From bb0531aeea534db4bf10b3105bae0b88a5d65030 Mon Sep 17 00:00:00 2001 From: Tomi Eckert Date: Wed, 1 Oct 2025 13:10:57 +0200 Subject: [PATCH] feat(monitor): Add HANA statement queue monitoring This commit introduces a new feature to monitor the HANA statement queue. Added STATEMENT_QUEUE_THRESHOLD and STATEMENT_QUEUE_CONSECUTIVE_RUNS to monitor/monitor.conf. Implemented logic in monitor/monitor.sh to query the statement queue length, track consecutive breaches of the defined threshold, and send notifications. Updated the script version to 1.3.0. Refactored log segment checks to only run when segments are found. --- monitor/monitor.conf | 5 +- monitor/monitor.sh | 107 ++++++++++++++++++++++++++----------------- 2 files changed, 70 insertions(+), 42 deletions(-) diff --git a/monitor/monitor.conf b/monitor/monitor.conf index 280432c..48b72f4 100644 --- a/monitor/monitor.conf +++ b/monitor/monitor.conf @@ -29,8 +29,11 @@ TRUNCATED_PERCENTAGE_THRESHOLD=50 FREE_PERCENTAGE_THRESHOLD=25 # Maximum age of the last successful full data backup in hours. BACKUP_THRESHOLD_HOURS=25 +# Statement queue length that triggers a check +STATEMENT_QUEUE_THRESHOLD=100 +# Number of consecutive runs the queue must be over threshold to trigger an alert +STATEMENT_QUEUE_CONSECUTIVE_RUNS=3 # --- Monitored Directories --- # List of directories to check for disk usage (space-separated) DIRECTORIES_TO_MONITOR=("/hana/log" "/hana/shared" "/hana/data" "/usr/sap") - diff --git a/monitor/monitor.sh b/monitor/monitor.sh index 87eabbd..30c1925 100644 --- a/monitor/monitor.sh +++ b/monitor/monitor.sh @@ -1,9 +1,9 @@ #!/bin/bash -# Version: 1.2.3 +# Version: 1.3.0 # ============================================================================= # SAP HANA Monitoring Script # -# Checks HANA processes, disk usage, and log segment state. +# Checks HANA processes, disk usage, log segments, and statement queue. # Sends ntfy.sh notifications if thresholds are exceeded. # ============================================================================= @@ -114,7 +114,7 @@ for dir in "${DIRECTORIES_TO_MONITOR[@]}"; do continue fi usage=$(df -h "$dir" | awk 'NR==2 {print $5}' | sed 's/%//') - echo " - ${dir} is at ${usage}%" + echo " - ${dir} is at ${usage}%" if (( $(echo "$usage > $DISK_USAGE_THRESHOLD" | bc -l) )); then echo "🚨 Alert: ${dir} usage is at ${usage}% which is above the ${DISK_USAGE_THRESHOLD}% threshold." >&2 send_notification_if_changed "disk_usage_${dir//\//_}" "HANA Disk" "Disk usage for ${dir} is at ${usage}%." "true" "${usage}%" @@ -161,58 +161,83 @@ echo "â„šī¸ Free Segments: ${free_segments}" if [ $total_segments -eq 0 ]; then echo "âš ī¸ Warning: No log segments found. Skipping percentage checks." >&2 send_notification_if_changed "hana_log_segments_total" "HANA Log Segment Warning" "No log segments found. Skipping percentage checks." "true" "NO_LOG_SEGMENTS" - exit 0 else send_notification_if_changed "hana_log_segments_total" "HANA Log Segment" "Log segments found." "false" "OK" + truncated_percentage=$((truncated_segments * 100 / total_segments)) + if (( $(echo "$truncated_percentage > $TRUNCATED_PERCENTAGE_THRESHOLD" | bc -l) )); then + echo "🚨 Alert: ${truncated_percentage}% of log segments are 'Truncated'." >&2 + send_notification_if_changed "hana_log_truncated" "HANA Log Segment" "${truncated_percentage}% of HANA log segments are in 'Truncated' state." "true" "${truncated_percentage}%" + else + send_notification_if_changed "hana_log_truncated" "HANA Log Segment" "${truncated_percentage}% of HANA log segments are in 'Truncated' state (below threshold)." "false" "OK" + fi + + free_percentage=$((free_segments * 100 / total_segments)) + if (( $(echo "$free_percentage < $FREE_PERCENTAGE_THRESHOLD" | bc -l) )); then + echo "🚨 Alert: Only ${free_percentage}% of log segments are 'Free'." >&2 + send_notification_if_changed "hana_log_free" "HANA Log Segment" "Only ${free_percentage}% of HANA log segments are in 'Free' state." "true" "${free_percentage}%" + else + send_notification_if_changed "hana_log_free" "HANA Log Segment" "Only ${free_percentage}% of HANA log segments are in 'Free' state (above threshold)." "false" "OK" + fi fi -truncated_percentage=$((truncated_segments * 100 / total_segments)) -if (( $(echo "$truncated_percentage > $TRUNCATED_PERCENTAGE_THRESHOLD" | bc -l) )); then - echo "🚨 Alert: ${truncated_percentage}% of log segments are 'Truncated'." >&2 - send_notification_if_changed "hana_log_truncated" "HANA Log Segment" "${truncated_percentage}% of HANA log segments are in 'Truncated' state." "true" "${truncated_percentage}%" +# --- HANA Statement Queue Monitoring --- +echo "âš™ī¸ Checking HANA statement queue..." +STATEMENT_QUEUE_SQL="SELECT COUNT(*) FROM M_SERVICE_THREADS WHERE THREAD_TYPE = 'SqlExecutor' AND THREAD_STATE = 'Queueing';" +queue_count=$("$HDBSQL_PATH" -U "$HANA_USER_KEY" -j -a -x "$STATEMENT_QUEUE_SQL" 2>/dev/null | tr -d '"') + +if ! [[ "$queue_count" =~ ^[0-9]+$ ]]; then + echo "âš ī¸ Warning: Could not retrieve HANA statement queue count. Skipping check." >&2 + send_notification_if_changed "hana_statement_queue_check_fail" "HANA Monitor Warning" "Could not retrieve statement queue count." "true" "QUEUE_CHECK_FAIL" else - send_notification_if_changed "hana_log_truncated" "HANA Log Segment" "${truncated_percentage}% of HANA log segments are in 'Truncated' state (below threshold)." "false" "OK" + send_notification_if_changed "hana_statement_queue_check_fail" "HANA Monitor Warning" "Statement queue check is working." "false" "OK" + echo "â„šī¸ Current statement queue length: ${queue_count}" + + breach_count=$(get_state "statement_queue_breach_count") + breach_count=${breach_count:-0} + + if (( queue_count > STATEMENT_QUEUE_THRESHOLD )); then + breach_count=$((breach_count + 1)) + echo "📈 Statement queue is above threshold. Consecutive breach count: ${breach_count}/${STATEMENT_QUEUE_CONSECUTIVE_RUNS}." + else + breach_count=0 + fi + set_state "statement_queue_breach_count" "$breach_count" + + if (( breach_count >= STATEMENT_QUEUE_CONSECUTIVE_RUNS )); then + message="Statement queue has been over ${STATEMENT_QUEUE_THRESHOLD} for ${breach_count} checks. Current count: ${queue_count}." + send_notification_if_changed "hana_statement_queue_status" "HANA Statement Queue" "${message}" "true" "ALERT:${queue_count}" + else + message="Statement queue is normal. Current count: ${queue_count}." + send_notification_if_changed "hana_statement_queue_status" "HANA Statement Queue" "${message}" "false" "OK" + fi fi -free_percentage=$((free_segments * 100 / total_segments)) -if (( $(echo "$free_percentage < $FREE_PERCENTAGE_THRESHOLD" | bc -l) )); then - echo "🚨 Alert: Only ${free_percentage}% of log segments are 'Free'." >&2 - send_notification_if_changed "hana_log_free" "HANA Log Segment" "Only ${free_percentage}% of HANA log segments are in 'Free' state." "true" "${free_percentage}%" -else - send_notification_if_changed "hana_log_free" "HANA Log Segment" "Only ${free_percentage}% of HANA log segments are in 'Free' state (above threshold)." "false" "OK" -fi -echo "â„šī¸ Checking last successful data backup status..." - -# Query to get the start time of the most recent successful complete data backup +# --- HANA Backup Status Monitoring --- +echo "â„šī¸ Checking last successful data backup status..." last_backup_date=$("$HDBSQL_PATH" -U "$HANA_USER_KEY" -j -a -x \ - "SELECT TOP 1 SYS_START_TIME FROM M_BACKUP_CATALOG WHERE ENTRY_TYPE_NAME = 'complete data backup' AND STATE_NAME = 'successful' ORDER BY SYS_START_TIME DESC" 2>/dev/null | tr -d "\"" | sed 's/\..*//') # sed removes fractional seconds + "SELECT TOP 1 SYS_START_TIME FROM M_BACKUP_CATALOG WHERE ENTRY_TYPE_NAME = 'complete data backup' AND STATE_NAME = 'successful' ORDER BY SYS_START_TIME DESC" 2>/dev/null | tr -d "\"" | sed 's/\..*//') if [[ -z "$last_backup_date" ]]; then - # No successful backup found at all message="No successful complete data backup found for ${COMPANY_NAME} HANA." echo "🚨 Critical: ${message}" send_notification_if_changed "hana_backup_status" "HANA Backup" "${message}" "true" "NO_BACKUP" - return -fi - -# Convert dates to epoch seconds for comparison -last_backup_epoch=$(date -d "$last_backup_date" +%s) -current_epoch=$(date +%s) -threshold_seconds=$((BACKUP_THRESHOLD_HOURS * 3600)) - -age_seconds=$((current_epoch - last_backup_epoch)) -age_hours=$((age_seconds / 3600)) - -if (( age_seconds > threshold_seconds )); then - message="Last successful HANA backup for ${COMPANY_NAME} is ${age_hours} hours old, which exceeds the threshold of ${BACKUP_THRESHOLD_HOURS} hours. Last backup was on: ${last_backup_date}." - echo "🚨 Critical: ${message}" - send_notification_if_changed "hana_backup_status" "HANA Backup" "${message}" "true" "${age_hours}h" else - message="Last successful backup is ${age_hours} hours old (Threshold: ${BACKUP_THRESHOLD_HOURS} hours)." - echo "✅ Success! ${message}" - send_notification_if_changed "hana_backup_status" "HANA Backup" "${message}" "false" "OK" + last_backup_epoch=$(date -d "$last_backup_date" +%s) + current_epoch=$(date +%s) + threshold_seconds=$((BACKUP_THRESHOLD_HOURS * 3600)) + age_seconds=$((current_epoch - last_backup_epoch)) + age_hours=$((age_seconds / 3600)) + + if (( age_seconds > threshold_seconds )); then + message="Last successful HANA backup for ${COMPANY_NAME} is ${age_hours} hours old, which exceeds the threshold of ${BACKUP_THRESHOLD_HOURS} hours. Last backup was on: ${last_backup_date}." + echo "🚨 Critical: ${message}" + send_notification_if_changed "hana_backup_status" "HANA Backup" "${message}" "true" "${age_hours}h" + else + message="Last successful backup is ${age_hours} hours old (Threshold: ${BACKUP_THRESHOLD_HOURS} hours)." + echo "✅ Success! ${message}" + send_notification_if_changed "hana_backup_status" "HANA Backup" "${message}" "false" "OK" + fi fi -echo "✅ Success! HANA monitoring check complete." - +echo "✅ Success! HANA monitoring check complete." \ No newline at end of file