diff --git a/monitor/monitor.conf b/monitor/monitor.conf index 280432c..48b72f4 100644 --- a/monitor/monitor.conf +++ b/monitor/monitor.conf @@ -29,8 +29,11 @@ TRUNCATED_PERCENTAGE_THRESHOLD=50 FREE_PERCENTAGE_THRESHOLD=25 # Maximum age of the last successful full data backup in hours. BACKUP_THRESHOLD_HOURS=25 +# Statement queue length that triggers a check +STATEMENT_QUEUE_THRESHOLD=100 +# Number of consecutive runs the queue must be over threshold to trigger an alert +STATEMENT_QUEUE_CONSECUTIVE_RUNS=3 # --- Monitored Directories --- # List of directories to check for disk usage (space-separated) DIRECTORIES_TO_MONITOR=("/hana/log" "/hana/shared" "/hana/data" "/usr/sap") - diff --git a/monitor/monitor.sh b/monitor/monitor.sh index 87eabbd..30c1925 100644 --- a/monitor/monitor.sh +++ b/monitor/monitor.sh @@ -1,9 +1,9 @@ #!/bin/bash -# Version: 1.2.3 +# Version: 1.3.0 # ============================================================================= # SAP HANA Monitoring Script # -# Checks HANA processes, disk usage, and log segment state. +# Checks HANA processes, disk usage, log segments, and statement queue. # Sends ntfy.sh notifications if thresholds are exceeded. # ============================================================================= @@ -114,7 +114,7 @@ for dir in "${DIRECTORIES_TO_MONITOR[@]}"; do continue fi usage=$(df -h "$dir" | awk 'NR==2 {print $5}' | sed 's/%//') - echo " - ${dir} is at ${usage}%" + echo " - ${dir} is at ${usage}%" if (( $(echo "$usage > $DISK_USAGE_THRESHOLD" | bc -l) )); then echo "🚨 Alert: ${dir} usage is at ${usage}% which is above the ${DISK_USAGE_THRESHOLD}% threshold." >&2 send_notification_if_changed "disk_usage_${dir//\//_}" "HANA Disk" "Disk usage for ${dir} is at ${usage}%." "true" "${usage}%" @@ -161,58 +161,83 @@ echo "â„šī¸ Free Segments: ${free_segments}" if [ $total_segments -eq 0 ]; then echo "âš ī¸ Warning: No log segments found. Skipping percentage checks." >&2 send_notification_if_changed "hana_log_segments_total" "HANA Log Segment Warning" "No log segments found. Skipping percentage checks." "true" "NO_LOG_SEGMENTS" - exit 0 else send_notification_if_changed "hana_log_segments_total" "HANA Log Segment" "Log segments found." "false" "OK" + truncated_percentage=$((truncated_segments * 100 / total_segments)) + if (( $(echo "$truncated_percentage > $TRUNCATED_PERCENTAGE_THRESHOLD" | bc -l) )); then + echo "🚨 Alert: ${truncated_percentage}% of log segments are 'Truncated'." >&2 + send_notification_if_changed "hana_log_truncated" "HANA Log Segment" "${truncated_percentage}% of HANA log segments are in 'Truncated' state." "true" "${truncated_percentage}%" + else + send_notification_if_changed "hana_log_truncated" "HANA Log Segment" "${truncated_percentage}% of HANA log segments are in 'Truncated' state (below threshold)." "false" "OK" + fi + + free_percentage=$((free_segments * 100 / total_segments)) + if (( $(echo "$free_percentage < $FREE_PERCENTAGE_THRESHOLD" | bc -l) )); then + echo "🚨 Alert: Only ${free_percentage}% of log segments are 'Free'." >&2 + send_notification_if_changed "hana_log_free" "HANA Log Segment" "Only ${free_percentage}% of HANA log segments are in 'Free' state." "true" "${free_percentage}%" + else + send_notification_if_changed "hana_log_free" "HANA Log Segment" "Only ${free_percentage}% of HANA log segments are in 'Free' state (above threshold)." "false" "OK" + fi fi -truncated_percentage=$((truncated_segments * 100 / total_segments)) -if (( $(echo "$truncated_percentage > $TRUNCATED_PERCENTAGE_THRESHOLD" | bc -l) )); then - echo "🚨 Alert: ${truncated_percentage}% of log segments are 'Truncated'." >&2 - send_notification_if_changed "hana_log_truncated" "HANA Log Segment" "${truncated_percentage}% of HANA log segments are in 'Truncated' state." "true" "${truncated_percentage}%" +# --- HANA Statement Queue Monitoring --- +echo "âš™ī¸ Checking HANA statement queue..." +STATEMENT_QUEUE_SQL="SELECT COUNT(*) FROM M_SERVICE_THREADS WHERE THREAD_TYPE = 'SqlExecutor' AND THREAD_STATE = 'Queueing';" +queue_count=$("$HDBSQL_PATH" -U "$HANA_USER_KEY" -j -a -x "$STATEMENT_QUEUE_SQL" 2>/dev/null | tr -d '"') + +if ! [[ "$queue_count" =~ ^[0-9]+$ ]]; then + echo "âš ī¸ Warning: Could not retrieve HANA statement queue count. Skipping check." >&2 + send_notification_if_changed "hana_statement_queue_check_fail" "HANA Monitor Warning" "Could not retrieve statement queue count." "true" "QUEUE_CHECK_FAIL" else - send_notification_if_changed "hana_log_truncated" "HANA Log Segment" "${truncated_percentage}% of HANA log segments are in 'Truncated' state (below threshold)." "false" "OK" + send_notification_if_changed "hana_statement_queue_check_fail" "HANA Monitor Warning" "Statement queue check is working." "false" "OK" + echo "â„šī¸ Current statement queue length: ${queue_count}" + + breach_count=$(get_state "statement_queue_breach_count") + breach_count=${breach_count:-0} + + if (( queue_count > STATEMENT_QUEUE_THRESHOLD )); then + breach_count=$((breach_count + 1)) + echo "📈 Statement queue is above threshold. Consecutive breach count: ${breach_count}/${STATEMENT_QUEUE_CONSECUTIVE_RUNS}." + else + breach_count=0 + fi + set_state "statement_queue_breach_count" "$breach_count" + + if (( breach_count >= STATEMENT_QUEUE_CONSECUTIVE_RUNS )); then + message="Statement queue has been over ${STATEMENT_QUEUE_THRESHOLD} for ${breach_count} checks. Current count: ${queue_count}." + send_notification_if_changed "hana_statement_queue_status" "HANA Statement Queue" "${message}" "true" "ALERT:${queue_count}" + else + message="Statement queue is normal. Current count: ${queue_count}." + send_notification_if_changed "hana_statement_queue_status" "HANA Statement Queue" "${message}" "false" "OK" + fi fi -free_percentage=$((free_segments * 100 / total_segments)) -if (( $(echo "$free_percentage < $FREE_PERCENTAGE_THRESHOLD" | bc -l) )); then - echo "🚨 Alert: Only ${free_percentage}% of log segments are 'Free'." >&2 - send_notification_if_changed "hana_log_free" "HANA Log Segment" "Only ${free_percentage}% of HANA log segments are in 'Free' state." "true" "${free_percentage}%" -else - send_notification_if_changed "hana_log_free" "HANA Log Segment" "Only ${free_percentage}% of HANA log segments are in 'Free' state (above threshold)." "false" "OK" -fi -echo "â„šī¸ Checking last successful data backup status..." - -# Query to get the start time of the most recent successful complete data backup +# --- HANA Backup Status Monitoring --- +echo "â„šī¸ Checking last successful data backup status..." last_backup_date=$("$HDBSQL_PATH" -U "$HANA_USER_KEY" -j -a -x \ - "SELECT TOP 1 SYS_START_TIME FROM M_BACKUP_CATALOG WHERE ENTRY_TYPE_NAME = 'complete data backup' AND STATE_NAME = 'successful' ORDER BY SYS_START_TIME DESC" 2>/dev/null | tr -d "\"" | sed 's/\..*//') # sed removes fractional seconds + "SELECT TOP 1 SYS_START_TIME FROM M_BACKUP_CATALOG WHERE ENTRY_TYPE_NAME = 'complete data backup' AND STATE_NAME = 'successful' ORDER BY SYS_START_TIME DESC" 2>/dev/null | tr -d "\"" | sed 's/\..*//') if [[ -z "$last_backup_date" ]]; then - # No successful backup found at all message="No successful complete data backup found for ${COMPANY_NAME} HANA." echo "🚨 Critical: ${message}" send_notification_if_changed "hana_backup_status" "HANA Backup" "${message}" "true" "NO_BACKUP" - return -fi - -# Convert dates to epoch seconds for comparison -last_backup_epoch=$(date -d "$last_backup_date" +%s) -current_epoch=$(date +%s) -threshold_seconds=$((BACKUP_THRESHOLD_HOURS * 3600)) - -age_seconds=$((current_epoch - last_backup_epoch)) -age_hours=$((age_seconds / 3600)) - -if (( age_seconds > threshold_seconds )); then - message="Last successful HANA backup for ${COMPANY_NAME} is ${age_hours} hours old, which exceeds the threshold of ${BACKUP_THRESHOLD_HOURS} hours. Last backup was on: ${last_backup_date}." - echo "🚨 Critical: ${message}" - send_notification_if_changed "hana_backup_status" "HANA Backup" "${message}" "true" "${age_hours}h" else - message="Last successful backup is ${age_hours} hours old (Threshold: ${BACKUP_THRESHOLD_HOURS} hours)." - echo "✅ Success! ${message}" - send_notification_if_changed "hana_backup_status" "HANA Backup" "${message}" "false" "OK" + last_backup_epoch=$(date -d "$last_backup_date" +%s) + current_epoch=$(date +%s) + threshold_seconds=$((BACKUP_THRESHOLD_HOURS * 3600)) + age_seconds=$((current_epoch - last_backup_epoch)) + age_hours=$((age_seconds / 3600)) + + if (( age_seconds > threshold_seconds )); then + message="Last successful HANA backup for ${COMPANY_NAME} is ${age_hours} hours old, which exceeds the threshold of ${BACKUP_THRESHOLD_HOURS} hours. Last backup was on: ${last_backup_date}." + echo "🚨 Critical: ${message}" + send_notification_if_changed "hana_backup_status" "HANA Backup" "${message}" "true" "${age_hours}h" + else + message="Last successful backup is ${age_hours} hours old (Threshold: ${BACKUP_THRESHOLD_HOURS} hours)." + echo "✅ Success! ${message}" + send_notification_if_changed "hana_backup_status" "HANA Backup" "${message}" "false" "OK" + fi fi -echo "✅ Success! HANA monitoring check complete." - +echo "✅ Success! HANA monitoring check complete." \ No newline at end of file