feat(monitor): Add HANA statement queue monitoring

This commit introduces a new feature to monitor the HANA statement queue.

Added STATEMENT_QUEUE_THRESHOLD and STATEMENT_QUEUE_CONSECUTIVE_RUNS to monitor/monitor.conf.
Implemented logic in monitor/monitor.sh to query the statement queue length, track consecutive breaches of the defined threshold, and send notifications.
Updated the script version to 1.3.0.
Refactored log segment checks to only run when segments are found.
This commit is contained in:
2025-10-01 13:10:57 +02:00
parent 92a2b963c4
commit bb0531aeea
2 changed files with 70 additions and 42 deletions

View File

@@ -29,8 +29,11 @@ TRUNCATED_PERCENTAGE_THRESHOLD=50
FREE_PERCENTAGE_THRESHOLD=25
# Maximum age of the last successful full data backup in hours.
BACKUP_THRESHOLD_HOURS=25
# Statement queue length that triggers a check
STATEMENT_QUEUE_THRESHOLD=100
# Number of consecutive runs the queue must be over threshold to trigger an alert
STATEMENT_QUEUE_CONSECUTIVE_RUNS=3
# --- Monitored Directories ---
# List of directories to check for disk usage (space-separated)
DIRECTORIES_TO_MONITOR=("/hana/log" "/hana/shared" "/hana/data" "/usr/sap")

View File

@@ -1,9 +1,9 @@
#!/bin/bash
# Version: 1.2.3
# Version: 1.3.0
# =============================================================================
# SAP HANA Monitoring Script
#
# Checks HANA processes, disk usage, and log segment state.
# Checks HANA processes, disk usage, log segments, and statement queue.
# Sends ntfy.sh notifications if thresholds are exceeded.
# =============================================================================
@@ -161,58 +161,83 @@ echo " Free Segments: ${free_segments}"
if [ $total_segments -eq 0 ]; then
echo "⚠️ Warning: No log segments found. Skipping percentage checks." >&2
send_notification_if_changed "hana_log_segments_total" "HANA Log Segment Warning" "No log segments found. Skipping percentage checks." "true" "NO_LOG_SEGMENTS"
exit 0
else
send_notification_if_changed "hana_log_segments_total" "HANA Log Segment" "Log segments found." "false" "OK"
fi
truncated_percentage=$((truncated_segments * 100 / total_segments))
if (( $(echo "$truncated_percentage > $TRUNCATED_PERCENTAGE_THRESHOLD" | bc -l) )); then
truncated_percentage=$((truncated_segments * 100 / total_segments))
if (( $(echo "$truncated_percentage > $TRUNCATED_PERCENTAGE_THRESHOLD" | bc -l) )); then
echo "🚨 Alert: ${truncated_percentage}% of log segments are 'Truncated'." >&2
send_notification_if_changed "hana_log_truncated" "HANA Log Segment" "${truncated_percentage}% of HANA log segments are in 'Truncated' state." "true" "${truncated_percentage}%"
else
else
send_notification_if_changed "hana_log_truncated" "HANA Log Segment" "${truncated_percentage}% of HANA log segments are in 'Truncated' state (below threshold)." "false" "OK"
fi
fi
free_percentage=$((free_segments * 100 / total_segments))
if (( $(echo "$free_percentage < $FREE_PERCENTAGE_THRESHOLD" | bc -l) )); then
free_percentage=$((free_segments * 100 / total_segments))
if (( $(echo "$free_percentage < $FREE_PERCENTAGE_THRESHOLD" | bc -l) )); then
echo "🚨 Alert: Only ${free_percentage}% of log segments are 'Free'." >&2
send_notification_if_changed "hana_log_free" "HANA Log Segment" "Only ${free_percentage}% of HANA log segments are in 'Free' state." "true" "${free_percentage}%"
else
else
send_notification_if_changed "hana_log_free" "HANA Log Segment" "Only ${free_percentage}% of HANA log segments are in 'Free' state (above threshold)." "false" "OK"
fi
fi
echo " Checking last successful data backup status..."
# --- HANA Statement Queue Monitoring ---
echo "⚙️ Checking HANA statement queue..."
STATEMENT_QUEUE_SQL="SELECT COUNT(*) FROM M_SERVICE_THREADS WHERE THREAD_TYPE = 'SqlExecutor' AND THREAD_STATE = 'Queueing';"
queue_count=$("$HDBSQL_PATH" -U "$HANA_USER_KEY" -j -a -x "$STATEMENT_QUEUE_SQL" 2>/dev/null | tr -d '"')
# Query to get the start time of the most recent successful complete data backup
if ! [[ "$queue_count" =~ ^[0-9]+$ ]]; then
echo "⚠️ Warning: Could not retrieve HANA statement queue count. Skipping check." >&2
send_notification_if_changed "hana_statement_queue_check_fail" "HANA Monitor Warning" "Could not retrieve statement queue count." "true" "QUEUE_CHECK_FAIL"
else
send_notification_if_changed "hana_statement_queue_check_fail" "HANA Monitor Warning" "Statement queue check is working." "false" "OK"
echo " Current statement queue length: ${queue_count}"
breach_count=$(get_state "statement_queue_breach_count")
breach_count=${breach_count:-0}
if (( queue_count > STATEMENT_QUEUE_THRESHOLD )); then
breach_count=$((breach_count + 1))
echo "📈 Statement queue is above threshold. Consecutive breach count: ${breach_count}/${STATEMENT_QUEUE_CONSECUTIVE_RUNS}."
else
breach_count=0
fi
set_state "statement_queue_breach_count" "$breach_count"
if (( breach_count >= STATEMENT_QUEUE_CONSECUTIVE_RUNS )); then
message="Statement queue has been over ${STATEMENT_QUEUE_THRESHOLD} for ${breach_count} checks. Current count: ${queue_count}."
send_notification_if_changed "hana_statement_queue_status" "HANA Statement Queue" "${message}" "true" "ALERT:${queue_count}"
else
message="Statement queue is normal. Current count: ${queue_count}."
send_notification_if_changed "hana_statement_queue_status" "HANA Statement Queue" "${message}" "false" "OK"
fi
fi
# --- HANA Backup Status Monitoring ---
echo " Checking last successful data backup status..."
last_backup_date=$("$HDBSQL_PATH" -U "$HANA_USER_KEY" -j -a -x \
"SELECT TOP 1 SYS_START_TIME FROM M_BACKUP_CATALOG WHERE ENTRY_TYPE_NAME = 'complete data backup' AND STATE_NAME = 'successful' ORDER BY SYS_START_TIME DESC" 2>/dev/null | tr -d "\"" | sed 's/\..*//') # sed removes fractional seconds
"SELECT TOP 1 SYS_START_TIME FROM M_BACKUP_CATALOG WHERE ENTRY_TYPE_NAME = 'complete data backup' AND STATE_NAME = 'successful' ORDER BY SYS_START_TIME DESC" 2>/dev/null | tr -d "\"" | sed 's/\..*//')
if [[ -z "$last_backup_date" ]]; then
# No successful backup found at all
message="No successful complete data backup found for ${COMPANY_NAME} HANA."
echo "🚨 Critical: ${message}"
send_notification_if_changed "hana_backup_status" "HANA Backup" "${message}" "true" "NO_BACKUP"
return
fi
else
last_backup_epoch=$(date -d "$last_backup_date" +%s)
current_epoch=$(date +%s)
threshold_seconds=$((BACKUP_THRESHOLD_HOURS * 3600))
age_seconds=$((current_epoch - last_backup_epoch))
age_hours=$((age_seconds / 3600))
# Convert dates to epoch seconds for comparison
last_backup_epoch=$(date -d "$last_backup_date" +%s)
current_epoch=$(date +%s)
threshold_seconds=$((BACKUP_THRESHOLD_HOURS * 3600))
age_seconds=$((current_epoch - last_backup_epoch))
age_hours=$((age_seconds / 3600))
if (( age_seconds > threshold_seconds )); then
if (( age_seconds > threshold_seconds )); then
message="Last successful HANA backup for ${COMPANY_NAME} is ${age_hours} hours old, which exceeds the threshold of ${BACKUP_THRESHOLD_HOURS} hours. Last backup was on: ${last_backup_date}."
echo "🚨 Critical: ${message}"
send_notification_if_changed "hana_backup_status" "HANA Backup" "${message}" "true" "${age_hours}h"
else
else
message="Last successful backup is ${age_hours} hours old (Threshold: ${BACKUP_THRESHOLD_HOURS} hours)."
echo "✅ Success! ${message}"
send_notification_if_changed "hana_backup_status" "HANA Backup" "${message}" "false" "OK"
fi
fi
echo "✅ Success! HANA monitoring check complete."