feat(monitor): Add HANA statement queue monitoring
This commit introduces a new feature to monitor the HANA statement queue. Added STATEMENT_QUEUE_THRESHOLD and STATEMENT_QUEUE_CONSECUTIVE_RUNS to monitor/monitor.conf. Implemented logic in monitor/monitor.sh to query the statement queue length, track consecutive breaches of the defined threshold, and send notifications. Updated the script version to 1.3.0. Refactored log segment checks to only run when segments are found.
This commit is contained in:
@@ -29,8 +29,11 @@ TRUNCATED_PERCENTAGE_THRESHOLD=50
|
|||||||
FREE_PERCENTAGE_THRESHOLD=25
|
FREE_PERCENTAGE_THRESHOLD=25
|
||||||
# Maximum age of the last successful full data backup in hours.
|
# Maximum age of the last successful full data backup in hours.
|
||||||
BACKUP_THRESHOLD_HOURS=25
|
BACKUP_THRESHOLD_HOURS=25
|
||||||
|
# Statement queue length that triggers a check
|
||||||
|
STATEMENT_QUEUE_THRESHOLD=100
|
||||||
|
# Number of consecutive runs the queue must be over threshold to trigger an alert
|
||||||
|
STATEMENT_QUEUE_CONSECUTIVE_RUNS=3
|
||||||
|
|
||||||
# --- Monitored Directories ---
|
# --- Monitored Directories ---
|
||||||
# List of directories to check for disk usage (space-separated)
|
# List of directories to check for disk usage (space-separated)
|
||||||
DIRECTORIES_TO_MONITOR=("/hana/log" "/hana/shared" "/hana/data" "/usr/sap")
|
DIRECTORIES_TO_MONITOR=("/hana/log" "/hana/shared" "/hana/data" "/usr/sap")
|
||||||
|
|
||||||
|
|||||||
@@ -1,9 +1,9 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
# Version: 1.2.3
|
# Version: 1.3.0
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# SAP HANA Monitoring Script
|
# SAP HANA Monitoring Script
|
||||||
#
|
#
|
||||||
# Checks HANA processes, disk usage, and log segment state.
|
# Checks HANA processes, disk usage, log segments, and statement queue.
|
||||||
# Sends ntfy.sh notifications if thresholds are exceeded.
|
# Sends ntfy.sh notifications if thresholds are exceeded.
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|
||||||
@@ -114,7 +114,7 @@ for dir in "${DIRECTORIES_TO_MONITOR[@]}"; do
|
|||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
usage=$(df -h "$dir" | awk 'NR==2 {print $5}' | sed 's/%//')
|
usage=$(df -h "$dir" | awk 'NR==2 {print $5}' | sed 's/%//')
|
||||||
echo " - ${dir} is at ${usage}%"
|
echo " - ${dir} is at ${usage}%"
|
||||||
if (( $(echo "$usage > $DISK_USAGE_THRESHOLD" | bc -l) )); then
|
if (( $(echo "$usage > $DISK_USAGE_THRESHOLD" | bc -l) )); then
|
||||||
echo "🚨 Alert: ${dir} usage is at ${usage}% which is above the ${DISK_USAGE_THRESHOLD}% threshold." >&2
|
echo "🚨 Alert: ${dir} usage is at ${usage}% which is above the ${DISK_USAGE_THRESHOLD}% threshold." >&2
|
||||||
send_notification_if_changed "disk_usage_${dir//\//_}" "HANA Disk" "Disk usage for ${dir} is at ${usage}%." "true" "${usage}%"
|
send_notification_if_changed "disk_usage_${dir//\//_}" "HANA Disk" "Disk usage for ${dir} is at ${usage}%." "true" "${usage}%"
|
||||||
@@ -161,58 +161,83 @@ echo "ℹ️ Free Segments: ${free_segments}"
|
|||||||
if [ $total_segments -eq 0 ]; then
|
if [ $total_segments -eq 0 ]; then
|
||||||
echo "⚠️ Warning: No log segments found. Skipping percentage checks." >&2
|
echo "⚠️ Warning: No log segments found. Skipping percentage checks." >&2
|
||||||
send_notification_if_changed "hana_log_segments_total" "HANA Log Segment Warning" "No log segments found. Skipping percentage checks." "true" "NO_LOG_SEGMENTS"
|
send_notification_if_changed "hana_log_segments_total" "HANA Log Segment Warning" "No log segments found. Skipping percentage checks." "true" "NO_LOG_SEGMENTS"
|
||||||
exit 0
|
|
||||||
else
|
else
|
||||||
send_notification_if_changed "hana_log_segments_total" "HANA Log Segment" "Log segments found." "false" "OK"
|
send_notification_if_changed "hana_log_segments_total" "HANA Log Segment" "Log segments found." "false" "OK"
|
||||||
|
truncated_percentage=$((truncated_segments * 100 / total_segments))
|
||||||
|
if (( $(echo "$truncated_percentage > $TRUNCATED_PERCENTAGE_THRESHOLD" | bc -l) )); then
|
||||||
|
echo "🚨 Alert: ${truncated_percentage}% of log segments are 'Truncated'." >&2
|
||||||
|
send_notification_if_changed "hana_log_truncated" "HANA Log Segment" "${truncated_percentage}% of HANA log segments are in 'Truncated' state." "true" "${truncated_percentage}%"
|
||||||
|
else
|
||||||
|
send_notification_if_changed "hana_log_truncated" "HANA Log Segment" "${truncated_percentage}% of HANA log segments are in 'Truncated' state (below threshold)." "false" "OK"
|
||||||
|
fi
|
||||||
|
|
||||||
|
free_percentage=$((free_segments * 100 / total_segments))
|
||||||
|
if (( $(echo "$free_percentage < $FREE_PERCENTAGE_THRESHOLD" | bc -l) )); then
|
||||||
|
echo "🚨 Alert: Only ${free_percentage}% of log segments are 'Free'." >&2
|
||||||
|
send_notification_if_changed "hana_log_free" "HANA Log Segment" "Only ${free_percentage}% of HANA log segments are in 'Free' state." "true" "${free_percentage}%"
|
||||||
|
else
|
||||||
|
send_notification_if_changed "hana_log_free" "HANA Log Segment" "Only ${free_percentage}% of HANA log segments are in 'Free' state (above threshold)." "false" "OK"
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
truncated_percentage=$((truncated_segments * 100 / total_segments))
|
# --- HANA Statement Queue Monitoring ---
|
||||||
if (( $(echo "$truncated_percentage > $TRUNCATED_PERCENTAGE_THRESHOLD" | bc -l) )); then
|
echo "⚙️ Checking HANA statement queue..."
|
||||||
echo "🚨 Alert: ${truncated_percentage}% of log segments are 'Truncated'." >&2
|
STATEMENT_QUEUE_SQL="SELECT COUNT(*) FROM M_SERVICE_THREADS WHERE THREAD_TYPE = 'SqlExecutor' AND THREAD_STATE = 'Queueing';"
|
||||||
send_notification_if_changed "hana_log_truncated" "HANA Log Segment" "${truncated_percentage}% of HANA log segments are in 'Truncated' state." "true" "${truncated_percentage}%"
|
queue_count=$("$HDBSQL_PATH" -U "$HANA_USER_KEY" -j -a -x "$STATEMENT_QUEUE_SQL" 2>/dev/null | tr -d '"')
|
||||||
|
|
||||||
|
if ! [[ "$queue_count" =~ ^[0-9]+$ ]]; then
|
||||||
|
echo "⚠️ Warning: Could not retrieve HANA statement queue count. Skipping check." >&2
|
||||||
|
send_notification_if_changed "hana_statement_queue_check_fail" "HANA Monitor Warning" "Could not retrieve statement queue count." "true" "QUEUE_CHECK_FAIL"
|
||||||
else
|
else
|
||||||
send_notification_if_changed "hana_log_truncated" "HANA Log Segment" "${truncated_percentage}% of HANA log segments are in 'Truncated' state (below threshold)." "false" "OK"
|
send_notification_if_changed "hana_statement_queue_check_fail" "HANA Monitor Warning" "Statement queue check is working." "false" "OK"
|
||||||
|
echo "ℹ️ Current statement queue length: ${queue_count}"
|
||||||
|
|
||||||
|
breach_count=$(get_state "statement_queue_breach_count")
|
||||||
|
breach_count=${breach_count:-0}
|
||||||
|
|
||||||
|
if (( queue_count > STATEMENT_QUEUE_THRESHOLD )); then
|
||||||
|
breach_count=$((breach_count + 1))
|
||||||
|
echo "📈 Statement queue is above threshold. Consecutive breach count: ${breach_count}/${STATEMENT_QUEUE_CONSECUTIVE_RUNS}."
|
||||||
|
else
|
||||||
|
breach_count=0
|
||||||
|
fi
|
||||||
|
set_state "statement_queue_breach_count" "$breach_count"
|
||||||
|
|
||||||
|
if (( breach_count >= STATEMENT_QUEUE_CONSECUTIVE_RUNS )); then
|
||||||
|
message="Statement queue has been over ${STATEMENT_QUEUE_THRESHOLD} for ${breach_count} checks. Current count: ${queue_count}."
|
||||||
|
send_notification_if_changed "hana_statement_queue_status" "HANA Statement Queue" "${message}" "true" "ALERT:${queue_count}"
|
||||||
|
else
|
||||||
|
message="Statement queue is normal. Current count: ${queue_count}."
|
||||||
|
send_notification_if_changed "hana_statement_queue_status" "HANA Statement Queue" "${message}" "false" "OK"
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
free_percentage=$((free_segments * 100 / total_segments))
|
|
||||||
if (( $(echo "$free_percentage < $FREE_PERCENTAGE_THRESHOLD" | bc -l) )); then
|
|
||||||
echo "🚨 Alert: Only ${free_percentage}% of log segments are 'Free'." >&2
|
|
||||||
send_notification_if_changed "hana_log_free" "HANA Log Segment" "Only ${free_percentage}% of HANA log segments are in 'Free' state." "true" "${free_percentage}%"
|
|
||||||
else
|
|
||||||
send_notification_if_changed "hana_log_free" "HANA Log Segment" "Only ${free_percentage}% of HANA log segments are in 'Free' state (above threshold)." "false" "OK"
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "ℹ️ Checking last successful data backup status..."
|
# --- HANA Backup Status Monitoring ---
|
||||||
|
echo "ℹ️ Checking last successful data backup status..."
|
||||||
# Query to get the start time of the most recent successful complete data backup
|
|
||||||
last_backup_date=$("$HDBSQL_PATH" -U "$HANA_USER_KEY" -j -a -x \
|
last_backup_date=$("$HDBSQL_PATH" -U "$HANA_USER_KEY" -j -a -x \
|
||||||
"SELECT TOP 1 SYS_START_TIME FROM M_BACKUP_CATALOG WHERE ENTRY_TYPE_NAME = 'complete data backup' AND STATE_NAME = 'successful' ORDER BY SYS_START_TIME DESC" 2>/dev/null | tr -d "\"" | sed 's/\..*//') # sed removes fractional seconds
|
"SELECT TOP 1 SYS_START_TIME FROM M_BACKUP_CATALOG WHERE ENTRY_TYPE_NAME = 'complete data backup' AND STATE_NAME = 'successful' ORDER BY SYS_START_TIME DESC" 2>/dev/null | tr -d "\"" | sed 's/\..*//')
|
||||||
|
|
||||||
if [[ -z "$last_backup_date" ]]; then
|
if [[ -z "$last_backup_date" ]]; then
|
||||||
# No successful backup found at all
|
|
||||||
message="No successful complete data backup found for ${COMPANY_NAME} HANA."
|
message="No successful complete data backup found for ${COMPANY_NAME} HANA."
|
||||||
echo "🚨 Critical: ${message}"
|
echo "🚨 Critical: ${message}"
|
||||||
send_notification_if_changed "hana_backup_status" "HANA Backup" "${message}" "true" "NO_BACKUP"
|
send_notification_if_changed "hana_backup_status" "HANA Backup" "${message}" "true" "NO_BACKUP"
|
||||||
return
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Convert dates to epoch seconds for comparison
|
|
||||||
last_backup_epoch=$(date -d "$last_backup_date" +%s)
|
|
||||||
current_epoch=$(date +%s)
|
|
||||||
threshold_seconds=$((BACKUP_THRESHOLD_HOURS * 3600))
|
|
||||||
|
|
||||||
age_seconds=$((current_epoch - last_backup_epoch))
|
|
||||||
age_hours=$((age_seconds / 3600))
|
|
||||||
|
|
||||||
if (( age_seconds > threshold_seconds )); then
|
|
||||||
message="Last successful HANA backup for ${COMPANY_NAME} is ${age_hours} hours old, which exceeds the threshold of ${BACKUP_THRESHOLD_HOURS} hours. Last backup was on: ${last_backup_date}."
|
|
||||||
echo "🚨 Critical: ${message}"
|
|
||||||
send_notification_if_changed "hana_backup_status" "HANA Backup" "${message}" "true" "${age_hours}h"
|
|
||||||
else
|
else
|
||||||
message="Last successful backup is ${age_hours} hours old (Threshold: ${BACKUP_THRESHOLD_HOURS} hours)."
|
last_backup_epoch=$(date -d "$last_backup_date" +%s)
|
||||||
echo "✅ Success! ${message}"
|
current_epoch=$(date +%s)
|
||||||
send_notification_if_changed "hana_backup_status" "HANA Backup" "${message}" "false" "OK"
|
threshold_seconds=$((BACKUP_THRESHOLD_HOURS * 3600))
|
||||||
|
age_seconds=$((current_epoch - last_backup_epoch))
|
||||||
|
age_hours=$((age_seconds / 3600))
|
||||||
|
|
||||||
|
if (( age_seconds > threshold_seconds )); then
|
||||||
|
message="Last successful HANA backup for ${COMPANY_NAME} is ${age_hours} hours old, which exceeds the threshold of ${BACKUP_THRESHOLD_HOURS} hours. Last backup was on: ${last_backup_date}."
|
||||||
|
echo "🚨 Critical: ${message}"
|
||||||
|
send_notification_if_changed "hana_backup_status" "HANA Backup" "${message}" "true" "${age_hours}h"
|
||||||
|
else
|
||||||
|
message="Last successful backup is ${age_hours} hours old (Threshold: ${BACKUP_THRESHOLD_HOURS} hours)."
|
||||||
|
echo "✅ Success! ${message}"
|
||||||
|
send_notification_if_changed "hana_backup_status" "HANA Backup" "${message}" "false" "OK"
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "✅ Success! HANA monitoring check complete."
|
echo "✅ Success! HANA monitoring check complete."
|
||||||
|
|
||||||
Reference in New Issue
Block a user