This commit introduces a new feature to monitor the HANA statement queue. Added STATEMENT_QUEUE_THRESHOLD and STATEMENT_QUEUE_CONSECUTIVE_RUNS to monitor/monitor.conf. Implemented logic in monitor/monitor.sh to query the statement queue length, track consecutive breaches of the defined threshold, and send notifications. Updated the script version to 1.3.0. Refactored log segment checks to only run when segments are found.
243 lines
11 KiB
Bash
243 lines
11 KiB
Bash
#!/bin/bash
|
||
# Version: 1.3.0
|
||
# =============================================================================
|
||
# SAP HANA Monitoring Script
|
||
#
|
||
# Checks HANA processes, disk usage, log segments, and statement queue.
|
||
# Sends ntfy.sh notifications if thresholds are exceeded.
|
||
# =============================================================================
|
||
|
||
# --- Lock File Implementation ---
|
||
LOCK_FILE="/tmp/hana_monitor.lock"
|
||
if [ -e "$LOCK_FILE" ]; then
|
||
echo "▶️ Script is already running. Exiting."
|
||
exit 1
|
||
fi
|
||
touch "$LOCK_FILE"
|
||
# Ensure lock file is removed on script exit
|
||
trap 'rm -f "$LOCK_FILE"' EXIT
|
||
|
||
# --- Configuration and Setup ---
|
||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
|
||
CONFIG_FILE="${SCRIPT_DIR}/monitor.conf"
|
||
|
||
if [ ! -f "$CONFIG_FILE" ]; then
|
||
echo "❌ Error: Configuration file not found at ${CONFIG_FILE}" >&2
|
||
rm -f "$LOCK_FILE"
|
||
exit 1
|
||
fi
|
||
source "$CONFIG_FILE"
|
||
|
||
STATE_DIR="${SCRIPT_DIR}/monitor_state"
|
||
mkdir -p "${STATE_DIR}"
|
||
|
||
# Helper functions for state management
|
||
get_state() {
|
||
local key="$1"
|
||
if [ -f "${STATE_DIR}/${key}.state" ]; then
|
||
cat "${STATE_DIR}/${key}.state"
|
||
else
|
||
echo ""
|
||
fi
|
||
}
|
||
|
||
set_state() {
|
||
local key="$1"
|
||
local value="$2"
|
||
echo "$value" > "${STATE_DIR}/${key}.state"
|
||
}
|
||
|
||
HOSTNAME=$(hostname)
|
||
SQL_QUERY="SELECT b.host, b.service_name, a.state, count(*) FROM PUBLIC.M_LOG_SEGMENTS a JOIN PUBLIC.M_SERVICES b ON (a.host = b.host AND a.port = b.port) GROUP BY b.host, b.service_name, a.state;"
|
||
|
||
send_notification_if_changed() {
|
||
local alert_key="$1"
|
||
local title_prefix="$2" # e.g., "HANA Process"
|
||
local current_message="$3"
|
||
local is_alert_condition="$4" # "true" or "false"
|
||
local current_value="$5" # The value to store as state (e.g., "85%", "GREEN", "ALERT")
|
||
|
||
local previous_value=$(get_state "${alert_key}")
|
||
|
||
if [ "$current_value" != "$previous_value" ]; then
|
||
local full_title=""
|
||
local full_message=""
|
||
|
||
if [ "$is_alert_condition" == "true" ]; then
|
||
full_title="${title_prefix} Alert"
|
||
full_message="🚨 Critical: ${current_message}"
|
||
else
|
||
# Check if it was previously an alert (i.e., previous_value was not "OK")
|
||
if [ -n "$previous_value" ] && [ "$previous_value" != "OK" ]; then
|
||
full_title="${title_prefix} Resolved"
|
||
full_message="✅ Resolved: ${current_message}"
|
||
else
|
||
# No alert, and no previous alert to resolve, so just update state silently
|
||
set_state "${alert_key}" "$current_value"
|
||
return
|
||
fi
|
||
fi
|
||
|
||
local final_message="[${COMPANY_NAME} | ${HOSTNAME}] ${full_message}"
|
||
curl -H "Authorization: Bearer ${NTFY_TOKEN}" -H "Title: ${full_title}" -d "${final_message}" "${NTFY_TOPIC_URL}" > /dev/null 2>&1
|
||
set_state "${alert_key}" "$current_value"
|
||
echo "🔔 Notification sent for ${alert_key}: ${full_message}"
|
||
fi
|
||
}
|
||
|
||
# --- HANA Process Status ---
|
||
echo "⚙️ Checking HANA process status..."
|
||
if [ ! -x "$SAPCONTROL_PATH" ]; then
|
||
echo "❌ Error: sapcontrol not found or not executable at ${SAPCONTROL_PATH}" >&2
|
||
send_notification_if_changed "hana_sapcontrol_path" "HANA Monitor Error" "sapcontrol not found or not executable at ${SAPCONTROL_PATH}" "true" "SAPCONTROL_ERROR"
|
||
exit 1
|
||
fi
|
||
|
||
non_green_processes=$("${SAPCONTROL_PATH}" -nr "${HANA_INSTANCE_NR}" -function GetProcessList | tail -n +6 | grep -v 'GREEN')
|
||
|
||
if [ -n "$non_green_processes" ]; then
|
||
echo "🚨 Alert: One or more HANA processes are not running!" >&2
|
||
echo "$non_green_processes" >&2
|
||
send_notification_if_changed "hana_processes" "HANA Process" "One or more HANA processes are not GREEN. Problem processes: ${non_green_processes}" "true" "PROCESS_ALERT:${non_green_processes}"
|
||
exit 1 # Exit early as other checks might fail
|
||
else
|
||
send_notification_if_changed "hana_processes" "HANA Process" "All HANA processes are GREEN." "false" "OK"
|
||
echo "✅ Success! All HANA processes are GREEN."
|
||
fi
|
||
|
||
# --- Disk Space Monitoring ---
|
||
echo "ℹ️ Checking disk usage..."
|
||
for dir in "${DIRECTORIES_TO_MONITOR[@]}"; do
|
||
if [ ! -d "$dir" ]; then
|
||
echo "⚠️ Warning: Directory '$dir' not found. Skipping." >&2
|
||
send_notification_if_changed "disk_dir_not_found_${dir//\//_}" "HANA Disk Warning" "Directory '$dir' not found." "true" "DIR_NOT_FOUND"
|
||
continue
|
||
fi
|
||
usage=$(df -h "$dir" | awk 'NR==2 {print $5}' | sed 's/%//')
|
||
echo " - ${dir} is at ${usage}%"
|
||
if (( $(echo "$usage > $DISK_USAGE_THRESHOLD" | bc -l) )); then
|
||
echo "🚨 Alert: ${dir} usage is at ${usage}% which is above the ${DISK_USAGE_THRESHOLD}% threshold." >&2
|
||
send_notification_if_changed "disk_usage_${dir//\//_}" "HANA Disk" "Disk usage for ${dir} is at ${usage}%." "true" "${usage}%"
|
||
else
|
||
send_notification_if_changed "disk_usage_${dir//\//_}" "HANA Disk" "Disk usage for ${dir} is at ${usage}% (below threshold)." "false" "OK"
|
||
fi
|
||
done
|
||
|
||
# --- HANA Log Segment Monitoring ---
|
||
echo "⚙️ Executing HANA SQL query..."
|
||
if [ ! -x "$HDBSQL_PATH" ]; then
|
||
echo "❌ Error: hdbsql not found or not executable at ${HDBSQL_PATH}" >&2
|
||
send_notification_if_changed "hana_hdbsql_path" "HANA Monitor Error" "hdbsql not found or not executable at ${HDBSQL_PATH}" "true" "HDBSQL_ERROR"
|
||
exit 1
|
||
fi
|
||
readarray -t sql_output < <("$HDBSQL_PATH" -U "$HANA_USER_KEY" -c ";" "$SQL_QUERY" 2>&1)
|
||
if [ $? -ne 0 ]; then
|
||
echo "❌ Failure! The hdbsql command failed. Please check logs." >&2
|
||
error_message=$(printf '%s\n' "${sql_output[@]}")
|
||
send_notification_if_changed "hana_hdbsql_command" "HANA Monitor Error" "The hdbsql command failed. Details: ${error_message}" "true" "HDBSQL_COMMAND_FAILED"
|
||
exit 1
|
||
fi
|
||
|
||
total_segments=0
|
||
truncated_segments=0
|
||
free_segments=0
|
||
for line in "${sql_output[@]}"; do
|
||
if [[ -z "$line" || "$line" == *"STATE"* ]]; then continue; fi
|
||
cleaned_line=$(echo "$line" | tr -d '"')
|
||
state=$(echo "$cleaned_line" | awk -F',' '{print $3}')
|
||
count=$(echo "$cleaned_line" | awk -F',' '{print $4}')
|
||
total_segments=$((total_segments + count))
|
||
if [[ "$state" == "Truncated" ]]; then
|
||
truncated_segments=$((truncated_segments + count))
|
||
elif [[ "$state" == "Free" ]]; then
|
||
free_segments=$((free_segments + count))
|
||
fi
|
||
done
|
||
|
||
echo "ℹ️ Total Segments: ${total_segments}"
|
||
echo "ℹ️ Truncated Segments: ${truncated_segments}"
|
||
echo "ℹ️ Free Segments: ${free_segments}"
|
||
|
||
if [ $total_segments -eq 0 ]; then
|
||
echo "⚠️ Warning: No log segments found. Skipping percentage checks." >&2
|
||
send_notification_if_changed "hana_log_segments_total" "HANA Log Segment Warning" "No log segments found. Skipping percentage checks." "true" "NO_LOG_SEGMENTS"
|
||
else
|
||
send_notification_if_changed "hana_log_segments_total" "HANA Log Segment" "Log segments found." "false" "OK"
|
||
truncated_percentage=$((truncated_segments * 100 / total_segments))
|
||
if (( $(echo "$truncated_percentage > $TRUNCATED_PERCENTAGE_THRESHOLD" | bc -l) )); then
|
||
echo "🚨 Alert: ${truncated_percentage}% of log segments are 'Truncated'." >&2
|
||
send_notification_if_changed "hana_log_truncated" "HANA Log Segment" "${truncated_percentage}% of HANA log segments are in 'Truncated' state." "true" "${truncated_percentage}%"
|
||
else
|
||
send_notification_if_changed "hana_log_truncated" "HANA Log Segment" "${truncated_percentage}% of HANA log segments are in 'Truncated' state (below threshold)." "false" "OK"
|
||
fi
|
||
|
||
free_percentage=$((free_segments * 100 / total_segments))
|
||
if (( $(echo "$free_percentage < $FREE_PERCENTAGE_THRESHOLD" | bc -l) )); then
|
||
echo "🚨 Alert: Only ${free_percentage}% of log segments are 'Free'." >&2
|
||
send_notification_if_changed "hana_log_free" "HANA Log Segment" "Only ${free_percentage}% of HANA log segments are in 'Free' state." "true" "${free_percentage}%"
|
||
else
|
||
send_notification_if_changed "hana_log_free" "HANA Log Segment" "Only ${free_percentage}% of HANA log segments are in 'Free' state (above threshold)." "false" "OK"
|
||
fi
|
||
fi
|
||
|
||
# --- HANA Statement Queue Monitoring ---
|
||
echo "⚙️ Checking HANA statement queue..."
|
||
STATEMENT_QUEUE_SQL="SELECT COUNT(*) FROM M_SERVICE_THREADS WHERE THREAD_TYPE = 'SqlExecutor' AND THREAD_STATE = 'Queueing';"
|
||
queue_count=$("$HDBSQL_PATH" -U "$HANA_USER_KEY" -j -a -x "$STATEMENT_QUEUE_SQL" 2>/dev/null | tr -d '"')
|
||
|
||
if ! [[ "$queue_count" =~ ^[0-9]+$ ]]; then
|
||
echo "⚠️ Warning: Could not retrieve HANA statement queue count. Skipping check." >&2
|
||
send_notification_if_changed "hana_statement_queue_check_fail" "HANA Monitor Warning" "Could not retrieve statement queue count." "true" "QUEUE_CHECK_FAIL"
|
||
else
|
||
send_notification_if_changed "hana_statement_queue_check_fail" "HANA Monitor Warning" "Statement queue check is working." "false" "OK"
|
||
echo "ℹ️ Current statement queue length: ${queue_count}"
|
||
|
||
breach_count=$(get_state "statement_queue_breach_count")
|
||
breach_count=${breach_count:-0}
|
||
|
||
if (( queue_count > STATEMENT_QUEUE_THRESHOLD )); then
|
||
breach_count=$((breach_count + 1))
|
||
echo "📈 Statement queue is above threshold. Consecutive breach count: ${breach_count}/${STATEMENT_QUEUE_CONSECUTIVE_RUNS}."
|
||
else
|
||
breach_count=0
|
||
fi
|
||
set_state "statement_queue_breach_count" "$breach_count"
|
||
|
||
if (( breach_count >= STATEMENT_QUEUE_CONSECUTIVE_RUNS )); then
|
||
message="Statement queue has been over ${STATEMENT_QUEUE_THRESHOLD} for ${breach_count} checks. Current count: ${queue_count}."
|
||
send_notification_if_changed "hana_statement_queue_status" "HANA Statement Queue" "${message}" "true" "ALERT:${queue_count}"
|
||
else
|
||
message="Statement queue is normal. Current count: ${queue_count}."
|
||
send_notification_if_changed "hana_statement_queue_status" "HANA Statement Queue" "${message}" "false" "OK"
|
||
fi
|
||
fi
|
||
|
||
|
||
# --- HANA Backup Status Monitoring ---
|
||
echo "ℹ️ Checking last successful data backup status..."
|
||
last_backup_date=$("$HDBSQL_PATH" -U "$HANA_USER_KEY" -j -a -x \
|
||
"SELECT TOP 1 SYS_START_TIME FROM M_BACKUP_CATALOG WHERE ENTRY_TYPE_NAME = 'complete data backup' AND STATE_NAME = 'successful' ORDER BY SYS_START_TIME DESC" 2>/dev/null | tr -d "\"" | sed 's/\..*//')
|
||
|
||
if [[ -z "$last_backup_date" ]]; then
|
||
message="No successful complete data backup found for ${COMPANY_NAME} HANA."
|
||
echo "🚨 Critical: ${message}"
|
||
send_notification_if_changed "hana_backup_status" "HANA Backup" "${message}" "true" "NO_BACKUP"
|
||
else
|
||
last_backup_epoch=$(date -d "$last_backup_date" +%s)
|
||
current_epoch=$(date +%s)
|
||
threshold_seconds=$((BACKUP_THRESHOLD_HOURS * 3600))
|
||
age_seconds=$((current_epoch - last_backup_epoch))
|
||
age_hours=$((age_seconds / 3600))
|
||
|
||
if (( age_seconds > threshold_seconds )); then
|
||
message="Last successful HANA backup for ${COMPANY_NAME} is ${age_hours} hours old, which exceeds the threshold of ${BACKUP_THRESHOLD_HOURS} hours. Last backup was on: ${last_backup_date}."
|
||
echo "🚨 Critical: ${message}"
|
||
send_notification_if_changed "hana_backup_status" "HANA Backup" "${message}" "true" "${age_hours}h"
|
||
else
|
||
message="Last successful backup is ${age_hours} hours old (Threshold: ${BACKUP_THRESHOLD_HOURS} hours)."
|
||
echo "✅ Success! ${message}"
|
||
send_notification_if_changed "hana_backup_status" "HANA Backup" "${message}" "false" "OK"
|
||
fi
|
||
fi
|
||
|
||
echo "✅ Success! HANA monitoring check complete." |