Files
Scripts/monitor/monitor.sh
Tomi Eckert eeb5b2eb7b feat(monitor): Implement state-based notifications to prevent alert spam
Introduces state management to 'monitor.sh' to send notifications only when a monitored status changes (e.g., from healthy to alert, or alert to resolved). This prevents repetitive alerts for persistent issues. Creates a 'monitor_state' directory for tracking. Updates script version to 1.2.0.
2025-09-25 18:34:40 +02:00

222 lines
9.8 KiB
Bash
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/bash
# Version: 1.2.0
# =============================================================================
# SAP HANA Monitoring Script
#
# Checks HANA processes, disk usage, and log segment state.
# Sends ntfy.sh notifications if thresholds are exceeded.
# =============================================================================
# --- Lock File Implementation ---
LOCK_FILE="/tmp/hana_monitor.lock"
if [ -e "$LOCK_FILE" ]; then
echo "▶️ Script is already running. Exiting."
exit 1
fi
touch "$LOCK_FILE"
# Ensure lock file is removed on script exit
trap 'rm -f "$LOCK_FILE"' EXIT
# --- Configuration and Setup ---
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
CONFIG_FILE="${SCRIPT_DIR}/monitor.conf"
if [ ! -f "$CONFIG_FILE" ]; then
echo "❌ Error: Configuration file not found at ${CONFIG_FILE}" >&2
rm -f "$LOCK_FILE"
exit 1
fi
source "$CONFIG_FILE"
STATE_DIR="${SCRIPT_DIR}/monitor_state"
mkdir -p "${STATE_DIR}"
# Helper functions for state management
get_state() {
local key="$1"
if [ -f "${STATE_DIR}/${key}.state" ]; then
cat "${STATE_DIR}/${key}.state"
else
echo ""
fi
}
set_state() {
local key="$1"
local value="$2"
echo "$value" > "${STATE_DIR}/${key}.state"
}
HOSTNAME=$(hostname)
SQL_QUERY="SELECT b.host, b.service_name, a.state, count(*) FROM PUBLIC.M_LOG_SEGMENTS a JOIN PUBLIC.M_SERVICES b ON (a.host = b.host AND a.port = b.port) GROUP BY b.host, b.service_name, a.state;"
send_notification_if_changed() {
local alert_key="$1"
local title_prefix="$2" # e.g., "HANA Process"
local current_message="$3"
local is_alert_condition="$4" # "true" or "false"
local current_value="$5" # The value to store as state (e.g., "85%", "GREEN", "ALERT")
local previous_value=$(get_state "${alert_key}")
if [ "$current_value" != "$previous_value" ]; then
local full_title=""
local full_message=""
if [ "$is_alert_condition" == "true" ]; then
full_title="${title_prefix} Alert"
full_message="🚨 Critical: ${current_message}"
else
# Check if it was previously an alert (i.e., previous_value was not "OK")
if [ -n "$previous_value" ] && [ "$previous_value" != "OK" ]; then
full_title="${title_prefix} Resolved"
full_message="✅ Resolved: ${current_message}"
else
# No alert, and no previous alert to resolve, so just update state silently
set_state "${alert_key}" "$current_value"
echo " State for ${alert_key} updated to ${current_value}. No notification sent."
return
fi
fi
local final_message="[${COMPANY_NAME} | ${HOSTNAME}] ${full_message}"
curl -H "Authorization: Bearer ${NTFY_TOKEN}" -H "Title: ${full_title}" -d "${final_message}" "${NTFY_TOPIC_URL}" > /dev/null 2>&1
set_state "${alert_key}" "$current_value"
echo "🔔 Notification sent for ${alert_key}: ${full_message}"
else
echo " State for ${alert_key} unchanged. No notification sent."
fi
}
# --- HANA Process Status ---
echo "⚙️ Checking HANA process status..."
if [ ! -x "$SAPCONTROL_PATH" ]; then
echo "❌ Error: sapcontrol not found or not executable at ${SAPCONTROL_PATH}" >&2
send_notification_if_changed "hana_sapcontrol_path" "HANA Monitor Error" "sapcontrol not found or not executable at ${SAPCONTROL_PATH}" "true" "SAPCONTROL_ERROR"
exit 1
fi
non_green_processes=$("${SAPCONTROL_PATH}" -nr "${HANA_INSTANCE_NR}" -function GetProcessList | tail -n +6 | grep -v 'GREEN')
if [ -n "$non_green_processes" ]; then
echo "🚨 Alert: One or more HANA processes are not running!" >&2
echo "$non_green_processes" >&2
send_notification_if_changed "hana_processes" "HANA Process" "One or more HANA processes are not GREEN. Problem processes: ${non_green_processes}" "true" "PROCESS_ALERT:${non_green_processes}"
exit 1 # Exit early as other checks might fail
else
send_notification_if_changed "hana_processes" "HANA Process" "All HANA processes are GREEN." "false" "OK"
echo "✅ Success! All HANA processes are GREEN."
fi
# --- Disk Space Monitoring ---
echo " Checking disk usage..."
for dir in "${DIRECTORIES_TO_MONITOR[@]}"; do
if [ ! -d "$dir" ]; then
echo "⚠️ Warning: Directory '$dir' not found. Skipping." >&2
send_notification_if_changed "disk_dir_not_found_${dir//\//_}" "HANA Disk Warning" "Directory '$dir' not found." "true" "DIR_NOT_FOUND"
continue
fi
usage=$(df -h "$dir" | awk 'NR==2 {print $5}' | sed 's/%//')
echo " - ${dir} is at ${usage}%"
if (( $(echo "$usage > $DISK_USAGE_THRESHOLD" | bc -l) )); then
echo "🚨 Alert: ${dir} usage is at ${usage}% which is above the ${DISK_USAGE_THRESHOLD}% threshold." >&2
send_notification_if_changed "disk_usage_${dir//\//_}" "HANA Disk" "Disk usage for ${dir} is at ${usage}%." "true" "${usage}%"
else
send_notification_if_changed "disk_usage_${dir//\//_}" "HANA Disk" "Disk usage for ${dir} is at ${usage}% (below threshold)." "false" "OK"
fi
done
# --- HANA Log Segment Monitoring ---
echo "⚙️ Executing HANA SQL query..."
if [ ! -x "$HDBSQL_PATH" ]; then
echo "❌ Error: hdbsql not found or not executable at ${HDBSQL_PATH}" >&2
send_notification_if_changed "hana_hdbsql_path" "HANA Monitor Error" "hdbsql not found or not executable at ${HDBSQL_PATH}" "true" "HDBSQL_ERROR"
exit 1
fi
readarray -t sql_output < <("$HDBSQL_PATH" -U "$HANA_USER_KEY" -c ";" "$SQL_QUERY" 2>&1)
if [ $? -ne 0 ]; then
echo "❌ Failure! The hdbsql command failed. Please check logs." >&2
error_message=$(printf '%s\n' "${sql_output[@]}")
send_notification_if_changed "hana_hdbsql_command" "HANA Monitor Error" "The hdbsql command failed. Details: ${error_message}" "true" "HDBSQL_COMMAND_FAILED"
exit 1
fi
total_segments=0
truncated_segments=0
free_segments=0
for line in "${sql_output[@]}"; do
if [[ -z "$line" || "$line" == *"STATE"* ]]; then continue; fi
cleaned_line=$(echo "$line" | tr -d '"')
state=$(echo "$cleaned_line" | awk -F',' '{print $3}')
count=$(echo "$cleaned_line" | awk -F',' '{print $4}')
total_segments=$((total_segments + count))
if [[ "$state" == "Truncated" ]]; then
truncated_segments=$((truncated_segments + count))
elif [[ "$state" == "Free" ]]; then
free_segments=$((free_segments + count))
fi
done
echo " Total Segments: ${total_segments}"
echo " Truncated Segments: ${truncated_segments}"
echo " Free Segments: ${free_segments}"
if [ $total_segments -eq 0 ]; then
echo "⚠️ Warning: No log segments found. Skipping percentage checks." >&2
send_notification_if_changed "hana_log_segments_total" "HANA Log Segment Warning" "No log segments found. Skipping percentage checks." "true" "NO_LOG_SEGMENTS"
exit 0
else
send_notification_if_changed "hana_log_segments_total" "HANA Log Segment" "Log segments found." "false" "OK"
fi
truncated_percentage=$((truncated_segments * 100 / total_segments))
if (( $(echo "$truncated_percentage > $TRUNCATED_PERCENTAGE_THRESHOLD" | bc -l) )); then
echo "🚨 Alert: ${truncated_percentage}% of log segments are 'Truncated'." >&2
send_notification_if_changed "hana_log_truncated" "HANA Log Segment" "${truncated_percentage}% of HANA log segments are in 'Truncated' state." "true" "${truncated_percentage}%"
else
send_notification_if_changed "hana_log_truncated" "HANA Log Segment" "${truncated_percentage}% of HANA log segments are in 'Truncated' state (below threshold)." "false" "OK"
fi
free_percentage=$((free_segments * 100 / total_segments))
if (( $(echo "$free_percentage < $FREE_PERCENTAGE_THRESHOLD" | bc -l) )); then
echo "🚨 Alert: Only ${free_percentage}% of log segments are 'Free'." >&2
send_notification_if_changed "hana_log_free" "HANA Log Segment" "Only ${free_percentage}% of HANA log segments are in 'Free' state." "true" "${free_percentage}%"
else
send_notification_if_changed "hana_log_free" "HANA Log Segment" "Only ${free_percentage}% of HANA log segments are in 'Free' state (above threshold)." "false" "OK"
fi
echo " Checking last successful data backup status..."
# Query to get the start time of the most recent successful complete data backup
last_backup_date=$("$HDBSQL_PATH" -U "$HANA_USER_KEY" -j -a -x \
"SELECT TOP 1 SYS_START_TIME FROM M_BACKUP_CATALOG WHERE ENTRY_TYPE_NAME = 'complete data backup' AND STATE_NAME = 'successful' ORDER BY SYS_START_TIME DESC" 2>/dev/null | tr -d "\"" | sed 's/\..*//') # sed removes fractional seconds
if [[ -z "$last_backup_date" ]]; then
# No successful backup found at all
local message="No successful complete data backup found for ${COMPANY_NAME} HANA."
echo "🚨 Critical: ${message}"
send_notification_if_changed "hana_backup_status" "HANA Backup" "${message}" "true" "NO_BACKUP"
return
fi
# Convert dates to epoch seconds for comparison
last_backup_epoch=$(date -d "$last_backup_date" +%s)
current_epoch=$(date +%s)
threshold_seconds=$((BACKUP_THRESHOLD_HOURS * 3600))
age_seconds=$((current_epoch - last_backup_epoch))
age_hours=$((age_seconds / 3600))
if (( age_seconds > threshold_seconds )); then
local message="Last successful HANA backup for ${COMPANY_NAME} is ${age_hours} hours old, which exceeds the threshold of ${BACKUP_THRESHOLD_HOURS} hours. Last backup was on: ${last_backup_date}."
echo "🚨 Critical: ${message}"
send_notification_if_changed "hana_backup_status" "HANA Backup" "${message}" "true" "${age_hours}h"
else
local message="Last successful backup is ${age_hours} hours old (Threshold: ${BACKUP_THRESHOLD_HOURS} hours)."
echo "✅ Success! ${message}"
send_notification_if_changed "hana_backup_status" "HANA Backup" "${message}" "false" "OK"
fi
echo "✅ Success! HANA monitoring check complete."