feat(monitor): Implement state-based notifications to prevent alert spam
Introduces state management to 'monitor.sh' to send notifications only when a monitored status changes (e.g., from healthy to alert, or alert to resolved). This prevents repetitive alerts for persistent issues. Creates a 'monitor_state' directory for tracking. Updates script version to 1.2.0.
This commit is contained in:
@@ -2,13 +2,13 @@
|
|||||||
|
|
||||||
# --- Company Information ---
|
# --- Company Information ---
|
||||||
# Used to identify which company the alert is for.
|
# Used to identify which company the alert is for.
|
||||||
COMPANY_NAME="Your Company Name"
|
COMPANY_NAME="Company"
|
||||||
|
|
||||||
# --- Notification Settings ---
|
# --- Notification Settings ---
|
||||||
# Your ntfy.sh topic URL
|
# Your ntfy.sh topic URL
|
||||||
NTFY_TOPIC_URL="https://ntfy.technopunk.space/sap"
|
NTFY_TOPIC_URL="https://ntfy.technopunk.space/sap"
|
||||||
# Your ntfy.sh bearer token (if required)
|
# Your ntfy.sh bearer token (if required)
|
||||||
NTFY_TOKEN="your_ntfy_token_here"
|
NTFY_TOKEN="tk_xxxxx"
|
||||||
|
|
||||||
# --- HANA Connection Settings ---
|
# --- HANA Connection Settings ---
|
||||||
# Full path to the sapcontrol executable
|
# Full path to the sapcontrol executable
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
# Version: 1.1.1
|
# Version: 1.2.0
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# SAP HANA Monitoring Script
|
# SAP HANA Monitoring Script
|
||||||
#
|
#
|
||||||
@@ -28,21 +28,71 @@ if [ ! -f "$CONFIG_FILE" ]; then
|
|||||||
fi
|
fi
|
||||||
source "$CONFIG_FILE"
|
source "$CONFIG_FILE"
|
||||||
|
|
||||||
|
STATE_DIR="${SCRIPT_DIR}/monitor_state"
|
||||||
|
mkdir -p "${STATE_DIR}"
|
||||||
|
|
||||||
|
# Helper functions for state management
|
||||||
|
get_state() {
|
||||||
|
local key="$1"
|
||||||
|
if [ -f "${STATE_DIR}/${key}.state" ]; then
|
||||||
|
cat "${STATE_DIR}/${key}.state"
|
||||||
|
else
|
||||||
|
echo ""
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
set_state() {
|
||||||
|
local key="$1"
|
||||||
|
local value="$2"
|
||||||
|
echo "$value" > "${STATE_DIR}/${key}.state"
|
||||||
|
}
|
||||||
|
|
||||||
HOSTNAME=$(hostname)
|
HOSTNAME=$(hostname)
|
||||||
SQL_QUERY="SELECT b.host, b.service_name, a.state, count(*) FROM PUBLIC.M_LOG_SEGMENTS a JOIN PUBLIC.M_SERVICES b ON (a.host = b.host AND a.port = b.port) GROUP BY b.host, b.service_name, a.state;"
|
SQL_QUERY="SELECT b.host, b.service_name, a.state, count(*) FROM PUBLIC.M_LOG_SEGMENTS a JOIN PUBLIC.M_SERVICES b ON (a.host = b.host AND a.port = b.port) GROUP BY b.host, b.service_name, a.state;"
|
||||||
|
|
||||||
send_notification() {
|
send_notification_if_changed() {
|
||||||
local title="$1"
|
local alert_key="$1"
|
||||||
local message="$2"
|
local title_prefix="$2" # e.g., "HANA Process"
|
||||||
local full_message="[${COMPANY_NAME} | ${HOSTNAME}] ${message}"
|
local current_message="$3"
|
||||||
curl -H "Authorization: Bearer ${NTFY_TOKEN}" -H "Title: ${title}" -d "${full_message}" "${NTFY_TOPIC_URL}" > /dev/null 2>&1
|
local is_alert_condition="$4" # "true" or "false"
|
||||||
|
local current_value="$5" # The value to store as state (e.g., "85%", "GREEN", "ALERT")
|
||||||
|
|
||||||
|
local previous_value=$(get_state "${alert_key}")
|
||||||
|
|
||||||
|
if [ "$current_value" != "$previous_value" ]; then
|
||||||
|
local full_title=""
|
||||||
|
local full_message=""
|
||||||
|
|
||||||
|
if [ "$is_alert_condition" == "true" ]; then
|
||||||
|
full_title="${title_prefix} Alert"
|
||||||
|
full_message="🚨 Critical: ${current_message}"
|
||||||
|
else
|
||||||
|
# Check if it was previously an alert (i.e., previous_value was not "OK")
|
||||||
|
if [ -n "$previous_value" ] && [ "$previous_value" != "OK" ]; then
|
||||||
|
full_title="${title_prefix} Resolved"
|
||||||
|
full_message="✅ Resolved: ${current_message}"
|
||||||
|
else
|
||||||
|
# No alert, and no previous alert to resolve, so just update state silently
|
||||||
|
set_state "${alert_key}" "$current_value"
|
||||||
|
echo "ℹ️ State for ${alert_key} updated to ${current_value}. No notification sent."
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
local final_message="[${COMPANY_NAME} | ${HOSTNAME}] ${full_message}"
|
||||||
|
curl -H "Authorization: Bearer ${NTFY_TOKEN}" -H "Title: ${full_title}" -d "${final_message}" "${NTFY_TOPIC_URL}" > /dev/null 2>&1
|
||||||
|
set_state "${alert_key}" "$current_value"
|
||||||
|
echo "🔔 Notification sent for ${alert_key}: ${full_message}"
|
||||||
|
else
|
||||||
|
echo "ℹ️ State for ${alert_key} unchanged. No notification sent."
|
||||||
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
# --- HANA Process Status ---
|
# --- HANA Process Status ---
|
||||||
echo "⚙️ Checking HANA process status..."
|
echo "⚙️ Checking HANA process status..."
|
||||||
if [ ! -x "$SAPCONTROL_PATH" ]; then
|
if [ ! -x "$SAPCONTROL_PATH" ]; then
|
||||||
echo "❌ Error: sapcontrol not found or not executable at ${SAPCONTROL_PATH}" >&2
|
echo "❌ Error: sapcontrol not found or not executable at ${SAPCONTROL_PATH}" >&2
|
||||||
send_notification "HANA Monitor Error" "❌ Error: sapcontrol not found or not executable at ${SAPCONTROL_PATH}"
|
send_notification_if_changed "hana_sapcontrol_path" "HANA Monitor Error" "sapcontrol not found or not executable at ${SAPCONTROL_PATH}" "true" "SAPCONTROL_ERROR"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -51,23 +101,28 @@ non_green_processes=$("${SAPCONTROL_PATH}" -nr "${HANA_INSTANCE_NR}" -function G
|
|||||||
if [ -n "$non_green_processes" ]; then
|
if [ -n "$non_green_processes" ]; then
|
||||||
echo "🚨 Alert: One or more HANA processes are not running!" >&2
|
echo "🚨 Alert: One or more HANA processes are not running!" >&2
|
||||||
echo "$non_green_processes" >&2
|
echo "$non_green_processes" >&2
|
||||||
send_notification "HANA Process Alert" "🚨 Critical: One or more HANA processes are not GREEN. Problem processes: ${non_green_processes}"
|
send_notification_if_changed "hana_processes" "HANA Process" "One or more HANA processes are not GREEN. Problem processes: ${non_green_processes}" "true" "PROCESS_ALERT:${non_green_processes}"
|
||||||
exit 1 # Exit early as other checks might fail
|
exit 1 # Exit early as other checks might fail
|
||||||
|
else
|
||||||
|
send_notification_if_changed "hana_processes" "HANA Process" "All HANA processes are GREEN." "false" "OK"
|
||||||
|
echo "✅ Success! All HANA processes are GREEN."
|
||||||
fi
|
fi
|
||||||
echo "✅ Success! All HANA processes are GREEN."
|
|
||||||
|
|
||||||
# --- Disk Space Monitoring ---
|
# --- Disk Space Monitoring ---
|
||||||
echo "ℹ️ Checking disk usage..."
|
echo "ℹ️ Checking disk usage..."
|
||||||
for dir in "${DIRECTORIES_TO_MONITOR[@]}"; do
|
for dir in "${DIRECTORIES_TO_MONITOR[@]}"; do
|
||||||
if [ ! -d "$dir" ]; then
|
if [ ! -d "$dir" ]; then
|
||||||
echo "⚠️ Warning: Directory '$dir' not found. Skipping." >&2
|
echo "⚠️ Warning: Directory '$dir' not found. Skipping." >&2
|
||||||
|
send_notification_if_changed "disk_dir_not_found_${dir//\//_}" "HANA Disk Warning" "Directory '$dir' not found." "true" "DIR_NOT_FOUND"
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
usage=$(df -h "$dir" | awk 'NR==2 {print $5}' | sed 's/%//')
|
usage=$(df -h "$dir" | awk 'NR==2 {print $5}' | sed 's/%//')
|
||||||
echo " - ${dir} is at ${usage}%"
|
echo " - ${dir} is at ${usage}%"
|
||||||
if (( $(echo "$usage > $DISK_USAGE_THRESHOLD" | bc -l) )); then
|
if (( $(echo "$usage > $DISK_USAGE_THRESHOLD" | bc -l) )); then
|
||||||
echo "🚨 Alert: ${dir} usage is at ${usage}% which is above the ${DISK_USAGE_THRESHOLD}% threshold." >&2
|
echo "🚨 Alert: ${dir} usage is at ${usage}% which is above the ${DISK_USAGE_THRESHOLD}% threshold." >&2
|
||||||
send_notification "HANA Disk Alert" "🚨 Critical: Disk usage for ${dir} is at ${usage}%."
|
send_notification_if_changed "disk_usage_${dir//\//_}" "HANA Disk" "Disk usage for ${dir} is at ${usage}%." "true" "${usage}%"
|
||||||
|
else
|
||||||
|
send_notification_if_changed "disk_usage_${dir//\//_}" "HANA Disk" "Disk usage for ${dir} is at ${usage}% (below threshold)." "false" "OK"
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
@@ -75,14 +130,14 @@ done
|
|||||||
echo "⚙️ Executing HANA SQL query..."
|
echo "⚙️ Executing HANA SQL query..."
|
||||||
if [ ! -x "$HDBSQL_PATH" ]; then
|
if [ ! -x "$HDBSQL_PATH" ]; then
|
||||||
echo "❌ Error: hdbsql not found or not executable at ${HDBSQL_PATH}" >&2
|
echo "❌ Error: hdbsql not found or not executable at ${HDBSQL_PATH}" >&2
|
||||||
send_notification "HANA Monitor Error" "❌ Error: hdbsql not found or not executable at ${HDBSQL_PATH}"
|
send_notification_if_changed "hana_hdbsql_path" "HANA Monitor Error" "hdbsql not found or not executable at ${HDBSQL_PATH}" "true" "HDBSQL_ERROR"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
readarray -t sql_output < <("$HDBSQL_PATH" -U "$HANA_USER_KEY" -c ";" "$SQL_QUERY" 2>&1)
|
readarray -t sql_output < <("$HDBSQL_PATH" -U "$HANA_USER_KEY" -c ";" "$SQL_QUERY" 2>&1)
|
||||||
if [ $? -ne 0 ]; then
|
if [ $? -ne 0 ]; then
|
||||||
echo "❌ Failure! The hdbsql command failed. Please check logs." >&2
|
echo "❌ Failure! The hdbsql command failed. Please check logs." >&2
|
||||||
error_message=$(printf '%s\n' "${sql_output[@]}")
|
error_message=$(printf '%s\n' "${sql_output[@]}")
|
||||||
send_notification "HANA Monitor Error" "❌ Failure! The hdbsql command failed. Details: ${error_message}"
|
send_notification_if_changed "hana_hdbsql_command" "HANA Monitor Error" "The hdbsql command failed. Details: ${error_message}" "true" "HDBSQL_COMMAND_FAILED"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -108,19 +163,26 @@ echo "ℹ️ Free Segments: ${free_segments}"
|
|||||||
|
|
||||||
if [ $total_segments -eq 0 ]; then
|
if [ $total_segments -eq 0 ]; then
|
||||||
echo "⚠️ Warning: No log segments found. Skipping percentage checks." >&2
|
echo "⚠️ Warning: No log segments found. Skipping percentage checks." >&2
|
||||||
|
send_notification_if_changed "hana_log_segments_total" "HANA Log Segment Warning" "No log segments found. Skipping percentage checks." "true" "NO_LOG_SEGMENTS"
|
||||||
exit 0
|
exit 0
|
||||||
|
else
|
||||||
|
send_notification_if_changed "hana_log_segments_total" "HANA Log Segment" "Log segments found." "false" "OK"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
truncated_percentage=$((truncated_segments * 100 / total_segments))
|
truncated_percentage=$((truncated_segments * 100 / total_segments))
|
||||||
if (( $(echo "$truncated_percentage > $TRUNCATED_PERCENTAGE_THRESHOLD" | bc -l) )); then
|
if (( $(echo "$truncated_percentage > $TRUNCATED_PERCENTAGE_THRESHOLD" | bc -l) )); then
|
||||||
echo "🚨 Alert: ${truncated_percentage}% of log segments are 'Truncated'." >&2
|
echo "🚨 Alert: ${truncated_percentage}% of log segments are 'Truncated'." >&2
|
||||||
send_notification "HANA Log Segment Alert" "🚨 Alert: ${truncated_percentage}% of HANA log segments are in 'Truncated' state."
|
send_notification_if_changed "hana_log_truncated" "HANA Log Segment" "${truncated_percentage}% of HANA log segments are in 'Truncated' state." "true" "${truncated_percentage}%"
|
||||||
|
else
|
||||||
|
send_notification_if_changed "hana_log_truncated" "HANA Log Segment" "${truncated_percentage}% of HANA log segments are in 'Truncated' state (below threshold)." "false" "OK"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
free_percentage=$((free_segments * 100 / total_segments))
|
free_percentage=$((free_segments * 100 / total_segments))
|
||||||
if (( $(echo "$free_percentage < $FREE_PERCENTAGE_THRESHOLD" | bc -l) )); then
|
if (( $(echo "$free_percentage < $FREE_PERCENTAGE_THRESHOLD" | bc -l) )); then
|
||||||
echo "🚨 Alert: Only ${free_percentage}% of log segments are 'Free'." >&2
|
echo "🚨 Alert: Only ${free_percentage}% of log segments are 'Free'." >&2
|
||||||
send_notification "HANA Log Segment Alert" "🚨 Alert: Only ${free_percentage}% of HANA log segments are in 'Free' state."
|
send_notification_if_changed "hana_log_free" "HANA Log Segment" "Only ${free_percentage}% of HANA log segments are in 'Free' state." "true" "${free_percentage}%"
|
||||||
|
else
|
||||||
|
send_notification_if_changed "hana_log_free" "HANA Log Segment" "Only ${free_percentage}% of HANA log segments are in 'Free' state (above threshold)." "false" "OK"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "ℹ️ Checking last successful data backup status..."
|
echo "ℹ️ Checking last successful data backup status..."
|
||||||
@@ -131,9 +193,9 @@ last_backup_date=$("$HDBSQL_PATH" -U "$HANA_USER_KEY" -j -a -x \
|
|||||||
|
|
||||||
if [[ -z "$last_backup_date" ]]; then
|
if [[ -z "$last_backup_date" ]]; then
|
||||||
# No successful backup found at all
|
# No successful backup found at all
|
||||||
local message="🚨 Critical: No successful complete data backup found for ${COMPANY_NAME} HANA."
|
local message="No successful complete data backup found for ${COMPANY_NAME} HANA."
|
||||||
echo "$message"
|
echo "🚨 Critical: ${message}"
|
||||||
send_notification "HANA Backup Alert" "$message"
|
send_notification_if_changed "hana_backup_status" "HANA Backup" "${message}" "true" "NO_BACKUP"
|
||||||
return
|
return
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -146,11 +208,13 @@ age_seconds=$((current_epoch - last_backup_epoch))
|
|||||||
age_hours=$((age_seconds / 3600))
|
age_hours=$((age_seconds / 3600))
|
||||||
|
|
||||||
if (( age_seconds > threshold_seconds )); then
|
if (( age_seconds > threshold_seconds )); then
|
||||||
local message="🚨 Critical: Last successful HANA backup for ${COMPANY_NAME} is ${age_hours} hours old, which exceeds the threshold of ${BACKUP_THRESHOLD_HOURS} hours. Last backup was on: ${last_backup_date}."
|
local message="Last successful HANA backup for ${COMPANY_NAME} is ${age_hours} hours old, which exceeds the threshold of ${BACKUP_THRESHOLD_HOURS} hours. Last backup was on: ${last_backup_date}."
|
||||||
echo "$message"
|
echo "🚨 Critical: ${message}"
|
||||||
send_notification "HANA Backup Alert" "$message"
|
send_notification_if_changed "hana_backup_status" "HANA Backup" "${message}" "true" "${age_hours}h"
|
||||||
else
|
else
|
||||||
echo "✅ Success! Last successful backup is ${age_hours} hours old (Threshold: ${BACKUP_THRESHOLD_HOURS} hours)."
|
local message="Last successful backup is ${age_hours} hours old (Threshold: ${BACKUP_THRESHOLD_HOURS} hours)."
|
||||||
|
echo "✅ Success! ${message}"
|
||||||
|
send_notification_if_changed "hana_backup_status" "HANA Backup" "${message}" "false" "OK"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "✅ Success! HANA monitoring check complete."
|
echo "✅ Success! HANA monitoring check complete."
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ declare -A SCRIPT_PACKAGES
|
|||||||
# Format: short_name="Display Name|Version|Description|URL1 URL2..."
|
# Format: short_name="Display Name|Version|Description|URL1 URL2..."
|
||||||
SCRIPT_PACKAGES["aurora"]="Aurora Suite|2.1.0|A collection of scripts for managing Aurora database instances.|https://git.technopunk.space/tomi/Scripts/raw/branch/main/aurora/aurora.sh https://git.technopunk.space/tomi/Scripts/raw/branch/main/aurora/aurora.conf"
|
SCRIPT_PACKAGES["aurora"]="Aurora Suite|2.1.0|A collection of scripts for managing Aurora database instances.|https://git.technopunk.space/tomi/Scripts/raw/branch/main/aurora/aurora.sh https://git.technopunk.space/tomi/Scripts/raw/branch/main/aurora/aurora.conf"
|
||||||
SCRIPT_PACKAGES["backup"]="Backup Suite|1.0.5|A comprehensive script for backing up system files and databases.|https://git.technopunk.space/tomi/Scripts/raw/branch/main/backup/backup.sh https://git.technopunk.space/tomi/Scripts/raw/branch/main/backup/backup.conf"
|
SCRIPT_PACKAGES["backup"]="Backup Suite|1.0.5|A comprehensive script for backing up system files and databases.|https://git.technopunk.space/tomi/Scripts/raw/branch/main/backup/backup.sh https://git.technopunk.space/tomi/Scripts/raw/branch/main/backup/backup.conf"
|
||||||
SCRIPT_PACKAGES["monitor"]="Monitor Suite|1.1.1|Scripts for monitoring system health and performance metrics.|https://git.technopunk.space/tomi/Scripts/raw/branch/main/monitor/monitor.sh https://git.technopunk.space/tomi/Scripts/raw/branch/main/monitor/monitor.conf"
|
SCRIPT_PACKAGES["monitor"]="Monitor Suite|1.2.0|Scripts for monitoring system health and performance metrics.|https://git.technopunk.space/tomi/Scripts/raw/branch/main/monitor/monitor.sh https://git.technopunk.space/tomi/Scripts/raw/branch/main/monitor/monitor.conf"
|
||||||
SCRIPT_PACKAGES["keymanager"]="Key Manager|1.2.1|A utility for managing HDB user keys for SAP HANA.|https://git.technopunk.space/tomi/Scripts/raw/branch/main/keymanager.sh"
|
SCRIPT_PACKAGES["keymanager"]="Key Manager|1.2.1|A utility for managing HDB user keys for SAP HANA.|https://git.technopunk.space/tomi/Scripts/raw/branch/main/keymanager.sh"
|
||||||
SCRIPT_PACKAGES["cleaner"]="File Cleaner|1.1.0|A simple script to clean up temporary files and logs.|https://git.technopunk.space/tomi/Scripts/raw/branch/main/cleaner.sh"
|
SCRIPT_PACKAGES["cleaner"]="File Cleaner|1.1.0|A simple script to clean up temporary files and logs.|https://git.technopunk.space/tomi/Scripts/raw/branch/main/cleaner.sh"
|
||||||
SCRIPT_PACKAGES["hanatool"]="HANA Tool|1.5.0|A command-line tool for various SAP HANA administration tasks.|https://git.technopunk.space/tomi/Scripts/raw/branch/main/hanatool.sh"
|
SCRIPT_PACKAGES["hanatool"]="HANA Tool|1.5.0|A command-line tool for various SAP HANA administration tasks.|https://git.technopunk.space/tomi/Scripts/raw/branch/main/hanatool.sh"
|
||||||
|
|||||||
Reference in New Issue
Block a user