158 lines
6.5 KiB
Bash
158 lines
6.5 KiB
Bash
#!/bin/bash
|
||
# Version: 1.1.0
|
||
# =============================================================================
|
||
# SAP HANA Monitoring Script
|
||
#
|
||
# Checks HANA processes, disk usage, and log segment state.
|
||
# Sends ntfy.sh notifications if thresholds are exceeded.
|
||
# =============================================================================
|
||
|
||
# --- Lock File Implementation ---
|
||
LOCK_FILE="/tmp/hana_monitor.lock"
|
||
if [ -e "$LOCK_FILE" ]; then
|
||
echo "▶️ Script is already running. Exiting."
|
||
exit 1
|
||
fi
|
||
touch "$LOCK_FILE"
|
||
# Ensure lock file is removed on script exit
|
||
trap 'rm -f "$LOCK_FILE"' EXIT
|
||
|
||
# --- Configuration and Setup ---
|
||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
|
||
CONFIG_FILE="${SCRIPT_DIR}/monitor.conf"
|
||
|
||
if [ ! -f "$CONFIG_FILE" ]; then
|
||
echo "❌ Error: Configuration file not found at ${CONFIG_FILE}" >&2
|
||
rm -f "$LOCK_FILE"
|
||
exit 1
|
||
fi
|
||
source "$CONFIG_FILE"
|
||
|
||
HOSTNAME=$(hostname)
|
||
SQL_QUERY="SELECT b.host, b.service_name, a.state, count(*) FROM PUBLIC.M_LOG_SEGMENTS a JOIN PUBLIC.M_SERVICES b ON (a.host = b.host AND a.port = b.port) GROUP BY b.host, b.service_name, a.state;"
|
||
|
||
send_notification() {
|
||
local title="$1"
|
||
local message="$2"
|
||
local full_message="[${COMPANY_NAME} | ${HOSTNAME}] ${message}"
|
||
curl -H "Authorization: Bearer ${NTFY_TOKEN}" -H "Title: ${title}" -d "${full_message}" "${NTFY_TOPIC_URL}" > /dev/null 2>&1
|
||
}
|
||
|
||
# --- HANA Process Status ---
|
||
echo "⚙️ Checking HANA process status..."
|
||
if [ ! -x "$SAPCONTROL_PATH" ]; then
|
||
echo "❌ Error: sapcontrol not found or not executable at ${SAPCONTROL_PATH}" >&2
|
||
send_notification "HANA Monitor Error" "❌ Error: sapcontrol not found or not executable at ${SAPCONTROL_PATH}"
|
||
exit 1
|
||
fi
|
||
|
||
non_green_processes=$("${SAPCONTROL_PATH}" -nr "${HANA_INSTANCE_NR}" -function GetProcessList | tail -n +6 | grep -v 'GREEN')
|
||
|
||
if [ -n "$non_green_processes" ]; then
|
||
echo "🚨 Alert: One or more HANA processes are not running!" >&2
|
||
echo "$non_green_processes" >&2
|
||
send_notification "HANA Process Alert" "🚨 Critical: One or more HANA processes are not GREEN. Problem processes: ${non_green_processes}"
|
||
exit 1 # Exit early as other checks might fail
|
||
fi
|
||
echo "✅ Success! All HANA processes are GREEN."
|
||
|
||
# --- Disk Space Monitoring ---
|
||
echo "ℹ️ Checking disk usage..."
|
||
for dir in "${DIRECTORIES_TO_MONITOR[@]}"; do
|
||
if [ ! -d "$dir" ]; then
|
||
echo "⚠️ Warning: Directory '$dir' not found. Skipping." >&2
|
||
continue
|
||
fi
|
||
usage=$(df -h "$dir" | awk 'NR==2 {print $5}' | sed 's/%//')
|
||
echo " - ${dir} is at ${usage}%"
|
||
if (( $(echo "$usage > $DISK_USAGE_THRESHOLD" | bc -l) )); then
|
||
echo "🚨 Alert: ${dir} usage is at ${usage}% which is above the ${DISK_USAGE_THRESHOLD}% threshold." >&2
|
||
send_notification "HANA Disk Alert" "🚨 Critical: Disk usage for ${dir} is at ${usage}%."
|
||
fi
|
||
done
|
||
|
||
# --- HANA Log Segment Monitoring ---
|
||
echo "⚙️ Executing HANA SQL query..."
|
||
if [ ! -x "$HDBSQL_PATH" ]; then
|
||
echo "❌ Error: hdbsql not found or not executable at ${HDBSQL_PATH}" >&2
|
||
send_notification "HANA Monitor Error" "❌ Error: hdbsql not found or not executable at ${HDBSQL_PATH}"
|
||
exit 1
|
||
fi
|
||
readarray -t sql_output < <("$HDBSQL_PATH" -U "$HANA_USER_KEY" -c ";" "$SQL_QUERY" 2>&1)
|
||
if [ $? -ne 0 ]; then
|
||
echo "❌ Failure! The hdbsql command failed. Please check logs." >&2
|
||
error_message=$(printf '%s\n' "${sql_output[@]}")
|
||
send_notification "HANA Monitor Error" "❌ Failure! The hdbsql command failed. Details: ${error_message}"
|
||
exit 1
|
||
fi
|
||
|
||
total_segments=0
|
||
truncated_segments=0
|
||
free_segments=0
|
||
for line in "${sql_output[@]}"; do
|
||
if [[ -z "$line" || "$line" == *"STATE"* ]]; then continue; fi
|
||
cleaned_line=$(echo "$line" | tr -d '"')
|
||
state=$(echo "$cleaned_line" | awk -F',' '{print $3}')
|
||
count=$(echo "$cleaned_line" | awk -F',' '{print $4}')
|
||
total_segments=$((total_segments + count))
|
||
if [[ "$state" == "Truncated" ]]; then
|
||
truncated_segments=$((truncated_segments + count))
|
||
elif [[ "$state" == "Free" ]]; then
|
||
free_segments=$((free_segments + count))
|
||
fi
|
||
done
|
||
|
||
echo "ℹ️ Total Segments: ${total_segments}"
|
||
echo "ℹ️ Truncated Segments: ${truncated_segments}"
|
||
echo "ℹ️ Free Segments: ${free_segments}"
|
||
|
||
if [ $total_segments -eq 0 ]; then
|
||
echo "⚠️ Warning: No log segments found. Skipping percentage checks." >&2
|
||
exit 0
|
||
fi
|
||
|
||
truncated_percentage=$((truncated_segments * 100 / total_segments))
|
||
if (( $(echo "$truncated_percentage > $TRUNCATED_PERCENTAGE_THRESHOLD" | bc -l) )); then
|
||
echo "🚨 Alert: ${truncated_percentage}% of log segments are 'Truncated'." >&2
|
||
send_notification "HANA Log Segment Alert" "🚨 Alert: ${truncated_percentage}% of HANA log segments are in 'Truncated' state."
|
||
fi
|
||
|
||
free_percentage=$((free_segments * 100 / total_segments))
|
||
if (( $(echo "$free_percentage < $FREE_PERCENTAGE_THRESHOLD" | bc -l) )); then
|
||
echo "🚨 Alert: Only ${free_percentage}% of log segments are 'Free'." >&2
|
||
send_notification "HANA Log Segment Alert" "🚨 Alert: Only ${free_percentage}% of HANA log segments are in 'Free' state."
|
||
fi
|
||
|
||
echo "ℹ️ Checking last successful data backup status..."
|
||
|
||
# Query to get the start time of the most recent successful complete data backup
|
||
last_backup_date=$(hdbsql -U "$HANA_USERKEY" -j -a -x \
|
||
"SELECT TOP 1 SYS_START_TIME FROM M_BACKUP_CATALOG WHERE ENTRY_TYPE_NAME = 'complete data backup' AND STATE_NAME = 'successful' ORDER BY SYS_START_TIME DESC" 2>/dev/null | tr -d "\"" | sed 's/\..*//') # sed removes fractional seconds
|
||
|
||
if [[ -z "$last_backup_date" ]]; then
|
||
# No successful backup found at all
|
||
local message="🚨 Critical: No successful complete data backup found for ${COMPANY_NAME} HANA."
|
||
echo "$message"
|
||
send_notification "HANA Backup Alert" "$message"
|
||
return
|
||
fi
|
||
|
||
# Convert dates to epoch seconds for comparison
|
||
last_backup_epoch=$(date -d "$last_backup_date" +%s)
|
||
current_epoch=$(date +%s)
|
||
threshold_seconds=$((BACKUP_THRESHOLD_HOURS * 3600))
|
||
|
||
age_seconds=$((current_epoch - last_backup_epoch))
|
||
age_hours=$((age_seconds / 3600))
|
||
|
||
if (( age_seconds > threshold_seconds )); then
|
||
local message="🚨 Critical: Last successful HANA backup for ${COMPANY_NAME} is ${age_hours} hours old, which exceeds the threshold of ${BACKUP_THRESHOLD_HOURS} hours. Last backup was on: ${last_backup_date}."
|
||
echo "$message"
|
||
send_notification "HANA Backup Alert" "$message"
|
||
else
|
||
echo "✅ Success! Last successful backup is ${age_hours} hours old (Threshold: ${BACKUP_THRESHOLD_HOURS} hours)."
|
||
fi
|
||
|
||
echo "✅ Success! HANA monitoring check complete."
|
||
|