refactor(monitoring): simplify monitoring scripts and remove state tracking

- Remove consecutive breach tracking for statement queue (immediate alerts)
- Consolidate script initialization into init_script() function
- Remove unused helper functions (send_ok, run_as_hana_user, get_mount_point)
- Flatten sld_watchdog.sh structure by removing main() wrapper
- Remove state directory and lock directory configuration from hana.conf
- Simplify alert messages to include threshold values

This continues the simplification effort from previous commits by removing stateful tracking mechanisms and streamlining the monitoring logic for easier maintenance.
This commit is contained in:
2026-03-12 22:18:29 +01:00
parent cf5b81889d
commit 0beef6fa48
6 changed files with 83 additions and 148 deletions

View File

@@ -33,7 +33,6 @@ DISK_USAGE_THRESHOLD=85
TRUNCATED_PERCENTAGE_THRESHOLD=50 TRUNCATED_PERCENTAGE_THRESHOLD=50
FREE_PERCENTAGE_THRESHOLD=10 FREE_PERCENTAGE_THRESHOLD=10
STATEMENT_QUEUE_THRESHOLD=10 STATEMENT_QUEUE_THRESHOLD=10
STATEMENT_QUEUE_CONSECUTIVE_RUNS=3
BACKUP_THRESHOLD_HOURS=32 BACKUP_THRESHOLD_HOURS=32
# --- Notification Configuration --- # --- Notification Configuration ---
@@ -45,10 +44,3 @@ COMPANY_NAME="My Company"
LOG_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" LOG_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
LOG_FILE="${LOG_DIR}/hana_monitor.log" LOG_FILE="${LOG_DIR}/hana_monitor.log"
# --- State Directory ---
STATE_DIR="${LOG_DIR}/monitor_state"
mkdir -p "${STATE_DIR}"
# --- Lock Directory ---
LOCK_DIR="/tmp"

View File

@@ -1,7 +1,7 @@
#!/bin/bash #!/bin/bash
# ============================================================================= # =============================================================================
# SAP HANA Disk Space Monitoring Script # SAP HANA Disk Space Monitoring Script
# Checks disk usage for configured directories with auto-cleanup capability # Checks disk usage for configured directories
# ============================================================================= # =============================================================================
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"

View File

@@ -3,13 +3,22 @@
# SAP HANA Monitoring Library - Shared Functions # SAP HANA Monitoring Library - Shared Functions
# ============================================================================= # =============================================================================
# Logging function with script name prefix # Initialize script with common setup
# Usage: log_message "SCRIPT_NAME" "message" # Usage: init_script "SCRIPT_NAME"
log_message() { # Sets up: SCRIPT_DIR, SCRIPT_NAME, LOG_FILE, LOCK_DIR
local script_name="$1" init_script() {
local message="$2" SCRIPT_NAME="$1"
local timestamp=$(date "+%Y-%m-%d %H:%M:%S") SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
echo "[${timestamp}] [${script_name}] ${message}" | tee -a "${LOG_FILE}"
# Load configuration
source "${SCRIPT_DIR}/hana.conf"
# Setup logging
LOG_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
LOG_FILE="${LOG_DIR}/hana_monitor.log"
# Setup lock directory
LOCK_DIR="/tmp"
} }
# Acquire lock for script execution # Acquire lock for script execution
@@ -38,6 +47,15 @@ release_lock() {
fi fi
} }
# Logging function with script name prefix
# Usage: log_message "SCRIPT_NAME" "message"
log_message() {
local script_name="$1"
local message="$2"
local timestamp=$(date "+%Y-%m-%d %H:%M:%S")
echo "[${timestamp}] [${script_name}] ${message}" | tee -a "${LOG_FILE}"
}
# Send notification via ntfy # Send notification via ntfy
# Usage: send_notification "TITLE" "MESSAGE" # Usage: send_notification "TITLE" "MESSAGE"
send_notification() { send_notification() {
@@ -64,23 +82,6 @@ send_alert() {
log_message "$script_name" "ALERT: ${message}" log_message "$script_name" "ALERT: ${message}"
} }
# Send OK notification (state change from alert to normal)
# Usage: send_ok "SCRIPT_NAME" "TITLE_PREFIX" "MESSAGE"
send_ok() {
local script_name="$1"
local title_prefix="$2"
local message="$3"
send_notification "${title_prefix} Resolved" "✅ Resolved: ${message}"
log_message "$script_name" "RESOLVED: ${message}"
}
# Run command as HANA user using su
# Usage: run_as_hana_user "COMMAND"
run_as_hana_user() {
local command="$1"
su - "$HANA_USER" -c "$command"
}
# Execute SQL query as HANA user # Execute SQL query as HANA user
# Usage: execute_hana_sql "SQL_QUERY" # Usage: execute_hana_sql "SQL_QUERY"
# Returns: SQL output on stdout, returns 0 on success, 1 on failure # Returns: SQL output on stdout, returns 0 on success, 1 on failure
@@ -131,19 +132,3 @@ get_disk_usage_percentage() {
fi fi
df "$dir" 2>/dev/null | awk 'NR==2 {gsub(/%/,"",$5); print $5}' df "$dir" 2>/dev/null | awk 'NR==2 {gsub(/%/,"",$5); print $5}'
} }
# Get mount point for a directory
# Usage: get_mount_point "/path/to/dir"
# Returns: Mount point path
get_mount_point() {
local dir="$1"
df "$dir" 2>/dev/null | awk 'NR==2 {print $NF}'
}
# Get available disk space in KB for a directory
# Usage: get_available_space_kb "/path/to/dir"
# Returns: Available space in KB
get_available_space_kb() {
local dir="$1"
df -k "$dir" 2>/dev/null | awk 'NR==2 {print $4}'
}

View File

@@ -72,25 +72,22 @@ log_message "$SCRIPT_NAME" "Truncated Segments: ${truncated_segments}"
log_message "$SCRIPT_NAME" "Free Segments: ${free_segments}" log_message "$SCRIPT_NAME" "Free Segments: ${free_segments}"
if [ $total_segments -eq 0 ]; then if [ $total_segments -eq 0 ]; then
log_message "$SCRIPT_NAME" "WARNING: No log segments found. Skipping percentage checks." log_message "$SCRIPT_NAME" "WARNING: No log segments found."
send_alert "$SCRIPT_NAME" "HANA Log Segment Warning" "No log segments found." send_alert "$SCRIPT_NAME" "HANA Log Segment Warning" "No log segments found."
exit 1 exit 1
fi fi
# Calculate truncated percentage with integer arithmetic # Calculate percentages
truncated_percentage=$((truncated_segments * 100 / total_segments)) truncated_percentage=$((truncated_segments * 100 / total_segments))
if [ $truncated_percentage -gt $TRUNCATED_PERCENTAGE_THRESHOLD ]; then
log_message "$SCRIPT_NAME" "ALERT: ${truncated_percentage}% of log segments are 'Truncated'."
send_alert "$SCRIPT_NAME" "HANA Log Segment" "${truncated_percentage}% of HANA log segments are in 'Truncated' state."
fi
# Calculate free percentage with integer arithmetic
free_percentage=$((free_segments * 100 / total_segments)) free_percentage=$((free_segments * 100 / total_segments))
# Check thresholds and alert
if [ $truncated_percentage -gt $TRUNCATED_PERCENTAGE_THRESHOLD ]; then
send_alert "$SCRIPT_NAME" "HANA Log Segment" "${truncated_percentage}% of log segments are 'Truncated' (threshold: ${TRUNCATED_PERCENTAGE_THRESHOLD}%)."
fi
if [ $free_percentage -lt $FREE_PERCENTAGE_THRESHOLD ]; then if [ $free_percentage -lt $FREE_PERCENTAGE_THRESHOLD ]; then
log_message "$SCRIPT_NAME" "ALERT: Only ${free_percentage}% of log segments are 'Free'." send_alert "$SCRIPT_NAME" "HANA Log Segment" "Only ${free_percentage}% of log segments are 'Free' (threshold: ${FREE_PERCENTAGE_THRESHOLD}%)."
send_alert "$SCRIPT_NAME" "HANA Log Segment" "Only ${free_percentage}% of HANA log segments are in 'Free' state."
fi fi
log_message "$SCRIPT_NAME" "Log segment check complete." log_message "$SCRIPT_NAME" "Log segment check complete."

View File

@@ -30,7 +30,7 @@ fi
STATEMENT_QUEUE_SQL="SELECT COUNT(*) FROM M_SERVICE_THREADS WHERE THREAD_TYPE = 'SqlExecutor' AND THREAD_STATE = 'Queueing';" STATEMENT_QUEUE_SQL="SELECT COUNT(*) FROM M_SERVICE_THREADS WHERE THREAD_TYPE = 'SqlExecutor' AND THREAD_STATE = 'Queueing';"
# Execute SQL query # Execute SQL query
queue_result=$(execute_hana_sql_query "$STATEMENT_QUEUE_SQL") queue_count=$(execute_hana_sql_query "$STATEMENT_QUEUE_SQL")
sql_status=$? sql_status=$?
if [ $sql_status -ne 0 ]; then if [ $sql_status -ne 0 ]; then
@@ -48,30 +48,11 @@ fi
log_message "$SCRIPT_NAME" "Current statement queue length: ${queue_count}" log_message "$SCRIPT_NAME" "Current statement queue length: ${queue_count}"
# Get breach count from state file # Alert immediately if queue exceeds threshold
breach_count_file="${STATE_DIR}/statement_queue_breach_count"
breach_count=0
if [ -f "$breach_count_file" ]; then
breach_count=$(cat "$breach_count_file")
fi
if [ "$queue_count" -gt "$STATEMENT_QUEUE_THRESHOLD" ]; then if [ "$queue_count" -gt "$STATEMENT_QUEUE_THRESHOLD" ]; then
breach_count=$((breach_count + 1)) send_alert "$SCRIPT_NAME" "HANA Statement Queue" "Statement queue count is ${queue_count}, which exceeds threshold of ${STATEMENT_QUEUE_THRESHOLD}."
log_message "$SCRIPT_NAME" "Statement queue is above threshold (${queue_count} > ${STATEMENT_QUEUE_THRESHOLD}). Consecutive breach count: ${breach_count}/${STATEMENT_QUEUE_CONSECUTIVE_RUNS}."
else
if [ "$breach_count" -gt 0 ]; then
log_message "$SCRIPT_NAME" "Statement queue returned to normal. Resetting breach count from ${breach_count} to 0."
fi
breach_count=0
fi
echo "$breach_count" > "$breach_count_file"
if [ "$breach_count" -ge "$STATEMENT_QUEUE_CONSECUTIVE_RUNS" ]; then
message="Statement queue has been over ${STATEMENT_QUEUE_THRESHOLD} for ${breach_count} checks. Current count: ${queue_count}."
send_alert "$SCRIPT_NAME" "HANA Statement Queue" "$message"
exit 1 exit 1
else
log_message "$SCRIPT_NAME" "Statement queue is normal. Current count: ${queue_count}."
fi fi
log_message "$SCRIPT_NAME" "Statement queue is normal. Current count: ${queue_count}."
log_message "$SCRIPT_NAME" "Statement queue check complete." log_message "$SCRIPT_NAME" "Statement queue check complete."

View File

@@ -21,12 +21,10 @@ fi
trap 'release_lock "$SCRIPT_NAME"' EXIT trap 'release_lock "$SCRIPT_NAME"' EXIT
# Function to check SLD health # Function to check SLD health
# Returns HTTP status code or "0" for connection errors
check_sld_health() { check_sld_health() {
local http_status local http_status
http_status=$(curl -k -s -o /dev/null -w "%{http_code}" -m "$SLD_TIMEOUT" --connect-timeout "$SLD_TIMEOUT" "$SLD_URL" 2>/dev/null) http_status=$(curl -k -s -o /dev/null -w "%{http_code}" -m "$SLD_TIMEOUT" --connect-timeout "$SLD_TIMEOUT" "$SLD_URL" 2>/dev/null)
# Handle curl errors (returns 000 on connection failure)
if [ -z "$http_status" ] || [ "$http_status" == "000" ]; then if [ -z "$http_status" ] || [ "$http_status" == "000" ]; then
echo "0" echo "0"
else else
@@ -38,7 +36,6 @@ check_sld_health() {
restart_sld_service() { restart_sld_service() {
log_message "$SCRIPT_NAME" "Attempting to restart SLD service..." log_message "$SCRIPT_NAME" "Attempting to restart SLD service..."
# Try systemctl first
if command -v systemctl &> /dev/null; then if command -v systemctl &> /dev/null; then
systemctl restart sapb1servertools 2>&1 systemctl restart sapb1servertools 2>&1
local restart_status=$? local restart_status=$?
@@ -50,73 +47,56 @@ restart_sld_service() {
return 1 return 1
fi fi
else else
log_message "$SCRIPT_NAME" "systemctl not available, trying alternative restart methods" log_message "$SCRIPT_NAME" "systemctl not available, trying service command"
# Fallback: try service command
service sapb1servertools restart 2>&1 service sapb1servertools restart 2>&1
return $? return $?
fi fi
} }
# Main monitoring logic log_message "$SCRIPT_NAME" "Checking SLD service health at ${SLD_URL}..."
main() {
log_message "$SCRIPT_NAME" "Checking SLD service health at ${SLD_URL}..."
local http_status http_status=$(check_sld_health)
http_status=$(check_sld_health)
# 200 OK or 401 Unauthorized indicate the Tomcat layer is actively processing # 200 OK or 401 Unauthorized indicate the Tomcat layer is actively processing
if [[ $http_status == 200 || $http_status == 401 ]]; then if [[ $http_status == 200 || $http_status == 401 ]]; then
log_message "$SCRIPT_NAME" "[OK] SLD service healthy (HTTP Status: $http_status)" log_message "$SCRIPT_NAME" "[OK] SLD service healthy (HTTP Status: $http_status)"
return 0 exit 0
fi fi
# Service is down or unresponsive # Service is down or unresponsive
local status_detail if [ "$http_status" == "0" ]; then
if [ "$http_status" == "0" ]; then status_detail="Connection failed or timeout"
status_detail="Connection failed or timeout" else
status_detail="HTTP Status: ${http_status}"
fi
log_message "$SCRIPT_NAME" "[ALERT] SLD dead or hung (${status_detail}). Restarting service..."
send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service is down (${status_detail}). Restarting ${SLD_URL}"
# Restart the service
if ! restart_sld_service; then
log_message "$SCRIPT_NAME" "[CRITICAL] Failed to restart SLD service"
send_alert "$SCRIPT_NAME" "SLD Service Critical" "Failed to restart SLD service"
exit 1
fi
# Allow service to spin up, then log recovery status
log_message "$SCRIPT_NAME" "Waiting 15 seconds for service to restart..."
sleep 15
recovery_status=$(check_sld_health)
if [[ $recovery_status == 200 || $recovery_status == 401 ]]; then
log_message "$SCRIPT_NAME" "[RECOVERY] SLD service recovered successfully (HTTP Status: $recovery_status)"
else
if [ "$recovery_status" == "0" ]; then
recovery_detail="Connection failed after restart"
else else
status_detail="HTTP Status: ${http_status}" recovery_detail="HTTP Status: $recovery_status"
fi fi
log_message "$SCRIPT_NAME" "[CRITICAL] SLD service failed to recover after restart (${recovery_detail})"
log_message "$SCRIPT_NAME" "[ALERT] SLD dead or hung (${status_detail}). Restarting service..." send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service FAILED to recover after restart (${recovery_detail})"
exit 1
# Send notification fi
send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service is down (${status_detail}). Restarting ${SLD_URL}"
# Restart the service
if ! restart_sld_service; then
log_message "$SCRIPT_NAME" "[CRITICAL] Failed to restart SLD service"
send_alert "$SCRIPT_NAME" "SLD Service Critical" "Failed to restart SLD service"
return 1
fi
# Allow service to spin up, then log recovery status
log_message "$SCRIPT_NAME" "Waiting 15 seconds for service to restart..."
sleep 15
local recovery_status
recovery_status=$(check_sld_health)
if [[ $recovery_status == 200 || $recovery_status == 401 ]]; then
log_message "$SCRIPT_NAME" "[RECOVERY] SLD service recovered successfully (HTTP Status: $recovery_status)"
else
local recovery_detail
if [ "$recovery_status" == "0" ]; then
recovery_detail="Connection failed after restart"
else
recovery_detail="HTTP Status: $recovery_status"
fi
log_message "$SCRIPT_NAME" "[CRITICAL] SLD service failed to recover after restart (${recovery_detail})"
send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service FAILED to recover after restart (${recovery_detail})"
return 1
fi
return 0
}
# Run main function
main
exit_code=$?
log_message "$SCRIPT_NAME" "SLD watchdog check complete." log_message "$SCRIPT_NAME" "SLD watchdog check complete."
exit $exit_code