refactor(monitoring): simplify monitoring scripts and remove state tracking

- Remove consecutive breach tracking for statement queue (immediate alerts) - Consolidate script initialization into init_script() function - Remove unused helper functions (send_ok, run_as_hana_user, get_mount_point) - Flatten sld_watchdog.sh structure by removing main() wrapper - Remove state directory and lock directory configuration from hana.conf - Simplify alert messages to include threshold values This continues the simplification effort from previous commits by removing stateful tracking mechanisms and streamlining the monitoring logic for easier maintenance.
2026-03-12 22:18:29 +01:00
parent cf5b81889d
commit 0beef6fa48
6 changed files with 83 additions and 148 deletions
--- a/hana.conf
+++ b/hana.conf
@@ -33,7 +33,6 @@ DISK_USAGE_THRESHOLD=85
 TRUNCATED_PERCENTAGE_THRESHOLD=50
 FREE_PERCENTAGE_THRESHOLD=10
 STATEMENT_QUEUE_THRESHOLD=10
 STATEMENT_QUEUE_CONSECUTIVE_RUNS=3
 BACKUP_THRESHOLD_HOURS=32
 # --- Notification Configuration ---
@@ -45,10 +44,3 @@ COMPANY_NAME="My Company"
 LOG_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
 LOG_FILE="${LOG_DIR}/hana_monitor.log"
 # --- State Directory ---
 STATE_DIR="${LOG_DIR}/monitor_state"
 mkdir -p "${STATE_DIR}"
 # --- Lock Directory ---
 LOCK_DIR="/tmp"
--- a/hana_disk.sh
+++ b/hana_disk.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 # =============================================================================
 # SAP HANA Disk Space Monitoring Script
-# Checks disk usage for configured directories with auto-cleanup capability
+# Checks disk usage for configured directories
 # =============================================================================
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
--- a/hana_lib.sh
+++ b/hana_lib.sh
@@ -3,13 +3,22 @@
 # SAP HANA Monitoring Library - Shared Functions
 # =============================================================================
-# Logging function with script name prefix
+# Initialize script with common setup
-# Usage: log_message "SCRIPT_NAME" "message"
+# Usage: init_script "SCRIPT_NAME"
-log_message() {
+# Sets up: SCRIPT_DIR, SCRIPT_NAME, LOG_FILE, LOCK_DIR
-    local script_name="$1"
+init_script() {
-    local message="$2"
+    SCRIPT_NAME="$1"
-    local timestamp=$(date "+%Y-%m-%d %H:%M:%S")
+    SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
-    echo "[${timestamp}] [${script_name}] ${message}" | tee -a "${LOG_FILE}"
+    
    # Load configuration
    source "${SCRIPT_DIR}/hana.conf"
    # Setup logging
    LOG_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
    LOG_FILE="${LOG_DIR}/hana_monitor.log"
    # Setup lock directory
    LOCK_DIR="/tmp"
 }
 # Acquire lock for script execution
@@ -38,6 +47,15 @@ release_lock() {
    fi
 }
 # Logging function with script name prefix
 # Usage: log_message "SCRIPT_NAME" "message"
 log_message() {
    local script_name="$1"
    local message="$2"
    local timestamp=$(date "+%Y-%m-%d %H:%M:%S")
    echo "[${timestamp}] [${script_name}] ${message}" | tee -a "${LOG_FILE}"
 }
 # Send notification via ntfy
 # Usage: send_notification "TITLE" "MESSAGE"
 send_notification() {
@@ -64,23 +82,6 @@ send_alert() {
    log_message "$script_name" "ALERT: ${message}"
 }
 # Send OK notification (state change from alert to normal)
 # Usage: send_ok "SCRIPT_NAME" "TITLE_PREFIX" "MESSAGE"
 send_ok() {
    local script_name="$1"
    local title_prefix="$2"
    local message="$3"
    send_notification "${title_prefix} Resolved" "✅ Resolved: ${message}"
    log_message "$script_name" "RESOLVED: ${message}"
 }
 # Run command as HANA user using su
 # Usage: run_as_hana_user "COMMAND"
 run_as_hana_user() {
    local command="$1"
    su - "$HANA_USER" -c "$command"
 }
 # Execute SQL query as HANA user
 # Usage: execute_hana_sql "SQL_QUERY"
 # Returns: SQL output on stdout, returns 0 on success, 1 on failure
@@ -131,19 +132,3 @@ get_disk_usage_percentage() {
    fi
    df "$dir" 2>/dev/null | awk 'NR==2 {gsub(/%/,"",$5); print $5}'
 }
 # Get mount point for a directory
 # Usage: get_mount_point "/path/to/dir"
 # Returns: Mount point path
 get_mount_point() {
    local dir="$1"
    df "$dir" 2>/dev/null | awk 'NR==2 {print $NF}'
 }
 # Get available disk space in KB for a directory
 # Usage: get_available_space_kb "/path/to/dir"
 # Returns: Available space in KB
 get_available_space_kb() {
    local dir="$1"
    df -k "$dir" 2>/dev/null | awk 'NR==2 {print $4}'
 }
--- a/hana_log_segments.sh
+++ b/hana_log_segments.sh
@@ -72,25 +72,22 @@ log_message "$SCRIPT_NAME" "Truncated Segments: ${truncated_segments}"
 log_message "$SCRIPT_NAME" "Free Segments: ${free_segments}"
 if [ $total_segments -eq 0 ]; then
-    log_message "$SCRIPT_NAME" "WARNING: No log segments found. Skipping percentage checks."
+    log_message "$SCRIPT_NAME" "WARNING: No log segments found."
    send_alert "$SCRIPT_NAME" "HANA Log Segment Warning" "No log segments found."
    exit 1
 fi
-# Calculate truncated percentage with integer arithmetic
+# Calculate percentages
 truncated_percentage=$((truncated_segments * 100 / total_segments))
 if [ $truncated_percentage -gt $TRUNCATED_PERCENTAGE_THRESHOLD ]; then
    log_message "$SCRIPT_NAME" "ALERT: ${truncated_percentage}% of log segments are 'Truncated'."
    send_alert "$SCRIPT_NAME" "HANA Log Segment" "${truncated_percentage}% of HANA log segments are in 'Truncated' state."
 fi
 # Calculate free percentage with integer arithmetic
 free_percentage=$((free_segments * 100 / total_segments))
 # Check thresholds and alert
 if [ $truncated_percentage -gt $TRUNCATED_PERCENTAGE_THRESHOLD ]; then
    send_alert "$SCRIPT_NAME" "HANA Log Segment" "${truncated_percentage}% of log segments are 'Truncated' (threshold: ${TRUNCATED_PERCENTAGE_THRESHOLD}%)."
 fi
 if [ $free_percentage -lt $FREE_PERCENTAGE_THRESHOLD ]; then
-    log_message "$SCRIPT_NAME" "ALERT: Only ${free_percentage}% of log segments are 'Free'."
+    send_alert "$SCRIPT_NAME" "HANA Log Segment" "Only ${free_percentage}% of log segments are 'Free' (threshold: ${FREE_PERCENTAGE_THRESHOLD}%)."
    send_alert "$SCRIPT_NAME" "HANA Log Segment" "Only ${free_percentage}% of HANA log segments are in 'Free' state."
 fi
 log_message "$SCRIPT_NAME" "Log segment check complete."
--- a/hana_queue.sh
+++ b/hana_queue.sh
@@ -30,7 +30,7 @@ fi
 STATEMENT_QUEUE_SQL="SELECT COUNT(*) FROM M_SERVICE_THREADS WHERE THREAD_TYPE = 'SqlExecutor' AND THREAD_STATE = 'Queueing';"
 # Execute SQL query
-queue_result=$(execute_hana_sql_query "$STATEMENT_QUEUE_SQL")
+queue_count=$(execute_hana_sql_query "$STATEMENT_QUEUE_SQL")
 sql_status=$?
 if [ $sql_status -ne 0 ]; then
@@ -48,30 +48,11 @@ fi
 log_message "$SCRIPT_NAME" "Current statement queue length: ${queue_count}"
-# Get breach count from state file
+# Alert immediately if queue exceeds threshold
 breach_count_file="${STATE_DIR}/statement_queue_breach_count"
 breach_count=0
 if [ -f "$breach_count_file" ]; then
    breach_count=$(cat "$breach_count_file")
 fi
 if [ "$queue_count" -gt "$STATEMENT_QUEUE_THRESHOLD" ]; then
-    breach_count=$((breach_count + 1))
+    send_alert "$SCRIPT_NAME" "HANA Statement Queue" "Statement queue count is ${queue_count}, which exceeds threshold of ${STATEMENT_QUEUE_THRESHOLD}."
    log_message "$SCRIPT_NAME" "Statement queue is above threshold (${queue_count} > ${STATEMENT_QUEUE_THRESHOLD}). Consecutive breach count: ${breach_count}/${STATEMENT_QUEUE_CONSECUTIVE_RUNS}."
 else
    if [ "$breach_count" -gt 0 ]; then
        log_message "$SCRIPT_NAME" "Statement queue returned to normal. Resetting breach count from ${breach_count} to 0."
    fi
    breach_count=0
 fi
 echo "$breach_count" > "$breach_count_file"
 if [ "$breach_count" -ge "$STATEMENT_QUEUE_CONSECUTIVE_RUNS" ]; then
    message="Statement queue has been over ${STATEMENT_QUEUE_THRESHOLD} for ${breach_count} checks. Current count: ${queue_count}."
    send_alert "$SCRIPT_NAME" "HANA Statement Queue" "$message"
    exit 1
 else
    log_message "$SCRIPT_NAME" "Statement queue is normal. Current count: ${queue_count}."
 fi
 log_message "$SCRIPT_NAME" "Statement queue is normal. Current count: ${queue_count}."
 log_message "$SCRIPT_NAME" "Statement queue check complete."
--- a/sld_watchdog.sh
+++ b/sld_watchdog.sh
@@ -21,12 +21,10 @@ fi
 trap 'release_lock "$SCRIPT_NAME"' EXIT
 # Function to check SLD health
 # Returns HTTP status code or "0" for connection errors
 check_sld_health() {
    local http_status
    http_status=$(curl -k -s -o /dev/null -w "%{http_code}" -m "$SLD_TIMEOUT" --connect-timeout "$SLD_TIMEOUT" "$SLD_URL" 2>/dev/null)
    # Handle curl errors (returns 000 on connection failure)
    if [ -z "$http_status" ] || [ "$http_status" == "000" ]; then
        echo "0"
    else
@@ -38,7 +36,6 @@ check_sld_health() {
 restart_sld_service() {
    log_message "$SCRIPT_NAME" "Attempting to restart SLD service..."
    # Try systemctl first
    if command -v systemctl &> /dev/null; then
        systemctl restart sapb1servertools 2>&1
        local restart_status=$?
@@ -50,57 +47,48 @@ restart_sld_service() {
            return 1
        fi
    else
-        log_message "$SCRIPT_NAME" "systemctl not available, trying alternative restart methods"
+        log_message "$SCRIPT_NAME" "systemctl not available, trying service command"
        # Fallback: try service command
        service sapb1servertools restart 2>&1
        return $?
    fi
 }
-# Main monitoring logic
+log_message "$SCRIPT_NAME" "Checking SLD service health at ${SLD_URL}..."
 main() {
    log_message "$SCRIPT_NAME" "Checking SLD service health at ${SLD_URL}..."
-    local http_status
+http_status=$(check_sld_health)
    http_status=$(check_sld_health)
-    # 200 OK or 401 Unauthorized indicate the Tomcat layer is actively processing
+# 200 OK or 401 Unauthorized indicate the Tomcat layer is actively processing
-    if [[ $http_status == 200 || $http_status == 401 ]]; then
+if [[ $http_status == 200 || $http_status == 401 ]]; then
    log_message "$SCRIPT_NAME" "[OK] SLD service healthy (HTTP Status: $http_status)"
-        return 0
+    exit 0
-    fi
+fi
-    # Service is down or unresponsive
+# Service is down or unresponsive
-    local status_detail
+if [ "$http_status" == "0" ]; then
    if [ "$http_status" == "0" ]; then
    status_detail="Connection failed or timeout"
-    else
+else
    status_detail="HTTP Status: ${http_status}"
-    fi
+fi
-    log_message "$SCRIPT_NAME" "[ALERT] SLD dead or hung (${status_detail}). Restarting service..."
+log_message "$SCRIPT_NAME" "[ALERT] SLD dead or hung (${status_detail}). Restarting service..."
 send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service is down (${status_detail}). Restarting ${SLD_URL}"
-    # Send notification
+# Restart the service
-    send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service is down (${status_detail}). Restarting ${SLD_URL}"
+if ! restart_sld_service; then
    # Restart the service
    if ! restart_sld_service; then
    log_message "$SCRIPT_NAME" "[CRITICAL] Failed to restart SLD service"
    send_alert "$SCRIPT_NAME" "SLD Service Critical" "Failed to restart SLD service"
-        return 1
+    exit 1
-    fi
+fi
-    # Allow service to spin up, then log recovery status
+# Allow service to spin up, then log recovery status
-    log_message "$SCRIPT_NAME" "Waiting 15 seconds for service to restart..."
+log_message "$SCRIPT_NAME" "Waiting 15 seconds for service to restart..."
-    sleep 15
+sleep 15
-    local recovery_status
+recovery_status=$(check_sld_health)
    recovery_status=$(check_sld_health)
-    if [[ $recovery_status == 200 || $recovery_status == 401 ]]; then
+if [[ $recovery_status == 200 || $recovery_status == 401 ]]; then
    log_message "$SCRIPT_NAME" "[RECOVERY] SLD service recovered successfully (HTTP Status: $recovery_status)"
-    else
+else
        local recovery_detail
    if [ "$recovery_status" == "0" ]; then
        recovery_detail="Connection failed after restart"
    else
@@ -108,15 +96,7 @@ main() {
    fi
    log_message "$SCRIPT_NAME" "[CRITICAL] SLD service failed to recover after restart (${recovery_detail})"
    send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service FAILED to recover after restart (${recovery_detail})"
-        return 1
+    exit 1
-    fi
+fi
    return 0
 }
 # Run main function
 main
 exit_code=$?
 log_message "$SCRIPT_NAME" "SLD watchdog check complete."
 exit $exit_code