refactor(monitoring): simplify monitoring scripts and remove state tracking

- Remove consecutive breach tracking for statement queue (immediate alerts) - Consolidate script initialization into init_script() function - Remove unused helper functions (send_ok, run_as_hana_user, get_mount_point) - Flatten sld_watchdog.sh structure by removing main() wrapper - Remove state directory and lock directory configuration from hana.conf - Simplify alert messages to include threshold values This continues the simplification effort from previous commits by removing stateful tracking mechanisms and streamlining the monitoring logic for easier maintenance.
2026-03-12 22:18:29 +01:00
parent cf5b81889d
commit 0beef6fa48
6 changed files with 83 additions and 148 deletions
--- a/hana.conf
+++ b/hana.conf
@@ -33,7 +33,6 @@ DISK_USAGE_THRESHOLD=85
 TRUNCATED_PERCENTAGE_THRESHOLD=50
 FREE_PERCENTAGE_THRESHOLD=10
 STATEMENT_QUEUE_THRESHOLD=10
-STATEMENT_QUEUE_CONSECUTIVE_RUNS=3
 BACKUP_THRESHOLD_HOURS=32

 # --- Notification Configuration ---
@@ -45,10 +44,3 @@ COMPANY_NAME="My Company"
 LOG_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
 LOG_FILE="${LOG_DIR}/hana_monitor.log"

-# --- State Directory ---
-STATE_DIR="${LOG_DIR}/monitor_state"
-mkdir -p "${STATE_DIR}"
-
-# --- Lock Directory ---
-LOCK_DIR="/tmp"
-
--- a/hana_disk.sh
+++ b/hana_disk.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 # =============================================================================
 # SAP HANA Disk Space Monitoring Script
-# Checks disk usage for configured directories with auto-cleanup capability
+# Checks disk usage for configured directories
 # =============================================================================

 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
--- a/hana_lib.sh
+++ b/hana_lib.sh
@@ -3,13 +3,22 @@
 # SAP HANA Monitoring Library - Shared Functions
 # =============================================================================

-# Logging function with script name prefix
-# Usage: log_message "SCRIPT_NAME" "message"
-log_message() {
-    local script_name="$1"
-    local message="$2"
-    local timestamp=$(date "+%Y-%m-%d %H:%M:%S")
-    echo "[${timestamp}] [${script_name}] ${message}" | tee -a "${LOG_FILE}"
+# Initialize script with common setup
+# Usage: init_script "SCRIPT_NAME"
+# Sets up: SCRIPT_DIR, SCRIPT_NAME, LOG_FILE, LOCK_DIR
+init_script() {
+    SCRIPT_NAME="$1"
+    SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
+    
+    # Load configuration
+    source "${SCRIPT_DIR}/hana.conf"
+    
+    # Setup logging
+    LOG_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
+    LOG_FILE="${LOG_DIR}/hana_monitor.log"
+    
+    # Setup lock directory
+    LOCK_DIR="/tmp"
 }

 # Acquire lock for script execution
@@ -38,6 +47,15 @@ release_lock() {
    fi
 }

+# Logging function with script name prefix
+# Usage: log_message "SCRIPT_NAME" "message"
+log_message() {
+    local script_name="$1"
+    local message="$2"
+    local timestamp=$(date "+%Y-%m-%d %H:%M:%S")
+    echo "[${timestamp}] [${script_name}] ${message}" | tee -a "${LOG_FILE}"
+}
+
 # Send notification via ntfy
 # Usage: send_notification "TITLE" "MESSAGE"
 send_notification() {
@@ -64,23 +82,6 @@ send_alert() {
    log_message "$script_name" "ALERT: ${message}"
 }

-# Send OK notification (state change from alert to normal)
-# Usage: send_ok "SCRIPT_NAME" "TITLE_PREFIX" "MESSAGE"
-send_ok() {
-    local script_name="$1"
-    local title_prefix="$2"
-    local message="$3"
-    send_notification "${title_prefix} Resolved" "✅ Resolved: ${message}"
-    log_message "$script_name" "RESOLVED: ${message}"
-}
-
-# Run command as HANA user using su
-# Usage: run_as_hana_user "COMMAND"
-run_as_hana_user() {
-    local command="$1"
-    su - "$HANA_USER" -c "$command"
-}
-
 # Execute SQL query as HANA user
 # Usage: execute_hana_sql "SQL_QUERY"
 # Returns: SQL output on stdout, returns 0 on success, 1 on failure
@@ -131,19 +132,3 @@ get_disk_usage_percentage() {
    fi
    df "$dir" 2>/dev/null | awk 'NR==2 {gsub(/%/,"",$5); print $5}'
 }
-
-# Get mount point for a directory
-# Usage: get_mount_point "/path/to/dir"
-# Returns: Mount point path
-get_mount_point() {
-    local dir="$1"
-    df "$dir" 2>/dev/null | awk 'NR==2 {print $NF}'
-}
-
-# Get available disk space in KB for a directory
-# Usage: get_available_space_kb "/path/to/dir"
-# Returns: Available space in KB
-get_available_space_kb() {
-    local dir="$1"
-    df -k "$dir" 2>/dev/null | awk 'NR==2 {print $4}'
-}
--- a/hana_log_segments.sh
+++ b/hana_log_segments.sh
@@ -72,25 +72,22 @@ log_message "$SCRIPT_NAME" "Truncated Segments: ${truncated_segments}"
 log_message "$SCRIPT_NAME" "Free Segments: ${free_segments}"

 if [ $total_segments -eq 0 ]; then
-    log_message "$SCRIPT_NAME" "WARNING: No log segments found. Skipping percentage checks."
+    log_message "$SCRIPT_NAME" "WARNING: No log segments found."
    send_alert "$SCRIPT_NAME" "HANA Log Segment Warning" "No log segments found."
    exit 1
 fi

-# Calculate truncated percentage with integer arithmetic
+# Calculate percentages
 truncated_percentage=$((truncated_segments * 100 / total_segments))
-
-if [ $truncated_percentage -gt $TRUNCATED_PERCENTAGE_THRESHOLD ]; then
-    log_message "$SCRIPT_NAME" "ALERT: ${truncated_percentage}% of log segments are 'Truncated'."
-    send_alert "$SCRIPT_NAME" "HANA Log Segment" "${truncated_percentage}% of HANA log segments are in 'Truncated' state."
-fi
-
-# Calculate free percentage with integer arithmetic
 free_percentage=$((free_segments * 100 / total_segments))

+# Check thresholds and alert
+if [ $truncated_percentage -gt $TRUNCATED_PERCENTAGE_THRESHOLD ]; then
+    send_alert "$SCRIPT_NAME" "HANA Log Segment" "${truncated_percentage}% of log segments are 'Truncated' (threshold: ${TRUNCATED_PERCENTAGE_THRESHOLD}%)."
+fi
+
 if [ $free_percentage -lt $FREE_PERCENTAGE_THRESHOLD ]; then
-    log_message "$SCRIPT_NAME" "ALERT: Only ${free_percentage}% of log segments are 'Free'."
-    send_alert "$SCRIPT_NAME" "HANA Log Segment" "Only ${free_percentage}% of HANA log segments are in 'Free' state."
+    send_alert "$SCRIPT_NAME" "HANA Log Segment" "Only ${free_percentage}% of log segments are 'Free' (threshold: ${FREE_PERCENTAGE_THRESHOLD}%)."
 fi

 log_message "$SCRIPT_NAME" "Log segment check complete."
--- a/hana_queue.sh
+++ b/hana_queue.sh
@@ -30,7 +30,7 @@ fi
 STATEMENT_QUEUE_SQL="SELECT COUNT(*) FROM M_SERVICE_THREADS WHERE THREAD_TYPE = 'SqlExecutor' AND THREAD_STATE = 'Queueing';"

 # Execute SQL query
-queue_result=$(execute_hana_sql_query "$STATEMENT_QUEUE_SQL")
+queue_count=$(execute_hana_sql_query "$STATEMENT_QUEUE_SQL")
 sql_status=$?

 if [ $sql_status -ne 0 ]; then
@@ -48,30 +48,11 @@ fi

 log_message "$SCRIPT_NAME" "Current statement queue length: ${queue_count}"

-# Get breach count from state file
-breach_count_file="${STATE_DIR}/statement_queue_breach_count"
-breach_count=0
-if [ -f "$breach_count_file" ]; then
-    breach_count=$(cat "$breach_count_file")
-fi
-
+# Alert immediately if queue exceeds threshold
 if [ "$queue_count" -gt "$STATEMENT_QUEUE_THRESHOLD" ]; then
-    breach_count=$((breach_count + 1))
-    log_message "$SCRIPT_NAME" "Statement queue is above threshold (${queue_count} > ${STATEMENT_QUEUE_THRESHOLD}). Consecutive breach count: ${breach_count}/${STATEMENT_QUEUE_CONSECUTIVE_RUNS}."
-else
-    if [ "$breach_count" -gt 0 ]; then
-        log_message "$SCRIPT_NAME" "Statement queue returned to normal. Resetting breach count from ${breach_count} to 0."
-    fi
-    breach_count=0
-fi
-echo "$breach_count" > "$breach_count_file"
-
-if [ "$breach_count" -ge "$STATEMENT_QUEUE_CONSECUTIVE_RUNS" ]; then
-    message="Statement queue has been over ${STATEMENT_QUEUE_THRESHOLD} for ${breach_count} checks. Current count: ${queue_count}."
-    send_alert "$SCRIPT_NAME" "HANA Statement Queue" "$message"
+    send_alert "$SCRIPT_NAME" "HANA Statement Queue" "Statement queue count is ${queue_count}, which exceeds threshold of ${STATEMENT_QUEUE_THRESHOLD}."
    exit 1
-else
-    log_message "$SCRIPT_NAME" "Statement queue is normal. Current count: ${queue_count}."
 fi

+log_message "$SCRIPT_NAME" "Statement queue is normal. Current count: ${queue_count}."
 log_message "$SCRIPT_NAME" "Statement queue check complete."
--- a/sld_watchdog.sh
+++ b/sld_watchdog.sh
@@ -21,12 +21,10 @@ fi
 trap 'release_lock "$SCRIPT_NAME"' EXIT

 # Function to check SLD health
-# Returns HTTP status code or "0" for connection errors
 check_sld_health() {
    local http_status
    http_status=$(curl -k -s -o /dev/null -w "%{http_code}" -m "$SLD_TIMEOUT" --connect-timeout "$SLD_TIMEOUT" "$SLD_URL" 2>/dev/null)
    
-    # Handle curl errors (returns 000 on connection failure)
    if [ -z "$http_status" ] || [ "$http_status" == "000" ]; then
        echo "0"
    else
@@ -38,7 +36,6 @@ check_sld_health() {
 restart_sld_service() {
    log_message "$SCRIPT_NAME" "Attempting to restart SLD service..."
    
-    # Try systemctl first
    if command -v systemctl &> /dev/null; then
        systemctl restart sapb1servertools 2>&1
        local restart_status=$?
@@ -50,73 +47,56 @@ restart_sld_service() {
            return 1
        fi
    else
-        log_message "$SCRIPT_NAME" "systemctl not available, trying alternative restart methods"
-        # Fallback: try service command
+        log_message "$SCRIPT_NAME" "systemctl not available, trying service command"
        service sapb1servertools restart 2>&1
        return $?
    fi
 }

-# Main monitoring logic
-main() {
-    log_message "$SCRIPT_NAME" "Checking SLD service health at ${SLD_URL}..."
+log_message "$SCRIPT_NAME" "Checking SLD service health at ${SLD_URL}..."

-    local http_status
-    http_status=$(check_sld_health)
+http_status=$(check_sld_health)

-    # 200 OK or 401 Unauthorized indicate the Tomcat layer is actively processing
-    if [[ $http_status == 200 || $http_status == 401 ]]; then
-        log_message "$SCRIPT_NAME" "[OK] SLD service healthy (HTTP Status: $http_status)"
-        return 0
-    fi
+# 200 OK or 401 Unauthorized indicate the Tomcat layer is actively processing
+if [[ $http_status == 200 || $http_status == 401 ]]; then
+    log_message "$SCRIPT_NAME" "[OK] SLD service healthy (HTTP Status: $http_status)"
+    exit 0
+fi

-    # Service is down or unresponsive
-    local status_detail
-    if [ "$http_status" == "0" ]; then
-        status_detail="Connection failed or timeout"
+# Service is down or unresponsive
+if [ "$http_status" == "0" ]; then
+    status_detail="Connection failed or timeout"
+else
+    status_detail="HTTP Status: ${http_status}"
+fi
+
+log_message "$SCRIPT_NAME" "[ALERT] SLD dead or hung (${status_detail}). Restarting service..."
+send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service is down (${status_detail}). Restarting ${SLD_URL}"
+
+# Restart the service
+if ! restart_sld_service; then
+    log_message "$SCRIPT_NAME" "[CRITICAL] Failed to restart SLD service"
+    send_alert "$SCRIPT_NAME" "SLD Service Critical" "Failed to restart SLD service"
+    exit 1
+fi
+
+# Allow service to spin up, then log recovery status
+log_message "$SCRIPT_NAME" "Waiting 15 seconds for service to restart..."
+sleep 15
+
+recovery_status=$(check_sld_health)
+
+if [[ $recovery_status == 200 || $recovery_status == 401 ]]; then
+    log_message "$SCRIPT_NAME" "[RECOVERY] SLD service recovered successfully (HTTP Status: $recovery_status)"
+else
+    if [ "$recovery_status" == "0" ]; then
+        recovery_detail="Connection failed after restart"
    else
-        status_detail="HTTP Status: ${http_status}"
+        recovery_detail="HTTP Status: $recovery_status"
    fi
-    
-    log_message "$SCRIPT_NAME" "[ALERT] SLD dead or hung (${status_detail}). Restarting service..."
-    
-    # Send notification
-    send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service is down (${status_detail}). Restarting ${SLD_URL}"
-    
-    # Restart the service
-    if ! restart_sld_service; then
-        log_message "$SCRIPT_NAME" "[CRITICAL] Failed to restart SLD service"
-        send_alert "$SCRIPT_NAME" "SLD Service Critical" "Failed to restart SLD service"
-        return 1
-    fi
-    
-    # Allow service to spin up, then log recovery status
-    log_message "$SCRIPT_NAME" "Waiting 15 seconds for service to restart..."
-    sleep 15
-    
-    local recovery_status
-    recovery_status=$(check_sld_health)
-    
-    if [[ $recovery_status == 200 || $recovery_status == 401 ]]; then
-        log_message "$SCRIPT_NAME" "[RECOVERY] SLD service recovered successfully (HTTP Status: $recovery_status)"
-    else
-        local recovery_detail
-        if [ "$recovery_status" == "0" ]; then
-            recovery_detail="Connection failed after restart"
-        else
-            recovery_detail="HTTP Status: $recovery_status"
-        fi
-        log_message "$SCRIPT_NAME" "[CRITICAL] SLD service failed to recover after restart (${recovery_detail})"
-        send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service FAILED to recover after restart (${recovery_detail})"
-        return 1
-    fi
-    
-    return 0
-}
-
-# Run main function
-main
-exit_code=$?
+    log_message "$SCRIPT_NAME" "[CRITICAL] SLD service failed to recover after restart (${recovery_detail})"
+    send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service FAILED to recover after restart (${recovery_detail})"
+    exit 1
+fi

 log_message "$SCRIPT_NAME" "SLD watchdog check complete."
-exit $exit_code