refactor(monitoring): simplify monitoring scripts and remove state tracking

- Remove consecutive breach tracking for statement queue (immediate alerts) - Consolidate script initialization into init_script() function - Remove unused helper functions (send_ok, run_as_hana_user, get_mount_point) - Flatten sld_watchdog.sh structure by removing main() wrapper - Remove state directory and lock directory configuration from hana.conf - Simplify alert messages to include threshold values This continues the simplification effort from previous commits by removing stateful tracking mechanisms and streamlining the monitoring logic for easier maintenance.
2026-03-12 22:18:29 +01:00
parent cf5b81889d
commit 0beef6fa48
6 changed files with 83 additions and 148 deletions
--- a/sld_watchdog.sh
+++ b/sld_watchdog.sh
@@ -21,12 +21,10 @@ fi
 trap 'release_lock "$SCRIPT_NAME"' EXIT

 # Function to check SLD health
-# Returns HTTP status code or "0" for connection errors
 check_sld_health() {
    local http_status
    http_status=$(curl -k -s -o /dev/null -w "%{http_code}" -m "$SLD_TIMEOUT" --connect-timeout "$SLD_TIMEOUT" "$SLD_URL" 2>/dev/null)
    
-    # Handle curl errors (returns 000 on connection failure)
    if [ -z "$http_status" ] || [ "$http_status" == "000" ]; then
        echo "0"
    else
@@ -38,7 +36,6 @@ check_sld_health() {
 restart_sld_service() {
    log_message "$SCRIPT_NAME" "Attempting to restart SLD service..."
    
-    # Try systemctl first
    if command -v systemctl &> /dev/null; then
        systemctl restart sapb1servertools 2>&1
        local restart_status=$?
@@ -50,73 +47,56 @@ restart_sld_service() {
            return 1
        fi
    else
-        log_message "$SCRIPT_NAME" "systemctl not available, trying alternative restart methods"
-        # Fallback: try service command
+        log_message "$SCRIPT_NAME" "systemctl not available, trying service command"
        service sapb1servertools restart 2>&1
        return $?
    fi
 }

-# Main monitoring logic
-main() {
-    log_message "$SCRIPT_NAME" "Checking SLD service health at ${SLD_URL}..."
-    
-    local http_status
-    http_status=$(check_sld_health)
-    
-    # 200 OK or 401 Unauthorized indicate the Tomcat layer is actively processing
-    if [[ $http_status == 200 || $http_status == 401 ]]; then
-        log_message "$SCRIPT_NAME" "[OK] SLD service healthy (HTTP Status: $http_status)"
-        return 0
-    fi
-    
-    # Service is down or unresponsive
-    local status_detail
-    if [ "$http_status" == "0" ]; then
-        status_detail="Connection failed or timeout"
-    else
-        status_detail="HTTP Status: ${http_status}"
-    fi
-    
-    log_message "$SCRIPT_NAME" "[ALERT] SLD dead or hung (${status_detail}). Restarting service..."
-    
-    # Send notification
-    send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service is down (${status_detail}). Restarting ${SLD_URL}"
-    
-    # Restart the service
-    if ! restart_sld_service; then
-        log_message "$SCRIPT_NAME" "[CRITICAL] Failed to restart SLD service"
-        send_alert "$SCRIPT_NAME" "SLD Service Critical" "Failed to restart SLD service"
-        return 1
-    fi
-    
-    # Allow service to spin up, then log recovery status
-    log_message "$SCRIPT_NAME" "Waiting 15 seconds for service to restart..."
-    sleep 15
-    
-    local recovery_status
-    recovery_status=$(check_sld_health)
-    
-    if [[ $recovery_status == 200 || $recovery_status == 401 ]]; then
-        log_message "$SCRIPT_NAME" "[RECOVERY] SLD service recovered successfully (HTTP Status: $recovery_status)"
-    else
-        local recovery_detail
-        if [ "$recovery_status" == "0" ]; then
-            recovery_detail="Connection failed after restart"
-        else
-            recovery_detail="HTTP Status: $recovery_status"
-        fi
-        log_message "$SCRIPT_NAME" "[CRITICAL] SLD service failed to recover after restart (${recovery_detail})"
-        send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service FAILED to recover after restart (${recovery_detail})"
-        return 1
-    fi
-    
-    return 0
-}
+log_message "$SCRIPT_NAME" "Checking SLD service health at ${SLD_URL}..."

-# Run main function
-main
-exit_code=$?
+http_status=$(check_sld_health)
+
+# 200 OK or 401 Unauthorized indicate the Tomcat layer is actively processing
+if [[ $http_status == 200 || $http_status == 401 ]]; then
+    log_message "$SCRIPT_NAME" "[OK] SLD service healthy (HTTP Status: $http_status)"
+    exit 0
+fi
+
+# Service is down or unresponsive
+if [ "$http_status" == "0" ]; then
+    status_detail="Connection failed or timeout"
+else
+    status_detail="HTTP Status: ${http_status}"
+fi
+
+log_message "$SCRIPT_NAME" "[ALERT] SLD dead or hung (${status_detail}). Restarting service..."
+send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service is down (${status_detail}). Restarting ${SLD_URL}"
+
+# Restart the service
+if ! restart_sld_service; then
+    log_message "$SCRIPT_NAME" "[CRITICAL] Failed to restart SLD service"
+    send_alert "$SCRIPT_NAME" "SLD Service Critical" "Failed to restart SLD service"
+    exit 1
+fi
+
+# Allow service to spin up, then log recovery status
+log_message "$SCRIPT_NAME" "Waiting 15 seconds for service to restart..."
+sleep 15
+
+recovery_status=$(check_sld_health)
+
+if [[ $recovery_status == 200 || $recovery_status == 401 ]]; then
+    log_message "$SCRIPT_NAME" "[RECOVERY] SLD service recovered successfully (HTTP Status: $recovery_status)"
+else
+    if [ "$recovery_status" == "0" ]; then
+        recovery_detail="Connection failed after restart"
+    else
+        recovery_detail="HTTP Status: $recovery_status"
+    fi
+    log_message "$SCRIPT_NAME" "[CRITICAL] SLD service failed to recover after restart (${recovery_detail})"
+    send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service FAILED to recover after restart (${recovery_detail})"
+    exit 1
+fi

 log_message "$SCRIPT_NAME" "SLD watchdog check complete."
-exit $exit_code