- Remove consecutive breach tracking for statement queue (immediate alerts) - Consolidate script initialization into init_script() function - Remove unused helper functions (send_ok, run_as_hana_user, get_mount_point) - Flatten sld_watchdog.sh structure by removing main() wrapper - Remove state directory and lock directory configuration from hana.conf - Simplify alert messages to include threshold values This continues the simplification effort from previous commits by removing stateful tracking mechanisms and streamlining the monitoring logic for easier maintenance.
103 lines
3.4 KiB
Bash
103 lines
3.4 KiB
Bash
#!/bin/bash
|
|
# =============================================================================
|
|
# sld_watchdog.sh - Monitors SLD service health and restarts if needed
|
|
# =============================================================================
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
SCRIPT_NAME="sld_watchdog"
|
|
|
|
# Source configuration and library
|
|
source "$SCRIPT_DIR/hana.conf"
|
|
source "$SCRIPT_DIR/hana_lib.sh"
|
|
|
|
# SLD-specific configuration
|
|
SLD_URL="${SLD_URL:-https://localhost:40000/sld/sld0100.svc}"
|
|
SLD_TIMEOUT="${SLD_TIMEOUT:-5}"
|
|
|
|
# Acquire lock
|
|
if ! acquire_lock "$SCRIPT_NAME"; then
|
|
exit 0
|
|
fi
|
|
trap 'release_lock "$SCRIPT_NAME"' EXIT
|
|
|
|
# Function to check SLD health
|
|
check_sld_health() {
|
|
local http_status
|
|
http_status=$(curl -k -s -o /dev/null -w "%{http_code}" -m "$SLD_TIMEOUT" --connect-timeout "$SLD_TIMEOUT" "$SLD_URL" 2>/dev/null)
|
|
|
|
if [ -z "$http_status" ] || [ "$http_status" == "000" ]; then
|
|
echo "0"
|
|
else
|
|
echo "$http_status"
|
|
fi
|
|
}
|
|
|
|
# Function to restart SLD service
|
|
restart_sld_service() {
|
|
log_message "$SCRIPT_NAME" "Attempting to restart SLD service..."
|
|
|
|
if command -v systemctl &> /dev/null; then
|
|
systemctl restart sapb1servertools 2>&1
|
|
local restart_status=$?
|
|
if [ $restart_status -eq 0 ]; then
|
|
log_message "$SCRIPT_NAME" "Service restart command executed successfully"
|
|
return 0
|
|
else
|
|
log_message "$SCRIPT_NAME" "Service restart failed with exit code ${restart_status}"
|
|
return 1
|
|
fi
|
|
else
|
|
log_message "$SCRIPT_NAME" "systemctl not available, trying service command"
|
|
service sapb1servertools restart 2>&1
|
|
return $?
|
|
fi
|
|
}
|
|
|
|
log_message "$SCRIPT_NAME" "Checking SLD service health at ${SLD_URL}..."
|
|
|
|
http_status=$(check_sld_health)
|
|
|
|
# 200 OK or 401 Unauthorized indicate the Tomcat layer is actively processing
|
|
if [[ $http_status == 200 || $http_status == 401 ]]; then
|
|
log_message "$SCRIPT_NAME" "[OK] SLD service healthy (HTTP Status: $http_status)"
|
|
exit 0
|
|
fi
|
|
|
|
# Service is down or unresponsive
|
|
if [ "$http_status" == "0" ]; then
|
|
status_detail="Connection failed or timeout"
|
|
else
|
|
status_detail="HTTP Status: ${http_status}"
|
|
fi
|
|
|
|
log_message "$SCRIPT_NAME" "[ALERT] SLD dead or hung (${status_detail}). Restarting service..."
|
|
send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service is down (${status_detail}). Restarting ${SLD_URL}"
|
|
|
|
# Restart the service
|
|
if ! restart_sld_service; then
|
|
log_message "$SCRIPT_NAME" "[CRITICAL] Failed to restart SLD service"
|
|
send_alert "$SCRIPT_NAME" "SLD Service Critical" "Failed to restart SLD service"
|
|
exit 1
|
|
fi
|
|
|
|
# Allow service to spin up, then log recovery status
|
|
log_message "$SCRIPT_NAME" "Waiting 15 seconds for service to restart..."
|
|
sleep 15
|
|
|
|
recovery_status=$(check_sld_health)
|
|
|
|
if [[ $recovery_status == 200 || $recovery_status == 401 ]]; then
|
|
log_message "$SCRIPT_NAME" "[RECOVERY] SLD service recovered successfully (HTTP Status: $recovery_status)"
|
|
else
|
|
if [ "$recovery_status" == "0" ]; then
|
|
recovery_detail="Connection failed after restart"
|
|
else
|
|
recovery_detail="HTTP Status: $recovery_status"
|
|
fi
|
|
log_message "$SCRIPT_NAME" "[CRITICAL] SLD service failed to recover after restart (${recovery_detail})"
|
|
send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service FAILED to recover after restart (${recovery_detail})"
|
|
exit 1
|
|
fi
|
|
|
|
log_message "$SCRIPT_NAME" "SLD watchdog check complete."
|