Files
hana-scripts/sld_watchdog.sh
2026-03-12 20:12:20 +01:00

135 lines
4.7 KiB
Bash

#!/bin/bash
#
# sld_watchdog.sh - Monitors SLD service health and restarts if needed
# Optimized for better error handling and reliability
#
# Get script directory and name
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
SCRIPT_NAME="sld_watchdog"
# Source configuration and library
source "$SCRIPT_DIR/hana.conf"
source "$SCRIPT_DIR/hana_lib.sh"
# SLD-specific configuration
SLD_URL="${SLD_URL:-https://localhost:40000/sld/sld0100.svc}"
SLD_TIMEOUT="${SLD_TIMEOUT:-5}"
SLD_LOCK_FILE="/tmp/hana_sld_watchdog.lock"
# Acquire lock using library function
LOCK_FILE=$(acquire_lock "$SCRIPT_NAME")
if [ $? -ne 0 ]; then
exit 0
fi
trap 'release_lock "$LOCK_FILE"' EXIT
# Function to check SLD health
# Returns HTTP status code or "0" for connection errors
check_sld_health() {
local http_status
http_status=$(curl -k -s -o /dev/null -w "%{http_code}" -m "$SLD_TIMEOUT" --connect-timeout "$SLD_TIMEOUT" "$SLD_URL" 2>/dev/null)
# Handle curl errors (returns 000 on connection failure)
if [ -z "$http_status" ] || [ "$http_status" == "000" ]; then
echo "0"
else
echo "$http_status"
fi
}
# Function to restart SLD service
restart_sld_service() {
log_message "$SCRIPT_NAME" "Attempting to restart SLD service..."
# Try systemctl first
if command -v systemctl &> /dev/null; then
systemctl restart sapb1servertools 2>&1
local restart_status=$?
if [ $restart_status -eq 0 ]; then
log_message "$SCRIPT_NAME" "Service restart command executed successfully"
return 0
else
log_message "$SCRIPT_NAME" "Service restart failed with exit code ${restart_status}"
return 1
fi
else
log_message "$SCRIPT_NAME" "systemctl not available, trying alternative restart methods"
# Fallback: try service command
service sapb1servertools restart 2>&1
return $?
fi
}
# Main monitoring logic
main() {
log_message "$SCRIPT_NAME" "Checking SLD service health at ${SLD_URL}..."
local http_status
http_status=$(check_sld_health)
# 200 OK or 401 Unauthorized indicate the Tomcat layer is actively processing
if [[ $http_status == 200 || $http_status == 401 ]]; then
log_message "$SCRIPT_NAME" "[OK] SLD service healthy (HTTP Status: $http_status)"
send_notification_if_changed "$SCRIPT_NAME" "sld_status" "SLD Service" "SLD service is healthy (HTTP: $http_status)" "false" "OK"
return 0
fi
# Service is down or unresponsive
local status_detail
if [ "$http_status" == "0" ]; then
status_detail="Connection failed or timeout"
else
status_detail="HTTP Status: ${http_status}"
fi
log_message "$SCRIPT_NAME" "[ALERT] SLD dead or hung (${status_detail}). Restarting service..."
# Send notification
send_notification_if_changed "$SCRIPT_NAME" "sld_down" "SLD Service Critical" \
"SLD service is down (${status_detail}). Restarting ${SLD_URL}" "true" "SLD_DOWN"
# Restart the service
if ! restart_sld_service; then
log_message "$SCRIPT_NAME" "[CRITICAL] Failed to restart SLD service"
send_notification_if_changed "$SCRIPT_NAME" "sld_restart_failed" "SLD Service Critical" \
"Failed to restart SLD service" "true" "RESTART_FAILED"
return 1
fi
# Allow service to spin up, then log recovery status
log_message "$SCRIPT_NAME" "Waiting 15 seconds for service to restart..."
sleep 15
local recovery_status
recovery_status=$(check_sld_health)
if [[ $recovery_status == 200 || $recovery_status == 401 ]]; then
log_message "$SCRIPT_NAME" "[RECOVERY] SLD service recovered successfully (HTTP Status: $recovery_status)"
send_notification_if_changed "$SCRIPT_NAME" "sld_down" "SLD Service" \
"SLD service recovered (HTTP: $recovery_status)" "false" "OK"
send_notification_if_changed "$SCRIPT_NAME" "sld_restart_failed" "SLD Service" \
"Service recovered successfully" "false" "OK"
else
local recovery_detail
if [ "$recovery_status" == "0" ]; then
recovery_detail="Connection failed after restart"
else
recovery_detail="HTTP Status: $recovery_status"
fi
log_message "$SCRIPT_NAME" "[CRITICAL] SLD service failed to recover after restart (${recovery_detail})"
send_notification_if_changed "$SCRIPT_NAME" "sld_down" "SLD Service Critical" \
"SLD service FAILED to recover after restart (${recovery_detail})" "true" "RECOVERY_FAILED"
return 1
fi
return 0
}
# Run main function
main
exit_code=$?
log_message "$SCRIPT_NAME" "SLD watchdog check complete."
exit $exit_code