#!/bin/bash # ============================================================================= # sld_watchdog.sh - Monitors SLD service health and restarts if needed # ============================================================================= SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SCRIPT_NAME="sld_watchdog" # Source configuration and library source "$SCRIPT_DIR/hana.conf" source "$SCRIPT_DIR/hana_lib.sh" # SLD-specific configuration SLD_URL="${SLD_URL:-https://localhost:40000/sld/sld0100.svc}" SLD_TIMEOUT="${SLD_TIMEOUT:-5}" # Acquire lock if ! acquire_lock "$SCRIPT_NAME"; then exit 0 fi trap 'release_lock "$SCRIPT_NAME"' EXIT # Function to check SLD health # Returns HTTP status code or "0" for connection errors check_sld_health() { local http_status http_status=$(curl -k -s -o /dev/null -w "%{http_code}" -m "$SLD_TIMEOUT" --connect-timeout "$SLD_TIMEOUT" "$SLD_URL" 2>/dev/null) # Handle curl errors (returns 000 on connection failure) if [ -z "$http_status" ] || [ "$http_status" == "000" ]; then echo "0" else echo "$http_status" fi } # Function to restart SLD service restart_sld_service() { log_message "$SCRIPT_NAME" "Attempting to restart SLD service..." # Try systemctl first if command -v systemctl &> /dev/null; then systemctl restart sapb1servertools 2>&1 local restart_status=$? if [ $restart_status -eq 0 ]; then log_message "$SCRIPT_NAME" "Service restart command executed successfully" return 0 else log_message "$SCRIPT_NAME" "Service restart failed with exit code ${restart_status}" return 1 fi else log_message "$SCRIPT_NAME" "systemctl not available, trying alternative restart methods" # Fallback: try service command service sapb1servertools restart 2>&1 return $? fi } # Main monitoring logic main() { log_message "$SCRIPT_NAME" "Checking SLD service health at ${SLD_URL}..." local http_status http_status=$(check_sld_health) # 200 OK or 401 Unauthorized indicate the Tomcat layer is actively processing if [[ $http_status == 200 || $http_status == 401 ]]; then log_message "$SCRIPT_NAME" "[OK] SLD service healthy (HTTP Status: $http_status)" return 0 fi # Service is down or unresponsive local status_detail if [ "$http_status" == "0" ]; then status_detail="Connection failed or timeout" else status_detail="HTTP Status: ${http_status}" fi log_message "$SCRIPT_NAME" "[ALERT] SLD dead or hung (${status_detail}). Restarting service..." # Send notification send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service is down (${status_detail}). Restarting ${SLD_URL}" # Restart the service if ! restart_sld_service; then log_message "$SCRIPT_NAME" "[CRITICAL] Failed to restart SLD service" send_alert "$SCRIPT_NAME" "SLD Service Critical" "Failed to restart SLD service" return 1 fi # Allow service to spin up, then log recovery status log_message "$SCRIPT_NAME" "Waiting 15 seconds for service to restart..." sleep 15 local recovery_status recovery_status=$(check_sld_health) if [[ $recovery_status == 200 || $recovery_status == 401 ]]; then log_message "$SCRIPT_NAME" "[RECOVERY] SLD service recovered successfully (HTTP Status: $recovery_status)" else local recovery_detail if [ "$recovery_status" == "0" ]; then recovery_detail="Connection failed after restart" else recovery_detail="HTTP Status: $recovery_status" fi log_message "$SCRIPT_NAME" "[CRITICAL] SLD service failed to recover after restart (${recovery_detail})" send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service FAILED to recover after restart (${recovery_detail})" return 1 fi return 0 } # Run main function main exit_code=$? log_message "$SCRIPT_NAME" "SLD watchdog check complete." exit $exit_code