- Replace state-based notifications with direct alert functions - Remove auto-cleanup functionality from disk monitoring and configuration - Simplify lock acquisition/release across all monitoring scripts - Add execute_hana_sql helper functions for consistent SQL execution - Remove state file tracking in favor of direct file operations - Standardize error handling with exit codes on critical failures - Clean up hana.conf by removing unused auto-delete directory settings
123 lines
4.0 KiB
Bash
123 lines
4.0 KiB
Bash
#!/bin/bash
|
|
# =============================================================================
|
|
# sld_watchdog.sh - Monitors SLD service health and restarts if needed
|
|
# =============================================================================
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
SCRIPT_NAME="sld_watchdog"
|
|
|
|
# Source configuration and library
|
|
source "$SCRIPT_DIR/hana.conf"
|
|
source "$SCRIPT_DIR/hana_lib.sh"
|
|
|
|
# SLD-specific configuration
|
|
SLD_URL="${SLD_URL:-https://localhost:40000/sld/sld0100.svc}"
|
|
SLD_TIMEOUT="${SLD_TIMEOUT:-5}"
|
|
|
|
# Acquire lock
|
|
if ! acquire_lock "$SCRIPT_NAME"; then
|
|
exit 0
|
|
fi
|
|
trap 'release_lock "$SCRIPT_NAME"' EXIT
|
|
|
|
# Function to check SLD health
|
|
# Returns HTTP status code or "0" for connection errors
|
|
check_sld_health() {
|
|
local http_status
|
|
http_status=$(curl -k -s -o /dev/null -w "%{http_code}" -m "$SLD_TIMEOUT" --connect-timeout "$SLD_TIMEOUT" "$SLD_URL" 2>/dev/null)
|
|
|
|
# Handle curl errors (returns 000 on connection failure)
|
|
if [ -z "$http_status" ] || [ "$http_status" == "000" ]; then
|
|
echo "0"
|
|
else
|
|
echo "$http_status"
|
|
fi
|
|
}
|
|
|
|
# Function to restart SLD service
|
|
restart_sld_service() {
|
|
log_message "$SCRIPT_NAME" "Attempting to restart SLD service..."
|
|
|
|
# Try systemctl first
|
|
if command -v systemctl &> /dev/null; then
|
|
systemctl restart sapb1servertools 2>&1
|
|
local restart_status=$?
|
|
if [ $restart_status -eq 0 ]; then
|
|
log_message "$SCRIPT_NAME" "Service restart command executed successfully"
|
|
return 0
|
|
else
|
|
log_message "$SCRIPT_NAME" "Service restart failed with exit code ${restart_status}"
|
|
return 1
|
|
fi
|
|
else
|
|
log_message "$SCRIPT_NAME" "systemctl not available, trying alternative restart methods"
|
|
# Fallback: try service command
|
|
service sapb1servertools restart 2>&1
|
|
return $?
|
|
fi
|
|
}
|
|
|
|
# Main monitoring logic
|
|
main() {
|
|
log_message "$SCRIPT_NAME" "Checking SLD service health at ${SLD_URL}..."
|
|
|
|
local http_status
|
|
http_status=$(check_sld_health)
|
|
|
|
# 200 OK or 401 Unauthorized indicate the Tomcat layer is actively processing
|
|
if [[ $http_status == 200 || $http_status == 401 ]]; then
|
|
log_message "$SCRIPT_NAME" "[OK] SLD service healthy (HTTP Status: $http_status)"
|
|
return 0
|
|
fi
|
|
|
|
# Service is down or unresponsive
|
|
local status_detail
|
|
if [ "$http_status" == "0" ]; then
|
|
status_detail="Connection failed or timeout"
|
|
else
|
|
status_detail="HTTP Status: ${http_status}"
|
|
fi
|
|
|
|
log_message "$SCRIPT_NAME" "[ALERT] SLD dead or hung (${status_detail}). Restarting service..."
|
|
|
|
# Send notification
|
|
send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service is down (${status_detail}). Restarting ${SLD_URL}"
|
|
|
|
# Restart the service
|
|
if ! restart_sld_service; then
|
|
log_message "$SCRIPT_NAME" "[CRITICAL] Failed to restart SLD service"
|
|
send_alert "$SCRIPT_NAME" "SLD Service Critical" "Failed to restart SLD service"
|
|
return 1
|
|
fi
|
|
|
|
# Allow service to spin up, then log recovery status
|
|
log_message "$SCRIPT_NAME" "Waiting 15 seconds for service to restart..."
|
|
sleep 15
|
|
|
|
local recovery_status
|
|
recovery_status=$(check_sld_health)
|
|
|
|
if [[ $recovery_status == 200 || $recovery_status == 401 ]]; then
|
|
log_message "$SCRIPT_NAME" "[RECOVERY] SLD service recovered successfully (HTTP Status: $recovery_status)"
|
|
else
|
|
local recovery_detail
|
|
if [ "$recovery_status" == "0" ]; then
|
|
recovery_detail="Connection failed after restart"
|
|
else
|
|
recovery_detail="HTTP Status: $recovery_status"
|
|
fi
|
|
log_message "$SCRIPT_NAME" "[CRITICAL] SLD service failed to recover after restart (${recovery_detail})"
|
|
send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service FAILED to recover after restart (${recovery_detail})"
|
|
return 1
|
|
fi
|
|
|
|
return 0
|
|
}
|
|
|
|
# Run main function
|
|
main
|
|
exit_code=$?
|
|
|
|
log_message "$SCRIPT_NAME" "SLD watchdog check complete."
|
|
exit $exit_code
|