Compare commits

...

4 Commits

Author SHA1 Message Date
e653ff7b9a fix(monitoring): allow sapcontrol exit code 3
The sapcontrol command returns exit code 3 for successful execution
in this context. Previously, only exit code 0 was accepted, causing
false error alerts and script termination when the process was healthy.
2026-03-12 22:35:54 +01:00
f364996563 refactor(monitoring): parse process list via awk
Update grep pipelines to include awk command. This ensures
only the process identifier is captured instead of the full
status line.
2026-03-12 22:32:18 +01:00
7495ebcd78 refactor(monitoring): remove hardcoded tool paths
Remove SAPCONTROL_PATH and HDBSQL_PATH variables from configuration.
Update scripts to rely on the system PATH environment variable when
executing as the <sid>adm user. Remove redundant existence checks for
these commands.
2026-03-12 22:24:02 +01:00
0beef6fa48 refactor(monitoring): simplify monitoring scripts and remove state tracking
- Remove consecutive breach tracking for statement queue (immediate alerts)
- Consolidate script initialization into init_script() function
- Remove unused helper functions (send_ok, run_as_hana_user, get_mount_point)
- Flatten sld_watchdog.sh structure by removing main() wrapper
- Remove state directory and lock directory configuration from hana.conf
- Simplify alert messages to include threshold values

This continues the simplification effort from previous commits by removing stateful tracking mechanisms and streamlining the monitoring logic for easier maintenance.
2026-03-12 22:18:29 +01:00
8 changed files with 89 additions and 183 deletions

View File

@@ -17,8 +17,7 @@ HANA_INSTANCE_NR="00"
HANA_USER_KEY="CRONKEY" HANA_USER_KEY="CRONKEY"
# --- Paths --- # --- Paths ---
SAPCONTROL_PATH="/usr/sap/hostctrl/exe/sapcontrol" # Commands are executed as <sid>adm user without full paths
HDBSQL_PATH="/usr/sap/HDB/HDB${HANA_INSTANCE_NR}/exe/hdbsql"
# --- Monitoring Directories --- # --- Monitoring Directories ---
DIRECTORIES_TO_MONITOR=( DIRECTORIES_TO_MONITOR=(
@@ -33,7 +32,6 @@ DISK_USAGE_THRESHOLD=85
TRUNCATED_PERCENTAGE_THRESHOLD=50 TRUNCATED_PERCENTAGE_THRESHOLD=50
FREE_PERCENTAGE_THRESHOLD=10 FREE_PERCENTAGE_THRESHOLD=10
STATEMENT_QUEUE_THRESHOLD=10 STATEMENT_QUEUE_THRESHOLD=10
STATEMENT_QUEUE_CONSECUTIVE_RUNS=3
BACKUP_THRESHOLD_HOURS=32 BACKUP_THRESHOLD_HOURS=32
# --- Notification Configuration --- # --- Notification Configuration ---
@@ -45,10 +43,3 @@ COMPANY_NAME="My Company"
LOG_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" LOG_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
LOG_FILE="${LOG_DIR}/hana_monitor.log" LOG_FILE="${LOG_DIR}/hana_monitor.log"
# --- State Directory ---
STATE_DIR="${LOG_DIR}/monitor_state"
mkdir -p "${STATE_DIR}"
# --- Lock Directory ---
LOCK_DIR="/tmp"

View File

@@ -19,13 +19,6 @@ trap 'release_lock "$SCRIPT_NAME"' EXIT
log_message "$SCRIPT_NAME" "Starting backup status check..." log_message "$SCRIPT_NAME" "Starting backup status check..."
# Check if hdbsql is available
if [ ! -x "$HDBSQL_PATH" ]; then
log_message "$SCRIPT_NAME" "ERROR: hdbsql not found or not executable at ${HDBSQL_PATH}"
send_alert "$SCRIPT_NAME" "HANA Monitor Error" "hdbsql not found or not executable at ${HDBSQL_PATH}"
exit 1
fi
# SQL Query for last successful backup # SQL Query for last successful backup
BACKUP_SQL="SELECT TOP 1 SYS_START_TIME FROM M_BACKUP_CATALOG WHERE ENTRY_TYPE_NAME = 'complete data backup' AND STATE_NAME = 'successful' ORDER BY SYS_START_TIME DESC" BACKUP_SQL="SELECT TOP 1 SYS_START_TIME FROM M_BACKUP_CATALOG WHERE ENTRY_TYPE_NAME = 'complete data backup' AND STATE_NAME = 'successful' ORDER BY SYS_START_TIME DESC"

View File

@@ -1,7 +1,7 @@
#!/bin/bash #!/bin/bash
# ============================================================================= # =============================================================================
# SAP HANA Disk Space Monitoring Script # SAP HANA Disk Space Monitoring Script
# Checks disk usage for configured directories with auto-cleanup capability # Checks disk usage for configured directories
# ============================================================================= # =============================================================================
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"

View File

@@ -3,13 +3,22 @@
# SAP HANA Monitoring Library - Shared Functions # SAP HANA Monitoring Library - Shared Functions
# ============================================================================= # =============================================================================
# Logging function with script name prefix # Initialize script with common setup
# Usage: log_message "SCRIPT_NAME" "message" # Usage: init_script "SCRIPT_NAME"
log_message() { # Sets up: SCRIPT_DIR, SCRIPT_NAME, LOG_FILE, LOCK_DIR
local script_name="$1" init_script() {
local message="$2" SCRIPT_NAME="$1"
local timestamp=$(date "+%Y-%m-%d %H:%M:%S") SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
echo "[${timestamp}] [${script_name}] ${message}" | tee -a "${LOG_FILE}"
# Load configuration
source "${SCRIPT_DIR}/hana.conf"
# Setup logging
LOG_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
LOG_FILE="${LOG_DIR}/hana_monitor.log"
# Setup lock directory
LOCK_DIR="/tmp"
} }
# Acquire lock for script execution # Acquire lock for script execution
@@ -38,6 +47,15 @@ release_lock() {
fi fi
} }
# Logging function with script name prefix
# Usage: log_message "SCRIPT_NAME" "message"
log_message() {
local script_name="$1"
local message="$2"
local timestamp=$(date "+%Y-%m-%d %H:%M:%S")
echo "[${timestamp}] [${script_name}] ${message}" | tee -a "${LOG_FILE}"
}
# Send notification via ntfy # Send notification via ntfy
# Usage: send_notification "TITLE" "MESSAGE" # Usage: send_notification "TITLE" "MESSAGE"
send_notification() { send_notification() {
@@ -64,23 +82,6 @@ send_alert() {
log_message "$script_name" "ALERT: ${message}" log_message "$script_name" "ALERT: ${message}"
} }
# Send OK notification (state change from alert to normal)
# Usage: send_ok "SCRIPT_NAME" "TITLE_PREFIX" "MESSAGE"
send_ok() {
local script_name="$1"
local title_prefix="$2"
local message="$3"
send_notification "${title_prefix} Resolved" "✅ Resolved: ${message}"
log_message "$script_name" "RESOLVED: ${message}"
}
# Run command as HANA user using su
# Usage: run_as_hana_user "COMMAND"
run_as_hana_user() {
local command="$1"
su - "$HANA_USER" -c "$command"
}
# Execute SQL query as HANA user # Execute SQL query as HANA user
# Usage: execute_hana_sql "SQL_QUERY" # Usage: execute_hana_sql "SQL_QUERY"
# Returns: SQL output on stdout, returns 0 on success, 1 on failure # Returns: SQL output on stdout, returns 0 on success, 1 on failure
@@ -88,7 +89,7 @@ execute_hana_sql() {
local sql_query="$1" local sql_query="$1"
local output local output
output=$(su - "$HANA_USER" -c "$HDBSQL_PATH -U $HANA_USER_KEY -j -a -x \"$sql_query\"" 2>&1) output=$(su - "$HANA_USER" -c "hdbsql -U $HANA_USER_KEY -j -a -x \"$sql_query\"" 2>&1)
local sql_status=$? local sql_status=$?
if [ $sql_status -ne 0 ]; then if [ $sql_status -ne 0 ]; then
@@ -131,19 +132,3 @@ get_disk_usage_percentage() {
fi fi
df "$dir" 2>/dev/null | awk 'NR==2 {gsub(/%/,"",$5); print $5}' df "$dir" 2>/dev/null | awk 'NR==2 {gsub(/%/,"",$5); print $5}'
} }
# Get mount point for a directory
# Usage: get_mount_point "/path/to/dir"
# Returns: Mount point path
get_mount_point() {
local dir="$1"
df "$dir" 2>/dev/null | awk 'NR==2 {print $NF}'
}
# Get available disk space in KB for a directory
# Usage: get_available_space_kb "/path/to/dir"
# Returns: Available space in KB
get_available_space_kb() {
local dir="$1"
df -k "$dir" 2>/dev/null | awk 'NR==2 {print $4}'
}

View File

@@ -19,13 +19,6 @@ trap 'release_lock "$SCRIPT_NAME"' EXIT
log_message "$SCRIPT_NAME" "Starting log segment check..." log_message "$SCRIPT_NAME" "Starting log segment check..."
# Check if hdbsql is available
if [ ! -x "$HDBSQL_PATH" ]; then
log_message "$SCRIPT_NAME" "ERROR: hdbsql not found or not executable at ${HDBSQL_PATH}"
send_alert "$SCRIPT_NAME" "HANA Monitor Error" "hdbsql not found or not executable at ${HDBSQL_PATH}"
exit 1
fi
# SQL Query for log segments # SQL Query for log segments
SQL_QUERY="SELECT b.host, b.service_name, a.state, count(*) FROM PUBLIC.M_LOG_SEGMENTS a JOIN PUBLIC.M_SERVICES b ON (a.host = b.host AND a.port = b.port) GROUP BY b.host, b.service_name, a.state;" SQL_QUERY="SELECT b.host, b.service_name, a.state, count(*) FROM PUBLIC.M_LOG_SEGMENTS a JOIN PUBLIC.M_SERVICES b ON (a.host = b.host AND a.port = b.port) GROUP BY b.host, b.service_name, a.state;"
@@ -72,25 +65,22 @@ log_message "$SCRIPT_NAME" "Truncated Segments: ${truncated_segments}"
log_message "$SCRIPT_NAME" "Free Segments: ${free_segments}" log_message "$SCRIPT_NAME" "Free Segments: ${free_segments}"
if [ $total_segments -eq 0 ]; then if [ $total_segments -eq 0 ]; then
log_message "$SCRIPT_NAME" "WARNING: No log segments found. Skipping percentage checks." log_message "$SCRIPT_NAME" "WARNING: No log segments found."
send_alert "$SCRIPT_NAME" "HANA Log Segment Warning" "No log segments found." send_alert "$SCRIPT_NAME" "HANA Log Segment Warning" "No log segments found."
exit 1 exit 1
fi fi
# Calculate truncated percentage with integer arithmetic # Calculate percentages
truncated_percentage=$((truncated_segments * 100 / total_segments)) truncated_percentage=$((truncated_segments * 100 / total_segments))
if [ $truncated_percentage -gt $TRUNCATED_PERCENTAGE_THRESHOLD ]; then
log_message "$SCRIPT_NAME" "ALERT: ${truncated_percentage}% of log segments are 'Truncated'."
send_alert "$SCRIPT_NAME" "HANA Log Segment" "${truncated_percentage}% of HANA log segments are in 'Truncated' state."
fi
# Calculate free percentage with integer arithmetic
free_percentage=$((free_segments * 100 / total_segments)) free_percentage=$((free_segments * 100 / total_segments))
# Check thresholds and alert
if [ $truncated_percentage -gt $TRUNCATED_PERCENTAGE_THRESHOLD ]; then
send_alert "$SCRIPT_NAME" "HANA Log Segment" "${truncated_percentage}% of log segments are 'Truncated' (threshold: ${TRUNCATED_PERCENTAGE_THRESHOLD}%)."
fi
if [ $free_percentage -lt $FREE_PERCENTAGE_THRESHOLD ]; then if [ $free_percentage -lt $FREE_PERCENTAGE_THRESHOLD ]; then
log_message "$SCRIPT_NAME" "ALERT: Only ${free_percentage}% of log segments are 'Free'." send_alert "$SCRIPT_NAME" "HANA Log Segment" "Only ${free_percentage}% of log segments are 'Free' (threshold: ${FREE_PERCENTAGE_THRESHOLD}%)."
send_alert "$SCRIPT_NAME" "HANA Log Segment" "Only ${free_percentage}% of HANA log segments are in 'Free' state."
fi fi
log_message "$SCRIPT_NAME" "Log segment check complete." log_message "$SCRIPT_NAME" "Log segment check complete."

View File

@@ -19,25 +19,18 @@ trap 'release_lock "$SCRIPT_NAME"' EXIT
log_message "$SCRIPT_NAME" "Starting HANA process status check..." log_message "$SCRIPT_NAME" "Starting HANA process status check..."
# Check if sapcontrol is available
if [ ! -x "$SAPCONTROL_PATH" ]; then
log_message "$SCRIPT_NAME" "ERROR: sapcontrol not found or not executable at ${SAPCONTROL_PATH}"
send_alert "$SCRIPT_NAME" "HANA Monitor Error" "sapcontrol not found or not executable at ${SAPCONTROL_PATH}"
exit 1
fi
# Get process list # Get process list
process_list=$(su - "$HANA_USER" -c "${SAPCONTROL_PATH} -nr ${HANA_INSTANCE_NR} -function GetProcessList" 2>&1) process_list=$(su - "$HANA_USER" -c "sapcontrol -nr ${HANA_INSTANCE_NR} -function GetProcessList" 2>&1)
sapcontrol_status=$? sapcontrol_status=$?
if [ $sapcontrol_status -ne 0 ]; then if [ $sapcontrol_status -ne 3 ]; then
log_message "$SCRIPT_NAME" "ERROR: sapcontrol command failed with exit code ${sapcontrol_status}" log_message "$SCRIPT_NAME" "ERROR: sapcontrol command failed with exit code ${sapcontrol_status}"
send_alert "$SCRIPT_NAME" "HANA Monitor Error" "sapcontrol command failed. Exit code: ${sapcontrol_status}" send_alert "$SCRIPT_NAME" "HANA Monitor Error" "sapcontrol command failed. Exit code: ${sapcontrol_status}"
exit 1 exit 1
fi fi
# Check for non-GREEN processes (skip header lines) # Check for non-GREEN processes (skip header lines)
non_green_processes=$(echo "$process_list" | tail -n +6 | grep -v 'GREEN' | grep -v '^$') non_green_processes=$(echo "$process_list" | tail -n +6 | grep -v 'GREEN' | grep -v '^$' | awk -F', ' '{print $1}')
if [ -n "$non_green_processes" ]; then if [ -n "$non_green_processes" ]; then
log_message "$SCRIPT_NAME" "ALERT: One or more HANA processes are not running!" log_message "$SCRIPT_NAME" "ALERT: One or more HANA processes are not running!"
@@ -47,7 +40,7 @@ if [ -n "$non_green_processes" ]; then
fi fi
# Verify we actually got process data # Verify we actually got process data
green_processes=$(echo "$process_list" | tail -n +6 | grep 'GREEN') green_processes=$(echo "$process_list" | tail -n +6 | grep 'GREEN' | awk -F', ' '{print $1}')
if [ -z "$green_processes" ]; then if [ -z "$green_processes" ]; then
log_message "$SCRIPT_NAME" "WARNING: No process data found. SAP HANA may not be running." log_message "$SCRIPT_NAME" "WARNING: No process data found. SAP HANA may not be running."
send_alert "$SCRIPT_NAME" "HANA Process" "No process data found. SAP HANA may not be running." send_alert "$SCRIPT_NAME" "HANA Process" "No process data found. SAP HANA may not be running."

View File

@@ -19,18 +19,11 @@ trap 'release_lock "$SCRIPT_NAME"' EXIT
log_message "$SCRIPT_NAME" "Starting statement queue check..." log_message "$SCRIPT_NAME" "Starting statement queue check..."
# Check if hdbsql is available
if [ ! -x "$HDBSQL_PATH" ]; then
log_message "$SCRIPT_NAME" "ERROR: hdbsql not found or not executable at ${HDBSQL_PATH}"
send_alert "$SCRIPT_NAME" "HANA Monitor Error" "hdbsql not found or not executable at ${HDBSQL_PATH}"
exit 1
fi
# SQL Query for statement queue # SQL Query for statement queue
STATEMENT_QUEUE_SQL="SELECT COUNT(*) FROM M_SERVICE_THREADS WHERE THREAD_TYPE = 'SqlExecutor' AND THREAD_STATE = 'Queueing';" STATEMENT_QUEUE_SQL="SELECT COUNT(*) FROM M_SERVICE_THREADS WHERE THREAD_TYPE = 'SqlExecutor' AND THREAD_STATE = 'Queueing';"
# Execute SQL query # Execute SQL query
queue_result=$(execute_hana_sql_query "$STATEMENT_QUEUE_SQL") queue_count=$(execute_hana_sql_query "$STATEMENT_QUEUE_SQL")
sql_status=$? sql_status=$?
if [ $sql_status -ne 0 ]; then if [ $sql_status -ne 0 ]; then
@@ -48,30 +41,11 @@ fi
log_message "$SCRIPT_NAME" "Current statement queue length: ${queue_count}" log_message "$SCRIPT_NAME" "Current statement queue length: ${queue_count}"
# Get breach count from state file # Alert immediately if queue exceeds threshold
breach_count_file="${STATE_DIR}/statement_queue_breach_count"
breach_count=0
if [ -f "$breach_count_file" ]; then
breach_count=$(cat "$breach_count_file")
fi
if [ "$queue_count" -gt "$STATEMENT_QUEUE_THRESHOLD" ]; then if [ "$queue_count" -gt "$STATEMENT_QUEUE_THRESHOLD" ]; then
breach_count=$((breach_count + 1)) send_alert "$SCRIPT_NAME" "HANA Statement Queue" "Statement queue count is ${queue_count}, which exceeds threshold of ${STATEMENT_QUEUE_THRESHOLD}."
log_message "$SCRIPT_NAME" "Statement queue is above threshold (${queue_count} > ${STATEMENT_QUEUE_THRESHOLD}). Consecutive breach count: ${breach_count}/${STATEMENT_QUEUE_CONSECUTIVE_RUNS}."
else
if [ "$breach_count" -gt 0 ]; then
log_message "$SCRIPT_NAME" "Statement queue returned to normal. Resetting breach count from ${breach_count} to 0."
fi
breach_count=0
fi
echo "$breach_count" > "$breach_count_file"
if [ "$breach_count" -ge "$STATEMENT_QUEUE_CONSECUTIVE_RUNS" ]; then
message="Statement queue has been over ${STATEMENT_QUEUE_THRESHOLD} for ${breach_count} checks. Current count: ${queue_count}."
send_alert "$SCRIPT_NAME" "HANA Statement Queue" "$message"
exit 1 exit 1
else
log_message "$SCRIPT_NAME" "Statement queue is normal. Current count: ${queue_count}."
fi fi
log_message "$SCRIPT_NAME" "Statement queue is normal. Current count: ${queue_count}."
log_message "$SCRIPT_NAME" "Statement queue check complete." log_message "$SCRIPT_NAME" "Statement queue check complete."

View File

@@ -21,12 +21,10 @@ fi
trap 'release_lock "$SCRIPT_NAME"' EXIT trap 'release_lock "$SCRIPT_NAME"' EXIT
# Function to check SLD health # Function to check SLD health
# Returns HTTP status code or "0" for connection errors
check_sld_health() { check_sld_health() {
local http_status local http_status
http_status=$(curl -k -s -o /dev/null -w "%{http_code}" -m "$SLD_TIMEOUT" --connect-timeout "$SLD_TIMEOUT" "$SLD_URL" 2>/dev/null) http_status=$(curl -k -s -o /dev/null -w "%{http_code}" -m "$SLD_TIMEOUT" --connect-timeout "$SLD_TIMEOUT" "$SLD_URL" 2>/dev/null)
# Handle curl errors (returns 000 on connection failure)
if [ -z "$http_status" ] || [ "$http_status" == "000" ]; then if [ -z "$http_status" ] || [ "$http_status" == "000" ]; then
echo "0" echo "0"
else else
@@ -38,7 +36,6 @@ check_sld_health() {
restart_sld_service() { restart_sld_service() {
log_message "$SCRIPT_NAME" "Attempting to restart SLD service..." log_message "$SCRIPT_NAME" "Attempting to restart SLD service..."
# Try systemctl first
if command -v systemctl &> /dev/null; then if command -v systemctl &> /dev/null; then
systemctl restart sapb1servertools 2>&1 systemctl restart sapb1servertools 2>&1
local restart_status=$? local restart_status=$?
@@ -50,73 +47,56 @@ restart_sld_service() {
return 1 return 1
fi fi
else else
log_message "$SCRIPT_NAME" "systemctl not available, trying alternative restart methods" log_message "$SCRIPT_NAME" "systemctl not available, trying service command"
# Fallback: try service command
service sapb1servertools restart 2>&1 service sapb1servertools restart 2>&1
return $? return $?
fi fi
} }
# Main monitoring logic log_message "$SCRIPT_NAME" "Checking SLD service health at ${SLD_URL}..."
main() {
log_message "$SCRIPT_NAME" "Checking SLD service health at ${SLD_URL}..."
local http_status
http_status=$(check_sld_health)
# 200 OK or 401 Unauthorized indicate the Tomcat layer is actively processing
if [[ $http_status == 200 || $http_status == 401 ]]; then
log_message "$SCRIPT_NAME" "[OK] SLD service healthy (HTTP Status: $http_status)"
return 0
fi
# Service is down or unresponsive
local status_detail
if [ "$http_status" == "0" ]; then
status_detail="Connection failed or timeout"
else
status_detail="HTTP Status: ${http_status}"
fi
log_message "$SCRIPT_NAME" "[ALERT] SLD dead or hung (${status_detail}). Restarting service..."
# Send notification
send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service is down (${status_detail}). Restarting ${SLD_URL}"
# Restart the service
if ! restart_sld_service; then
log_message "$SCRIPT_NAME" "[CRITICAL] Failed to restart SLD service"
send_alert "$SCRIPT_NAME" "SLD Service Critical" "Failed to restart SLD service"
return 1
fi
# Allow service to spin up, then log recovery status
log_message "$SCRIPT_NAME" "Waiting 15 seconds for service to restart..."
sleep 15
local recovery_status
recovery_status=$(check_sld_health)
if [[ $recovery_status == 200 || $recovery_status == 401 ]]; then
log_message "$SCRIPT_NAME" "[RECOVERY] SLD service recovered successfully (HTTP Status: $recovery_status)"
else
local recovery_detail
if [ "$recovery_status" == "0" ]; then
recovery_detail="Connection failed after restart"
else
recovery_detail="HTTP Status: $recovery_status"
fi
log_message "$SCRIPT_NAME" "[CRITICAL] SLD service failed to recover after restart (${recovery_detail})"
send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service FAILED to recover after restart (${recovery_detail})"
return 1
fi
return 0
}
# Run main function http_status=$(check_sld_health)
main
exit_code=$? # 200 OK or 401 Unauthorized indicate the Tomcat layer is actively processing
if [[ $http_status == 200 || $http_status == 401 ]]; then
log_message "$SCRIPT_NAME" "[OK] SLD service healthy (HTTP Status: $http_status)"
exit 0
fi
# Service is down or unresponsive
if [ "$http_status" == "0" ]; then
status_detail="Connection failed or timeout"
else
status_detail="HTTP Status: ${http_status}"
fi
log_message "$SCRIPT_NAME" "[ALERT] SLD dead or hung (${status_detail}). Restarting service..."
send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service is down (${status_detail}). Restarting ${SLD_URL}"
# Restart the service
if ! restart_sld_service; then
log_message "$SCRIPT_NAME" "[CRITICAL] Failed to restart SLD service"
send_alert "$SCRIPT_NAME" "SLD Service Critical" "Failed to restart SLD service"
exit 1
fi
# Allow service to spin up, then log recovery status
log_message "$SCRIPT_NAME" "Waiting 15 seconds for service to restart..."
sleep 15
recovery_status=$(check_sld_health)
if [[ $recovery_status == 200 || $recovery_status == 401 ]]; then
log_message "$SCRIPT_NAME" "[RECOVERY] SLD service recovered successfully (HTTP Status: $recovery_status)"
else
if [ "$recovery_status" == "0" ]; then
recovery_detail="Connection failed after restart"
else
recovery_detail="HTTP Status: $recovery_status"
fi
log_message "$SCRIPT_NAME" "[CRITICAL] SLD service failed to recover after restart (${recovery_detail})"
send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service FAILED to recover after restart (${recovery_detail})"
exit 1
fi
log_message "$SCRIPT_NAME" "SLD watchdog check complete." log_message "$SCRIPT_NAME" "SLD watchdog check complete."
exit $exit_code