refactor(monitoring): simplify monitoring scripts and remove state tracking

- Remove consecutive breach tracking for statement queue (immediate alerts)
- Consolidate script initialization into init_script() function
- Remove unused helper functions (send_ok, run_as_hana_user, get_mount_point)
- Flatten sld_watchdog.sh structure by removing main() wrapper
- Remove state directory and lock directory configuration from hana.conf
- Simplify alert messages to include threshold values

This continues the simplification effort from previous commits by removing stateful tracking mechanisms and streamlining the monitoring logic for easier maintenance.
This commit is contained in:
2026-03-12 22:18:29 +01:00
parent cf5b81889d
commit 0beef6fa48
6 changed files with 83 additions and 148 deletions

View File

@@ -21,12 +21,10 @@ fi
trap 'release_lock "$SCRIPT_NAME"' EXIT
# Function to check SLD health
# Returns HTTP status code or "0" for connection errors
check_sld_health() {
local http_status
http_status=$(curl -k -s -o /dev/null -w "%{http_code}" -m "$SLD_TIMEOUT" --connect-timeout "$SLD_TIMEOUT" "$SLD_URL" 2>/dev/null)
# Handle curl errors (returns 000 on connection failure)
if [ -z "$http_status" ] || [ "$http_status" == "000" ]; then
echo "0"
else
@@ -38,7 +36,6 @@ check_sld_health() {
restart_sld_service() {
log_message "$SCRIPT_NAME" "Attempting to restart SLD service..."
# Try systemctl first
if command -v systemctl &> /dev/null; then
systemctl restart sapb1servertools 2>&1
local restart_status=$?
@@ -50,73 +47,56 @@ restart_sld_service() {
return 1
fi
else
log_message "$SCRIPT_NAME" "systemctl not available, trying alternative restart methods"
# Fallback: try service command
log_message "$SCRIPT_NAME" "systemctl not available, trying service command"
service sapb1servertools restart 2>&1
return $?
fi
}
# Main monitoring logic
main() {
log_message "$SCRIPT_NAME" "Checking SLD service health at ${SLD_URL}..."
local http_status
http_status=$(check_sld_health)
# 200 OK or 401 Unauthorized indicate the Tomcat layer is actively processing
if [[ $http_status == 200 || $http_status == 401 ]]; then
log_message "$SCRIPT_NAME" "[OK] SLD service healthy (HTTP Status: $http_status)"
return 0
fi
# Service is down or unresponsive
local status_detail
if [ "$http_status" == "0" ]; then
status_detail="Connection failed or timeout"
else
status_detail="HTTP Status: ${http_status}"
fi
log_message "$SCRIPT_NAME" "[ALERT] SLD dead or hung (${status_detail}). Restarting service..."
# Send notification
send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service is down (${status_detail}). Restarting ${SLD_URL}"
# Restart the service
if ! restart_sld_service; then
log_message "$SCRIPT_NAME" "[CRITICAL] Failed to restart SLD service"
send_alert "$SCRIPT_NAME" "SLD Service Critical" "Failed to restart SLD service"
return 1
fi
# Allow service to spin up, then log recovery status
log_message "$SCRIPT_NAME" "Waiting 15 seconds for service to restart..."
sleep 15
local recovery_status
recovery_status=$(check_sld_health)
if [[ $recovery_status == 200 || $recovery_status == 401 ]]; then
log_message "$SCRIPT_NAME" "[RECOVERY] SLD service recovered successfully (HTTP Status: $recovery_status)"
else
local recovery_detail
if [ "$recovery_status" == "0" ]; then
recovery_detail="Connection failed after restart"
else
recovery_detail="HTTP Status: $recovery_status"
fi
log_message "$SCRIPT_NAME" "[CRITICAL] SLD service failed to recover after restart (${recovery_detail})"
send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service FAILED to recover after restart (${recovery_detail})"
return 1
fi
return 0
}
log_message "$SCRIPT_NAME" "Checking SLD service health at ${SLD_URL}..."
# Run main function
main
exit_code=$?
http_status=$(check_sld_health)
# 200 OK or 401 Unauthorized indicate the Tomcat layer is actively processing
if [[ $http_status == 200 || $http_status == 401 ]]; then
log_message "$SCRIPT_NAME" "[OK] SLD service healthy (HTTP Status: $http_status)"
exit 0
fi
# Service is down or unresponsive
if [ "$http_status" == "0" ]; then
status_detail="Connection failed or timeout"
else
status_detail="HTTP Status: ${http_status}"
fi
log_message "$SCRIPT_NAME" "[ALERT] SLD dead or hung (${status_detail}). Restarting service..."
send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service is down (${status_detail}). Restarting ${SLD_URL}"
# Restart the service
if ! restart_sld_service; then
log_message "$SCRIPT_NAME" "[CRITICAL] Failed to restart SLD service"
send_alert "$SCRIPT_NAME" "SLD Service Critical" "Failed to restart SLD service"
exit 1
fi
# Allow service to spin up, then log recovery status
log_message "$SCRIPT_NAME" "Waiting 15 seconds for service to restart..."
sleep 15
recovery_status=$(check_sld_health)
if [[ $recovery_status == 200 || $recovery_status == 401 ]]; then
log_message "$SCRIPT_NAME" "[RECOVERY] SLD service recovered successfully (HTTP Status: $recovery_status)"
else
if [ "$recovery_status" == "0" ]; then
recovery_detail="Connection failed after restart"
else
recovery_detail="HTTP Status: $recovery_status"
fi
log_message "$SCRIPT_NAME" "[CRITICAL] SLD service failed to recover after restart (${recovery_detail})"
send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service FAILED to recover after restart (${recovery_detail})"
exit 1
fi
log_message "$SCRIPT_NAME" "SLD watchdog check complete."
exit $exit_code