refactor(monitoring): simplify monitoring scripts and remove state tracking
- Remove consecutive breach tracking for statement queue (immediate alerts) - Consolidate script initialization into init_script() function - Remove unused helper functions (send_ok, run_as_hana_user, get_mount_point) - Flatten sld_watchdog.sh structure by removing main() wrapper - Remove state directory and lock directory configuration from hana.conf - Simplify alert messages to include threshold values This continues the simplification effort from previous commits by removing stateful tracking mechanisms and streamlining the monitoring logic for easier maintenance.
This commit is contained in:
110
sld_watchdog.sh
110
sld_watchdog.sh
@@ -21,12 +21,10 @@ fi
|
||||
trap 'release_lock "$SCRIPT_NAME"' EXIT
|
||||
|
||||
# Function to check SLD health
|
||||
# Returns HTTP status code or "0" for connection errors
|
||||
check_sld_health() {
|
||||
local http_status
|
||||
http_status=$(curl -k -s -o /dev/null -w "%{http_code}" -m "$SLD_TIMEOUT" --connect-timeout "$SLD_TIMEOUT" "$SLD_URL" 2>/dev/null)
|
||||
|
||||
# Handle curl errors (returns 000 on connection failure)
|
||||
if [ -z "$http_status" ] || [ "$http_status" == "000" ]; then
|
||||
echo "0"
|
||||
else
|
||||
@@ -38,7 +36,6 @@ check_sld_health() {
|
||||
restart_sld_service() {
|
||||
log_message "$SCRIPT_NAME" "Attempting to restart SLD service..."
|
||||
|
||||
# Try systemctl first
|
||||
if command -v systemctl &> /dev/null; then
|
||||
systemctl restart sapb1servertools 2>&1
|
||||
local restart_status=$?
|
||||
@@ -50,73 +47,56 @@ restart_sld_service() {
|
||||
return 1
|
||||
fi
|
||||
else
|
||||
log_message "$SCRIPT_NAME" "systemctl not available, trying alternative restart methods"
|
||||
# Fallback: try service command
|
||||
log_message "$SCRIPT_NAME" "systemctl not available, trying service command"
|
||||
service sapb1servertools restart 2>&1
|
||||
return $?
|
||||
fi
|
||||
}
|
||||
|
||||
# Main monitoring logic
|
||||
main() {
|
||||
log_message "$SCRIPT_NAME" "Checking SLD service health at ${SLD_URL}..."
|
||||
|
||||
local http_status
|
||||
http_status=$(check_sld_health)
|
||||
|
||||
# 200 OK or 401 Unauthorized indicate the Tomcat layer is actively processing
|
||||
if [[ $http_status == 200 || $http_status == 401 ]]; then
|
||||
log_message "$SCRIPT_NAME" "[OK] SLD service healthy (HTTP Status: $http_status)"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Service is down or unresponsive
|
||||
local status_detail
|
||||
if [ "$http_status" == "0" ]; then
|
||||
status_detail="Connection failed or timeout"
|
||||
else
|
||||
status_detail="HTTP Status: ${http_status}"
|
||||
fi
|
||||
|
||||
log_message "$SCRIPT_NAME" "[ALERT] SLD dead or hung (${status_detail}). Restarting service..."
|
||||
|
||||
# Send notification
|
||||
send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service is down (${status_detail}). Restarting ${SLD_URL}"
|
||||
|
||||
# Restart the service
|
||||
if ! restart_sld_service; then
|
||||
log_message "$SCRIPT_NAME" "[CRITICAL] Failed to restart SLD service"
|
||||
send_alert "$SCRIPT_NAME" "SLD Service Critical" "Failed to restart SLD service"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Allow service to spin up, then log recovery status
|
||||
log_message "$SCRIPT_NAME" "Waiting 15 seconds for service to restart..."
|
||||
sleep 15
|
||||
|
||||
local recovery_status
|
||||
recovery_status=$(check_sld_health)
|
||||
|
||||
if [[ $recovery_status == 200 || $recovery_status == 401 ]]; then
|
||||
log_message "$SCRIPT_NAME" "[RECOVERY] SLD service recovered successfully (HTTP Status: $recovery_status)"
|
||||
else
|
||||
local recovery_detail
|
||||
if [ "$recovery_status" == "0" ]; then
|
||||
recovery_detail="Connection failed after restart"
|
||||
else
|
||||
recovery_detail="HTTP Status: $recovery_status"
|
||||
fi
|
||||
log_message "$SCRIPT_NAME" "[CRITICAL] SLD service failed to recover after restart (${recovery_detail})"
|
||||
send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service FAILED to recover after restart (${recovery_detail})"
|
||||
return 1
|
||||
fi
|
||||
|
||||
return 0
|
||||
}
|
||||
log_message "$SCRIPT_NAME" "Checking SLD service health at ${SLD_URL}..."
|
||||
|
||||
# Run main function
|
||||
main
|
||||
exit_code=$?
|
||||
http_status=$(check_sld_health)
|
||||
|
||||
# 200 OK or 401 Unauthorized indicate the Tomcat layer is actively processing
|
||||
if [[ $http_status == 200 || $http_status == 401 ]]; then
|
||||
log_message "$SCRIPT_NAME" "[OK] SLD service healthy (HTTP Status: $http_status)"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Service is down or unresponsive
|
||||
if [ "$http_status" == "0" ]; then
|
||||
status_detail="Connection failed or timeout"
|
||||
else
|
||||
status_detail="HTTP Status: ${http_status}"
|
||||
fi
|
||||
|
||||
log_message "$SCRIPT_NAME" "[ALERT] SLD dead or hung (${status_detail}). Restarting service..."
|
||||
send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service is down (${status_detail}). Restarting ${SLD_URL}"
|
||||
|
||||
# Restart the service
|
||||
if ! restart_sld_service; then
|
||||
log_message "$SCRIPT_NAME" "[CRITICAL] Failed to restart SLD service"
|
||||
send_alert "$SCRIPT_NAME" "SLD Service Critical" "Failed to restart SLD service"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Allow service to spin up, then log recovery status
|
||||
log_message "$SCRIPT_NAME" "Waiting 15 seconds for service to restart..."
|
||||
sleep 15
|
||||
|
||||
recovery_status=$(check_sld_health)
|
||||
|
||||
if [[ $recovery_status == 200 || $recovery_status == 401 ]]; then
|
||||
log_message "$SCRIPT_NAME" "[RECOVERY] SLD service recovered successfully (HTTP Status: $recovery_status)"
|
||||
else
|
||||
if [ "$recovery_status" == "0" ]; then
|
||||
recovery_detail="Connection failed after restart"
|
||||
else
|
||||
recovery_detail="HTTP Status: $recovery_status"
|
||||
fi
|
||||
log_message "$SCRIPT_NAME" "[CRITICAL] SLD service failed to recover after restart (${recovery_detail})"
|
||||
send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service FAILED to recover after restart (${recovery_detail})"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log_message "$SCRIPT_NAME" "SLD watchdog check complete."
|
||||
exit $exit_code
|
||||
|
||||
Reference in New Issue
Block a user