refactor(monitoring): simplify notification system and remove auto-cleanup
- Replace state-based notifications with direct alert functions - Remove auto-cleanup functionality from disk monitoring and configuration - Simplify lock acquisition/release across all monitoring scripts - Add execute_hana_sql helper functions for consistent SQL execution - Remove state file tracking in favor of direct file operations - Standardize error handling with exit codes on critical failures - Clean up hana.conf by removing unused auto-delete directory settings
This commit is contained in:
@@ -1,10 +1,8 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# =============================================================================
|
||||
# sld_watchdog.sh - Monitors SLD service health and restarts if needed
|
||||
# Optimized for better error handling and reliability
|
||||
#
|
||||
# =============================================================================
|
||||
|
||||
# Get script directory and name
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
SCRIPT_NAME="sld_watchdog"
|
||||
|
||||
@@ -15,14 +13,12 @@ source "$SCRIPT_DIR/hana_lib.sh"
|
||||
# SLD-specific configuration
|
||||
SLD_URL="${SLD_URL:-https://localhost:40000/sld/sld0100.svc}"
|
||||
SLD_TIMEOUT="${SLD_TIMEOUT:-5}"
|
||||
SLD_LOCK_FILE="/tmp/hana_sld_watchdog.lock"
|
||||
|
||||
# Acquire lock using library function
|
||||
LOCK_FILE=$(acquire_lock "$SCRIPT_NAME")
|
||||
if [ $? -ne 0 ]; then
|
||||
# Acquire lock
|
||||
if ! acquire_lock "$SCRIPT_NAME"; then
|
||||
exit 0
|
||||
fi
|
||||
trap 'release_lock "$LOCK_FILE"' EXIT
|
||||
trap 'release_lock "$SCRIPT_NAME"' EXIT
|
||||
|
||||
# Function to check SLD health
|
||||
# Returns HTTP status code or "0" for connection errors
|
||||
@@ -71,7 +67,6 @@ main() {
|
||||
# 200 OK or 401 Unauthorized indicate the Tomcat layer is actively processing
|
||||
if [[ $http_status == 200 || $http_status == 401 ]]; then
|
||||
log_message "$SCRIPT_NAME" "[OK] SLD service healthy (HTTP Status: $http_status)"
|
||||
send_notification_if_changed "$SCRIPT_NAME" "sld_status" "SLD Service" "SLD service is healthy (HTTP: $http_status)" "false" "OK"
|
||||
return 0
|
||||
fi
|
||||
|
||||
@@ -86,14 +81,12 @@ main() {
|
||||
log_message "$SCRIPT_NAME" "[ALERT] SLD dead or hung (${status_detail}). Restarting service..."
|
||||
|
||||
# Send notification
|
||||
send_notification_if_changed "$SCRIPT_NAME" "sld_down" "SLD Service Critical" \
|
||||
"SLD service is down (${status_detail}). Restarting ${SLD_URL}" "true" "SLD_DOWN"
|
||||
send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service is down (${status_detail}). Restarting ${SLD_URL}"
|
||||
|
||||
# Restart the service
|
||||
if ! restart_sld_service; then
|
||||
log_message "$SCRIPT_NAME" "[CRITICAL] Failed to restart SLD service"
|
||||
send_notification_if_changed "$SCRIPT_NAME" "sld_restart_failed" "SLD Service Critical" \
|
||||
"Failed to restart SLD service" "true" "RESTART_FAILED"
|
||||
send_alert "$SCRIPT_NAME" "SLD Service Critical" "Failed to restart SLD service"
|
||||
return 1
|
||||
fi
|
||||
|
||||
@@ -106,10 +99,6 @@ main() {
|
||||
|
||||
if [[ $recovery_status == 200 || $recovery_status == 401 ]]; then
|
||||
log_message "$SCRIPT_NAME" "[RECOVERY] SLD service recovered successfully (HTTP Status: $recovery_status)"
|
||||
send_notification_if_changed "$SCRIPT_NAME" "sld_down" "SLD Service" \
|
||||
"SLD service recovered (HTTP: $recovery_status)" "false" "OK"
|
||||
send_notification_if_changed "$SCRIPT_NAME" "sld_restart_failed" "SLD Service" \
|
||||
"Service recovered successfully" "false" "OK"
|
||||
else
|
||||
local recovery_detail
|
||||
if [ "$recovery_status" == "0" ]; then
|
||||
@@ -118,8 +107,7 @@ main() {
|
||||
recovery_detail="HTTP Status: $recovery_status"
|
||||
fi
|
||||
log_message "$SCRIPT_NAME" "[CRITICAL] SLD service failed to recover after restart (${recovery_detail})"
|
||||
send_notification_if_changed "$SCRIPT_NAME" "sld_down" "SLD Service Critical" \
|
||||
"SLD service FAILED to recover after restart (${recovery_detail})" "true" "RECOVERY_FAILED"
|
||||
send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service FAILED to recover after restart (${recovery_detail})"
|
||||
return 1
|
||||
fi
|
||||
|
||||
|
||||
Reference in New Issue
Block a user