From 0beef6fa483183947768bda92a1bd51b9c7f1d87 Mon Sep 17 00:00:00 2001 From: Tomi Eckert Date: Thu, 12 Mar 2026 22:18:29 +0100 Subject: [PATCH] refactor(monitoring): simplify monitoring scripts and remove state tracking - Remove consecutive breach tracking for statement queue (immediate alerts) - Consolidate script initialization into init_script() function - Remove unused helper functions (send_ok, run_as_hana_user, get_mount_point) - Flatten sld_watchdog.sh structure by removing main() wrapper - Remove state directory and lock directory configuration from hana.conf - Simplify alert messages to include threshold values This continues the simplification effort from previous commits by removing stateful tracking mechanisms and streamlining the monitoring logic for easier maintenance. --- hana.conf | 8 ---- hana_disk.sh | 2 +- hana_lib.sh | 65 ++++++++++--------------- hana_log_segments.sh | 19 ++++---- hana_queue.sh | 27 ++--------- sld_watchdog.sh | 110 ++++++++++++++++++------------------------- 6 files changed, 83 insertions(+), 148 deletions(-) diff --git a/hana.conf b/hana.conf index 4bd0a94..2c097da 100644 --- a/hana.conf +++ b/hana.conf @@ -33,7 +33,6 @@ DISK_USAGE_THRESHOLD=85 TRUNCATED_PERCENTAGE_THRESHOLD=50 FREE_PERCENTAGE_THRESHOLD=10 STATEMENT_QUEUE_THRESHOLD=10 -STATEMENT_QUEUE_CONSECUTIVE_RUNS=3 BACKUP_THRESHOLD_HOURS=32 # --- Notification Configuration --- @@ -45,10 +44,3 @@ COMPANY_NAME="My Company" LOG_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" LOG_FILE="${LOG_DIR}/hana_monitor.log" -# --- State Directory --- -STATE_DIR="${LOG_DIR}/monitor_state" -mkdir -p "${STATE_DIR}" - -# --- Lock Directory --- -LOCK_DIR="/tmp" - diff --git a/hana_disk.sh b/hana_disk.sh index b74e777..c4bec9d 100644 --- a/hana_disk.sh +++ b/hana_disk.sh @@ -1,7 +1,7 @@ #!/bin/bash # ============================================================================= # SAP HANA Disk Space Monitoring Script -# Checks disk usage for configured directories with auto-cleanup capability +# Checks disk usage for configured directories # ============================================================================= SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" diff --git a/hana_lib.sh b/hana_lib.sh index 3c9b3cb..a1094bd 100644 --- a/hana_lib.sh +++ b/hana_lib.sh @@ -3,13 +3,22 @@ # SAP HANA Monitoring Library - Shared Functions # ============================================================================= -# Logging function with script name prefix -# Usage: log_message "SCRIPT_NAME" "message" -log_message() { - local script_name="$1" - local message="$2" - local timestamp=$(date "+%Y-%m-%d %H:%M:%S") - echo "[${timestamp}] [${script_name}] ${message}" | tee -a "${LOG_FILE}" +# Initialize script with common setup +# Usage: init_script "SCRIPT_NAME" +# Sets up: SCRIPT_DIR, SCRIPT_NAME, LOG_FILE, LOCK_DIR +init_script() { + SCRIPT_NAME="$1" + SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" + + # Load configuration + source "${SCRIPT_DIR}/hana.conf" + + # Setup logging + LOG_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" + LOG_FILE="${LOG_DIR}/hana_monitor.log" + + # Setup lock directory + LOCK_DIR="/tmp" } # Acquire lock for script execution @@ -38,6 +47,15 @@ release_lock() { fi } +# Logging function with script name prefix +# Usage: log_message "SCRIPT_NAME" "message" +log_message() { + local script_name="$1" + local message="$2" + local timestamp=$(date "+%Y-%m-%d %H:%M:%S") + echo "[${timestamp}] [${script_name}] ${message}" | tee -a "${LOG_FILE}" +} + # Send notification via ntfy # Usage: send_notification "TITLE" "MESSAGE" send_notification() { @@ -64,23 +82,6 @@ send_alert() { log_message "$script_name" "ALERT: ${message}" } -# Send OK notification (state change from alert to normal) -# Usage: send_ok "SCRIPT_NAME" "TITLE_PREFIX" "MESSAGE" -send_ok() { - local script_name="$1" - local title_prefix="$2" - local message="$3" - send_notification "${title_prefix} Resolved" "✅ Resolved: ${message}" - log_message "$script_name" "RESOLVED: ${message}" -} - -# Run command as HANA user using su -# Usage: run_as_hana_user "COMMAND" -run_as_hana_user() { - local command="$1" - su - "$HANA_USER" -c "$command" -} - # Execute SQL query as HANA user # Usage: execute_hana_sql "SQL_QUERY" # Returns: SQL output on stdout, returns 0 on success, 1 on failure @@ -131,19 +132,3 @@ get_disk_usage_percentage() { fi df "$dir" 2>/dev/null | awk 'NR==2 {gsub(/%/,"",$5); print $5}' } - -# Get mount point for a directory -# Usage: get_mount_point "/path/to/dir" -# Returns: Mount point path -get_mount_point() { - local dir="$1" - df "$dir" 2>/dev/null | awk 'NR==2 {print $NF}' -} - -# Get available disk space in KB for a directory -# Usage: get_available_space_kb "/path/to/dir" -# Returns: Available space in KB -get_available_space_kb() { - local dir="$1" - df -k "$dir" 2>/dev/null | awk 'NR==2 {print $4}' -} diff --git a/hana_log_segments.sh b/hana_log_segments.sh index e5604d7..f6b6308 100644 --- a/hana_log_segments.sh +++ b/hana_log_segments.sh @@ -72,25 +72,22 @@ log_message "$SCRIPT_NAME" "Truncated Segments: ${truncated_segments}" log_message "$SCRIPT_NAME" "Free Segments: ${free_segments}" if [ $total_segments -eq 0 ]; then - log_message "$SCRIPT_NAME" "WARNING: No log segments found. Skipping percentage checks." + log_message "$SCRIPT_NAME" "WARNING: No log segments found." send_alert "$SCRIPT_NAME" "HANA Log Segment Warning" "No log segments found." exit 1 fi -# Calculate truncated percentage with integer arithmetic +# Calculate percentages truncated_percentage=$((truncated_segments * 100 / total_segments)) - -if [ $truncated_percentage -gt $TRUNCATED_PERCENTAGE_THRESHOLD ]; then - log_message "$SCRIPT_NAME" "ALERT: ${truncated_percentage}% of log segments are 'Truncated'." - send_alert "$SCRIPT_NAME" "HANA Log Segment" "${truncated_percentage}% of HANA log segments are in 'Truncated' state." -fi - -# Calculate free percentage with integer arithmetic free_percentage=$((free_segments * 100 / total_segments)) +# Check thresholds and alert +if [ $truncated_percentage -gt $TRUNCATED_PERCENTAGE_THRESHOLD ]; then + send_alert "$SCRIPT_NAME" "HANA Log Segment" "${truncated_percentage}% of log segments are 'Truncated' (threshold: ${TRUNCATED_PERCENTAGE_THRESHOLD}%)." +fi + if [ $free_percentage -lt $FREE_PERCENTAGE_THRESHOLD ]; then - log_message "$SCRIPT_NAME" "ALERT: Only ${free_percentage}% of log segments are 'Free'." - send_alert "$SCRIPT_NAME" "HANA Log Segment" "Only ${free_percentage}% of HANA log segments are in 'Free' state." + send_alert "$SCRIPT_NAME" "HANA Log Segment" "Only ${free_percentage}% of log segments are 'Free' (threshold: ${FREE_PERCENTAGE_THRESHOLD}%)." fi log_message "$SCRIPT_NAME" "Log segment check complete." diff --git a/hana_queue.sh b/hana_queue.sh index d4950db..49196b0 100644 --- a/hana_queue.sh +++ b/hana_queue.sh @@ -30,7 +30,7 @@ fi STATEMENT_QUEUE_SQL="SELECT COUNT(*) FROM M_SERVICE_THREADS WHERE THREAD_TYPE = 'SqlExecutor' AND THREAD_STATE = 'Queueing';" # Execute SQL query -queue_result=$(execute_hana_sql_query "$STATEMENT_QUEUE_SQL") +queue_count=$(execute_hana_sql_query "$STATEMENT_QUEUE_SQL") sql_status=$? if [ $sql_status -ne 0 ]; then @@ -48,30 +48,11 @@ fi log_message "$SCRIPT_NAME" "Current statement queue length: ${queue_count}" -# Get breach count from state file -breach_count_file="${STATE_DIR}/statement_queue_breach_count" -breach_count=0 -if [ -f "$breach_count_file" ]; then - breach_count=$(cat "$breach_count_file") -fi - +# Alert immediately if queue exceeds threshold if [ "$queue_count" -gt "$STATEMENT_QUEUE_THRESHOLD" ]; then - breach_count=$((breach_count + 1)) - log_message "$SCRIPT_NAME" "Statement queue is above threshold (${queue_count} > ${STATEMENT_QUEUE_THRESHOLD}). Consecutive breach count: ${breach_count}/${STATEMENT_QUEUE_CONSECUTIVE_RUNS}." -else - if [ "$breach_count" -gt 0 ]; then - log_message "$SCRIPT_NAME" "Statement queue returned to normal. Resetting breach count from ${breach_count} to 0." - fi - breach_count=0 -fi -echo "$breach_count" > "$breach_count_file" - -if [ "$breach_count" -ge "$STATEMENT_QUEUE_CONSECUTIVE_RUNS" ]; then - message="Statement queue has been over ${STATEMENT_QUEUE_THRESHOLD} for ${breach_count} checks. Current count: ${queue_count}." - send_alert "$SCRIPT_NAME" "HANA Statement Queue" "$message" + send_alert "$SCRIPT_NAME" "HANA Statement Queue" "Statement queue count is ${queue_count}, which exceeds threshold of ${STATEMENT_QUEUE_THRESHOLD}." exit 1 -else - log_message "$SCRIPT_NAME" "Statement queue is normal. Current count: ${queue_count}." fi +log_message "$SCRIPT_NAME" "Statement queue is normal. Current count: ${queue_count}." log_message "$SCRIPT_NAME" "Statement queue check complete." diff --git a/sld_watchdog.sh b/sld_watchdog.sh index 0d6943f..ac54e96 100644 --- a/sld_watchdog.sh +++ b/sld_watchdog.sh @@ -21,12 +21,10 @@ fi trap 'release_lock "$SCRIPT_NAME"' EXIT # Function to check SLD health -# Returns HTTP status code or "0" for connection errors check_sld_health() { local http_status http_status=$(curl -k -s -o /dev/null -w "%{http_code}" -m "$SLD_TIMEOUT" --connect-timeout "$SLD_TIMEOUT" "$SLD_URL" 2>/dev/null) - # Handle curl errors (returns 000 on connection failure) if [ -z "$http_status" ] || [ "$http_status" == "000" ]; then echo "0" else @@ -38,7 +36,6 @@ check_sld_health() { restart_sld_service() { log_message "$SCRIPT_NAME" "Attempting to restart SLD service..." - # Try systemctl first if command -v systemctl &> /dev/null; then systemctl restart sapb1servertools 2>&1 local restart_status=$? @@ -50,73 +47,56 @@ restart_sld_service() { return 1 fi else - log_message "$SCRIPT_NAME" "systemctl not available, trying alternative restart methods" - # Fallback: try service command + log_message "$SCRIPT_NAME" "systemctl not available, trying service command" service sapb1servertools restart 2>&1 return $? fi } -# Main monitoring logic -main() { - log_message "$SCRIPT_NAME" "Checking SLD service health at ${SLD_URL}..." - - local http_status - http_status=$(check_sld_health) - - # 200 OK or 401 Unauthorized indicate the Tomcat layer is actively processing - if [[ $http_status == 200 || $http_status == 401 ]]; then - log_message "$SCRIPT_NAME" "[OK] SLD service healthy (HTTP Status: $http_status)" - return 0 - fi - - # Service is down or unresponsive - local status_detail - if [ "$http_status" == "0" ]; then - status_detail="Connection failed or timeout" - else - status_detail="HTTP Status: ${http_status}" - fi - - log_message "$SCRIPT_NAME" "[ALERT] SLD dead or hung (${status_detail}). Restarting service..." - - # Send notification - send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service is down (${status_detail}). Restarting ${SLD_URL}" - - # Restart the service - if ! restart_sld_service; then - log_message "$SCRIPT_NAME" "[CRITICAL] Failed to restart SLD service" - send_alert "$SCRIPT_NAME" "SLD Service Critical" "Failed to restart SLD service" - return 1 - fi - - # Allow service to spin up, then log recovery status - log_message "$SCRIPT_NAME" "Waiting 15 seconds for service to restart..." - sleep 15 - - local recovery_status - recovery_status=$(check_sld_health) - - if [[ $recovery_status == 200 || $recovery_status == 401 ]]; then - log_message "$SCRIPT_NAME" "[RECOVERY] SLD service recovered successfully (HTTP Status: $recovery_status)" - else - local recovery_detail - if [ "$recovery_status" == "0" ]; then - recovery_detail="Connection failed after restart" - else - recovery_detail="HTTP Status: $recovery_status" - fi - log_message "$SCRIPT_NAME" "[CRITICAL] SLD service failed to recover after restart (${recovery_detail})" - send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service FAILED to recover after restart (${recovery_detail})" - return 1 - fi - - return 0 -} +log_message "$SCRIPT_NAME" "Checking SLD service health at ${SLD_URL}..." -# Run main function -main -exit_code=$? +http_status=$(check_sld_health) + +# 200 OK or 401 Unauthorized indicate the Tomcat layer is actively processing +if [[ $http_status == 200 || $http_status == 401 ]]; then + log_message "$SCRIPT_NAME" "[OK] SLD service healthy (HTTP Status: $http_status)" + exit 0 +fi + +# Service is down or unresponsive +if [ "$http_status" == "0" ]; then + status_detail="Connection failed or timeout" +else + status_detail="HTTP Status: ${http_status}" +fi + +log_message "$SCRIPT_NAME" "[ALERT] SLD dead or hung (${status_detail}). Restarting service..." +send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service is down (${status_detail}). Restarting ${SLD_URL}" + +# Restart the service +if ! restart_sld_service; then + log_message "$SCRIPT_NAME" "[CRITICAL] Failed to restart SLD service" + send_alert "$SCRIPT_NAME" "SLD Service Critical" "Failed to restart SLD service" + exit 1 +fi + +# Allow service to spin up, then log recovery status +log_message "$SCRIPT_NAME" "Waiting 15 seconds for service to restart..." +sleep 15 + +recovery_status=$(check_sld_health) + +if [[ $recovery_status == 200 || $recovery_status == 401 ]]; then + log_message "$SCRIPT_NAME" "[RECOVERY] SLD service recovered successfully (HTTP Status: $recovery_status)" +else + if [ "$recovery_status" == "0" ]; then + recovery_detail="Connection failed after restart" + else + recovery_detail="HTTP Status: $recovery_status" + fi + log_message "$SCRIPT_NAME" "[CRITICAL] SLD service failed to recover after restart (${recovery_detail})" + send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service FAILED to recover after restart (${recovery_detail})" + exit 1 +fi log_message "$SCRIPT_NAME" "SLD watchdog check complete." -exit $exit_code