refactor(monitoring): simplify monitoring scripts and remove state tracking

- Remove consecutive breach tracking for statement queue (immediate alerts)
- Consolidate script initialization into init_script() function
- Remove unused helper functions (send_ok, run_as_hana_user, get_mount_point)
- Flatten sld_watchdog.sh structure by removing main() wrapper
- Remove state directory and lock directory configuration from hana.conf
- Simplify alert messages to include threshold values

This continues the simplification effort from previous commits by removing stateful tracking mechanisms and streamlining the monitoring logic for easier maintenance.
This commit is contained in:
2026-03-12 22:18:29 +01:00
parent cf5b81889d
commit 0beef6fa48
6 changed files with 83 additions and 148 deletions

View File

@@ -33,7 +33,6 @@ DISK_USAGE_THRESHOLD=85
TRUNCATED_PERCENTAGE_THRESHOLD=50
FREE_PERCENTAGE_THRESHOLD=10
STATEMENT_QUEUE_THRESHOLD=10
STATEMENT_QUEUE_CONSECUTIVE_RUNS=3
BACKUP_THRESHOLD_HOURS=32
# --- Notification Configuration ---
@@ -45,10 +44,3 @@ COMPANY_NAME="My Company"
LOG_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
LOG_FILE="${LOG_DIR}/hana_monitor.log"
# --- State Directory ---
STATE_DIR="${LOG_DIR}/monitor_state"
mkdir -p "${STATE_DIR}"
# --- Lock Directory ---
LOCK_DIR="/tmp"

View File

@@ -1,7 +1,7 @@
#!/bin/bash
# =============================================================================
# SAP HANA Disk Space Monitoring Script
# Checks disk usage for configured directories with auto-cleanup capability
# Checks disk usage for configured directories
# =============================================================================
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"

View File

@@ -3,13 +3,22 @@
# SAP HANA Monitoring Library - Shared Functions
# =============================================================================
# Logging function with script name prefix
# Usage: log_message "SCRIPT_NAME" "message"
log_message() {
local script_name="$1"
local message="$2"
local timestamp=$(date "+%Y-%m-%d %H:%M:%S")
echo "[${timestamp}] [${script_name}] ${message}" | tee -a "${LOG_FILE}"
# Initialize script with common setup
# Usage: init_script "SCRIPT_NAME"
# Sets up: SCRIPT_DIR, SCRIPT_NAME, LOG_FILE, LOCK_DIR
init_script() {
SCRIPT_NAME="$1"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
# Load configuration
source "${SCRIPT_DIR}/hana.conf"
# Setup logging
LOG_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
LOG_FILE="${LOG_DIR}/hana_monitor.log"
# Setup lock directory
LOCK_DIR="/tmp"
}
# Acquire lock for script execution
@@ -38,6 +47,15 @@ release_lock() {
fi
}
# Logging function with script name prefix
# Usage: log_message "SCRIPT_NAME" "message"
log_message() {
local script_name="$1"
local message="$2"
local timestamp=$(date "+%Y-%m-%d %H:%M:%S")
echo "[${timestamp}] [${script_name}] ${message}" | tee -a "${LOG_FILE}"
}
# Send notification via ntfy
# Usage: send_notification "TITLE" "MESSAGE"
send_notification() {
@@ -64,23 +82,6 @@ send_alert() {
log_message "$script_name" "ALERT: ${message}"
}
# Send OK notification (state change from alert to normal)
# Usage: send_ok "SCRIPT_NAME" "TITLE_PREFIX" "MESSAGE"
send_ok() {
local script_name="$1"
local title_prefix="$2"
local message="$3"
send_notification "${title_prefix} Resolved" "✅ Resolved: ${message}"
log_message "$script_name" "RESOLVED: ${message}"
}
# Run command as HANA user using su
# Usage: run_as_hana_user "COMMAND"
run_as_hana_user() {
local command="$1"
su - "$HANA_USER" -c "$command"
}
# Execute SQL query as HANA user
# Usage: execute_hana_sql "SQL_QUERY"
# Returns: SQL output on stdout, returns 0 on success, 1 on failure
@@ -131,19 +132,3 @@ get_disk_usage_percentage() {
fi
df "$dir" 2>/dev/null | awk 'NR==2 {gsub(/%/,"",$5); print $5}'
}
# Get mount point for a directory
# Usage: get_mount_point "/path/to/dir"
# Returns: Mount point path
get_mount_point() {
local dir="$1"
df "$dir" 2>/dev/null | awk 'NR==2 {print $NF}'
}
# Get available disk space in KB for a directory
# Usage: get_available_space_kb "/path/to/dir"
# Returns: Available space in KB
get_available_space_kb() {
local dir="$1"
df -k "$dir" 2>/dev/null | awk 'NR==2 {print $4}'
}

View File

@@ -72,25 +72,22 @@ log_message "$SCRIPT_NAME" "Truncated Segments: ${truncated_segments}"
log_message "$SCRIPT_NAME" "Free Segments: ${free_segments}"
if [ $total_segments -eq 0 ]; then
log_message "$SCRIPT_NAME" "WARNING: No log segments found. Skipping percentage checks."
log_message "$SCRIPT_NAME" "WARNING: No log segments found."
send_alert "$SCRIPT_NAME" "HANA Log Segment Warning" "No log segments found."
exit 1
fi
# Calculate truncated percentage with integer arithmetic
# Calculate percentages
truncated_percentage=$((truncated_segments * 100 / total_segments))
if [ $truncated_percentage -gt $TRUNCATED_PERCENTAGE_THRESHOLD ]; then
log_message "$SCRIPT_NAME" "ALERT: ${truncated_percentage}% of log segments are 'Truncated'."
send_alert "$SCRIPT_NAME" "HANA Log Segment" "${truncated_percentage}% of HANA log segments are in 'Truncated' state."
fi
# Calculate free percentage with integer arithmetic
free_percentage=$((free_segments * 100 / total_segments))
# Check thresholds and alert
if [ $truncated_percentage -gt $TRUNCATED_PERCENTAGE_THRESHOLD ]; then
send_alert "$SCRIPT_NAME" "HANA Log Segment" "${truncated_percentage}% of log segments are 'Truncated' (threshold: ${TRUNCATED_PERCENTAGE_THRESHOLD}%)."
fi
if [ $free_percentage -lt $FREE_PERCENTAGE_THRESHOLD ]; then
log_message "$SCRIPT_NAME" "ALERT: Only ${free_percentage}% of log segments are 'Free'."
send_alert "$SCRIPT_NAME" "HANA Log Segment" "Only ${free_percentage}% of HANA log segments are in 'Free' state."
send_alert "$SCRIPT_NAME" "HANA Log Segment" "Only ${free_percentage}% of log segments are 'Free' (threshold: ${FREE_PERCENTAGE_THRESHOLD}%)."
fi
log_message "$SCRIPT_NAME" "Log segment check complete."

View File

@@ -30,7 +30,7 @@ fi
STATEMENT_QUEUE_SQL="SELECT COUNT(*) FROM M_SERVICE_THREADS WHERE THREAD_TYPE = 'SqlExecutor' AND THREAD_STATE = 'Queueing';"
# Execute SQL query
queue_result=$(execute_hana_sql_query "$STATEMENT_QUEUE_SQL")
queue_count=$(execute_hana_sql_query "$STATEMENT_QUEUE_SQL")
sql_status=$?
if [ $sql_status -ne 0 ]; then
@@ -48,30 +48,11 @@ fi
log_message "$SCRIPT_NAME" "Current statement queue length: ${queue_count}"
# Get breach count from state file
breach_count_file="${STATE_DIR}/statement_queue_breach_count"
breach_count=0
if [ -f "$breach_count_file" ]; then
breach_count=$(cat "$breach_count_file")
fi
# Alert immediately if queue exceeds threshold
if [ "$queue_count" -gt "$STATEMENT_QUEUE_THRESHOLD" ]; then
breach_count=$((breach_count + 1))
log_message "$SCRIPT_NAME" "Statement queue is above threshold (${queue_count} > ${STATEMENT_QUEUE_THRESHOLD}). Consecutive breach count: ${breach_count}/${STATEMENT_QUEUE_CONSECUTIVE_RUNS}."
else
if [ "$breach_count" -gt 0 ]; then
log_message "$SCRIPT_NAME" "Statement queue returned to normal. Resetting breach count from ${breach_count} to 0."
fi
breach_count=0
fi
echo "$breach_count" > "$breach_count_file"
if [ "$breach_count" -ge "$STATEMENT_QUEUE_CONSECUTIVE_RUNS" ]; then
message="Statement queue has been over ${STATEMENT_QUEUE_THRESHOLD} for ${breach_count} checks. Current count: ${queue_count}."
send_alert "$SCRIPT_NAME" "HANA Statement Queue" "$message"
send_alert "$SCRIPT_NAME" "HANA Statement Queue" "Statement queue count is ${queue_count}, which exceeds threshold of ${STATEMENT_QUEUE_THRESHOLD}."
exit 1
else
log_message "$SCRIPT_NAME" "Statement queue is normal. Current count: ${queue_count}."
fi
log_message "$SCRIPT_NAME" "Statement queue is normal. Current count: ${queue_count}."
log_message "$SCRIPT_NAME" "Statement queue check complete."

View File

@@ -21,12 +21,10 @@ fi
trap 'release_lock "$SCRIPT_NAME"' EXIT
# Function to check SLD health
# Returns HTTP status code or "0" for connection errors
check_sld_health() {
local http_status
http_status=$(curl -k -s -o /dev/null -w "%{http_code}" -m "$SLD_TIMEOUT" --connect-timeout "$SLD_TIMEOUT" "$SLD_URL" 2>/dev/null)
# Handle curl errors (returns 000 on connection failure)
if [ -z "$http_status" ] || [ "$http_status" == "000" ]; then
echo "0"
else
@@ -38,7 +36,6 @@ check_sld_health() {
restart_sld_service() {
log_message "$SCRIPT_NAME" "Attempting to restart SLD service..."
# Try systemctl first
if command -v systemctl &> /dev/null; then
systemctl restart sapb1servertools 2>&1
local restart_status=$?
@@ -50,73 +47,56 @@ restart_sld_service() {
return 1
fi
else
log_message "$SCRIPT_NAME" "systemctl not available, trying alternative restart methods"
# Fallback: try service command
log_message "$SCRIPT_NAME" "systemctl not available, trying service command"
service sapb1servertools restart 2>&1
return $?
fi
}
# Main monitoring logic
main() {
log_message "$SCRIPT_NAME" "Checking SLD service health at ${SLD_URL}..."
log_message "$SCRIPT_NAME" "Checking SLD service health at ${SLD_URL}..."
local http_status
http_status=$(check_sld_health)
http_status=$(check_sld_health)
# 200 OK or 401 Unauthorized indicate the Tomcat layer is actively processing
if [[ $http_status == 200 || $http_status == 401 ]]; then
log_message "$SCRIPT_NAME" "[OK] SLD service healthy (HTTP Status: $http_status)"
return 0
fi
# 200 OK or 401 Unauthorized indicate the Tomcat layer is actively processing
if [[ $http_status == 200 || $http_status == 401 ]]; then
log_message "$SCRIPT_NAME" "[OK] SLD service healthy (HTTP Status: $http_status)"
exit 0
fi
# Service is down or unresponsive
local status_detail
if [ "$http_status" == "0" ]; then
status_detail="Connection failed or timeout"
# Service is down or unresponsive
if [ "$http_status" == "0" ]; then
status_detail="Connection failed or timeout"
else
status_detail="HTTP Status: ${http_status}"
fi
log_message "$SCRIPT_NAME" "[ALERT] SLD dead or hung (${status_detail}). Restarting service..."
send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service is down (${status_detail}). Restarting ${SLD_URL}"
# Restart the service
if ! restart_sld_service; then
log_message "$SCRIPT_NAME" "[CRITICAL] Failed to restart SLD service"
send_alert "$SCRIPT_NAME" "SLD Service Critical" "Failed to restart SLD service"
exit 1
fi
# Allow service to spin up, then log recovery status
log_message "$SCRIPT_NAME" "Waiting 15 seconds for service to restart..."
sleep 15
recovery_status=$(check_sld_health)
if [[ $recovery_status == 200 || $recovery_status == 401 ]]; then
log_message "$SCRIPT_NAME" "[RECOVERY] SLD service recovered successfully (HTTP Status: $recovery_status)"
else
if [ "$recovery_status" == "0" ]; then
recovery_detail="Connection failed after restart"
else
status_detail="HTTP Status: ${http_status}"
recovery_detail="HTTP Status: $recovery_status"
fi
log_message "$SCRIPT_NAME" "[ALERT] SLD dead or hung (${status_detail}). Restarting service..."
# Send notification
send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service is down (${status_detail}). Restarting ${SLD_URL}"
# Restart the service
if ! restart_sld_service; then
log_message "$SCRIPT_NAME" "[CRITICAL] Failed to restart SLD service"
send_alert "$SCRIPT_NAME" "SLD Service Critical" "Failed to restart SLD service"
return 1
fi
# Allow service to spin up, then log recovery status
log_message "$SCRIPT_NAME" "Waiting 15 seconds for service to restart..."
sleep 15
local recovery_status
recovery_status=$(check_sld_health)
if [[ $recovery_status == 200 || $recovery_status == 401 ]]; then
log_message "$SCRIPT_NAME" "[RECOVERY] SLD service recovered successfully (HTTP Status: $recovery_status)"
else
local recovery_detail
if [ "$recovery_status" == "0" ]; then
recovery_detail="Connection failed after restart"
else
recovery_detail="HTTP Status: $recovery_status"
fi
log_message "$SCRIPT_NAME" "[CRITICAL] SLD service failed to recover after restart (${recovery_detail})"
send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service FAILED to recover after restart (${recovery_detail})"
return 1
fi
return 0
}
# Run main function
main
exit_code=$?
log_message "$SCRIPT_NAME" "[CRITICAL] SLD service failed to recover after restart (${recovery_detail})"
send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service FAILED to recover after restart (${recovery_detail})"
exit 1
fi
log_message "$SCRIPT_NAME" "SLD watchdog check complete."
exit $exit_code