refactor(monitoring): simplify monitoring scripts and remove state tracking
- Remove consecutive breach tracking for statement queue (immediate alerts) - Consolidate script initialization into init_script() function - Remove unused helper functions (send_ok, run_as_hana_user, get_mount_point) - Flatten sld_watchdog.sh structure by removing main() wrapper - Remove state directory and lock directory configuration from hana.conf - Simplify alert messages to include threshold values This continues the simplification effort from previous commits by removing stateful tracking mechanisms and streamlining the monitoring logic for easier maintenance.
This commit is contained in:
@@ -33,7 +33,6 @@ DISK_USAGE_THRESHOLD=85
|
|||||||
TRUNCATED_PERCENTAGE_THRESHOLD=50
|
TRUNCATED_PERCENTAGE_THRESHOLD=50
|
||||||
FREE_PERCENTAGE_THRESHOLD=10
|
FREE_PERCENTAGE_THRESHOLD=10
|
||||||
STATEMENT_QUEUE_THRESHOLD=10
|
STATEMENT_QUEUE_THRESHOLD=10
|
||||||
STATEMENT_QUEUE_CONSECUTIVE_RUNS=3
|
|
||||||
BACKUP_THRESHOLD_HOURS=32
|
BACKUP_THRESHOLD_HOURS=32
|
||||||
|
|
||||||
# --- Notification Configuration ---
|
# --- Notification Configuration ---
|
||||||
@@ -45,10 +44,3 @@ COMPANY_NAME="My Company"
|
|||||||
LOG_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
|
LOG_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
|
||||||
LOG_FILE="${LOG_DIR}/hana_monitor.log"
|
LOG_FILE="${LOG_DIR}/hana_monitor.log"
|
||||||
|
|
||||||
# --- State Directory ---
|
|
||||||
STATE_DIR="${LOG_DIR}/monitor_state"
|
|
||||||
mkdir -p "${STATE_DIR}"
|
|
||||||
|
|
||||||
# --- Lock Directory ---
|
|
||||||
LOCK_DIR="/tmp"
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# SAP HANA Disk Space Monitoring Script
|
# SAP HANA Disk Space Monitoring Script
|
||||||
# Checks disk usage for configured directories with auto-cleanup capability
|
# Checks disk usage for configured directories
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
|
||||||
|
|||||||
65
hana_lib.sh
65
hana_lib.sh
@@ -3,13 +3,22 @@
|
|||||||
# SAP HANA Monitoring Library - Shared Functions
|
# SAP HANA Monitoring Library - Shared Functions
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|
||||||
# Logging function with script name prefix
|
# Initialize script with common setup
|
||||||
# Usage: log_message "SCRIPT_NAME" "message"
|
# Usage: init_script "SCRIPT_NAME"
|
||||||
log_message() {
|
# Sets up: SCRIPT_DIR, SCRIPT_NAME, LOG_FILE, LOCK_DIR
|
||||||
local script_name="$1"
|
init_script() {
|
||||||
local message="$2"
|
SCRIPT_NAME="$1"
|
||||||
local timestamp=$(date "+%Y-%m-%d %H:%M:%S")
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
|
||||||
echo "[${timestamp}] [${script_name}] ${message}" | tee -a "${LOG_FILE}"
|
|
||||||
|
# Load configuration
|
||||||
|
source "${SCRIPT_DIR}/hana.conf"
|
||||||
|
|
||||||
|
# Setup logging
|
||||||
|
LOG_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
|
||||||
|
LOG_FILE="${LOG_DIR}/hana_monitor.log"
|
||||||
|
|
||||||
|
# Setup lock directory
|
||||||
|
LOCK_DIR="/tmp"
|
||||||
}
|
}
|
||||||
|
|
||||||
# Acquire lock for script execution
|
# Acquire lock for script execution
|
||||||
@@ -38,6 +47,15 @@ release_lock() {
|
|||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Logging function with script name prefix
|
||||||
|
# Usage: log_message "SCRIPT_NAME" "message"
|
||||||
|
log_message() {
|
||||||
|
local script_name="$1"
|
||||||
|
local message="$2"
|
||||||
|
local timestamp=$(date "+%Y-%m-%d %H:%M:%S")
|
||||||
|
echo "[${timestamp}] [${script_name}] ${message}" | tee -a "${LOG_FILE}"
|
||||||
|
}
|
||||||
|
|
||||||
# Send notification via ntfy
|
# Send notification via ntfy
|
||||||
# Usage: send_notification "TITLE" "MESSAGE"
|
# Usage: send_notification "TITLE" "MESSAGE"
|
||||||
send_notification() {
|
send_notification() {
|
||||||
@@ -64,23 +82,6 @@ send_alert() {
|
|||||||
log_message "$script_name" "ALERT: ${message}"
|
log_message "$script_name" "ALERT: ${message}"
|
||||||
}
|
}
|
||||||
|
|
||||||
# Send OK notification (state change from alert to normal)
|
|
||||||
# Usage: send_ok "SCRIPT_NAME" "TITLE_PREFIX" "MESSAGE"
|
|
||||||
send_ok() {
|
|
||||||
local script_name="$1"
|
|
||||||
local title_prefix="$2"
|
|
||||||
local message="$3"
|
|
||||||
send_notification "${title_prefix} Resolved" "✅ Resolved: ${message}"
|
|
||||||
log_message "$script_name" "RESOLVED: ${message}"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Run command as HANA user using su
|
|
||||||
# Usage: run_as_hana_user "COMMAND"
|
|
||||||
run_as_hana_user() {
|
|
||||||
local command="$1"
|
|
||||||
su - "$HANA_USER" -c "$command"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Execute SQL query as HANA user
|
# Execute SQL query as HANA user
|
||||||
# Usage: execute_hana_sql "SQL_QUERY"
|
# Usage: execute_hana_sql "SQL_QUERY"
|
||||||
# Returns: SQL output on stdout, returns 0 on success, 1 on failure
|
# Returns: SQL output on stdout, returns 0 on success, 1 on failure
|
||||||
@@ -131,19 +132,3 @@ get_disk_usage_percentage() {
|
|||||||
fi
|
fi
|
||||||
df "$dir" 2>/dev/null | awk 'NR==2 {gsub(/%/,"",$5); print $5}'
|
df "$dir" 2>/dev/null | awk 'NR==2 {gsub(/%/,"",$5); print $5}'
|
||||||
}
|
}
|
||||||
|
|
||||||
# Get mount point for a directory
|
|
||||||
# Usage: get_mount_point "/path/to/dir"
|
|
||||||
# Returns: Mount point path
|
|
||||||
get_mount_point() {
|
|
||||||
local dir="$1"
|
|
||||||
df "$dir" 2>/dev/null | awk 'NR==2 {print $NF}'
|
|
||||||
}
|
|
||||||
|
|
||||||
# Get available disk space in KB for a directory
|
|
||||||
# Usage: get_available_space_kb "/path/to/dir"
|
|
||||||
# Returns: Available space in KB
|
|
||||||
get_available_space_kb() {
|
|
||||||
local dir="$1"
|
|
||||||
df -k "$dir" 2>/dev/null | awk 'NR==2 {print $4}'
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -72,25 +72,22 @@ log_message "$SCRIPT_NAME" "Truncated Segments: ${truncated_segments}"
|
|||||||
log_message "$SCRIPT_NAME" "Free Segments: ${free_segments}"
|
log_message "$SCRIPT_NAME" "Free Segments: ${free_segments}"
|
||||||
|
|
||||||
if [ $total_segments -eq 0 ]; then
|
if [ $total_segments -eq 0 ]; then
|
||||||
log_message "$SCRIPT_NAME" "WARNING: No log segments found. Skipping percentage checks."
|
log_message "$SCRIPT_NAME" "WARNING: No log segments found."
|
||||||
send_alert "$SCRIPT_NAME" "HANA Log Segment Warning" "No log segments found."
|
send_alert "$SCRIPT_NAME" "HANA Log Segment Warning" "No log segments found."
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Calculate truncated percentage with integer arithmetic
|
# Calculate percentages
|
||||||
truncated_percentage=$((truncated_segments * 100 / total_segments))
|
truncated_percentage=$((truncated_segments * 100 / total_segments))
|
||||||
|
|
||||||
if [ $truncated_percentage -gt $TRUNCATED_PERCENTAGE_THRESHOLD ]; then
|
|
||||||
log_message "$SCRIPT_NAME" "ALERT: ${truncated_percentage}% of log segments are 'Truncated'."
|
|
||||||
send_alert "$SCRIPT_NAME" "HANA Log Segment" "${truncated_percentage}% of HANA log segments are in 'Truncated' state."
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Calculate free percentage with integer arithmetic
|
|
||||||
free_percentage=$((free_segments * 100 / total_segments))
|
free_percentage=$((free_segments * 100 / total_segments))
|
||||||
|
|
||||||
|
# Check thresholds and alert
|
||||||
|
if [ $truncated_percentage -gt $TRUNCATED_PERCENTAGE_THRESHOLD ]; then
|
||||||
|
send_alert "$SCRIPT_NAME" "HANA Log Segment" "${truncated_percentage}% of log segments are 'Truncated' (threshold: ${TRUNCATED_PERCENTAGE_THRESHOLD}%)."
|
||||||
|
fi
|
||||||
|
|
||||||
if [ $free_percentage -lt $FREE_PERCENTAGE_THRESHOLD ]; then
|
if [ $free_percentage -lt $FREE_PERCENTAGE_THRESHOLD ]; then
|
||||||
log_message "$SCRIPT_NAME" "ALERT: Only ${free_percentage}% of log segments are 'Free'."
|
send_alert "$SCRIPT_NAME" "HANA Log Segment" "Only ${free_percentage}% of log segments are 'Free' (threshold: ${FREE_PERCENTAGE_THRESHOLD}%)."
|
||||||
send_alert "$SCRIPT_NAME" "HANA Log Segment" "Only ${free_percentage}% of HANA log segments are in 'Free' state."
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
log_message "$SCRIPT_NAME" "Log segment check complete."
|
log_message "$SCRIPT_NAME" "Log segment check complete."
|
||||||
|
|||||||
@@ -30,7 +30,7 @@ fi
|
|||||||
STATEMENT_QUEUE_SQL="SELECT COUNT(*) FROM M_SERVICE_THREADS WHERE THREAD_TYPE = 'SqlExecutor' AND THREAD_STATE = 'Queueing';"
|
STATEMENT_QUEUE_SQL="SELECT COUNT(*) FROM M_SERVICE_THREADS WHERE THREAD_TYPE = 'SqlExecutor' AND THREAD_STATE = 'Queueing';"
|
||||||
|
|
||||||
# Execute SQL query
|
# Execute SQL query
|
||||||
queue_result=$(execute_hana_sql_query "$STATEMENT_QUEUE_SQL")
|
queue_count=$(execute_hana_sql_query "$STATEMENT_QUEUE_SQL")
|
||||||
sql_status=$?
|
sql_status=$?
|
||||||
|
|
||||||
if [ $sql_status -ne 0 ]; then
|
if [ $sql_status -ne 0 ]; then
|
||||||
@@ -48,30 +48,11 @@ fi
|
|||||||
|
|
||||||
log_message "$SCRIPT_NAME" "Current statement queue length: ${queue_count}"
|
log_message "$SCRIPT_NAME" "Current statement queue length: ${queue_count}"
|
||||||
|
|
||||||
# Get breach count from state file
|
# Alert immediately if queue exceeds threshold
|
||||||
breach_count_file="${STATE_DIR}/statement_queue_breach_count"
|
|
||||||
breach_count=0
|
|
||||||
if [ -f "$breach_count_file" ]; then
|
|
||||||
breach_count=$(cat "$breach_count_file")
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ "$queue_count" -gt "$STATEMENT_QUEUE_THRESHOLD" ]; then
|
if [ "$queue_count" -gt "$STATEMENT_QUEUE_THRESHOLD" ]; then
|
||||||
breach_count=$((breach_count + 1))
|
send_alert "$SCRIPT_NAME" "HANA Statement Queue" "Statement queue count is ${queue_count}, which exceeds threshold of ${STATEMENT_QUEUE_THRESHOLD}."
|
||||||
log_message "$SCRIPT_NAME" "Statement queue is above threshold (${queue_count} > ${STATEMENT_QUEUE_THRESHOLD}). Consecutive breach count: ${breach_count}/${STATEMENT_QUEUE_CONSECUTIVE_RUNS}."
|
|
||||||
else
|
|
||||||
if [ "$breach_count" -gt 0 ]; then
|
|
||||||
log_message "$SCRIPT_NAME" "Statement queue returned to normal. Resetting breach count from ${breach_count} to 0."
|
|
||||||
fi
|
|
||||||
breach_count=0
|
|
||||||
fi
|
|
||||||
echo "$breach_count" > "$breach_count_file"
|
|
||||||
|
|
||||||
if [ "$breach_count" -ge "$STATEMENT_QUEUE_CONSECUTIVE_RUNS" ]; then
|
|
||||||
message="Statement queue has been over ${STATEMENT_QUEUE_THRESHOLD} for ${breach_count} checks. Current count: ${queue_count}."
|
|
||||||
send_alert "$SCRIPT_NAME" "HANA Statement Queue" "$message"
|
|
||||||
exit 1
|
exit 1
|
||||||
else
|
|
||||||
log_message "$SCRIPT_NAME" "Statement queue is normal. Current count: ${queue_count}."
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
log_message "$SCRIPT_NAME" "Statement queue is normal. Current count: ${queue_count}."
|
||||||
log_message "$SCRIPT_NAME" "Statement queue check complete."
|
log_message "$SCRIPT_NAME" "Statement queue check complete."
|
||||||
|
|||||||
@@ -21,12 +21,10 @@ fi
|
|||||||
trap 'release_lock "$SCRIPT_NAME"' EXIT
|
trap 'release_lock "$SCRIPT_NAME"' EXIT
|
||||||
|
|
||||||
# Function to check SLD health
|
# Function to check SLD health
|
||||||
# Returns HTTP status code or "0" for connection errors
|
|
||||||
check_sld_health() {
|
check_sld_health() {
|
||||||
local http_status
|
local http_status
|
||||||
http_status=$(curl -k -s -o /dev/null -w "%{http_code}" -m "$SLD_TIMEOUT" --connect-timeout "$SLD_TIMEOUT" "$SLD_URL" 2>/dev/null)
|
http_status=$(curl -k -s -o /dev/null -w "%{http_code}" -m "$SLD_TIMEOUT" --connect-timeout "$SLD_TIMEOUT" "$SLD_URL" 2>/dev/null)
|
||||||
|
|
||||||
# Handle curl errors (returns 000 on connection failure)
|
|
||||||
if [ -z "$http_status" ] || [ "$http_status" == "000" ]; then
|
if [ -z "$http_status" ] || [ "$http_status" == "000" ]; then
|
||||||
echo "0"
|
echo "0"
|
||||||
else
|
else
|
||||||
@@ -38,7 +36,6 @@ check_sld_health() {
|
|||||||
restart_sld_service() {
|
restart_sld_service() {
|
||||||
log_message "$SCRIPT_NAME" "Attempting to restart SLD service..."
|
log_message "$SCRIPT_NAME" "Attempting to restart SLD service..."
|
||||||
|
|
||||||
# Try systemctl first
|
|
||||||
if command -v systemctl &> /dev/null; then
|
if command -v systemctl &> /dev/null; then
|
||||||
systemctl restart sapb1servertools 2>&1
|
systemctl restart sapb1servertools 2>&1
|
||||||
local restart_status=$?
|
local restart_status=$?
|
||||||
@@ -50,57 +47,48 @@ restart_sld_service() {
|
|||||||
return 1
|
return 1
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
log_message "$SCRIPT_NAME" "systemctl not available, trying alternative restart methods"
|
log_message "$SCRIPT_NAME" "systemctl not available, trying service command"
|
||||||
# Fallback: try service command
|
|
||||||
service sapb1servertools restart 2>&1
|
service sapb1servertools restart 2>&1
|
||||||
return $?
|
return $?
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
# Main monitoring logic
|
log_message "$SCRIPT_NAME" "Checking SLD service health at ${SLD_URL}..."
|
||||||
main() {
|
|
||||||
log_message "$SCRIPT_NAME" "Checking SLD service health at ${SLD_URL}..."
|
|
||||||
|
|
||||||
local http_status
|
http_status=$(check_sld_health)
|
||||||
http_status=$(check_sld_health)
|
|
||||||
|
|
||||||
# 200 OK or 401 Unauthorized indicate the Tomcat layer is actively processing
|
# 200 OK or 401 Unauthorized indicate the Tomcat layer is actively processing
|
||||||
if [[ $http_status == 200 || $http_status == 401 ]]; then
|
if [[ $http_status == 200 || $http_status == 401 ]]; then
|
||||||
log_message "$SCRIPT_NAME" "[OK] SLD service healthy (HTTP Status: $http_status)"
|
log_message "$SCRIPT_NAME" "[OK] SLD service healthy (HTTP Status: $http_status)"
|
||||||
return 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Service is down or unresponsive
|
# Service is down or unresponsive
|
||||||
local status_detail
|
if [ "$http_status" == "0" ]; then
|
||||||
if [ "$http_status" == "0" ]; then
|
|
||||||
status_detail="Connection failed or timeout"
|
status_detail="Connection failed or timeout"
|
||||||
else
|
else
|
||||||
status_detail="HTTP Status: ${http_status}"
|
status_detail="HTTP Status: ${http_status}"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
log_message "$SCRIPT_NAME" "[ALERT] SLD dead or hung (${status_detail}). Restarting service..."
|
log_message "$SCRIPT_NAME" "[ALERT] SLD dead or hung (${status_detail}). Restarting service..."
|
||||||
|
send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service is down (${status_detail}). Restarting ${SLD_URL}"
|
||||||
|
|
||||||
# Send notification
|
# Restart the service
|
||||||
send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service is down (${status_detail}). Restarting ${SLD_URL}"
|
if ! restart_sld_service; then
|
||||||
|
|
||||||
# Restart the service
|
|
||||||
if ! restart_sld_service; then
|
|
||||||
log_message "$SCRIPT_NAME" "[CRITICAL] Failed to restart SLD service"
|
log_message "$SCRIPT_NAME" "[CRITICAL] Failed to restart SLD service"
|
||||||
send_alert "$SCRIPT_NAME" "SLD Service Critical" "Failed to restart SLD service"
|
send_alert "$SCRIPT_NAME" "SLD Service Critical" "Failed to restart SLD service"
|
||||||
return 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Allow service to spin up, then log recovery status
|
# Allow service to spin up, then log recovery status
|
||||||
log_message "$SCRIPT_NAME" "Waiting 15 seconds for service to restart..."
|
log_message "$SCRIPT_NAME" "Waiting 15 seconds for service to restart..."
|
||||||
sleep 15
|
sleep 15
|
||||||
|
|
||||||
local recovery_status
|
recovery_status=$(check_sld_health)
|
||||||
recovery_status=$(check_sld_health)
|
|
||||||
|
|
||||||
if [[ $recovery_status == 200 || $recovery_status == 401 ]]; then
|
if [[ $recovery_status == 200 || $recovery_status == 401 ]]; then
|
||||||
log_message "$SCRIPT_NAME" "[RECOVERY] SLD service recovered successfully (HTTP Status: $recovery_status)"
|
log_message "$SCRIPT_NAME" "[RECOVERY] SLD service recovered successfully (HTTP Status: $recovery_status)"
|
||||||
else
|
else
|
||||||
local recovery_detail
|
|
||||||
if [ "$recovery_status" == "0" ]; then
|
if [ "$recovery_status" == "0" ]; then
|
||||||
recovery_detail="Connection failed after restart"
|
recovery_detail="Connection failed after restart"
|
||||||
else
|
else
|
||||||
@@ -108,15 +96,7 @@ main() {
|
|||||||
fi
|
fi
|
||||||
log_message "$SCRIPT_NAME" "[CRITICAL] SLD service failed to recover after restart (${recovery_detail})"
|
log_message "$SCRIPT_NAME" "[CRITICAL] SLD service failed to recover after restart (${recovery_detail})"
|
||||||
send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service FAILED to recover after restart (${recovery_detail})"
|
send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service FAILED to recover after restart (${recovery_detail})"
|
||||||
return 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
# Run main function
|
|
||||||
main
|
|
||||||
exit_code=$?
|
|
||||||
|
|
||||||
log_message "$SCRIPT_NAME" "SLD watchdog check complete."
|
log_message "$SCRIPT_NAME" "SLD watchdog check complete."
|
||||||
exit $exit_code
|
|
||||||
|
|||||||
Reference in New Issue
Block a user