refactor(monitoring): simplify notification system and remove auto-cleanup

- Replace state-based notifications with direct alert functions - Remove auto-cleanup functionality from disk monitoring and configuration - Simplify lock acquisition/release across all monitoring scripts - Add execute_hana_sql helper functions for consistent SQL execution - Remove state file tracking in favor of direct file operations - Standardize error handling with exit codes on critical failures - Clean up hana.conf by removing unused auto-delete directory settings
2026-03-12 21:52:49 +01:00
parent 5a92bc4e93
commit cf5b81889d
8 changed files with 183 additions and 391 deletions
--- a/hana.conf
+++ b/hana.conf
@@ -28,29 +28,6 @@ DIRECTORIES_TO_MONITOR=(
    "/usr/sap"
 )
 # --- Directories for Auto-Delete ---
 # These directories will be automatically cleaned when disk space is low
 # Format: "mount_point:directory_path"
 # The script will check if a monitored directory is on the same mount point
 # as an auto-delete directory and can clean it to free up space
 # NOTE: Do NOT include HANA system logs like /hana/log - only include
 # directories with safe-to-delete files like backup logs, temp files, etc.
 DIRS_FOR_AUTODELETE=(
    "/hana/shared:/hana/shared/backup/schema/DB_NDB"
    "/hana/shared:/hana/shared/backup/schema/SYSTEMDB"
    "/hana/data:/hana/data/temp"
    "/usr/sap:/usr/sap/trans/log"
    "/usr/sap:/usr/sap/hostctrl/work/log"
 )
 # --- Disk Auto-Delete Configuration ---
 # Minimum free space percentage to maintain after cleanup
 MIN_FREE_SPACE_AFTER_CLEANUP=5
 # Maximum age of files to delete (in days)
 MAX_FILE_AGE_DAYS=7
 # Enable automatic cleanup when disk usage exceeds threshold
 AUTO_CLEANUP_ENABLED=true
 # --- Thresholds ---
 DISK_USAGE_THRESHOLD=85
 TRUNCATED_PERCENTAGE_THRESHOLD=50
--- a/hana_backup.sh
+++ b/hana_backup.sh
@@ -12,31 +12,30 @@ source "${SCRIPT_DIR}/hana.conf"
 source "${SCRIPT_DIR}/hana_lib.sh"
 # Acquire lock
-LOCK_FILE=$(acquire_lock "$SCRIPT_NAME")
+if ! acquire_lock "$SCRIPT_NAME"; then
 if [ $? -ne 0 ]; then
    exit 1
 fi
-trap 'release_lock "$LOCK_FILE"' EXIT
+trap 'release_lock "$SCRIPT_NAME"' EXIT
 log_message "$SCRIPT_NAME" "Starting backup status check..."
 # Check if hdbsql is available
 if [ ! -x "$HDBSQL_PATH" ]; then
    log_message "$SCRIPT_NAME" "ERROR: hdbsql not found or not executable at ${HDBSQL_PATH}"
-    send_notification_if_changed "$SCRIPT_NAME" "hana_hdbsql_path_backup" "HANA Monitor Error" "hdbsql not found or not executable at ${HDBSQL_PATH}" "true" "HDBSQL_ERROR"
+    send_alert "$SCRIPT_NAME" "HANA Monitor Error" "hdbsql not found or not executable at ${HDBSQL_PATH}"
    exit 1
 fi
 # SQL Query for last successful backup
 BACKUP_SQL="SELECT TOP 1 SYS_START_TIME FROM M_BACKUP_CATALOG WHERE ENTRY_TYPE_NAME = 'complete data backup' AND STATE_NAME = 'successful' ORDER BY SYS_START_TIME DESC"
-# Execute SQL query as HANA user with improved error handling
+# Execute SQL query
-backup_result=$(su - "$HANA_USER" -c "$HDBSQL_PATH -U $HANA_USER_KEY -j -a -x \"$BACKUP_SQL\"" 2>&1)
+backup_result=$(execute_hana_sql "$BACKUP_SQL")
 sql_status=$?
 if [ $sql_status -ne 0 ]; then
    log_message "$SCRIPT_NAME" "ERROR: Failed to execute backup query. Exit code: ${sql_status}"
-    send_notification_if_changed "$SCRIPT_NAME" "hana_backup_query_error" "HANA Backup Error" "Failed to execute backup query. Exit code: ${sql_status}" "true" "QUERY_ERROR"
+    send_alert "$SCRIPT_NAME" "HANA Backup Error" "Failed to execute backup query. Exit code: ${sql_status}"
    exit 1
 fi
@@ -45,18 +44,15 @@ last_backup_date=$(echo "$backup_result" | tr -d '"' | sed 's/\..*//')
 if [[ -z "$last_backup_date" || "$last_backup_date" == *"error"* || "$last_backup_date" == *"Error"* ]]; then
    message="No successful complete data backup found for ${COMPANY_NAME} HANA."
    log_message "$SCRIPT_NAME" "CRITICAL: ${message}"
-    send_notification_if_changed "$SCRIPT_NAME" "hana_backup_status" "HANA Backup" "$message" "true" "NO_BACKUP"
+    send_alert "$SCRIPT_NAME" "HANA Backup" "$message"
    exit 1
 fi
 # Clear any previous query error state
 send_notification_if_changed "$SCRIPT_NAME" "hana_backup_query_error" "HANA Backup" "Backup query successful." "false" "OK"
 # Calculate backup age
 last_backup_epoch=$(date -d "$last_backup_date" +%s 2>/dev/null)
 if [ $? -ne 0 ]; then
    log_message "$SCRIPT_NAME" "ERROR: Failed to parse backup date: ${last_backup_date}"
-    send_notification_if_changed "$SCRIPT_NAME" "hana_backup_status" "HANA Backup" "Failed to parse backup date: ${last_backup_date}" "true" "DATE_PARSE_ERROR"
+    send_alert "$SCRIPT_NAME" "HANA Backup" "Failed to parse backup date: ${last_backup_date}"
    exit 1
 fi
@@ -73,11 +69,11 @@ fi
 if [ $age_seconds -gt $threshold_seconds ]; then
    message="Last successful HANA backup for ${COMPANY_NAME} is ${age_hours} hours old, which exceeds the threshold of ${BACKUP_THRESHOLD_HOURS} hours. Last backup was on: ${last_backup_date}."
    log_message "$SCRIPT_NAME" "CRITICAL: ${message}"
-    send_notification_if_changed "$SCRIPT_NAME" "hana_backup_status" "HANA Backup" "$message" "true" "${age_hours}h"
+    send_alert "$SCRIPT_NAME" "HANA Backup" "$message"
    exit 1
 else
    message="Last successful backup is ${age_hours} hours old (Threshold: ${BACKUP_THRESHOLD_HOURS} hours)."
    log_message "$SCRIPT_NAME" "SUCCESS: ${message}"
    send_notification_if_changed "$SCRIPT_NAME" "hana_backup_status" "HANA Backup" "$message" "false" "OK"
 fi
 log_message "$SCRIPT_NAME" "Backup check complete."
--- a/hana_disk.sh
+++ b/hana_disk.sh
@@ -12,18 +12,16 @@ source "${SCRIPT_DIR}/hana.conf"
 source "${SCRIPT_DIR}/hana_lib.sh"
 # Acquire lock
-LOCK_FILE=$(acquire_lock "$SCRIPT_NAME")
+if ! acquire_lock "$SCRIPT_NAME"; then
 if [ $? -ne 0 ]; then
    exit 1
 fi
-trap 'release_lock "$LOCK_FILE"' EXIT
+trap 'release_lock "$SCRIPT_NAME"' EXIT
 log_message "$SCRIPT_NAME" "Starting disk usage check..."
 # Track overall status
 ALERT_COUNT=0
 TOTAL_DIRS=0
 CLEANUP_PERFORMED=0
 for dir in "${DIRECTORIES_TO_MONITOR[@]}"; do
    TOTAL_DIRS=$((TOTAL_DIRS + 1))
@@ -31,7 +29,7 @@ for dir in "${DIRECTORIES_TO_MONITOR[@]}"; do
    # Check if directory exists
    if [ ! -d "$dir" ]; then
        log_message "$SCRIPT_NAME" "WARNING: Directory '$dir' not found. Skipping."
-        send_notification_if_changed "$SCRIPT_NAME" "disk_dir_not_found_${dir//\//_}" "HANA Disk Warning" "Directory '$dir' not found." "true" "DIR_NOT_FOUND"
+        send_alert "$SCRIPT_NAME" "HANA Disk Warning" "Directory '$dir' not found."
        ALERT_COUNT=$((ALERT_COUNT + 1))
        continue
    fi
@@ -49,33 +47,15 @@ for dir in "${DIRECTORIES_TO_MONITOR[@]}"; do
    # Check if usage exceeds threshold
    if [ "$usage" -gt "$DISK_USAGE_THRESHOLD" ]; then
        log_message "$SCRIPT_NAME" "ALERT: ${dir} usage is at ${usage}% which is above the ${DISK_USAGE_THRESHOLD}% threshold."
-        
+        send_alert "$SCRIPT_NAME" "HANA Disk" "Disk usage for ${dir} is at ${usage}% (threshold: ${DISK_USAGE_THRESHOLD}%)."
        # Attempt auto-cleanup if enabled
        if [ "$AUTO_CLEANUP_ENABLED" == "true" ]; then
            log_message "$SCRIPT_NAME" "Attempting auto-cleanup for '${dir}'..."
            mount_point=$(get_mount_point "$dir")
            if auto_cleanup "$mount_point" "$MIN_FREE_SPACE_AFTER_CLEANUP"; then
                CLEANUP_PERFORMED=$((CLEANUP_PERFORMED + 1))
                new_usage=$(get_disk_usage_percentage "$dir")
                log_message "$SCRIPT_NAME" "After cleanup, ${dir} usage is at ${new_usage}%"
                usage=$new_usage
            else
                log_message "$SCRIPT_NAME" "Auto-cleanup failed or no files to clean for '${dir}'"
            fi
        fi
        # Send notification with final usage after cleanup attempt
        send_notification_if_changed "$SCRIPT_NAME" "disk_usage_${dir//\//_}" "HANA Disk" "Disk usage for ${dir} is at ${usage}%." "true" "${usage}%"
        ALERT_COUNT=$((ALERT_COUNT + 1))
    else
-        # Send OK notification only if state changed from alert
+        log_message "$SCRIPT_NAME" "OK: ${dir} usage is at ${usage}% (below threshold)."
        send_notification_if_changed "$SCRIPT_NAME" "disk_usage_${dir//\//_}" "HANA Disk" "Disk usage for ${dir} is at ${usage}% (below threshold)." "false" "OK"
    fi
 done
 # Summary logging
-log_message "$SCRIPT_NAME" "Disk check complete. Total: ${TOTAL_DIRS} dirs, ${ALERT_COUNT} alerts, ${CLEANUP_PERFORMED} cleanups performed."
+log_message "$SCRIPT_NAME" "Disk check complete. Total: ${TOTAL_DIRS} dirs, ${ALERT_COUNT} alerts."
 # Exit with status based on alerts
 if [ "$ALERT_COUNT" -gt 0 ]; then
--- a/hana_lib.sh
+++ b/hana_lib.sh
@@ -25,81 +25,53 @@ acquire_lock() {
    fi
    touch "$lock_file"
    echo "$lock_file"
    return 0
 }
 # Release lock
-# Usage: release_lock "LOCK_FILE"
+# Usage: release_lock "SCRIPT_NAME"
 release_lock() {
-    local lock_file="$1"
+    local script_name="$1"
-    if [ -n "$lock_file" ] && [ -f "$lock_file" ]; then
+    local lock_file="${LOCK_DIR}/hana_${script_name}.lock"
    if [ -f "$lock_file" ]; then
        rm -f "$lock_file"
    fi
 }
-# Get state value
+# Send notification via ntfy
-# Usage: get_state "KEY"
+# Usage: send_notification "TITLE" "MESSAGE"
-get_state() {
+send_notification() {
-    local key="$1"
+    local title="$1"
-    if [ -f "${STATE_DIR}/${key}.state" ]; then
+    local message="$2"
        cat "${STATE_DIR}/${key}.state"
    else
        echo ""
    fi
 }
 # Set state value
 # Usage: set_state "KEY" "VALUE"
 set_state() {
    local key="$1"
    local value="$2"
    echo "$value" > "${STATE_DIR}/${key}.state"
 }
 # Send notification if state changed
 # Usage: send_notification_if_changed "SCRIPT_NAME" "ALERT_KEY" "TITLE_PREFIX" "MESSAGE" "IS_ALERT" "CURRENT_VALUE"
 send_notification_if_changed() {
    local script_name="$1"
    local alert_key="$2"
    local title_prefix="$3"
    local current_message="$4"
    local is_alert_condition="$5"
    local current_value="$6"
    local hostname=$(hostname)
-    
+    local full_message="[${COMPANY_NAME} | ${hostname}] ${message}"
    local previous_value=$(get_state "$alert_key")
    if [ "$current_value" != "$previous_value" ]; then
        local full_title=""
        local full_message=""
        if [ "$is_alert_condition" == "true" ]; then
            full_title="${title_prefix} Alert"
            full_message="🚨 Critical: ${current_message}"
            log_message "$script_name" "ALERT: ${full_message}"
        else
            if [ -n "$previous_value" ] && [ "$previous_value" != "OK" ]; then
                full_title="${title_prefix} Resolved"
                full_message="✅ Resolved: ${current_message}"
                log_message "$script_name" "RESOLVED: ${full_message}"
            else
                set_state "$alert_key" "$current_value"
                return
            fi
        fi
        local final_message="[${COMPANY_NAME} | ${hostname}] ${full_message}"
    if [ -n "$NTFY_TOKEN" ] && [ -n "$NTFY_TOPIC_URL" ]; then
-            curl -H "Authorization: Bearer ${NTFY_TOKEN}" -H "Title: ${full_title}" -d "${final_message}" "${NTFY_TOPIC_URL}" > /dev/null 2>&1
+        curl -H "Authorization: Bearer ${NTFY_TOKEN}" -H "Title: ${title}" -d "${full_message}" "${NTFY_TOPIC_URL}" > /dev/null 2>&1
-            log_message "$script_name" "Notification sent: ${full_title}"
+        log_message "NOTIFY" "Notification sent: ${title}"
    else
-            log_message "$script_name" "Ntfy not configured, skipping notification"
+        log_message "NOTIFY" "Ntfy not configured, skipping notification"
    fi
 }
-        set_state "$alert_key" "$current_value"
+# Send alert notification
-    fi
+# Usage: send_alert "SCRIPT_NAME" "TITLE_PREFIX" "MESSAGE"
 send_alert() {
    local script_name="$1"
    local title_prefix="$2"
    local message="$3"
    send_notification "${title_prefix} Alert" "🚨 Critical: ${message}"
    log_message "$script_name" "ALERT: ${message}"
 }
 # Send OK notification (state change from alert to normal)
 # Usage: send_ok "SCRIPT_NAME" "TITLE_PREFIX" "MESSAGE"
 send_ok() {
    local script_name="$1"
    local title_prefix="$2"
    local message="$3"
    send_notification "${title_prefix} Resolved" "✅ Resolved: ${message}"
    log_message "$script_name" "RESOLVED: ${message}"
 }
 # Run command as HANA user using su
@@ -109,6 +81,45 @@ run_as_hana_user() {
    su - "$HANA_USER" -c "$command"
 }
 # Execute SQL query as HANA user
 # Usage: execute_hana_sql "SQL_QUERY"
 # Returns: SQL output on stdout, returns 0 on success, 1 on failure
 execute_hana_sql() {
    local sql_query="$1"
    local output
    output=$(su - "$HANA_USER" -c "$HDBSQL_PATH -U $HANA_USER_KEY -j -a -x \"$sql_query\"" 2>&1)
    local sql_status=$?
    if [ $sql_status -ne 0 ]; then
        log_message "SQL" "ERROR: Failed to execute SQL query. Exit code: ${sql_status}"
        echo "$output" >&2
        return 1
    fi
    echo "$output"
    return 0
 }
 # Execute SQL query and return result (for single-value queries)
 # Usage: execute_hana_sql_query "SQL_QUERY"
 # Returns: Query result on stdout, returns 0 on success, 1 on failure
 execute_hana_sql_query() {
    local sql_query="$1"
    local output
    output=$(execute_hana_sql "$sql_query")
    local sql_status=$?
    if [ $sql_status -ne 0 ]; then
        return 1
    fi
    # Clean output: remove quotes and whitespace
    echo "$output" | tr -d '"' | xargs
    return 0
 }
 # Get disk usage percentage for a directory
 # Usage: get_disk_usage_percentage "/path/to/dir"
 # Returns: Usage percentage as integer (without % sign)
@@ -136,137 +147,3 @@ get_available_space_kb() {
    local dir="$1"
    df -k "$dir" 2>/dev/null | awk 'NR==2 {print $4}'
 }
 # Find directories for auto-delete on the same mount point
 # Usage: find_autodelete_dirs_on_mount "mount_point"
 # Returns: Space-separated list of directories to clean
 find_autodelete_dirs_on_mount() {
    local mount_point="$1"
    local result=""
    for entry in "${DIRS_FOR_AUTODELETE[@]}"; do
        local entry_mount="${entry%%:*}"
        local cleanup_dir="${entry#*:}"
        if [ "$entry_mount" == "$mount_point" ] && [ -d "$cleanup_dir" ]; then
            if [ -n "$result" ]; then
                result="$result $cleanup_dir"
            else
                result="$cleanup_dir"
            fi
        fi
    done
    echo "$result"
 }
 # Clean old files in a directory
 # Usage: clean_directory_files "/path/to/dir" "max_age_days"
 # Returns: Number of files deleted and space freed
 clean_directory_files() {
    local cleanup_dir="$1"
    local max_age_days="${2:-7}"
    local files_deleted=0
    local space_freed=0
    if [ ! -d "$cleanup_dir" ]; then
        log_message "CLEANUP" "Directory '$cleanup_dir' not found. Skipping."
        echo "0:0"
        return
    fi
    # Find and delete old files
    while IFS= read -r -d '' file; do
        if [ -f "$file" ]; then
            local file_size=$(stat -c%s "$file" 2>/dev/null || echo "0")
            rm -f "$file" 2>/dev/null && {
                files_deleted=$((files_deleted + 1))
                space_freed=$((space_freed + file_size))
            }
        fi
    done < <(find "$cleanup_dir" -type f -mtime +$max_age_days -print0 2>/dev/null)
    # Also clean empty directories
    find "$cleanup_dir" -type d -empty -delete 2>/dev/null
    log_message "CLEANUP" "Deleted $files_deleted files from '$cleanup_dir', freed $((space_freed / 1024)) KB"
    echo "${files_deleted}:${space_freed}"
 }
 # Automatic disk cleanup function
 # Usage: auto_cleanup "mount_point" "target_free_percentage"
 # Returns: 0 if cleanup successful, 1 if failed or not needed
 auto_cleanup() {
    local mount_point="$1"
    local target_free_percentage="${2:-5}"
    if [ "$AUTO_CLEANUP_ENABLED" != "true" ]; then
        log_message "CLEANUP" "Auto-cleanup is disabled. Skipping."
        return 1
    fi
    local cleanup_dirs=$(find_autodelete_dirs_on_mount "$mount_point")
    if [ -z "$cleanup_dirs" ]; then
        log_message "CLEANUP" "No auto-delete directories configured for mount point '$mount_point'. Skipping cleanup."
        return 1
    fi
    log_message "CLEANUP" "Starting auto-cleanup for mount point '$mount_point'. Directories: $cleanup_dirs"
    local total_freed=0
    local total_files=0
    for cleanup_dir in $cleanup_dirs; do
        local result=$(clean_directory_files "$cleanup_dir" "$MAX_FILE_AGE_DAYS")
        local files="${result%%:*}"
        local freed="${result#*:}"
        total_files=$((total_files + files))
        total_freed=$((total_freed + freed))
    done
    log_message "CLEANUP" "Cleanup complete. Total files deleted: $total_files, Total space freed: $((total_freed / 1024)) KB"
    if [ $total_freed -gt 0 ]; then
        return 0
    else
        return 1
    fi
 }
 # Check disk space and perform auto-cleanup if needed
 # Usage: check_and_cleanup_disk "directory" "threshold"
 # Returns: 0 if OK or cleanup successful, 1 if critical and cleanup failed
 check_and_cleanup_disk() {
    local dir="$1"
    local threshold="${2:-85}"
    local usage=$(get_disk_usage_percentage "$dir")
    local mount_point=$(get_mount_point "$dir")
    if [ -z "$usage" ] || [ "$usage" -eq 0 ]; then
        log_message "CLEANUP" "Could not determine disk usage for '$dir'. Skipping."
        return 1
    fi
    if [ "$usage" -gt "$threshold" ]; then
        log_message "CLEANUP" "Disk usage ${usage}% exceeds threshold ${threshold}% for '$dir'. Attempting cleanup..."
        if auto_cleanup "$mount_point" "$MIN_FREE_SPACE_AFTER_CLEANUP"; then
            local new_usage=$(get_disk_usage_percentage "$dir")
            log_message "CLEANUP" "After cleanup, disk usage is ${new_usage}%"
            if [ "$new_usage" -le "$threshold" ]; then
                return 0
            else
                log_message "CLEANUP" "Cleanup completed but usage ${new_usage}% still above threshold"
                return 0
            fi
        else
            log_message "CLEANUP" "Cleanup failed or no files to clean for '$dir'"
            return 1
        fi
    fi
    return 0
 }
--- a/hana_log_segments.sh
+++ b/hana_log_segments.sh
@@ -12,32 +12,30 @@ source "${SCRIPT_DIR}/hana.conf"
 source "${SCRIPT_DIR}/hana_lib.sh"
 # Acquire lock
-LOCK_FILE=$(acquire_lock "$SCRIPT_NAME")
+if ! acquire_lock "$SCRIPT_NAME"; then
 if [ $? -ne 0 ]; then
    exit 1
 fi
-trap 'release_lock "$LOCK_FILE"' EXIT
+trap 'release_lock "$SCRIPT_NAME"' EXIT
 log_message "$SCRIPT_NAME" "Starting log segment check..."
 # SQL Query for log segments
 SQL_QUERY="SELECT b.host, b.service_name, a.state, count(*) FROM PUBLIC.M_LOG_SEGMENTS a JOIN PUBLIC.M_SERVICES b ON (a.host = b.host AND a.port = b.port) GROUP BY b.host, b.service_name, a.state;"
 # Check if hdbsql is available
 if [ ! -x "$HDBSQL_PATH" ]; then
    log_message "$SCRIPT_NAME" "ERROR: hdbsql not found or not executable at ${HDBSQL_PATH}"
-    send_notification_if_changed "$SCRIPT_NAME" "hana_hdbsql_path" "HANA Monitor Error" "hdbsql not found or not executable at ${HDBSQL_PATH}" "true" "HDBSQL_ERROR"
+    send_alert "$SCRIPT_NAME" "HANA Monitor Error" "hdbsql not found or not executable at ${HDBSQL_PATH}"
    exit 1
 fi
-# Execute SQL query as HANA user with improved error handling
+# SQL Query for log segments
-readarray -t sql_output < <(su - "$HANA_USER" -c "$HDBSQL_PATH -U $HANA_USER_KEY -c \";\" \"$SQL_QUERY\"" 2>&1)
+SQL_QUERY="SELECT b.host, b.service_name, a.state, count(*) FROM PUBLIC.M_LOG_SEGMENTS a JOIN PUBLIC.M_SERVICES b ON (a.host = b.host AND a.port = b.port) GROUP BY b.host, b.service_name, a.state;"
 # Execute SQL query
 sql_output=$(execute_hana_sql "$SQL_QUERY")
 sql_status=$?
 if [ $sql_status -ne 0 ]; then
-    error_message=$(printf '%s\n' "${sql_output[@]}")
+    log_message "$SCRIPT_NAME" "ERROR: The hdbsql command failed."
-    log_message "$SCRIPT_NAME" "ERROR: The hdbsql command failed. Details: ${error_message}"
+    send_alert "$SCRIPT_NAME" "HANA Monitor Error" "The hdbsql command failed."
    send_notification_if_changed "$SCRIPT_NAME" "hana_hdbsql_command" "HANA Monitor Error" "The hdbsql command failed. Details: ${error_message}" "true" "HDBSQL_COMMAND_FAILED"
    exit 1
 fi
@@ -46,15 +44,15 @@ total_segments=0
 truncated_segments=0
 free_segments=0
-for line in "${sql_output[@]}"; do
+while IFS= read -r line; do
    # Skip empty lines and header
    if [[ -z "$line" || "$line" == *"STATE"* || "$line" == *"host"* ]]; then
        continue
    fi
    cleaned_line=$(echo "$line" | tr -d '"')
-    state=$(echo "$cleaned_line" | awk -F',' '{print $3}' | xargs)  # Trim whitespace
+    state=$(echo "$cleaned_line" | awk -F',' '{print $3}' | xargs)
-    count=$(echo "$cleaned_line" | awk -F',' '{print $4}' | xargs)  # Trim whitespace
+    count=$(echo "$cleaned_line" | awk -F',' '{print $4}' | xargs)
    # Validate count is a number
    if ! [[ "$count" =~ ^[0-9]+$ ]]; then
@@ -67,7 +65,7 @@ for line in "${sql_output[@]}"; do
    elif [[ "$state" == "Free" ]]; then
        free_segments=$((free_segments + count))
    fi
-done
+done <<< "$sql_output"
 log_message "$SCRIPT_NAME" "Total Segments: ${total_segments}"
 log_message "$SCRIPT_NAME" "Truncated Segments: ${truncated_segments}"
@@ -75,37 +73,24 @@ log_message "$SCRIPT_NAME" "Free Segments: ${free_segments}"
 if [ $total_segments -eq 0 ]; then
    log_message "$SCRIPT_NAME" "WARNING: No log segments found. Skipping percentage checks."
-    send_notification_if_changed "$SCRIPT_NAME" "hana_log_segments_total" "HANA Log Segment Warning" "No log segments found. Skipping percentage checks." "true" "NO_LOG_SEGMENTS"
+    send_alert "$SCRIPT_NAME" "HANA Log Segment Warning" "No log segments found."
-else
+    exit 1
-    send_notification_if_changed "$SCRIPT_NAME" "hana_log_segments_total" "HANA Log Segment" "Log segments found." "false" "OK"
+fi
 # Calculate truncated percentage with integer arithmetic
    if [ $total_segments -gt 0 ]; then
 truncated_percentage=$((truncated_segments * 100 / total_segments))
    else
        truncated_percentage=0
    fi
 if [ $truncated_percentage -gt $TRUNCATED_PERCENTAGE_THRESHOLD ]; then
    log_message "$SCRIPT_NAME" "ALERT: ${truncated_percentage}% of log segments are 'Truncated'."
-        send_notification_if_changed "$SCRIPT_NAME" "hana_log_truncated" "HANA Log Segment" "${truncated_percentage}% of HANA log segments are in 'Truncated' state." "true" "${truncated_percentage}%"
+    send_alert "$SCRIPT_NAME" "HANA Log Segment" "${truncated_percentage}% of HANA log segments are in 'Truncated' state."
    else
        send_notification_if_changed "$SCRIPT_NAME" "hana_log_truncated" "HANA Log Segment" "${truncated_percentage}% of HANA log segments are in 'Truncated' state (below threshold)." "false" "OK"
 fi
 # Calculate free percentage with integer arithmetic
    if [ $total_segments -gt 0 ]; then
 free_percentage=$((free_segments * 100 / total_segments))
    else
        free_percentage=0
    fi
 if [ $free_percentage -lt $FREE_PERCENTAGE_THRESHOLD ]; then
    log_message "$SCRIPT_NAME" "ALERT: Only ${free_percentage}% of log segments are 'Free'."
-        send_notification_if_changed "$SCRIPT_NAME" "hana_log_free" "HANA Log Segment" "Only ${free_percentage}% of HANA log segments are in 'Free' state." "true" "${free_percentage}%"
+    send_alert "$SCRIPT_NAME" "HANA Log Segment" "Only ${free_percentage}% of HANA log segments are in 'Free' state."
    else
        send_notification_if_changed "$SCRIPT_NAME" "hana_log_free" "HANA Log Segment" "Only ${free_percentage}% of HANA log segments are in 'Free' state (above threshold)." "false" "OK"
    fi
 fi
 log_message "$SCRIPT_NAME" "Log segment check complete."
--- a/hana_processes.sh
+++ b/hana_processes.sh
@@ -12,53 +12,47 @@ source "${SCRIPT_DIR}/hana.conf"
 source "${SCRIPT_DIR}/hana_lib.sh"
 # Acquire lock
-LOCK_FILE=$(acquire_lock "$SCRIPT_NAME")
+if ! acquire_lock "$SCRIPT_NAME"; then
 if [ $? -ne 0 ]; then
    exit 1
 fi
-trap 'release_lock "$LOCK_FILE"' EXIT
+trap 'release_lock "$SCRIPT_NAME"' EXIT
 log_message "$SCRIPT_NAME" "Starting HANA process status check..."
 # Check if sapcontrol is available
 if [ ! -x "$SAPCONTROL_PATH" ]; then
    log_message "$SCRIPT_NAME" "ERROR: sapcontrol not found or not executable at ${SAPCONTROL_PATH}"
-    send_notification_if_changed "$SCRIPT_NAME" "hana_sapcontrol_path" "HANA Monitor Error" "sapcontrol not found or not executable at ${SAPCONTROL_PATH}" "true" "SAPCONTROL_ERROR"
+    send_alert "$SCRIPT_NAME" "HANA Monitor Error" "sapcontrol not found or not executable at ${SAPCONTROL_PATH}"
    exit 1
 fi
-# Get process list with improved error handling
+# Get process list
 process_list=$(su - "$HANA_USER" -c "${SAPCONTROL_PATH} -nr ${HANA_INSTANCE_NR} -function GetProcessList" 2>&1)
 sapcontrol_status=$?
 if [ $sapcontrol_status -ne 0 ]; then
    log_message "$SCRIPT_NAME" "ERROR: sapcontrol command failed with exit code ${sapcontrol_status}"
-    send_notification_if_changed "$SCRIPT_NAME" "hana_sapcontrol_command" "HANA Monitor Error" "sapcontrol command failed. Exit code: ${sapcontrol_status}" "true" "SAPCONTROL_COMMAND_FAILED"
+    send_alert "$SCRIPT_NAME" "HANA Monitor Error" "sapcontrol command failed. Exit code: ${sapcontrol_status}"
    exit 1
 fi
 # Clear any previous sapcontrol error state
 send_notification_if_changed "$SCRIPT_NAME" "hana_sapcontrol_command" "HANA Process" "sapcontrol command successful." "false" "OK"
 # Check for non-GREEN processes (skip header lines)
 non_green_processes=$(echo "$process_list" | tail -n +6 | grep -v 'GREEN' | grep -v '^$')
 if [ -n "$non_green_processes" ]; then
    log_message "$SCRIPT_NAME" "ALERT: One or more HANA processes are not running!"
    log_message "$SCRIPT_NAME" "Problem processes: ${non_green_processes}"
-    send_notification_if_changed "$SCRIPT_NAME" "hana_processes" "HANA Process" "One or more HANA processes are not GREEN. Problem processes: ${non_green_processes}" "true" "PROCESS_ALERT:${non_green_processes}"
+    send_alert "$SCRIPT_NAME" "HANA Process" "One or more HANA processes are not GREEN. Problem processes: ${non_green_processes}"
    exit 1
-else
+fi
 # Verify we actually got process data
 green_processes=$(echo "$process_list" | tail -n +6 | grep 'GREEN')
 if [ -z "$green_processes" ]; then
    log_message "$SCRIPT_NAME" "WARNING: No process data found. SAP HANA may not be running."
-        send_notification_if_changed "$SCRIPT_NAME" "hana_processes" "HANA Process" "No process data found. SAP HANA may not be running." "true" "NO_PROCESS_DATA"
+    send_alert "$SCRIPT_NAME" "HANA Process" "No process data found. SAP HANA may not be running."
    exit 1
 fi
    send_notification_if_changed "$SCRIPT_NAME" "hana_processes" "HANA Process" "All HANA processes are GREEN." "false" "OK"
 log_message "$SCRIPT_NAME" "SUCCESS: All HANA processes are GREEN."
 fi
 log_message "$SCRIPT_NAME" "Process check complete."
--- a/hana_queue.sh
+++ b/hana_queue.sh
@@ -12,52 +12,48 @@ source "${SCRIPT_DIR}/hana.conf"
 source "${SCRIPT_DIR}/hana_lib.sh"
 # Acquire lock
-LOCK_FILE=$(acquire_lock "$SCRIPT_NAME")
+if ! acquire_lock "$SCRIPT_NAME"; then
 if [ $? -ne 0 ]; then
    exit 1
 fi
-trap 'release_lock "$LOCK_FILE"' EXIT
+trap 'release_lock "$SCRIPT_NAME"' EXIT
 log_message "$SCRIPT_NAME" "Starting statement queue check..."
 # SQL Query for statement queue
 STATEMENT_QUEUE_SQL="SELECT COUNT(*) FROM M_SERVICE_THREADS WHERE THREAD_TYPE = 'SqlExecutor' AND THREAD_STATE = 'Queueing';"
 # Check if hdbsql is available
 if [ ! -x "$HDBSQL_PATH" ]; then
    log_message "$SCRIPT_NAME" "ERROR: hdbsql not found or not executable at ${HDBSQL_PATH}"
-    send_notification_if_changed "$SCRIPT_NAME" "hana_hdbsql_path_queue" "HANA Monitor Error" "hdbsql not found or not executable at ${HDBSQL_PATH}" "true" "HDBSQL_ERROR"
+    send_alert "$SCRIPT_NAME" "HANA Monitor Error" "hdbsql not found or not executable at ${HDBSQL_PATH}"
    exit 1
 fi
-# Execute SQL query as HANA user with improved error handling
+# SQL Query for statement queue
-queue_result=$(su - "$HANA_USER" -c "$HDBSQL_PATH -U $HANA_USER_KEY -j -a -x \"$STATEMENT_QUEUE_SQL\"" 2>&1)
+STATEMENT_QUEUE_SQL="SELECT COUNT(*) FROM M_SERVICE_THREADS WHERE THREAD_TYPE = 'SqlExecutor' AND THREAD_STATE = 'Queueing';"
 # Execute SQL query
 queue_result=$(execute_hana_sql_query "$STATEMENT_QUEUE_SQL")
 sql_status=$?
 if [ $sql_status -ne 0 ]; then
-    log_message "$SCRIPT_NAME" "ERROR: Failed to execute queue query. Exit code: ${sql_status}"
+    log_message "$SCRIPT_NAME" "ERROR: Failed to execute queue query."
-    send_notification_if_changed "$SCRIPT_NAME" "hana_queue_query_error" "HANA Queue Error" "Failed to execute queue query. Exit code: ${sql_status}" "true" "QUERY_ERROR"
+    send_alert "$SCRIPT_NAME" "HANA Queue Error" "Failed to execute queue query."
    exit 1
 fi
 # Clear any previous query error state
 send_notification_if_changed "$SCRIPT_NAME" "hana_queue_query_error" "HANA Statement Queue" "Queue query successful." "false" "OK"
 # Parse queue count
 queue_count=$(echo "$queue_result" | tr -d '"' | xargs)
 # Validate queue count is a number
 if ! [[ "$queue_count" =~ ^[0-9]+$ ]]; then
-    log_message "$SCRIPT_NAME" "WARNING: Could not retrieve HANA statement queue count. Got: '${queue_count}'. Skipping check."
+    log_message "$SCRIPT_NAME" "WARNING: Could not retrieve HANA statement queue count. Got: '${queue_count}'."
-    send_notification_if_changed "$SCRIPT_NAME" "hana_statement_queue_check_fail" "HANA Monitor Warning" "Could not retrieve statement queue count. Got: '${queue_count}'" "true" "QUEUE_CHECK_FAIL"
+    send_alert "$SCRIPT_NAME" "HANA Monitor Warning" "Could not retrieve statement queue count."
-else
+    exit 1
-    # Clear any previous check failure state
+fi
-    send_notification_if_changed "$SCRIPT_NAME" "hana_statement_queue_check_fail" "HANA Monitor Warning" "Statement queue check is working." "false" "OK"
+
 log_message "$SCRIPT_NAME" "Current statement queue length: ${queue_count}"
-    # Get breach count from state
+# Get breach count from state file
-    breach_count=$(get_state "statement_queue_breach_count")
+breach_count_file="${STATE_DIR}/statement_queue_breach_count"
-    breach_count=${breach_count:-0}
+breach_count=0
 if [ -f "$breach_count_file" ]; then
    breach_count=$(cat "$breach_count_file")
 fi
 if [ "$queue_count" -gt "$STATEMENT_QUEUE_THRESHOLD" ]; then
    breach_count=$((breach_count + 1))
@@ -68,15 +64,14 @@ else
    fi
    breach_count=0
 fi
-    set_state "statement_queue_breach_count" "$breach_count"
+echo "$breach_count" > "$breach_count_file"
 if [ "$breach_count" -ge "$STATEMENT_QUEUE_CONSECUTIVE_RUNS" ]; then
    message="Statement queue has been over ${STATEMENT_QUEUE_THRESHOLD} for ${breach_count} checks. Current count: ${queue_count}."
-        send_notification_if_changed "$SCRIPT_NAME" "hana_statement_queue_status" "HANA Statement Queue" "$message" "true" "ALERT:${queue_count}"
+    send_alert "$SCRIPT_NAME" "HANA Statement Queue" "$message"
    exit 1
 else
-        message="Statement queue is normal. Current count: ${queue_count}."
+    log_message "$SCRIPT_NAME" "Statement queue is normal. Current count: ${queue_count}."
        send_notification_if_changed "$SCRIPT_NAME" "hana_statement_queue_status" "HANA Statement Queue" "$message" "false" "OK"
    fi
 fi
 log_message "$SCRIPT_NAME" "Statement queue check complete."
--- a/sld_watchdog.sh
+++ b/sld_watchdog.sh
@@ -1,10 +1,8 @@
 #!/bin/bash
-#
+# =============================================================================
 # sld_watchdog.sh - Monitors SLD service health and restarts if needed
-# Optimized for better error handling and reliability
+# =============================================================================
 #
 # Get script directory and name
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 SCRIPT_NAME="sld_watchdog"
@@ -15,14 +13,12 @@ source "$SCRIPT_DIR/hana_lib.sh"
 # SLD-specific configuration
 SLD_URL="${SLD_URL:-https://localhost:40000/sld/sld0100.svc}"
 SLD_TIMEOUT="${SLD_TIMEOUT:-5}"
 SLD_LOCK_FILE="/tmp/hana_sld_watchdog.lock"
-# Acquire lock using library function
+# Acquire lock
-LOCK_FILE=$(acquire_lock "$SCRIPT_NAME")
+if ! acquire_lock "$SCRIPT_NAME"; then
 if [ $? -ne 0 ]; then
    exit 0
 fi
-trap 'release_lock "$LOCK_FILE"' EXIT
+trap 'release_lock "$SCRIPT_NAME"' EXIT
 # Function to check SLD health
 # Returns HTTP status code or "0" for connection errors
@@ -71,7 +67,6 @@ main() {
    # 200 OK or 401 Unauthorized indicate the Tomcat layer is actively processing
    if [[ $http_status == 200 || $http_status == 401 ]]; then
        log_message "$SCRIPT_NAME" "[OK] SLD service healthy (HTTP Status: $http_status)"
        send_notification_if_changed "$SCRIPT_NAME" "sld_status" "SLD Service" "SLD service is healthy (HTTP: $http_status)" "false" "OK"
        return 0
    fi
@@ -86,14 +81,12 @@ main() {
    log_message "$SCRIPT_NAME" "[ALERT] SLD dead or hung (${status_detail}). Restarting service..."
    # Send notification
-    send_notification_if_changed "$SCRIPT_NAME" "sld_down" "SLD Service Critical" \
+    send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service is down (${status_detail}). Restarting ${SLD_URL}"
        "SLD service is down (${status_detail}). Restarting ${SLD_URL}" "true" "SLD_DOWN"
    # Restart the service
    if ! restart_sld_service; then
        log_message "$SCRIPT_NAME" "[CRITICAL] Failed to restart SLD service"
-        send_notification_if_changed "$SCRIPT_NAME" "sld_restart_failed" "SLD Service Critical" \
+        send_alert "$SCRIPT_NAME" "SLD Service Critical" "Failed to restart SLD service"
            "Failed to restart SLD service" "true" "RESTART_FAILED"
        return 1
    fi
@@ -106,10 +99,6 @@ main() {
    if [[ $recovery_status == 200 || $recovery_status == 401 ]]; then
        log_message "$SCRIPT_NAME" "[RECOVERY] SLD service recovered successfully (HTTP Status: $recovery_status)"
        send_notification_if_changed "$SCRIPT_NAME" "sld_down" "SLD Service" \
            "SLD service recovered (HTTP: $recovery_status)" "false" "OK"
        send_notification_if_changed "$SCRIPT_NAME" "sld_restart_failed" "SLD Service" \
            "Service recovered successfully" "false" "OK"
    else
        local recovery_detail
        if [ "$recovery_status" == "0" ]; then
@@ -118,8 +107,7 @@ main() {
            recovery_detail="HTTP Status: $recovery_status"
        fi
        log_message "$SCRIPT_NAME" "[CRITICAL] SLD service failed to recover after restart (${recovery_detail})"
-        send_notification_if_changed "$SCRIPT_NAME" "sld_down" "SLD Service Critical" \
+        send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service FAILED to recover after restart (${recovery_detail})"
            "SLD service FAILED to recover after restart (${recovery_detail})" "true" "RECOVERY_FAILED"
        return 1
    fi