refactor(monitoring): simplify notification system and remove auto-cleanup

- Replace state-based notifications with direct alert functions
- Remove auto-cleanup functionality from disk monitoring and configuration
- Simplify lock acquisition/release across all monitoring scripts
- Add execute_hana_sql helper functions for consistent SQL execution
- Remove state file tracking in favor of direct file operations
- Standardize error handling with exit codes on critical failures
- Clean up hana.conf by removing unused auto-delete directory settings
This commit is contained in:
2026-03-12 21:52:49 +01:00
parent 5a92bc4e93
commit cf5b81889d
8 changed files with 183 additions and 391 deletions

View File

@@ -25,83 +25,55 @@ acquire_lock() {
fi
touch "$lock_file"
echo "$lock_file"
return 0
}
# Release lock
# Usage: release_lock "LOCK_FILE"
# Usage: release_lock "SCRIPT_NAME"
release_lock() {
local lock_file="$1"
if [ -n "$lock_file" ] && [ -f "$lock_file" ]; then
local script_name="$1"
local lock_file="${LOCK_DIR}/hana_${script_name}.lock"
if [ -f "$lock_file" ]; then
rm -f "$lock_file"
fi
}
# Get state value
# Usage: get_state "KEY"
get_state() {
local key="$1"
if [ -f "${STATE_DIR}/${key}.state" ]; then
cat "${STATE_DIR}/${key}.state"
else
echo ""
fi
}
# Set state value
# Usage: set_state "KEY" "VALUE"
set_state() {
local key="$1"
local value="$2"
echo "$value" > "${STATE_DIR}/${key}.state"
}
# Send notification if state changed
# Usage: send_notification_if_changed "SCRIPT_NAME" "ALERT_KEY" "TITLE_PREFIX" "MESSAGE" "IS_ALERT" "CURRENT_VALUE"
send_notification_if_changed() {
local script_name="$1"
local alert_key="$2"
local title_prefix="$3"
local current_message="$4"
local is_alert_condition="$5"
local current_value="$6"
# Send notification via ntfy
# Usage: send_notification "TITLE" "MESSAGE"
send_notification() {
local title="$1"
local message="$2"
local hostname=$(hostname)
local full_message="[${COMPANY_NAME} | ${hostname}] ${message}"
local previous_value=$(get_state "$alert_key")
if [ "$current_value" != "$previous_value" ]; then
local full_title=""
local full_message=""
if [ "$is_alert_condition" == "true" ]; then
full_title="${title_prefix} Alert"
full_message="🚨 Critical: ${current_message}"
log_message "$script_name" "ALERT: ${full_message}"
else
if [ -n "$previous_value" ] && [ "$previous_value" != "OK" ]; then
full_title="${title_prefix} Resolved"
full_message="✅ Resolved: ${current_message}"
log_message "$script_name" "RESOLVED: ${full_message}"
else
set_state "$alert_key" "$current_value"
return
fi
fi
local final_message="[${COMPANY_NAME} | ${hostname}] ${full_message}"
if [ -n "$NTFY_TOKEN" ] && [ -n "$NTFY_TOPIC_URL" ]; then
curl -H "Authorization: Bearer ${NTFY_TOKEN}" -H "Title: ${full_title}" -d "${final_message}" "${NTFY_TOPIC_URL}" > /dev/null 2>&1
log_message "$script_name" "Notification sent: ${full_title}"
else
log_message "$script_name" "Ntfy not configured, skipping notification"
fi
set_state "$alert_key" "$current_value"
if [ -n "$NTFY_TOKEN" ] && [ -n "$NTFY_TOPIC_URL" ]; then
curl -H "Authorization: Bearer ${NTFY_TOKEN}" -H "Title: ${title}" -d "${full_message}" "${NTFY_TOPIC_URL}" > /dev/null 2>&1
log_message "NOTIFY" "Notification sent: ${title}"
else
log_message "NOTIFY" "Ntfy not configured, skipping notification"
fi
}
# Send alert notification
# Usage: send_alert "SCRIPT_NAME" "TITLE_PREFIX" "MESSAGE"
send_alert() {
local script_name="$1"
local title_prefix="$2"
local message="$3"
send_notification "${title_prefix} Alert" "🚨 Critical: ${message}"
log_message "$script_name" "ALERT: ${message}"
}
# Send OK notification (state change from alert to normal)
# Usage: send_ok "SCRIPT_NAME" "TITLE_PREFIX" "MESSAGE"
send_ok() {
local script_name="$1"
local title_prefix="$2"
local message="$3"
send_notification "${title_prefix} Resolved" "✅ Resolved: ${message}"
log_message "$script_name" "RESOLVED: ${message}"
}
# Run command as HANA user using su
# Usage: run_as_hana_user "COMMAND"
run_as_hana_user() {
@@ -109,6 +81,45 @@ run_as_hana_user() {
su - "$HANA_USER" -c "$command"
}
# Execute SQL query as HANA user
# Usage: execute_hana_sql "SQL_QUERY"
# Returns: SQL output on stdout, returns 0 on success, 1 on failure
execute_hana_sql() {
local sql_query="$1"
local output
output=$(su - "$HANA_USER" -c "$HDBSQL_PATH -U $HANA_USER_KEY -j -a -x \"$sql_query\"" 2>&1)
local sql_status=$?
if [ $sql_status -ne 0 ]; then
log_message "SQL" "ERROR: Failed to execute SQL query. Exit code: ${sql_status}"
echo "$output" >&2
return 1
fi
echo "$output"
return 0
}
# Execute SQL query and return result (for single-value queries)
# Usage: execute_hana_sql_query "SQL_QUERY"
# Returns: Query result on stdout, returns 0 on success, 1 on failure
execute_hana_sql_query() {
local sql_query="$1"
local output
output=$(execute_hana_sql "$sql_query")
local sql_status=$?
if [ $sql_status -ne 0 ]; then
return 1
fi
# Clean output: remove quotes and whitespace
echo "$output" | tr -d '"' | xargs
return 0
}
# Get disk usage percentage for a directory
# Usage: get_disk_usage_percentage "/path/to/dir"
# Returns: Usage percentage as integer (without % sign)
@@ -136,137 +147,3 @@ get_available_space_kb() {
local dir="$1"
df -k "$dir" 2>/dev/null | awk 'NR==2 {print $4}'
}
# Find directories for auto-delete on the same mount point
# Usage: find_autodelete_dirs_on_mount "mount_point"
# Returns: Space-separated list of directories to clean
find_autodelete_dirs_on_mount() {
local mount_point="$1"
local result=""
for entry in "${DIRS_FOR_AUTODELETE[@]}"; do
local entry_mount="${entry%%:*}"
local cleanup_dir="${entry#*:}"
if [ "$entry_mount" == "$mount_point" ] && [ -d "$cleanup_dir" ]; then
if [ -n "$result" ]; then
result="$result $cleanup_dir"
else
result="$cleanup_dir"
fi
fi
done
echo "$result"
}
# Clean old files in a directory
# Usage: clean_directory_files "/path/to/dir" "max_age_days"
# Returns: Number of files deleted and space freed
clean_directory_files() {
local cleanup_dir="$1"
local max_age_days="${2:-7}"
local files_deleted=0
local space_freed=0
if [ ! -d "$cleanup_dir" ]; then
log_message "CLEANUP" "Directory '$cleanup_dir' not found. Skipping."
echo "0:0"
return
fi
# Find and delete old files
while IFS= read -r -d '' file; do
if [ -f "$file" ]; then
local file_size=$(stat -c%s "$file" 2>/dev/null || echo "0")
rm -f "$file" 2>/dev/null && {
files_deleted=$((files_deleted + 1))
space_freed=$((space_freed + file_size))
}
fi
done < <(find "$cleanup_dir" -type f -mtime +$max_age_days -print0 2>/dev/null)
# Also clean empty directories
find "$cleanup_dir" -type d -empty -delete 2>/dev/null
log_message "CLEANUP" "Deleted $files_deleted files from '$cleanup_dir', freed $((space_freed / 1024)) KB"
echo "${files_deleted}:${space_freed}"
}
# Automatic disk cleanup function
# Usage: auto_cleanup "mount_point" "target_free_percentage"
# Returns: 0 if cleanup successful, 1 if failed or not needed
auto_cleanup() {
local mount_point="$1"
local target_free_percentage="${2:-5}"
if [ "$AUTO_CLEANUP_ENABLED" != "true" ]; then
log_message "CLEANUP" "Auto-cleanup is disabled. Skipping."
return 1
fi
local cleanup_dirs=$(find_autodelete_dirs_on_mount "$mount_point")
if [ -z "$cleanup_dirs" ]; then
log_message "CLEANUP" "No auto-delete directories configured for mount point '$mount_point'. Skipping cleanup."
return 1
fi
log_message "CLEANUP" "Starting auto-cleanup for mount point '$mount_point'. Directories: $cleanup_dirs"
local total_freed=0
local total_files=0
for cleanup_dir in $cleanup_dirs; do
local result=$(clean_directory_files "$cleanup_dir" "$MAX_FILE_AGE_DAYS")
local files="${result%%:*}"
local freed="${result#*:}"
total_files=$((total_files + files))
total_freed=$((total_freed + freed))
done
log_message "CLEANUP" "Cleanup complete. Total files deleted: $total_files, Total space freed: $((total_freed / 1024)) KB"
if [ $total_freed -gt 0 ]; then
return 0
else
return 1
fi
}
# Check disk space and perform auto-cleanup if needed
# Usage: check_and_cleanup_disk "directory" "threshold"
# Returns: 0 if OK or cleanup successful, 1 if critical and cleanup failed
check_and_cleanup_disk() {
local dir="$1"
local threshold="${2:-85}"
local usage=$(get_disk_usage_percentage "$dir")
local mount_point=$(get_mount_point "$dir")
if [ -z "$usage" ] || [ "$usage" -eq 0 ]; then
log_message "CLEANUP" "Could not determine disk usage for '$dir'. Skipping."
return 1
fi
if [ "$usage" -gt "$threshold" ]; then
log_message "CLEANUP" "Disk usage ${usage}% exceeds threshold ${threshold}% for '$dir'. Attempting cleanup..."
if auto_cleanup "$mount_point" "$MIN_FREE_SPACE_AFTER_CLEANUP"; then
local new_usage=$(get_disk_usage_percentage "$dir")
log_message "CLEANUP" "After cleanup, disk usage is ${new_usage}%"
if [ "$new_usage" -le "$threshold" ]; then
return 0
else
log_message "CLEANUP" "Cleanup completed but usage ${new_usage}% still above threshold"
return 0
fi
else
log_message "CLEANUP" "Cleanup failed or no files to clean for '$dir'"
return 1
fi
fi
return 0
}