refactor(monitoring): simplify notification system and remove auto-cleanup

- Replace state-based notifications with direct alert functions
- Remove auto-cleanup functionality from disk monitoring and configuration
- Simplify lock acquisition/release across all monitoring scripts
- Add execute_hana_sql helper functions for consistent SQL execution
- Remove state file tracking in favor of direct file operations
- Standardize error handling with exit codes on critical failures
- Clean up hana.conf by removing unused auto-delete directory settings
This commit is contained in:
2026-03-12 21:52:49 +01:00
parent 5a92bc4e93
commit cf5b81889d
8 changed files with 183 additions and 391 deletions

View File

@@ -12,71 +12,66 @@ source "${SCRIPT_DIR}/hana.conf"
source "${SCRIPT_DIR}/hana_lib.sh"
# Acquire lock
LOCK_FILE=$(acquire_lock "$SCRIPT_NAME")
if [ $? -ne 0 ]; then
if ! acquire_lock "$SCRIPT_NAME"; then
exit 1
fi
trap 'release_lock "$LOCK_FILE"' EXIT
trap 'release_lock "$SCRIPT_NAME"' EXIT
log_message "$SCRIPT_NAME" "Starting statement queue check..."
# SQL Query for statement queue
STATEMENT_QUEUE_SQL="SELECT COUNT(*) FROM M_SERVICE_THREADS WHERE THREAD_TYPE = 'SqlExecutor' AND THREAD_STATE = 'Queueing';"
# Check if hdbsql is available
if [ ! -x "$HDBSQL_PATH" ]; then
log_message "$SCRIPT_NAME" "ERROR: hdbsql not found or not executable at ${HDBSQL_PATH}"
send_notification_if_changed "$SCRIPT_NAME" "hana_hdbsql_path_queue" "HANA Monitor Error" "hdbsql not found or not executable at ${HDBSQL_PATH}" "true" "HDBSQL_ERROR"
send_alert "$SCRIPT_NAME" "HANA Monitor Error" "hdbsql not found or not executable at ${HDBSQL_PATH}"
exit 1
fi
# Execute SQL query as HANA user with improved error handling
queue_result=$(su - "$HANA_USER" -c "$HDBSQL_PATH -U $HANA_USER_KEY -j -a -x \"$STATEMENT_QUEUE_SQL\"" 2>&1)
# SQL Query for statement queue
STATEMENT_QUEUE_SQL="SELECT COUNT(*) FROM M_SERVICE_THREADS WHERE THREAD_TYPE = 'SqlExecutor' AND THREAD_STATE = 'Queueing';"
# Execute SQL query
queue_result=$(execute_hana_sql_query "$STATEMENT_QUEUE_SQL")
sql_status=$?
if [ $sql_status -ne 0 ]; then
log_message "$SCRIPT_NAME" "ERROR: Failed to execute queue query. Exit code: ${sql_status}"
send_notification_if_changed "$SCRIPT_NAME" "hana_queue_query_error" "HANA Queue Error" "Failed to execute queue query. Exit code: ${sql_status}" "true" "QUERY_ERROR"
log_message "$SCRIPT_NAME" "ERROR: Failed to execute queue query."
send_alert "$SCRIPT_NAME" "HANA Queue Error" "Failed to execute queue query."
exit 1
fi
# Clear any previous query error state
send_notification_if_changed "$SCRIPT_NAME" "hana_queue_query_error" "HANA Statement Queue" "Queue query successful." "false" "OK"
# Parse queue count
queue_count=$(echo "$queue_result" | tr -d '"' | xargs)
# Validate queue count is a number
if ! [[ "$queue_count" =~ ^[0-9]+$ ]]; then
log_message "$SCRIPT_NAME" "WARNING: Could not retrieve HANA statement queue count. Got: '${queue_count}'. Skipping check."
send_notification_if_changed "$SCRIPT_NAME" "hana_statement_queue_check_fail" "HANA Monitor Warning" "Could not retrieve statement queue count. Got: '${queue_count}'" "true" "QUEUE_CHECK_FAIL"
log_message "$SCRIPT_NAME" "WARNING: Could not retrieve HANA statement queue count. Got: '${queue_count}'."
send_alert "$SCRIPT_NAME" "HANA Monitor Warning" "Could not retrieve statement queue count."
exit 1
fi
log_message "$SCRIPT_NAME" "Current statement queue length: ${queue_count}"
# Get breach count from state file
breach_count_file="${STATE_DIR}/statement_queue_breach_count"
breach_count=0
if [ -f "$breach_count_file" ]; then
breach_count=$(cat "$breach_count_file")
fi
if [ "$queue_count" -gt "$STATEMENT_QUEUE_THRESHOLD" ]; then
breach_count=$((breach_count + 1))
log_message "$SCRIPT_NAME" "Statement queue is above threshold (${queue_count} > ${STATEMENT_QUEUE_THRESHOLD}). Consecutive breach count: ${breach_count}/${STATEMENT_QUEUE_CONSECUTIVE_RUNS}."
else
# Clear any previous check failure state
send_notification_if_changed "$SCRIPT_NAME" "hana_statement_queue_check_fail" "HANA Monitor Warning" "Statement queue check is working." "false" "OK"
log_message "$SCRIPT_NAME" "Current statement queue length: ${queue_count}"
# Get breach count from state
breach_count=$(get_state "statement_queue_breach_count")
breach_count=${breach_count:-0}
if [ "$queue_count" -gt "$STATEMENT_QUEUE_THRESHOLD" ]; then
breach_count=$((breach_count + 1))
log_message "$SCRIPT_NAME" "Statement queue is above threshold (${queue_count} > ${STATEMENT_QUEUE_THRESHOLD}). Consecutive breach count: ${breach_count}/${STATEMENT_QUEUE_CONSECUTIVE_RUNS}."
else
if [ "$breach_count" -gt 0 ]; then
log_message "$SCRIPT_NAME" "Statement queue returned to normal. Resetting breach count from ${breach_count} to 0."
fi
breach_count=0
if [ "$breach_count" -gt 0 ]; then
log_message "$SCRIPT_NAME" "Statement queue returned to normal. Resetting breach count from ${breach_count} to 0."
fi
set_state "statement_queue_breach_count" "$breach_count"
breach_count=0
fi
echo "$breach_count" > "$breach_count_file"
if [ "$breach_count" -ge "$STATEMENT_QUEUE_CONSECUTIVE_RUNS" ]; then
message="Statement queue has been over ${STATEMENT_QUEUE_THRESHOLD} for ${breach_count} checks. Current count: ${queue_count}."
send_notification_if_changed "$SCRIPT_NAME" "hana_statement_queue_status" "HANA Statement Queue" "$message" "true" "ALERT:${queue_count}"
else
message="Statement queue is normal. Current count: ${queue_count}."
send_notification_if_changed "$SCRIPT_NAME" "hana_statement_queue_status" "HANA Statement Queue" "$message" "false" "OK"
fi
if [ "$breach_count" -ge "$STATEMENT_QUEUE_CONSECUTIVE_RUNS" ]; then
message="Statement queue has been over ${STATEMENT_QUEUE_THRESHOLD} for ${breach_count} checks. Current count: ${queue_count}."
send_alert "$SCRIPT_NAME" "HANA Statement Queue" "$message"
exit 1
else
log_message "$SCRIPT_NAME" "Statement queue is normal. Current count: ${queue_count}."
fi
log_message "$SCRIPT_NAME" "Statement queue check complete."