refactor(monitoring): simplify notification system and remove auto-cleanup
- Replace state-based notifications with direct alert functions - Remove auto-cleanup functionality from disk monitoring and configuration - Simplify lock acquisition/release across all monitoring scripts - Add execute_hana_sql helper functions for consistent SQL execution - Remove state file tracking in favor of direct file operations - Standardize error handling with exit codes on critical failures - Clean up hana.conf by removing unused auto-delete directory settings
This commit is contained in:
@@ -12,71 +12,66 @@ source "${SCRIPT_DIR}/hana.conf"
|
||||
source "${SCRIPT_DIR}/hana_lib.sh"
|
||||
|
||||
# Acquire lock
|
||||
LOCK_FILE=$(acquire_lock "$SCRIPT_NAME")
|
||||
if [ $? -ne 0 ]; then
|
||||
if ! acquire_lock "$SCRIPT_NAME"; then
|
||||
exit 1
|
||||
fi
|
||||
trap 'release_lock "$LOCK_FILE"' EXIT
|
||||
trap 'release_lock "$SCRIPT_NAME"' EXIT
|
||||
|
||||
log_message "$SCRIPT_NAME" "Starting statement queue check..."
|
||||
|
||||
# SQL Query for statement queue
|
||||
STATEMENT_QUEUE_SQL="SELECT COUNT(*) FROM M_SERVICE_THREADS WHERE THREAD_TYPE = 'SqlExecutor' AND THREAD_STATE = 'Queueing';"
|
||||
|
||||
# Check if hdbsql is available
|
||||
if [ ! -x "$HDBSQL_PATH" ]; then
|
||||
log_message "$SCRIPT_NAME" "ERROR: hdbsql not found or not executable at ${HDBSQL_PATH}"
|
||||
send_notification_if_changed "$SCRIPT_NAME" "hana_hdbsql_path_queue" "HANA Monitor Error" "hdbsql not found or not executable at ${HDBSQL_PATH}" "true" "HDBSQL_ERROR"
|
||||
send_alert "$SCRIPT_NAME" "HANA Monitor Error" "hdbsql not found or not executable at ${HDBSQL_PATH}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Execute SQL query as HANA user with improved error handling
|
||||
queue_result=$(su - "$HANA_USER" -c "$HDBSQL_PATH -U $HANA_USER_KEY -j -a -x \"$STATEMENT_QUEUE_SQL\"" 2>&1)
|
||||
# SQL Query for statement queue
|
||||
STATEMENT_QUEUE_SQL="SELECT COUNT(*) FROM M_SERVICE_THREADS WHERE THREAD_TYPE = 'SqlExecutor' AND THREAD_STATE = 'Queueing';"
|
||||
|
||||
# Execute SQL query
|
||||
queue_result=$(execute_hana_sql_query "$STATEMENT_QUEUE_SQL")
|
||||
sql_status=$?
|
||||
|
||||
if [ $sql_status -ne 0 ]; then
|
||||
log_message "$SCRIPT_NAME" "ERROR: Failed to execute queue query. Exit code: ${sql_status}"
|
||||
send_notification_if_changed "$SCRIPT_NAME" "hana_queue_query_error" "HANA Queue Error" "Failed to execute queue query. Exit code: ${sql_status}" "true" "QUERY_ERROR"
|
||||
log_message "$SCRIPT_NAME" "ERROR: Failed to execute queue query."
|
||||
send_alert "$SCRIPT_NAME" "HANA Queue Error" "Failed to execute queue query."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Clear any previous query error state
|
||||
send_notification_if_changed "$SCRIPT_NAME" "hana_queue_query_error" "HANA Statement Queue" "Queue query successful." "false" "OK"
|
||||
|
||||
# Parse queue count
|
||||
queue_count=$(echo "$queue_result" | tr -d '"' | xargs)
|
||||
|
||||
# Validate queue count is a number
|
||||
if ! [[ "$queue_count" =~ ^[0-9]+$ ]]; then
|
||||
log_message "$SCRIPT_NAME" "WARNING: Could not retrieve HANA statement queue count. Got: '${queue_count}'. Skipping check."
|
||||
send_notification_if_changed "$SCRIPT_NAME" "hana_statement_queue_check_fail" "HANA Monitor Warning" "Could not retrieve statement queue count. Got: '${queue_count}'" "true" "QUEUE_CHECK_FAIL"
|
||||
log_message "$SCRIPT_NAME" "WARNING: Could not retrieve HANA statement queue count. Got: '${queue_count}'."
|
||||
send_alert "$SCRIPT_NAME" "HANA Monitor Warning" "Could not retrieve statement queue count."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log_message "$SCRIPT_NAME" "Current statement queue length: ${queue_count}"
|
||||
|
||||
# Get breach count from state file
|
||||
breach_count_file="${STATE_DIR}/statement_queue_breach_count"
|
||||
breach_count=0
|
||||
if [ -f "$breach_count_file" ]; then
|
||||
breach_count=$(cat "$breach_count_file")
|
||||
fi
|
||||
|
||||
if [ "$queue_count" -gt "$STATEMENT_QUEUE_THRESHOLD" ]; then
|
||||
breach_count=$((breach_count + 1))
|
||||
log_message "$SCRIPT_NAME" "Statement queue is above threshold (${queue_count} > ${STATEMENT_QUEUE_THRESHOLD}). Consecutive breach count: ${breach_count}/${STATEMENT_QUEUE_CONSECUTIVE_RUNS}."
|
||||
else
|
||||
# Clear any previous check failure state
|
||||
send_notification_if_changed "$SCRIPT_NAME" "hana_statement_queue_check_fail" "HANA Monitor Warning" "Statement queue check is working." "false" "OK"
|
||||
log_message "$SCRIPT_NAME" "Current statement queue length: ${queue_count}"
|
||||
|
||||
# Get breach count from state
|
||||
breach_count=$(get_state "statement_queue_breach_count")
|
||||
breach_count=${breach_count:-0}
|
||||
|
||||
if [ "$queue_count" -gt "$STATEMENT_QUEUE_THRESHOLD" ]; then
|
||||
breach_count=$((breach_count + 1))
|
||||
log_message "$SCRIPT_NAME" "Statement queue is above threshold (${queue_count} > ${STATEMENT_QUEUE_THRESHOLD}). Consecutive breach count: ${breach_count}/${STATEMENT_QUEUE_CONSECUTIVE_RUNS}."
|
||||
else
|
||||
if [ "$breach_count" -gt 0 ]; then
|
||||
log_message "$SCRIPT_NAME" "Statement queue returned to normal. Resetting breach count from ${breach_count} to 0."
|
||||
fi
|
||||
breach_count=0
|
||||
if [ "$breach_count" -gt 0 ]; then
|
||||
log_message "$SCRIPT_NAME" "Statement queue returned to normal. Resetting breach count from ${breach_count} to 0."
|
||||
fi
|
||||
set_state "statement_queue_breach_count" "$breach_count"
|
||||
breach_count=0
|
||||
fi
|
||||
echo "$breach_count" > "$breach_count_file"
|
||||
|
||||
if [ "$breach_count" -ge "$STATEMENT_QUEUE_CONSECUTIVE_RUNS" ]; then
|
||||
message="Statement queue has been over ${STATEMENT_QUEUE_THRESHOLD} for ${breach_count} checks. Current count: ${queue_count}."
|
||||
send_notification_if_changed "$SCRIPT_NAME" "hana_statement_queue_status" "HANA Statement Queue" "$message" "true" "ALERT:${queue_count}"
|
||||
else
|
||||
message="Statement queue is normal. Current count: ${queue_count}."
|
||||
send_notification_if_changed "$SCRIPT_NAME" "hana_statement_queue_status" "HANA Statement Queue" "$message" "false" "OK"
|
||||
fi
|
||||
if [ "$breach_count" -ge "$STATEMENT_QUEUE_CONSECUTIVE_RUNS" ]; then
|
||||
message="Statement queue has been over ${STATEMENT_QUEUE_THRESHOLD} for ${breach_count} checks. Current count: ${queue_count}."
|
||||
send_alert "$SCRIPT_NAME" "HANA Statement Queue" "$message"
|
||||
exit 1
|
||||
else
|
||||
log_message "$SCRIPT_NAME" "Statement queue is normal. Current count: ${queue_count}."
|
||||
fi
|
||||
|
||||
log_message "$SCRIPT_NAME" "Statement queue check complete."
|
||||
|
||||
Reference in New Issue
Block a user