diff --git a/hana.conf b/hana.conf index 8ce7d96..4bd0a94 100644 --- a/hana.conf +++ b/hana.conf @@ -28,29 +28,6 @@ DIRECTORIES_TO_MONITOR=( "/usr/sap" ) -# --- Directories for Auto-Delete --- -# These directories will be automatically cleaned when disk space is low -# Format: "mount_point:directory_path" -# The script will check if a monitored directory is on the same mount point -# as an auto-delete directory and can clean it to free up space -# NOTE: Do NOT include HANA system logs like /hana/log - only include -# directories with safe-to-delete files like backup logs, temp files, etc. -DIRS_FOR_AUTODELETE=( - "/hana/shared:/hana/shared/backup/schema/DB_NDB" - "/hana/shared:/hana/shared/backup/schema/SYSTEMDB" - "/hana/data:/hana/data/temp" - "/usr/sap:/usr/sap/trans/log" - "/usr/sap:/usr/sap/hostctrl/work/log" -) - -# --- Disk Auto-Delete Configuration --- -# Minimum free space percentage to maintain after cleanup -MIN_FREE_SPACE_AFTER_CLEANUP=5 -# Maximum age of files to delete (in days) -MAX_FILE_AGE_DAYS=7 -# Enable automatic cleanup when disk usage exceeds threshold -AUTO_CLEANUP_ENABLED=true - # --- Thresholds --- DISK_USAGE_THRESHOLD=85 TRUNCATED_PERCENTAGE_THRESHOLD=50 diff --git a/hana_backup.sh b/hana_backup.sh index 5ce28a4..d32e907 100644 --- a/hana_backup.sh +++ b/hana_backup.sh @@ -12,31 +12,30 @@ source "${SCRIPT_DIR}/hana.conf" source "${SCRIPT_DIR}/hana_lib.sh" # Acquire lock -LOCK_FILE=$(acquire_lock "$SCRIPT_NAME") -if [ $? -ne 0 ]; then +if ! acquire_lock "$SCRIPT_NAME"; then exit 1 fi -trap 'release_lock "$LOCK_FILE"' EXIT +trap 'release_lock "$SCRIPT_NAME"' EXIT log_message "$SCRIPT_NAME" "Starting backup status check..." # Check if hdbsql is available if [ ! -x "$HDBSQL_PATH" ]; then log_message "$SCRIPT_NAME" "ERROR: hdbsql not found or not executable at ${HDBSQL_PATH}" - send_notification_if_changed "$SCRIPT_NAME" "hana_hdbsql_path_backup" "HANA Monitor Error" "hdbsql not found or not executable at ${HDBSQL_PATH}" "true" "HDBSQL_ERROR" + send_alert "$SCRIPT_NAME" "HANA Monitor Error" "hdbsql not found or not executable at ${HDBSQL_PATH}" exit 1 fi # SQL Query for last successful backup BACKUP_SQL="SELECT TOP 1 SYS_START_TIME FROM M_BACKUP_CATALOG WHERE ENTRY_TYPE_NAME = 'complete data backup' AND STATE_NAME = 'successful' ORDER BY SYS_START_TIME DESC" -# Execute SQL query as HANA user with improved error handling -backup_result=$(su - "$HANA_USER" -c "$HDBSQL_PATH -U $HANA_USER_KEY -j -a -x \"$BACKUP_SQL\"" 2>&1) +# Execute SQL query +backup_result=$(execute_hana_sql "$BACKUP_SQL") sql_status=$? if [ $sql_status -ne 0 ]; then log_message "$SCRIPT_NAME" "ERROR: Failed to execute backup query. Exit code: ${sql_status}" - send_notification_if_changed "$SCRIPT_NAME" "hana_backup_query_error" "HANA Backup Error" "Failed to execute backup query. Exit code: ${sql_status}" "true" "QUERY_ERROR" + send_alert "$SCRIPT_NAME" "HANA Backup Error" "Failed to execute backup query. Exit code: ${sql_status}" exit 1 fi @@ -45,18 +44,15 @@ last_backup_date=$(echo "$backup_result" | tr -d '"' | sed 's/\..*//') if [[ -z "$last_backup_date" || "$last_backup_date" == *"error"* || "$last_backup_date" == *"Error"* ]]; then message="No successful complete data backup found for ${COMPANY_NAME} HANA." log_message "$SCRIPT_NAME" "CRITICAL: ${message}" - send_notification_if_changed "$SCRIPT_NAME" "hana_backup_status" "HANA Backup" "$message" "true" "NO_BACKUP" + send_alert "$SCRIPT_NAME" "HANA Backup" "$message" exit 1 fi -# Clear any previous query error state -send_notification_if_changed "$SCRIPT_NAME" "hana_backup_query_error" "HANA Backup" "Backup query successful." "false" "OK" - # Calculate backup age last_backup_epoch=$(date -d "$last_backup_date" +%s 2>/dev/null) if [ $? -ne 0 ]; then log_message "$SCRIPT_NAME" "ERROR: Failed to parse backup date: ${last_backup_date}" - send_notification_if_changed "$SCRIPT_NAME" "hana_backup_status" "HANA Backup" "Failed to parse backup date: ${last_backup_date}" "true" "DATE_PARSE_ERROR" + send_alert "$SCRIPT_NAME" "HANA Backup" "Failed to parse backup date: ${last_backup_date}" exit 1 fi @@ -73,11 +69,11 @@ fi if [ $age_seconds -gt $threshold_seconds ]; then message="Last successful HANA backup for ${COMPANY_NAME} is ${age_hours} hours old, which exceeds the threshold of ${BACKUP_THRESHOLD_HOURS} hours. Last backup was on: ${last_backup_date}." log_message "$SCRIPT_NAME" "CRITICAL: ${message}" - send_notification_if_changed "$SCRIPT_NAME" "hana_backup_status" "HANA Backup" "$message" "true" "${age_hours}h" + send_alert "$SCRIPT_NAME" "HANA Backup" "$message" + exit 1 else message="Last successful backup is ${age_hours} hours old (Threshold: ${BACKUP_THRESHOLD_HOURS} hours)." log_message "$SCRIPT_NAME" "SUCCESS: ${message}" - send_notification_if_changed "$SCRIPT_NAME" "hana_backup_status" "HANA Backup" "$message" "false" "OK" fi log_message "$SCRIPT_NAME" "Backup check complete." diff --git a/hana_disk.sh b/hana_disk.sh index 3d87646..b74e777 100644 --- a/hana_disk.sh +++ b/hana_disk.sh @@ -12,18 +12,16 @@ source "${SCRIPT_DIR}/hana.conf" source "${SCRIPT_DIR}/hana_lib.sh" # Acquire lock -LOCK_FILE=$(acquire_lock "$SCRIPT_NAME") -if [ $? -ne 0 ]; then +if ! acquire_lock "$SCRIPT_NAME"; then exit 1 fi -trap 'release_lock "$LOCK_FILE"' EXIT +trap 'release_lock "$SCRIPT_NAME"' EXIT log_message "$SCRIPT_NAME" "Starting disk usage check..." # Track overall status ALERT_COUNT=0 TOTAL_DIRS=0 -CLEANUP_PERFORMED=0 for dir in "${DIRECTORIES_TO_MONITOR[@]}"; do TOTAL_DIRS=$((TOTAL_DIRS + 1)) @@ -31,7 +29,7 @@ for dir in "${DIRECTORIES_TO_MONITOR[@]}"; do # Check if directory exists if [ ! -d "$dir" ]; then log_message "$SCRIPT_NAME" "WARNING: Directory '$dir' not found. Skipping." - send_notification_if_changed "$SCRIPT_NAME" "disk_dir_not_found_${dir//\//_}" "HANA Disk Warning" "Directory '$dir' not found." "true" "DIR_NOT_FOUND" + send_alert "$SCRIPT_NAME" "HANA Disk Warning" "Directory '$dir' not found." ALERT_COUNT=$((ALERT_COUNT + 1)) continue fi @@ -49,33 +47,15 @@ for dir in "${DIRECTORIES_TO_MONITOR[@]}"; do # Check if usage exceeds threshold if [ "$usage" -gt "$DISK_USAGE_THRESHOLD" ]; then log_message "$SCRIPT_NAME" "ALERT: ${dir} usage is at ${usage}% which is above the ${DISK_USAGE_THRESHOLD}% threshold." - - # Attempt auto-cleanup if enabled - if [ "$AUTO_CLEANUP_ENABLED" == "true" ]; then - log_message "$SCRIPT_NAME" "Attempting auto-cleanup for '${dir}'..." - mount_point=$(get_mount_point "$dir") - - if auto_cleanup "$mount_point" "$MIN_FREE_SPACE_AFTER_CLEANUP"; then - CLEANUP_PERFORMED=$((CLEANUP_PERFORMED + 1)) - new_usage=$(get_disk_usage_percentage "$dir") - log_message "$SCRIPT_NAME" "After cleanup, ${dir} usage is at ${new_usage}%" - usage=$new_usage - else - log_message "$SCRIPT_NAME" "Auto-cleanup failed or no files to clean for '${dir}'" - fi - fi - - # Send notification with final usage after cleanup attempt - send_notification_if_changed "$SCRIPT_NAME" "disk_usage_${dir//\//_}" "HANA Disk" "Disk usage for ${dir} is at ${usage}%." "true" "${usage}%" + send_alert "$SCRIPT_NAME" "HANA Disk" "Disk usage for ${dir} is at ${usage}% (threshold: ${DISK_USAGE_THRESHOLD}%)." ALERT_COUNT=$((ALERT_COUNT + 1)) else - # Send OK notification only if state changed from alert - send_notification_if_changed "$SCRIPT_NAME" "disk_usage_${dir//\//_}" "HANA Disk" "Disk usage for ${dir} is at ${usage}% (below threshold)." "false" "OK" + log_message "$SCRIPT_NAME" "OK: ${dir} usage is at ${usage}% (below threshold)." fi done # Summary logging -log_message "$SCRIPT_NAME" "Disk check complete. Total: ${TOTAL_DIRS} dirs, ${ALERT_COUNT} alerts, ${CLEANUP_PERFORMED} cleanups performed." +log_message "$SCRIPT_NAME" "Disk check complete. Total: ${TOTAL_DIRS} dirs, ${ALERT_COUNT} alerts." # Exit with status based on alerts if [ "$ALERT_COUNT" -gt 0 ]; then diff --git a/hana_lib.sh b/hana_lib.sh index 21d79a9..3c9b3cb 100644 --- a/hana_lib.sh +++ b/hana_lib.sh @@ -25,83 +25,55 @@ acquire_lock() { fi touch "$lock_file" - echo "$lock_file" return 0 } # Release lock -# Usage: release_lock "LOCK_FILE" +# Usage: release_lock "SCRIPT_NAME" release_lock() { - local lock_file="$1" - if [ -n "$lock_file" ] && [ -f "$lock_file" ]; then + local script_name="$1" + local lock_file="${LOCK_DIR}/hana_${script_name}.lock" + if [ -f "$lock_file" ]; then rm -f "$lock_file" fi } -# Get state value -# Usage: get_state "KEY" -get_state() { - local key="$1" - if [ -f "${STATE_DIR}/${key}.state" ]; then - cat "${STATE_DIR}/${key}.state" - else - echo "" - fi -} - -# Set state value -# Usage: set_state "KEY" "VALUE" -set_state() { - local key="$1" - local value="$2" - echo "$value" > "${STATE_DIR}/${key}.state" -} - -# Send notification if state changed -# Usage: send_notification_if_changed "SCRIPT_NAME" "ALERT_KEY" "TITLE_PREFIX" "MESSAGE" "IS_ALERT" "CURRENT_VALUE" -send_notification_if_changed() { - local script_name="$1" - local alert_key="$2" - local title_prefix="$3" - local current_message="$4" - local is_alert_condition="$5" - local current_value="$6" +# Send notification via ntfy +# Usage: send_notification "TITLE" "MESSAGE" +send_notification() { + local title="$1" + local message="$2" local hostname=$(hostname) + local full_message="[${COMPANY_NAME} | ${hostname}] ${message}" - local previous_value=$(get_state "$alert_key") - - if [ "$current_value" != "$previous_value" ]; then - local full_title="" - local full_message="" - - if [ "$is_alert_condition" == "true" ]; then - full_title="${title_prefix} Alert" - full_message="🚨 Critical: ${current_message}" - log_message "$script_name" "ALERT: ${full_message}" - else - if [ -n "$previous_value" ] && [ "$previous_value" != "OK" ]; then - full_title="${title_prefix} Resolved" - full_message="✅ Resolved: ${current_message}" - log_message "$script_name" "RESOLVED: ${full_message}" - else - set_state "$alert_key" "$current_value" - return - fi - fi - - local final_message="[${COMPANY_NAME} | ${hostname}] ${full_message}" - - if [ -n "$NTFY_TOKEN" ] && [ -n "$NTFY_TOPIC_URL" ]; then - curl -H "Authorization: Bearer ${NTFY_TOKEN}" -H "Title: ${full_title}" -d "${final_message}" "${NTFY_TOPIC_URL}" > /dev/null 2>&1 - log_message "$script_name" "Notification sent: ${full_title}" - else - log_message "$script_name" "Ntfy not configured, skipping notification" - fi - - set_state "$alert_key" "$current_value" + if [ -n "$NTFY_TOKEN" ] && [ -n "$NTFY_TOPIC_URL" ]; then + curl -H "Authorization: Bearer ${NTFY_TOKEN}" -H "Title: ${title}" -d "${full_message}" "${NTFY_TOPIC_URL}" > /dev/null 2>&1 + log_message "NOTIFY" "Notification sent: ${title}" + else + log_message "NOTIFY" "Ntfy not configured, skipping notification" fi } +# Send alert notification +# Usage: send_alert "SCRIPT_NAME" "TITLE_PREFIX" "MESSAGE" +send_alert() { + local script_name="$1" + local title_prefix="$2" + local message="$3" + send_notification "${title_prefix} Alert" "🚨 Critical: ${message}" + log_message "$script_name" "ALERT: ${message}" +} + +# Send OK notification (state change from alert to normal) +# Usage: send_ok "SCRIPT_NAME" "TITLE_PREFIX" "MESSAGE" +send_ok() { + local script_name="$1" + local title_prefix="$2" + local message="$3" + send_notification "${title_prefix} Resolved" "✅ Resolved: ${message}" + log_message "$script_name" "RESOLVED: ${message}" +} + # Run command as HANA user using su # Usage: run_as_hana_user "COMMAND" run_as_hana_user() { @@ -109,6 +81,45 @@ run_as_hana_user() { su - "$HANA_USER" -c "$command" } +# Execute SQL query as HANA user +# Usage: execute_hana_sql "SQL_QUERY" +# Returns: SQL output on stdout, returns 0 on success, 1 on failure +execute_hana_sql() { + local sql_query="$1" + local output + + output=$(su - "$HANA_USER" -c "$HDBSQL_PATH -U $HANA_USER_KEY -j -a -x \"$sql_query\"" 2>&1) + local sql_status=$? + + if [ $sql_status -ne 0 ]; then + log_message "SQL" "ERROR: Failed to execute SQL query. Exit code: ${sql_status}" + echo "$output" >&2 + return 1 + fi + + echo "$output" + return 0 +} + +# Execute SQL query and return result (for single-value queries) +# Usage: execute_hana_sql_query "SQL_QUERY" +# Returns: Query result on stdout, returns 0 on success, 1 on failure +execute_hana_sql_query() { + local sql_query="$1" + local output + + output=$(execute_hana_sql "$sql_query") + local sql_status=$? + + if [ $sql_status -ne 0 ]; then + return 1 + fi + + # Clean output: remove quotes and whitespace + echo "$output" | tr -d '"' | xargs + return 0 +} + # Get disk usage percentage for a directory # Usage: get_disk_usage_percentage "/path/to/dir" # Returns: Usage percentage as integer (without % sign) @@ -136,137 +147,3 @@ get_available_space_kb() { local dir="$1" df -k "$dir" 2>/dev/null | awk 'NR==2 {print $4}' } - -# Find directories for auto-delete on the same mount point -# Usage: find_autodelete_dirs_on_mount "mount_point" -# Returns: Space-separated list of directories to clean -find_autodelete_dirs_on_mount() { - local mount_point="$1" - local result="" - - for entry in "${DIRS_FOR_AUTODELETE[@]}"; do - local entry_mount="${entry%%:*}" - local cleanup_dir="${entry#*:}" - - if [ "$entry_mount" == "$mount_point" ] && [ -d "$cleanup_dir" ]; then - if [ -n "$result" ]; then - result="$result $cleanup_dir" - else - result="$cleanup_dir" - fi - fi - done - - echo "$result" -} - -# Clean old files in a directory -# Usage: clean_directory_files "/path/to/dir" "max_age_days" -# Returns: Number of files deleted and space freed -clean_directory_files() { - local cleanup_dir="$1" - local max_age_days="${2:-7}" - local files_deleted=0 - local space_freed=0 - - if [ ! -d "$cleanup_dir" ]; then - log_message "CLEANUP" "Directory '$cleanup_dir' not found. Skipping." - echo "0:0" - return - fi - - # Find and delete old files - while IFS= read -r -d '' file; do - if [ -f "$file" ]; then - local file_size=$(stat -c%s "$file" 2>/dev/null || echo "0") - rm -f "$file" 2>/dev/null && { - files_deleted=$((files_deleted + 1)) - space_freed=$((space_freed + file_size)) - } - fi - done < <(find "$cleanup_dir" -type f -mtime +$max_age_days -print0 2>/dev/null) - - # Also clean empty directories - find "$cleanup_dir" -type d -empty -delete 2>/dev/null - - log_message "CLEANUP" "Deleted $files_deleted files from '$cleanup_dir', freed $((space_freed / 1024)) KB" - echo "${files_deleted}:${space_freed}" -} - -# Automatic disk cleanup function -# Usage: auto_cleanup "mount_point" "target_free_percentage" -# Returns: 0 if cleanup successful, 1 if failed or not needed -auto_cleanup() { - local mount_point="$1" - local target_free_percentage="${2:-5}" - - if [ "$AUTO_CLEANUP_ENABLED" != "true" ]; then - log_message "CLEANUP" "Auto-cleanup is disabled. Skipping." - return 1 - fi - - local cleanup_dirs=$(find_autodelete_dirs_on_mount "$mount_point") - - if [ -z "$cleanup_dirs" ]; then - log_message "CLEANUP" "No auto-delete directories configured for mount point '$mount_point'. Skipping cleanup." - return 1 - fi - - log_message "CLEANUP" "Starting auto-cleanup for mount point '$mount_point'. Directories: $cleanup_dirs" - - local total_freed=0 - local total_files=0 - - for cleanup_dir in $cleanup_dirs; do - local result=$(clean_directory_files "$cleanup_dir" "$MAX_FILE_AGE_DAYS") - local files="${result%%:*}" - local freed="${result#*:}" - total_files=$((total_files + files)) - total_freed=$((total_freed + freed)) - done - - log_message "CLEANUP" "Cleanup complete. Total files deleted: $total_files, Total space freed: $((total_freed / 1024)) KB" - - if [ $total_freed -gt 0 ]; then - return 0 - else - return 1 - fi -} - -# Check disk space and perform auto-cleanup if needed -# Usage: check_and_cleanup_disk "directory" "threshold" -# Returns: 0 if OK or cleanup successful, 1 if critical and cleanup failed -check_and_cleanup_disk() { - local dir="$1" - local threshold="${2:-85}" - - local usage=$(get_disk_usage_percentage "$dir") - local mount_point=$(get_mount_point "$dir") - - if [ -z "$usage" ] || [ "$usage" -eq 0 ]; then - log_message "CLEANUP" "Could not determine disk usage for '$dir'. Skipping." - return 1 - fi - - if [ "$usage" -gt "$threshold" ]; then - log_message "CLEANUP" "Disk usage ${usage}% exceeds threshold ${threshold}% for '$dir'. Attempting cleanup..." - - if auto_cleanup "$mount_point" "$MIN_FREE_SPACE_AFTER_CLEANUP"; then - local new_usage=$(get_disk_usage_percentage "$dir") - log_message "CLEANUP" "After cleanup, disk usage is ${new_usage}%" - - if [ "$new_usage" -le "$threshold" ]; then - return 0 - else - log_message "CLEANUP" "Cleanup completed but usage ${new_usage}% still above threshold" - return 0 - fi - else - log_message "CLEANUP" "Cleanup failed or no files to clean for '$dir'" - return 1 - fi - fi - - return 0 -} diff --git a/hana_log_segments.sh b/hana_log_segments.sh index c74678f..e5604d7 100644 --- a/hana_log_segments.sh +++ b/hana_log_segments.sh @@ -12,32 +12,30 @@ source "${SCRIPT_DIR}/hana.conf" source "${SCRIPT_DIR}/hana_lib.sh" # Acquire lock -LOCK_FILE=$(acquire_lock "$SCRIPT_NAME") -if [ $? -ne 0 ]; then +if ! acquire_lock "$SCRIPT_NAME"; then exit 1 fi -trap 'release_lock "$LOCK_FILE"' EXIT +trap 'release_lock "$SCRIPT_NAME"' EXIT log_message "$SCRIPT_NAME" "Starting log segment check..." -# SQL Query for log segments -SQL_QUERY="SELECT b.host, b.service_name, a.state, count(*) FROM PUBLIC.M_LOG_SEGMENTS a JOIN PUBLIC.M_SERVICES b ON (a.host = b.host AND a.port = b.port) GROUP BY b.host, b.service_name, a.state;" - # Check if hdbsql is available if [ ! -x "$HDBSQL_PATH" ]; then log_message "$SCRIPT_NAME" "ERROR: hdbsql not found or not executable at ${HDBSQL_PATH}" - send_notification_if_changed "$SCRIPT_NAME" "hana_hdbsql_path" "HANA Monitor Error" "hdbsql not found or not executable at ${HDBSQL_PATH}" "true" "HDBSQL_ERROR" + send_alert "$SCRIPT_NAME" "HANA Monitor Error" "hdbsql not found or not executable at ${HDBSQL_PATH}" exit 1 fi -# Execute SQL query as HANA user with improved error handling -readarray -t sql_output < <(su - "$HANA_USER" -c "$HDBSQL_PATH -U $HANA_USER_KEY -c \";\" \"$SQL_QUERY\"" 2>&1) +# SQL Query for log segments +SQL_QUERY="SELECT b.host, b.service_name, a.state, count(*) FROM PUBLIC.M_LOG_SEGMENTS a JOIN PUBLIC.M_SERVICES b ON (a.host = b.host AND a.port = b.port) GROUP BY b.host, b.service_name, a.state;" + +# Execute SQL query +sql_output=$(execute_hana_sql "$SQL_QUERY") sql_status=$? if [ $sql_status -ne 0 ]; then - error_message=$(printf '%s\n' "${sql_output[@]}") - log_message "$SCRIPT_NAME" "ERROR: The hdbsql command failed. Details: ${error_message}" - send_notification_if_changed "$SCRIPT_NAME" "hana_hdbsql_command" "HANA Monitor Error" "The hdbsql command failed. Details: ${error_message}" "true" "HDBSQL_COMMAND_FAILED" + log_message "$SCRIPT_NAME" "ERROR: The hdbsql command failed." + send_alert "$SCRIPT_NAME" "HANA Monitor Error" "The hdbsql command failed." exit 1 fi @@ -46,15 +44,15 @@ total_segments=0 truncated_segments=0 free_segments=0 -for line in "${sql_output[@]}"; do +while IFS= read -r line; do # Skip empty lines and header if [[ -z "$line" || "$line" == *"STATE"* || "$line" == *"host"* ]]; then continue fi cleaned_line=$(echo "$line" | tr -d '"') - state=$(echo "$cleaned_line" | awk -F',' '{print $3}' | xargs) # Trim whitespace - count=$(echo "$cleaned_line" | awk -F',' '{print $4}' | xargs) # Trim whitespace + state=$(echo "$cleaned_line" | awk -F',' '{print $3}' | xargs) + count=$(echo "$cleaned_line" | awk -F',' '{print $4}' | xargs) # Validate count is a number if ! [[ "$count" =~ ^[0-9]+$ ]]; then @@ -67,7 +65,7 @@ for line in "${sql_output[@]}"; do elif [[ "$state" == "Free" ]]; then free_segments=$((free_segments + count)) fi -done +done <<< "$sql_output" log_message "$SCRIPT_NAME" "Total Segments: ${total_segments}" log_message "$SCRIPT_NAME" "Truncated Segments: ${truncated_segments}" @@ -75,37 +73,24 @@ log_message "$SCRIPT_NAME" "Free Segments: ${free_segments}" if [ $total_segments -eq 0 ]; then log_message "$SCRIPT_NAME" "WARNING: No log segments found. Skipping percentage checks." - send_notification_if_changed "$SCRIPT_NAME" "hana_log_segments_total" "HANA Log Segment Warning" "No log segments found. Skipping percentage checks." "true" "NO_LOG_SEGMENTS" -else - send_notification_if_changed "$SCRIPT_NAME" "hana_log_segments_total" "HANA Log Segment" "Log segments found." "false" "OK" - - # Calculate truncated percentage with integer arithmetic - if [ $total_segments -gt 0 ]; then - truncated_percentage=$((truncated_segments * 100 / total_segments)) - else - truncated_percentage=0 - fi - - if [ $truncated_percentage -gt $TRUNCATED_PERCENTAGE_THRESHOLD ]; then - log_message "$SCRIPT_NAME" "ALERT: ${truncated_percentage}% of log segments are 'Truncated'." - send_notification_if_changed "$SCRIPT_NAME" "hana_log_truncated" "HANA Log Segment" "${truncated_percentage}% of HANA log segments are in 'Truncated' state." "true" "${truncated_percentage}%" - else - send_notification_if_changed "$SCRIPT_NAME" "hana_log_truncated" "HANA Log Segment" "${truncated_percentage}% of HANA log segments are in 'Truncated' state (below threshold)." "false" "OK" - fi - - # Calculate free percentage with integer arithmetic - if [ $total_segments -gt 0 ]; then - free_percentage=$((free_segments * 100 / total_segments)) - else - free_percentage=0 - fi - - if [ $free_percentage -lt $FREE_PERCENTAGE_THRESHOLD ]; then - log_message "$SCRIPT_NAME" "ALERT: Only ${free_percentage}% of log segments are 'Free'." - send_notification_if_changed "$SCRIPT_NAME" "hana_log_free" "HANA Log Segment" "Only ${free_percentage}% of HANA log segments are in 'Free' state." "true" "${free_percentage}%" - else - send_notification_if_changed "$SCRIPT_NAME" "hana_log_free" "HANA Log Segment" "Only ${free_percentage}% of HANA log segments are in 'Free' state (above threshold)." "false" "OK" - fi + send_alert "$SCRIPT_NAME" "HANA Log Segment Warning" "No log segments found." + exit 1 +fi + +# Calculate truncated percentage with integer arithmetic +truncated_percentage=$((truncated_segments * 100 / total_segments)) + +if [ $truncated_percentage -gt $TRUNCATED_PERCENTAGE_THRESHOLD ]; then + log_message "$SCRIPT_NAME" "ALERT: ${truncated_percentage}% of log segments are 'Truncated'." + send_alert "$SCRIPT_NAME" "HANA Log Segment" "${truncated_percentage}% of HANA log segments are in 'Truncated' state." +fi + +# Calculate free percentage with integer arithmetic +free_percentage=$((free_segments * 100 / total_segments)) + +if [ $free_percentage -lt $FREE_PERCENTAGE_THRESHOLD ]; then + log_message "$SCRIPT_NAME" "ALERT: Only ${free_percentage}% of log segments are 'Free'." + send_alert "$SCRIPT_NAME" "HANA Log Segment" "Only ${free_percentage}% of HANA log segments are in 'Free' state." fi log_message "$SCRIPT_NAME" "Log segment check complete." diff --git a/hana_processes.sh b/hana_processes.sh index 39a07de..51067f8 100644 --- a/hana_processes.sh +++ b/hana_processes.sh @@ -12,53 +12,47 @@ source "${SCRIPT_DIR}/hana.conf" source "${SCRIPT_DIR}/hana_lib.sh" # Acquire lock -LOCK_FILE=$(acquire_lock "$SCRIPT_NAME") -if [ $? -ne 0 ]; then +if ! acquire_lock "$SCRIPT_NAME"; then exit 1 fi -trap 'release_lock "$LOCK_FILE"' EXIT +trap 'release_lock "$SCRIPT_NAME"' EXIT log_message "$SCRIPT_NAME" "Starting HANA process status check..." # Check if sapcontrol is available if [ ! -x "$SAPCONTROL_PATH" ]; then log_message "$SCRIPT_NAME" "ERROR: sapcontrol not found or not executable at ${SAPCONTROL_PATH}" - send_notification_if_changed "$SCRIPT_NAME" "hana_sapcontrol_path" "HANA Monitor Error" "sapcontrol not found or not executable at ${SAPCONTROL_PATH}" "true" "SAPCONTROL_ERROR" + send_alert "$SCRIPT_NAME" "HANA Monitor Error" "sapcontrol not found or not executable at ${SAPCONTROL_PATH}" exit 1 fi -# Get process list with improved error handling +# Get process list process_list=$(su - "$HANA_USER" -c "${SAPCONTROL_PATH} -nr ${HANA_INSTANCE_NR} -function GetProcessList" 2>&1) sapcontrol_status=$? if [ $sapcontrol_status -ne 0 ]; then log_message "$SCRIPT_NAME" "ERROR: sapcontrol command failed with exit code ${sapcontrol_status}" - send_notification_if_changed "$SCRIPT_NAME" "hana_sapcontrol_command" "HANA Monitor Error" "sapcontrol command failed. Exit code: ${sapcontrol_status}" "true" "SAPCONTROL_COMMAND_FAILED" + send_alert "$SCRIPT_NAME" "HANA Monitor Error" "sapcontrol command failed. Exit code: ${sapcontrol_status}" exit 1 fi -# Clear any previous sapcontrol error state -send_notification_if_changed "$SCRIPT_NAME" "hana_sapcontrol_command" "HANA Process" "sapcontrol command successful." "false" "OK" - # Check for non-GREEN processes (skip header lines) non_green_processes=$(echo "$process_list" | tail -n +6 | grep -v 'GREEN' | grep -v '^$') if [ -n "$non_green_processes" ]; then log_message "$SCRIPT_NAME" "ALERT: One or more HANA processes are not running!" log_message "$SCRIPT_NAME" "Problem processes: ${non_green_processes}" - send_notification_if_changed "$SCRIPT_NAME" "hana_processes" "HANA Process" "One or more HANA processes are not GREEN. Problem processes: ${non_green_processes}" "true" "PROCESS_ALERT:${non_green_processes}" + send_alert "$SCRIPT_NAME" "HANA Process" "One or more HANA processes are not GREEN. Problem processes: ${non_green_processes}" exit 1 -else - # Verify we actually got process data - green_processes=$(echo "$process_list" | tail -n +6 | grep 'GREEN') - if [ -z "$green_processes" ]; then - log_message "$SCRIPT_NAME" "WARNING: No process data found. SAP HANA may not be running." - send_notification_if_changed "$SCRIPT_NAME" "hana_processes" "HANA Process" "No process data found. SAP HANA may not be running." "true" "NO_PROCESS_DATA" - exit 1 - fi - - send_notification_if_changed "$SCRIPT_NAME" "hana_processes" "HANA Process" "All HANA processes are GREEN." "false" "OK" - log_message "$SCRIPT_NAME" "SUCCESS: All HANA processes are GREEN." fi +# Verify we actually got process data +green_processes=$(echo "$process_list" | tail -n +6 | grep 'GREEN') +if [ -z "$green_processes" ]; then + log_message "$SCRIPT_NAME" "WARNING: No process data found. SAP HANA may not be running." + send_alert "$SCRIPT_NAME" "HANA Process" "No process data found. SAP HANA may not be running." + exit 1 +fi + +log_message "$SCRIPT_NAME" "SUCCESS: All HANA processes are GREEN." log_message "$SCRIPT_NAME" "Process check complete." diff --git a/hana_queue.sh b/hana_queue.sh index 6e28ef1..d4950db 100644 --- a/hana_queue.sh +++ b/hana_queue.sh @@ -12,71 +12,66 @@ source "${SCRIPT_DIR}/hana.conf" source "${SCRIPT_DIR}/hana_lib.sh" # Acquire lock -LOCK_FILE=$(acquire_lock "$SCRIPT_NAME") -if [ $? -ne 0 ]; then +if ! acquire_lock "$SCRIPT_NAME"; then exit 1 fi -trap 'release_lock "$LOCK_FILE"' EXIT +trap 'release_lock "$SCRIPT_NAME"' EXIT log_message "$SCRIPT_NAME" "Starting statement queue check..." -# SQL Query for statement queue -STATEMENT_QUEUE_SQL="SELECT COUNT(*) FROM M_SERVICE_THREADS WHERE THREAD_TYPE = 'SqlExecutor' AND THREAD_STATE = 'Queueing';" - # Check if hdbsql is available if [ ! -x "$HDBSQL_PATH" ]; then log_message "$SCRIPT_NAME" "ERROR: hdbsql not found or not executable at ${HDBSQL_PATH}" - send_notification_if_changed "$SCRIPT_NAME" "hana_hdbsql_path_queue" "HANA Monitor Error" "hdbsql not found or not executable at ${HDBSQL_PATH}" "true" "HDBSQL_ERROR" + send_alert "$SCRIPT_NAME" "HANA Monitor Error" "hdbsql not found or not executable at ${HDBSQL_PATH}" exit 1 fi -# Execute SQL query as HANA user with improved error handling -queue_result=$(su - "$HANA_USER" -c "$HDBSQL_PATH -U $HANA_USER_KEY -j -a -x \"$STATEMENT_QUEUE_SQL\"" 2>&1) +# SQL Query for statement queue +STATEMENT_QUEUE_SQL="SELECT COUNT(*) FROM M_SERVICE_THREADS WHERE THREAD_TYPE = 'SqlExecutor' AND THREAD_STATE = 'Queueing';" + +# Execute SQL query +queue_result=$(execute_hana_sql_query "$STATEMENT_QUEUE_SQL") sql_status=$? if [ $sql_status -ne 0 ]; then - log_message "$SCRIPT_NAME" "ERROR: Failed to execute queue query. Exit code: ${sql_status}" - send_notification_if_changed "$SCRIPT_NAME" "hana_queue_query_error" "HANA Queue Error" "Failed to execute queue query. Exit code: ${sql_status}" "true" "QUERY_ERROR" + log_message "$SCRIPT_NAME" "ERROR: Failed to execute queue query." + send_alert "$SCRIPT_NAME" "HANA Queue Error" "Failed to execute queue query." exit 1 fi -# Clear any previous query error state -send_notification_if_changed "$SCRIPT_NAME" "hana_queue_query_error" "HANA Statement Queue" "Queue query successful." "false" "OK" - -# Parse queue count -queue_count=$(echo "$queue_result" | tr -d '"' | xargs) - # Validate queue count is a number if ! [[ "$queue_count" =~ ^[0-9]+$ ]]; then - log_message "$SCRIPT_NAME" "WARNING: Could not retrieve HANA statement queue count. Got: '${queue_count}'. Skipping check." - send_notification_if_changed "$SCRIPT_NAME" "hana_statement_queue_check_fail" "HANA Monitor Warning" "Could not retrieve statement queue count. Got: '${queue_count}'" "true" "QUEUE_CHECK_FAIL" + log_message "$SCRIPT_NAME" "WARNING: Could not retrieve HANA statement queue count. Got: '${queue_count}'." + send_alert "$SCRIPT_NAME" "HANA Monitor Warning" "Could not retrieve statement queue count." + exit 1 +fi + +log_message "$SCRIPT_NAME" "Current statement queue length: ${queue_count}" + +# Get breach count from state file +breach_count_file="${STATE_DIR}/statement_queue_breach_count" +breach_count=0 +if [ -f "$breach_count_file" ]; then + breach_count=$(cat "$breach_count_file") +fi + +if [ "$queue_count" -gt "$STATEMENT_QUEUE_THRESHOLD" ]; then + breach_count=$((breach_count + 1)) + log_message "$SCRIPT_NAME" "Statement queue is above threshold (${queue_count} > ${STATEMENT_QUEUE_THRESHOLD}). Consecutive breach count: ${breach_count}/${STATEMENT_QUEUE_CONSECUTIVE_RUNS}." else - # Clear any previous check failure state - send_notification_if_changed "$SCRIPT_NAME" "hana_statement_queue_check_fail" "HANA Monitor Warning" "Statement queue check is working." "false" "OK" - log_message "$SCRIPT_NAME" "Current statement queue length: ${queue_count}" - - # Get breach count from state - breach_count=$(get_state "statement_queue_breach_count") - breach_count=${breach_count:-0} - - if [ "$queue_count" -gt "$STATEMENT_QUEUE_THRESHOLD" ]; then - breach_count=$((breach_count + 1)) - log_message "$SCRIPT_NAME" "Statement queue is above threshold (${queue_count} > ${STATEMENT_QUEUE_THRESHOLD}). Consecutive breach count: ${breach_count}/${STATEMENT_QUEUE_CONSECUTIVE_RUNS}." - else - if [ "$breach_count" -gt 0 ]; then - log_message "$SCRIPT_NAME" "Statement queue returned to normal. Resetting breach count from ${breach_count} to 0." - fi - breach_count=0 + if [ "$breach_count" -gt 0 ]; then + log_message "$SCRIPT_NAME" "Statement queue returned to normal. Resetting breach count from ${breach_count} to 0." fi - set_state "statement_queue_breach_count" "$breach_count" + breach_count=0 +fi +echo "$breach_count" > "$breach_count_file" - if [ "$breach_count" -ge "$STATEMENT_QUEUE_CONSECUTIVE_RUNS" ]; then - message="Statement queue has been over ${STATEMENT_QUEUE_THRESHOLD} for ${breach_count} checks. Current count: ${queue_count}." - send_notification_if_changed "$SCRIPT_NAME" "hana_statement_queue_status" "HANA Statement Queue" "$message" "true" "ALERT:${queue_count}" - else - message="Statement queue is normal. Current count: ${queue_count}." - send_notification_if_changed "$SCRIPT_NAME" "hana_statement_queue_status" "HANA Statement Queue" "$message" "false" "OK" - fi +if [ "$breach_count" -ge "$STATEMENT_QUEUE_CONSECUTIVE_RUNS" ]; then + message="Statement queue has been over ${STATEMENT_QUEUE_THRESHOLD} for ${breach_count} checks. Current count: ${queue_count}." + send_alert "$SCRIPT_NAME" "HANA Statement Queue" "$message" + exit 1 +else + log_message "$SCRIPT_NAME" "Statement queue is normal. Current count: ${queue_count}." fi log_message "$SCRIPT_NAME" "Statement queue check complete." diff --git a/sld_watchdog.sh b/sld_watchdog.sh index b4da1a8..0d6943f 100644 --- a/sld_watchdog.sh +++ b/sld_watchdog.sh @@ -1,10 +1,8 @@ #!/bin/bash -# +# ============================================================================= # sld_watchdog.sh - Monitors SLD service health and restarts if needed -# Optimized for better error handling and reliability -# +# ============================================================================= -# Get script directory and name SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SCRIPT_NAME="sld_watchdog" @@ -15,14 +13,12 @@ source "$SCRIPT_DIR/hana_lib.sh" # SLD-specific configuration SLD_URL="${SLD_URL:-https://localhost:40000/sld/sld0100.svc}" SLD_TIMEOUT="${SLD_TIMEOUT:-5}" -SLD_LOCK_FILE="/tmp/hana_sld_watchdog.lock" -# Acquire lock using library function -LOCK_FILE=$(acquire_lock "$SCRIPT_NAME") -if [ $? -ne 0 ]; then +# Acquire lock +if ! acquire_lock "$SCRIPT_NAME"; then exit 0 fi -trap 'release_lock "$LOCK_FILE"' EXIT +trap 'release_lock "$SCRIPT_NAME"' EXIT # Function to check SLD health # Returns HTTP status code or "0" for connection errors @@ -71,7 +67,6 @@ main() { # 200 OK or 401 Unauthorized indicate the Tomcat layer is actively processing if [[ $http_status == 200 || $http_status == 401 ]]; then log_message "$SCRIPT_NAME" "[OK] SLD service healthy (HTTP Status: $http_status)" - send_notification_if_changed "$SCRIPT_NAME" "sld_status" "SLD Service" "SLD service is healthy (HTTP: $http_status)" "false" "OK" return 0 fi @@ -86,14 +81,12 @@ main() { log_message "$SCRIPT_NAME" "[ALERT] SLD dead or hung (${status_detail}). Restarting service..." # Send notification - send_notification_if_changed "$SCRIPT_NAME" "sld_down" "SLD Service Critical" \ - "SLD service is down (${status_detail}). Restarting ${SLD_URL}" "true" "SLD_DOWN" + send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service is down (${status_detail}). Restarting ${SLD_URL}" # Restart the service if ! restart_sld_service; then log_message "$SCRIPT_NAME" "[CRITICAL] Failed to restart SLD service" - send_notification_if_changed "$SCRIPT_NAME" "sld_restart_failed" "SLD Service Critical" \ - "Failed to restart SLD service" "true" "RESTART_FAILED" + send_alert "$SCRIPT_NAME" "SLD Service Critical" "Failed to restart SLD service" return 1 fi @@ -106,10 +99,6 @@ main() { if [[ $recovery_status == 200 || $recovery_status == 401 ]]; then log_message "$SCRIPT_NAME" "[RECOVERY] SLD service recovered successfully (HTTP Status: $recovery_status)" - send_notification_if_changed "$SCRIPT_NAME" "sld_down" "SLD Service" \ - "SLD service recovered (HTTP: $recovery_status)" "false" "OK" - send_notification_if_changed "$SCRIPT_NAME" "sld_restart_failed" "SLD Service" \ - "Service recovered successfully" "false" "OK" else local recovery_detail if [ "$recovery_status" == "0" ]; then @@ -118,8 +107,7 @@ main() { recovery_detail="HTTP Status: $recovery_status" fi log_message "$SCRIPT_NAME" "[CRITICAL] SLD service failed to recover after restart (${recovery_detail})" - send_notification_if_changed "$SCRIPT_NAME" "sld_down" "SLD Service Critical" \ - "SLD service FAILED to recover after restart (${recovery_detail})" "true" "RECOVERY_FAILED" + send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service FAILED to recover after restart (${recovery_detail})" return 1 fi