From e7c4142294fbecc52c3c98b3491de6c0d63f070c Mon Sep 17 00:00:00 2001 From: Tomi Eckert Date: Thu, 12 Mar 2026 20:12:20 +0100 Subject: [PATCH] initial commit --- .gitignore | 1 + hana.conf | 73 ++++++++++++ hana_backup.sh | 83 +++++++++++++ hana_disk.sh | 84 +++++++++++++ hana_lib.sh | 272 +++++++++++++++++++++++++++++++++++++++++++ hana_log_segments.sh | 111 ++++++++++++++++++ hana_processes.sh | 64 ++++++++++ hana_queue.sh | 82 +++++++++++++ sld_watchdog.sh | 134 +++++++++++++++++++++ 9 files changed, 904 insertions(+) create mode 100644 .gitignore create mode 100644 hana.conf create mode 100644 hana_backup.sh create mode 100644 hana_disk.sh create mode 100644 hana_lib.sh create mode 100644 hana_log_segments.sh create mode 100644 hana_processes.sh create mode 100644 hana_queue.sh create mode 100644 sld_watchdog.sh diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1eb90ba --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.anchor \ No newline at end of file diff --git a/hana.conf b/hana.conf new file mode 100644 index 0000000..bfd0882 --- /dev/null +++ b/hana.conf @@ -0,0 +1,73 @@ +#!/bin/bash +# ============================================================================= +# SAP HANA Common Configuration +# ============================================================================= + +# --- HANA Instance Configuration --- +# HANA SID (e.g., "NDB") - used to derive the HANA user (adm) +HANA_SID="NDB" + +# Derived HANA Linux user (automatically computed from HANA_SID) +HANA_USER="$(echo "$HANA_SID" | tr '[:upper:]' '[:lower:]')adm" + +# HANA Instance Number (e.g., "00") +HANA_INSTANCE_NR="00" + +# HANA User Key for hdbsql (hdbuserstore key) +HANA_USER_KEY="CRONKEY" + +# --- Paths --- +SAPCONTROL_PATH="/usr/sap/hostctrl/exe/sapcontrol" +HDBSQL_PATH="/usr/sap/HDB/HDB${HANA_INSTANCE_NR}/exe/hdbsql" + +# --- Monitoring Directories --- +DIRECTORIES_TO_MONITOR=( + "/hana/shared" + "/hana/log" + "/hana/data" + "/usr/sap" +) + +# --- Log Directories for Auto-Cleanup --- +# These directories will be automatically cleaned when disk space is low +# Format: "mount_point:log_directory_path" +# The script will check if a monitored directory is on the same mount point +# as a log directory and can clean the log directory to free up space +LOG_DIRS_FOR_CLEANUP=( + "/hana/log:/hana/log" + "/usr/sap:/usr/sap/trans/log" + "/usr/sap:/usr/sap/hostctrl/work/log" +) + +# --- Disk Cleanup Configuration --- +# Minimum free space percentage to maintain after cleanup +MIN_FREE_SPACE_AFTER_CLEANUP=5 +# Maximum age of log files to delete (in days) +MAX_LOG_FILE_AGE_DAYS=7 +# Enable automatic cleanup when disk usage exceeds threshold +AUTO_CLEANUP_ENABLED=true + +# --- Thresholds --- +DISK_USAGE_THRESHOLD=85 +TRUNCATED_PERCENTAGE_THRESHOLD=50 +FREE_PERCENTAGE_THRESHOLD=10 +STATEMENT_QUEUE_THRESHOLD=10 +STATEMENT_QUEUE_CONSECUTIVE_RUNS=3 +BACKUP_THRESHOLD_HOURS=32 + +# --- Notification Configuration --- +NTFY_TOKEN="" +NTFY_TOPIC_URL="" +COMPANY_NAME="My Company" + +# --- Logging Configuration --- +LOG_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" +LOG_FILE="${LOG_DIR}/hana_monitor.log" + +# --- State Directory --- +STATE_DIR="${LOG_DIR}/monitor_state" +mkdir -p "${STATE_DIR}" + +# --- Lock Directory --- +LOCK_DIR="/tmp" + diff --git a/hana_backup.sh b/hana_backup.sh new file mode 100644 index 0000000..5ce28a4 --- /dev/null +++ b/hana_backup.sh @@ -0,0 +1,83 @@ +#!/bin/bash +# ============================================================================= +# SAP HANA Backup Status Monitoring Script +# Checks last successful backup age +# ============================================================================= + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" +SCRIPT_NAME="hana_backup" + +# Load configuration +source "${SCRIPT_DIR}/hana.conf" +source "${SCRIPT_DIR}/hana_lib.sh" + +# Acquire lock +LOCK_FILE=$(acquire_lock "$SCRIPT_NAME") +if [ $? -ne 0 ]; then + exit 1 +fi +trap 'release_lock "$LOCK_FILE"' EXIT + +log_message "$SCRIPT_NAME" "Starting backup status check..." + +# Check if hdbsql is available +if [ ! -x "$HDBSQL_PATH" ]; then + log_message "$SCRIPT_NAME" "ERROR: hdbsql not found or not executable at ${HDBSQL_PATH}" + send_notification_if_changed "$SCRIPT_NAME" "hana_hdbsql_path_backup" "HANA Monitor Error" "hdbsql not found or not executable at ${HDBSQL_PATH}" "true" "HDBSQL_ERROR" + exit 1 +fi + +# SQL Query for last successful backup +BACKUP_SQL="SELECT TOP 1 SYS_START_TIME FROM M_BACKUP_CATALOG WHERE ENTRY_TYPE_NAME = 'complete data backup' AND STATE_NAME = 'successful' ORDER BY SYS_START_TIME DESC" + +# Execute SQL query as HANA user with improved error handling +backup_result=$(su - "$HANA_USER" -c "$HDBSQL_PATH -U $HANA_USER_KEY -j -a -x \"$BACKUP_SQL\"" 2>&1) +sql_status=$? + +if [ $sql_status -ne 0 ]; then + log_message "$SCRIPT_NAME" "ERROR: Failed to execute backup query. Exit code: ${sql_status}" + send_notification_if_changed "$SCRIPT_NAME" "hana_backup_query_error" "HANA Backup Error" "Failed to execute backup query. Exit code: ${sql_status}" "true" "QUERY_ERROR" + exit 1 +fi + +last_backup_date=$(echo "$backup_result" | tr -d '"' | sed 's/\..*//') + +if [[ -z "$last_backup_date" || "$last_backup_date" == *"error"* || "$last_backup_date" == *"Error"* ]]; then + message="No successful complete data backup found for ${COMPANY_NAME} HANA." + log_message "$SCRIPT_NAME" "CRITICAL: ${message}" + send_notification_if_changed "$SCRIPT_NAME" "hana_backup_status" "HANA Backup" "$message" "true" "NO_BACKUP" + exit 1 +fi + +# Clear any previous query error state +send_notification_if_changed "$SCRIPT_NAME" "hana_backup_query_error" "HANA Backup" "Backup query successful." "false" "OK" + +# Calculate backup age +last_backup_epoch=$(date -d "$last_backup_date" +%s 2>/dev/null) +if [ $? -ne 0 ]; then + log_message "$SCRIPT_NAME" "ERROR: Failed to parse backup date: ${last_backup_date}" + send_notification_if_changed "$SCRIPT_NAME" "hana_backup_status" "HANA Backup" "Failed to parse backup date: ${last_backup_date}" "true" "DATE_PARSE_ERROR" + exit 1 +fi + +current_epoch=$(date +%s) +threshold_seconds=$((BACKUP_THRESHOLD_HOURS * 3600)) +age_seconds=$((current_epoch - last_backup_epoch)) +age_hours=$((age_seconds / 3600)) + +if [ $age_seconds -lt 0 ]; then + log_message "$SCRIPT_NAME" "WARNING: Backup timestamp is in the future. Possible clock skew." + age_hours=0 +fi + +if [ $age_seconds -gt $threshold_seconds ]; then + message="Last successful HANA backup for ${COMPANY_NAME} is ${age_hours} hours old, which exceeds the threshold of ${BACKUP_THRESHOLD_HOURS} hours. Last backup was on: ${last_backup_date}." + log_message "$SCRIPT_NAME" "CRITICAL: ${message}" + send_notification_if_changed "$SCRIPT_NAME" "hana_backup_status" "HANA Backup" "$message" "true" "${age_hours}h" +else + message="Last successful backup is ${age_hours} hours old (Threshold: ${BACKUP_THRESHOLD_HOURS} hours)." + log_message "$SCRIPT_NAME" "SUCCESS: ${message}" + send_notification_if_changed "$SCRIPT_NAME" "hana_backup_status" "HANA Backup" "$message" "false" "OK" +fi + +log_message "$SCRIPT_NAME" "Backup check complete." diff --git a/hana_disk.sh b/hana_disk.sh new file mode 100644 index 0000000..3d87646 --- /dev/null +++ b/hana_disk.sh @@ -0,0 +1,84 @@ +#!/bin/bash +# ============================================================================= +# SAP HANA Disk Space Monitoring Script +# Checks disk usage for configured directories with auto-cleanup capability +# ============================================================================= + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" +SCRIPT_NAME="hana_disk" + +# Load configuration +source "${SCRIPT_DIR}/hana.conf" +source "${SCRIPT_DIR}/hana_lib.sh" + +# Acquire lock +LOCK_FILE=$(acquire_lock "$SCRIPT_NAME") +if [ $? -ne 0 ]; then + exit 1 +fi +trap 'release_lock "$LOCK_FILE"' EXIT + +log_message "$SCRIPT_NAME" "Starting disk usage check..." + +# Track overall status +ALERT_COUNT=0 +TOTAL_DIRS=0 +CLEANUP_PERFORMED=0 + +for dir in "${DIRECTORIES_TO_MONITOR[@]}"; do + TOTAL_DIRS=$((TOTAL_DIRS + 1)) + + # Check if directory exists + if [ ! -d "$dir" ]; then + log_message "$SCRIPT_NAME" "WARNING: Directory '$dir' not found. Skipping." + send_notification_if_changed "$SCRIPT_NAME" "disk_dir_not_found_${dir//\//_}" "HANA Disk Warning" "Directory '$dir' not found." "true" "DIR_NOT_FOUND" + ALERT_COUNT=$((ALERT_COUNT + 1)) + continue + fi + + # Get disk usage percentage + usage=$(get_disk_usage_percentage "$dir") + + if [ -z "$usage" ] || [ "$usage" -eq 0 ]; then + log_message "$SCRIPT_NAME" "WARNING: Could not determine disk usage for '$dir'. Skipping." + continue + fi + + log_message "$SCRIPT_NAME" "Directory ${dir} is at ${usage}%" + + # Check if usage exceeds threshold + if [ "$usage" -gt "$DISK_USAGE_THRESHOLD" ]; then + log_message "$SCRIPT_NAME" "ALERT: ${dir} usage is at ${usage}% which is above the ${DISK_USAGE_THRESHOLD}% threshold." + + # Attempt auto-cleanup if enabled + if [ "$AUTO_CLEANUP_ENABLED" == "true" ]; then + log_message "$SCRIPT_NAME" "Attempting auto-cleanup for '${dir}'..." + mount_point=$(get_mount_point "$dir") + + if auto_cleanup "$mount_point" "$MIN_FREE_SPACE_AFTER_CLEANUP"; then + CLEANUP_PERFORMED=$((CLEANUP_PERFORMED + 1)) + new_usage=$(get_disk_usage_percentage "$dir") + log_message "$SCRIPT_NAME" "After cleanup, ${dir} usage is at ${new_usage}%" + usage=$new_usage + else + log_message "$SCRIPT_NAME" "Auto-cleanup failed or no files to clean for '${dir}'" + fi + fi + + # Send notification with final usage after cleanup attempt + send_notification_if_changed "$SCRIPT_NAME" "disk_usage_${dir//\//_}" "HANA Disk" "Disk usage for ${dir} is at ${usage}%." "true" "${usage}%" + ALERT_COUNT=$((ALERT_COUNT + 1)) + else + # Send OK notification only if state changed from alert + send_notification_if_changed "$SCRIPT_NAME" "disk_usage_${dir//\//_}" "HANA Disk" "Disk usage for ${dir} is at ${usage}% (below threshold)." "false" "OK" + fi +done + +# Summary logging +log_message "$SCRIPT_NAME" "Disk check complete. Total: ${TOTAL_DIRS} dirs, ${ALERT_COUNT} alerts, ${CLEANUP_PERFORMED} cleanups performed." + +# Exit with status based on alerts +if [ "$ALERT_COUNT" -gt 0 ]; then + exit 1 +fi +exit 0 diff --git a/hana_lib.sh b/hana_lib.sh new file mode 100644 index 0000000..c58fdf9 --- /dev/null +++ b/hana_lib.sh @@ -0,0 +1,272 @@ +#!/bin/bash +# ============================================================================= +# SAP HANA Monitoring Library - Shared Functions +# ============================================================================= + +# Logging function with script name prefix +# Usage: log_message "SCRIPT_NAME" "message" +log_message() { + local script_name="$1" + local message="$2" + local timestamp=$(date "+%Y-%m-%d %H:%M:%S") + echo "[${timestamp}] [${script_name}] ${message}" | tee -a "${LOG_FILE}" +} + +# Acquire lock for script execution +# Usage: acquire_lock "SCRIPT_NAME" +# Returns: 0 on success, 1 on failure (already running) +acquire_lock() { + local script_name="$1" + local lock_file="${LOCK_DIR}/hana_${script_name}.lock" + + if [ -e "$lock_file" ]; then + log_message "$script_name" "Script is already running. Exiting." + return 1 + fi + + touch "$lock_file" + echo "$lock_file" + return 0 +} + +# Release lock +# Usage: release_lock "LOCK_FILE" +release_lock() { + local lock_file="$1" + if [ -n "$lock_file" ] && [ -f "$lock_file" ]; then + rm -f "$lock_file" + fi +} + +# Get state value +# Usage: get_state "KEY" +get_state() { + local key="$1" + if [ -f "${STATE_DIR}/${key}.state" ]; then + cat "${STATE_DIR}/${key}.state" + else + echo "" + fi +} + +# Set state value +# Usage: set_state "KEY" "VALUE" +set_state() { + local key="$1" + local value="$2" + echo "$value" > "${STATE_DIR}/${key}.state" +} + +# Send notification if state changed +# Usage: send_notification_if_changed "SCRIPT_NAME" "ALERT_KEY" "TITLE_PREFIX" "MESSAGE" "IS_ALERT" "CURRENT_VALUE" +send_notification_if_changed() { + local script_name="$1" + local alert_key="$2" + local title_prefix="$3" + local current_message="$4" + local is_alert_condition="$5" + local current_value="$6" + local hostname=$(hostname) + + local previous_value=$(get_state "$alert_key") + + if [ "$current_value" != "$previous_value" ]; then + local full_title="" + local full_message="" + + if [ "$is_alert_condition" == "true" ]; then + full_title="${title_prefix} Alert" + full_message="🚨 Critical: ${current_message}" + log_message "$script_name" "ALERT: ${full_message}" + else + if [ -n "$previous_value" ] && [ "$previous_value" != "OK" ]; then + full_title="${title_prefix} Resolved" + full_message="✅ Resolved: ${current_message}" + log_message "$script_name" "RESOLVED: ${full_message}" + else + set_state "$alert_key" "$current_value" + return + fi + fi + + local final_message="[${COMPANY_NAME} | ${hostname}] ${full_message}" + + if [ -n "$NTFY_TOKEN" ] && [ -n "$NTFY_TOPIC_URL" ]; then + curl -H "Authorization: Bearer ${NTFY_TOKEN}" -H "Title: ${full_title}" -d "${final_message}" "${NTFY_TOPIC_URL}" > /dev/null 2>&1 + log_message "$script_name" "Notification sent: ${full_title}" + else + log_message "$script_name" "Ntfy not configured, skipping notification" + fi + + set_state "$alert_key" "$current_value" + fi +} + +# Run command as HANA user using su +# Usage: run_as_hana_user "COMMAND" +run_as_hana_user() { + local command="$1" + su - "$HANA_USER" -c "$command" +} + +# Get disk usage percentage for a directory +# Usage: get_disk_usage_percentage "/path/to/dir" +# Returns: Usage percentage as integer (without % sign) +get_disk_usage_percentage() { + local dir="$1" + if [ ! -d "$dir" ]; then + echo "0" + return + fi + df "$dir" 2>/dev/null | awk 'NR==2 {gsub(/%/,"",$5); print $5}' +} + +# Get mount point for a directory +# Usage: get_mount_point "/path/to/dir" +# Returns: Mount point path +get_mount_point() { + local dir="$1" + df "$dir" 2>/dev/null | awk 'NR==2 {print $NF}' +} + +# Get available disk space in KB for a directory +# Usage: get_available_space_kb "/path/to/dir" +# Returns: Available space in KB +get_available_space_kb() { + local dir="$1" + df -k "$dir" 2>/dev/null | awk 'NR==2 {print $4}' +} + +# Find log directories on the same mount point +# Usage: find_log_dirs_on_mount "mount_point" +# Returns: Space-separated list of log directories +find_log_dirs_on_mount() { + local mount_point="$1" + local result="" + + for log_entry in "${LOG_DIRS_FOR_CLEANUP[@]}"; do + local entry_mount="${log_entry%%:*}" + local log_dir="${log_entry#*:}" + + if [ "$entry_mount" == "$mount_point" ] && [ -d "$log_dir" ]; then + if [ -n "$result" ]; then + result="$result $log_dir" + else + result="$log_dir" + fi + fi + done + + echo "$result" +} + +# Clean old log files in a directory +# Usage: clean_log_files "/path/to/log/dir" "max_age_days" +# Returns: Number of files deleted and space freed +clean_log_files() { + local log_dir="$1" + local max_age_days="${2:-7}" + local files_deleted=0 + local space_freed=0 + + if [ ! -d "$log_dir" ]; then + log_message "CLEANUP" "Log directory '$log_dir' not found. Skipping." + echo "0:0" + return + fi + + # Find and delete old log files + while IFS= read -r -d '' file; do + if [ -f "$file" ]; then + local file_size=$(stat -c%s "$file" 2>/dev/null || echo "0") + rm -f "$file" 2>/dev/null && { + files_deleted=$((files_deleted + 1)) + space_freed=$((space_freed + file_size)) + } + fi + done < <(find "$log_dir" -type f -mtime +$max_age_days -print0 2>/dev/null) + + # Also clean empty directories + find "$log_dir" -type d -empty -delete 2>/dev/null + + log_message "CLEANUP" "Deleted $files_deleted files from '$log_dir', freed $((space_freed / 1024)) KB" + echo "${files_deleted}:${space_freed}" +} + +# Automatic disk cleanup function +# Usage: auto_cleanup "mount_point" "target_free_percentage" +# Returns: 0 if cleanup successful, 1 if failed or not needed +auto_cleanup() { + local mount_point="$1" + local target_free_percentage="${2:-5}" + + if [ "$AUTO_CLEANUP_ENABLED" != "true" ]; then + log_message "CLEANUP" "Auto-cleanup is disabled. Skipping." + return 1 + fi + + local log_dirs=$(find_log_dirs_on_mount "$mount_point") + + if [ -z "$log_dirs" ]; then + log_message "CLEANUP" "No log directories configured for mount point '$mount_point'. Skipping cleanup." + return 1 + fi + + log_message "CLEANUP" "Starting auto-cleanup for mount point '$mount_point'. Log dirs: $log_dirs" + + local total_freed=0 + local total_files=0 + + for log_dir in $log_dirs; do + local result=$(clean_log_files "$log_dir" "$MAX_LOG_FILE_AGE_DAYS") + local files="${result%%:*}" + local freed="${result#*:}" + total_files=$((total_files + files)) + total_freed=$((total_freed + freed)) + done + + log_message "CLEANUP" "Cleanup complete. Total files deleted: $total_files, Total space freed: $((total_freed / 1024)) KB" + + if [ $total_freed -gt 0 ]; then + return 0 + else + return 1 + fi +} + +# Check disk space and perform auto-cleanup if needed +# Usage: check_and_cleanup_disk "directory" "threshold" +# Returns: 0 if OK or cleanup successful, 1 if critical and cleanup failed +check_and_cleanup_disk() { + local dir="$1" + local threshold="${2:-85}" + + local usage=$(get_disk_usage_percentage "$dir") + local mount_point=$(get_mount_point "$dir") + + if [ -z "$usage" ] || [ "$usage" -eq 0 ]; then + log_message "CLEANUP" "Could not determine disk usage for '$dir'. Skipping." + return 1 + fi + + if [ "$usage" -gt "$threshold" ]; then + log_message "CLEANUP" "Disk usage ${usage}% exceeds threshold ${threshold}% for '$dir'. Attempting cleanup..." + + if auto_cleanup "$mount_point" "$MIN_FREE_SPACE_AFTER_CLEANUP"; then + local new_usage=$(get_disk_usage_percentage "$dir") + log_message "CLEANUP" "After cleanup, disk usage is ${new_usage}%" + + if [ "$new_usage" -le "$threshold" ]; then + return 0 + else + log_message "CLEANUP" "Cleanup completed but usage ${new_usage}% still above threshold" + return 0 + fi + else + log_message "CLEANUP" "Cleanup failed or no files to clean for '$dir'" + return 1 + fi + fi + + return 0 +} diff --git a/hana_log_segments.sh b/hana_log_segments.sh new file mode 100644 index 0000000..c74678f --- /dev/null +++ b/hana_log_segments.sh @@ -0,0 +1,111 @@ +#!/bin/bash +# ============================================================================= +# SAP HANA Log Segment Monitoring Script +# Checks log segment states (Truncated, Free) +# ============================================================================= + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" +SCRIPT_NAME="hana_log_segments" + +# Load configuration +source "${SCRIPT_DIR}/hana.conf" +source "${SCRIPT_DIR}/hana_lib.sh" + +# Acquire lock +LOCK_FILE=$(acquire_lock "$SCRIPT_NAME") +if [ $? -ne 0 ]; then + exit 1 +fi +trap 'release_lock "$LOCK_FILE"' EXIT + +log_message "$SCRIPT_NAME" "Starting log segment check..." + +# SQL Query for log segments +SQL_QUERY="SELECT b.host, b.service_name, a.state, count(*) FROM PUBLIC.M_LOG_SEGMENTS a JOIN PUBLIC.M_SERVICES b ON (a.host = b.host AND a.port = b.port) GROUP BY b.host, b.service_name, a.state;" + +# Check if hdbsql is available +if [ ! -x "$HDBSQL_PATH" ]; then + log_message "$SCRIPT_NAME" "ERROR: hdbsql not found or not executable at ${HDBSQL_PATH}" + send_notification_if_changed "$SCRIPT_NAME" "hana_hdbsql_path" "HANA Monitor Error" "hdbsql not found or not executable at ${HDBSQL_PATH}" "true" "HDBSQL_ERROR" + exit 1 +fi + +# Execute SQL query as HANA user with improved error handling +readarray -t sql_output < <(su - "$HANA_USER" -c "$HDBSQL_PATH -U $HANA_USER_KEY -c \";\" \"$SQL_QUERY\"" 2>&1) +sql_status=$? + +if [ $sql_status -ne 0 ]; then + error_message=$(printf '%s\n' "${sql_output[@]}") + log_message "$SCRIPT_NAME" "ERROR: The hdbsql command failed. Details: ${error_message}" + send_notification_if_changed "$SCRIPT_NAME" "hana_hdbsql_command" "HANA Monitor Error" "The hdbsql command failed. Details: ${error_message}" "true" "HDBSQL_COMMAND_FAILED" + exit 1 +fi + +# Parse SQL output +total_segments=0 +truncated_segments=0 +free_segments=0 + +for line in "${sql_output[@]}"; do + # Skip empty lines and header + if [[ -z "$line" || "$line" == *"STATE"* || "$line" == *"host"* ]]; then + continue + fi + + cleaned_line=$(echo "$line" | tr -d '"') + state=$(echo "$cleaned_line" | awk -F',' '{print $3}' | xargs) # Trim whitespace + count=$(echo "$cleaned_line" | awk -F',' '{print $4}' | xargs) # Trim whitespace + + # Validate count is a number + if ! [[ "$count" =~ ^[0-9]+$ ]]; then + continue + fi + + total_segments=$((total_segments + count)) + if [[ "$state" == "Truncated" ]]; then + truncated_segments=$((truncated_segments + count)) + elif [[ "$state" == "Free" ]]; then + free_segments=$((free_segments + count)) + fi +done + +log_message "$SCRIPT_NAME" "Total Segments: ${total_segments}" +log_message "$SCRIPT_NAME" "Truncated Segments: ${truncated_segments}" +log_message "$SCRIPT_NAME" "Free Segments: ${free_segments}" + +if [ $total_segments -eq 0 ]; then + log_message "$SCRIPT_NAME" "WARNING: No log segments found. Skipping percentage checks." + send_notification_if_changed "$SCRIPT_NAME" "hana_log_segments_total" "HANA Log Segment Warning" "No log segments found. Skipping percentage checks." "true" "NO_LOG_SEGMENTS" +else + send_notification_if_changed "$SCRIPT_NAME" "hana_log_segments_total" "HANA Log Segment" "Log segments found." "false" "OK" + + # Calculate truncated percentage with integer arithmetic + if [ $total_segments -gt 0 ]; then + truncated_percentage=$((truncated_segments * 100 / total_segments)) + else + truncated_percentage=0 + fi + + if [ $truncated_percentage -gt $TRUNCATED_PERCENTAGE_THRESHOLD ]; then + log_message "$SCRIPT_NAME" "ALERT: ${truncated_percentage}% of log segments are 'Truncated'." + send_notification_if_changed "$SCRIPT_NAME" "hana_log_truncated" "HANA Log Segment" "${truncated_percentage}% of HANA log segments are in 'Truncated' state." "true" "${truncated_percentage}%" + else + send_notification_if_changed "$SCRIPT_NAME" "hana_log_truncated" "HANA Log Segment" "${truncated_percentage}% of HANA log segments are in 'Truncated' state (below threshold)." "false" "OK" + fi + + # Calculate free percentage with integer arithmetic + if [ $total_segments -gt 0 ]; then + free_percentage=$((free_segments * 100 / total_segments)) + else + free_percentage=0 + fi + + if [ $free_percentage -lt $FREE_PERCENTAGE_THRESHOLD ]; then + log_message "$SCRIPT_NAME" "ALERT: Only ${free_percentage}% of log segments are 'Free'." + send_notification_if_changed "$SCRIPT_NAME" "hana_log_free" "HANA Log Segment" "Only ${free_percentage}% of HANA log segments are in 'Free' state." "true" "${free_percentage}%" + else + send_notification_if_changed "$SCRIPT_NAME" "hana_log_free" "HANA Log Segment" "Only ${free_percentage}% of HANA log segments are in 'Free' state (above threshold)." "false" "OK" + fi +fi + +log_message "$SCRIPT_NAME" "Log segment check complete." diff --git a/hana_processes.sh b/hana_processes.sh new file mode 100644 index 0000000..39a07de --- /dev/null +++ b/hana_processes.sh @@ -0,0 +1,64 @@ +#!/bin/bash +# ============================================================================= +# SAP HANA Process Monitoring Script +# Checks if all HANA processes are in GREEN state +# ============================================================================= + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" +SCRIPT_NAME="hana_processes" + +# Load configuration +source "${SCRIPT_DIR}/hana.conf" +source "${SCRIPT_DIR}/hana_lib.sh" + +# Acquire lock +LOCK_FILE=$(acquire_lock "$SCRIPT_NAME") +if [ $? -ne 0 ]; then + exit 1 +fi +trap 'release_lock "$LOCK_FILE"' EXIT + +log_message "$SCRIPT_NAME" "Starting HANA process status check..." + +# Check if sapcontrol is available +if [ ! -x "$SAPCONTROL_PATH" ]; then + log_message "$SCRIPT_NAME" "ERROR: sapcontrol not found or not executable at ${SAPCONTROL_PATH}" + send_notification_if_changed "$SCRIPT_NAME" "hana_sapcontrol_path" "HANA Monitor Error" "sapcontrol not found or not executable at ${SAPCONTROL_PATH}" "true" "SAPCONTROL_ERROR" + exit 1 +fi + +# Get process list with improved error handling +process_list=$(su - "$HANA_USER" -c "${SAPCONTROL_PATH} -nr ${HANA_INSTANCE_NR} -function GetProcessList" 2>&1) +sapcontrol_status=$? + +if [ $sapcontrol_status -ne 0 ]; then + log_message "$SCRIPT_NAME" "ERROR: sapcontrol command failed with exit code ${sapcontrol_status}" + send_notification_if_changed "$SCRIPT_NAME" "hana_sapcontrol_command" "HANA Monitor Error" "sapcontrol command failed. Exit code: ${sapcontrol_status}" "true" "SAPCONTROL_COMMAND_FAILED" + exit 1 +fi + +# Clear any previous sapcontrol error state +send_notification_if_changed "$SCRIPT_NAME" "hana_sapcontrol_command" "HANA Process" "sapcontrol command successful." "false" "OK" + +# Check for non-GREEN processes (skip header lines) +non_green_processes=$(echo "$process_list" | tail -n +6 | grep -v 'GREEN' | grep -v '^$') + +if [ -n "$non_green_processes" ]; then + log_message "$SCRIPT_NAME" "ALERT: One or more HANA processes are not running!" + log_message "$SCRIPT_NAME" "Problem processes: ${non_green_processes}" + send_notification_if_changed "$SCRIPT_NAME" "hana_processes" "HANA Process" "One or more HANA processes are not GREEN. Problem processes: ${non_green_processes}" "true" "PROCESS_ALERT:${non_green_processes}" + exit 1 +else + # Verify we actually got process data + green_processes=$(echo "$process_list" | tail -n +6 | grep 'GREEN') + if [ -z "$green_processes" ]; then + log_message "$SCRIPT_NAME" "WARNING: No process data found. SAP HANA may not be running." + send_notification_if_changed "$SCRIPT_NAME" "hana_processes" "HANA Process" "No process data found. SAP HANA may not be running." "true" "NO_PROCESS_DATA" + exit 1 + fi + + send_notification_if_changed "$SCRIPT_NAME" "hana_processes" "HANA Process" "All HANA processes are GREEN." "false" "OK" + log_message "$SCRIPT_NAME" "SUCCESS: All HANA processes are GREEN." +fi + +log_message "$SCRIPT_NAME" "Process check complete." diff --git a/hana_queue.sh b/hana_queue.sh new file mode 100644 index 0000000..6e28ef1 --- /dev/null +++ b/hana_queue.sh @@ -0,0 +1,82 @@ +#!/bin/bash +# ============================================================================= +# SAP HANA Statement Queue Monitoring Script +# Checks for queued SQL statements +# ============================================================================= + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" +SCRIPT_NAME="hana_queue" + +# Load configuration +source "${SCRIPT_DIR}/hana.conf" +source "${SCRIPT_DIR}/hana_lib.sh" + +# Acquire lock +LOCK_FILE=$(acquire_lock "$SCRIPT_NAME") +if [ $? -ne 0 ]; then + exit 1 +fi +trap 'release_lock "$LOCK_FILE"' EXIT + +log_message "$SCRIPT_NAME" "Starting statement queue check..." + +# SQL Query for statement queue +STATEMENT_QUEUE_SQL="SELECT COUNT(*) FROM M_SERVICE_THREADS WHERE THREAD_TYPE = 'SqlExecutor' AND THREAD_STATE = 'Queueing';" + +# Check if hdbsql is available +if [ ! -x "$HDBSQL_PATH" ]; then + log_message "$SCRIPT_NAME" "ERROR: hdbsql not found or not executable at ${HDBSQL_PATH}" + send_notification_if_changed "$SCRIPT_NAME" "hana_hdbsql_path_queue" "HANA Monitor Error" "hdbsql not found or not executable at ${HDBSQL_PATH}" "true" "HDBSQL_ERROR" + exit 1 +fi + +# Execute SQL query as HANA user with improved error handling +queue_result=$(su - "$HANA_USER" -c "$HDBSQL_PATH -U $HANA_USER_KEY -j -a -x \"$STATEMENT_QUEUE_SQL\"" 2>&1) +sql_status=$? + +if [ $sql_status -ne 0 ]; then + log_message "$SCRIPT_NAME" "ERROR: Failed to execute queue query. Exit code: ${sql_status}" + send_notification_if_changed "$SCRIPT_NAME" "hana_queue_query_error" "HANA Queue Error" "Failed to execute queue query. Exit code: ${sql_status}" "true" "QUERY_ERROR" + exit 1 +fi + +# Clear any previous query error state +send_notification_if_changed "$SCRIPT_NAME" "hana_queue_query_error" "HANA Statement Queue" "Queue query successful." "false" "OK" + +# Parse queue count +queue_count=$(echo "$queue_result" | tr -d '"' | xargs) + +# Validate queue count is a number +if ! [[ "$queue_count" =~ ^[0-9]+$ ]]; then + log_message "$SCRIPT_NAME" "WARNING: Could not retrieve HANA statement queue count. Got: '${queue_count}'. Skipping check." + send_notification_if_changed "$SCRIPT_NAME" "hana_statement_queue_check_fail" "HANA Monitor Warning" "Could not retrieve statement queue count. Got: '${queue_count}'" "true" "QUEUE_CHECK_FAIL" +else + # Clear any previous check failure state + send_notification_if_changed "$SCRIPT_NAME" "hana_statement_queue_check_fail" "HANA Monitor Warning" "Statement queue check is working." "false" "OK" + log_message "$SCRIPT_NAME" "Current statement queue length: ${queue_count}" + + # Get breach count from state + breach_count=$(get_state "statement_queue_breach_count") + breach_count=${breach_count:-0} + + if [ "$queue_count" -gt "$STATEMENT_QUEUE_THRESHOLD" ]; then + breach_count=$((breach_count + 1)) + log_message "$SCRIPT_NAME" "Statement queue is above threshold (${queue_count} > ${STATEMENT_QUEUE_THRESHOLD}). Consecutive breach count: ${breach_count}/${STATEMENT_QUEUE_CONSECUTIVE_RUNS}." + else + if [ "$breach_count" -gt 0 ]; then + log_message "$SCRIPT_NAME" "Statement queue returned to normal. Resetting breach count from ${breach_count} to 0." + fi + breach_count=0 + fi + set_state "statement_queue_breach_count" "$breach_count" + + if [ "$breach_count" -ge "$STATEMENT_QUEUE_CONSECUTIVE_RUNS" ]; then + message="Statement queue has been over ${STATEMENT_QUEUE_THRESHOLD} for ${breach_count} checks. Current count: ${queue_count}." + send_notification_if_changed "$SCRIPT_NAME" "hana_statement_queue_status" "HANA Statement Queue" "$message" "true" "ALERT:${queue_count}" + else + message="Statement queue is normal. Current count: ${queue_count}." + send_notification_if_changed "$SCRIPT_NAME" "hana_statement_queue_status" "HANA Statement Queue" "$message" "false" "OK" + fi +fi + +log_message "$SCRIPT_NAME" "Statement queue check complete." diff --git a/sld_watchdog.sh b/sld_watchdog.sh new file mode 100644 index 0000000..b4da1a8 --- /dev/null +++ b/sld_watchdog.sh @@ -0,0 +1,134 @@ +#!/bin/bash +# +# sld_watchdog.sh - Monitors SLD service health and restarts if needed +# Optimized for better error handling and reliability +# + +# Get script directory and name +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SCRIPT_NAME="sld_watchdog" + +# Source configuration and library +source "$SCRIPT_DIR/hana.conf" +source "$SCRIPT_DIR/hana_lib.sh" + +# SLD-specific configuration +SLD_URL="${SLD_URL:-https://localhost:40000/sld/sld0100.svc}" +SLD_TIMEOUT="${SLD_TIMEOUT:-5}" +SLD_LOCK_FILE="/tmp/hana_sld_watchdog.lock" + +# Acquire lock using library function +LOCK_FILE=$(acquire_lock "$SCRIPT_NAME") +if [ $? -ne 0 ]; then + exit 0 +fi +trap 'release_lock "$LOCK_FILE"' EXIT + +# Function to check SLD health +# Returns HTTP status code or "0" for connection errors +check_sld_health() { + local http_status + http_status=$(curl -k -s -o /dev/null -w "%{http_code}" -m "$SLD_TIMEOUT" --connect-timeout "$SLD_TIMEOUT" "$SLD_URL" 2>/dev/null) + + # Handle curl errors (returns 000 on connection failure) + if [ -z "$http_status" ] || [ "$http_status" == "000" ]; then + echo "0" + else + echo "$http_status" + fi +} + +# Function to restart SLD service +restart_sld_service() { + log_message "$SCRIPT_NAME" "Attempting to restart SLD service..." + + # Try systemctl first + if command -v systemctl &> /dev/null; then + systemctl restart sapb1servertools 2>&1 + local restart_status=$? + if [ $restart_status -eq 0 ]; then + log_message "$SCRIPT_NAME" "Service restart command executed successfully" + return 0 + else + log_message "$SCRIPT_NAME" "Service restart failed with exit code ${restart_status}" + return 1 + fi + else + log_message "$SCRIPT_NAME" "systemctl not available, trying alternative restart methods" + # Fallback: try service command + service sapb1servertools restart 2>&1 + return $? + fi +} + +# Main monitoring logic +main() { + log_message "$SCRIPT_NAME" "Checking SLD service health at ${SLD_URL}..." + + local http_status + http_status=$(check_sld_health) + + # 200 OK or 401 Unauthorized indicate the Tomcat layer is actively processing + if [[ $http_status == 200 || $http_status == 401 ]]; then + log_message "$SCRIPT_NAME" "[OK] SLD service healthy (HTTP Status: $http_status)" + send_notification_if_changed "$SCRIPT_NAME" "sld_status" "SLD Service" "SLD service is healthy (HTTP: $http_status)" "false" "OK" + return 0 + fi + + # Service is down or unresponsive + local status_detail + if [ "$http_status" == "0" ]; then + status_detail="Connection failed or timeout" + else + status_detail="HTTP Status: ${http_status}" + fi + + log_message "$SCRIPT_NAME" "[ALERT] SLD dead or hung (${status_detail}). Restarting service..." + + # Send notification + send_notification_if_changed "$SCRIPT_NAME" "sld_down" "SLD Service Critical" \ + "SLD service is down (${status_detail}). Restarting ${SLD_URL}" "true" "SLD_DOWN" + + # Restart the service + if ! restart_sld_service; then + log_message "$SCRIPT_NAME" "[CRITICAL] Failed to restart SLD service" + send_notification_if_changed "$SCRIPT_NAME" "sld_restart_failed" "SLD Service Critical" \ + "Failed to restart SLD service" "true" "RESTART_FAILED" + return 1 + fi + + # Allow service to spin up, then log recovery status + log_message "$SCRIPT_NAME" "Waiting 15 seconds for service to restart..." + sleep 15 + + local recovery_status + recovery_status=$(check_sld_health) + + if [[ $recovery_status == 200 || $recovery_status == 401 ]]; then + log_message "$SCRIPT_NAME" "[RECOVERY] SLD service recovered successfully (HTTP Status: $recovery_status)" + send_notification_if_changed "$SCRIPT_NAME" "sld_down" "SLD Service" \ + "SLD service recovered (HTTP: $recovery_status)" "false" "OK" + send_notification_if_changed "$SCRIPT_NAME" "sld_restart_failed" "SLD Service" \ + "Service recovered successfully" "false" "OK" + else + local recovery_detail + if [ "$recovery_status" == "0" ]; then + recovery_detail="Connection failed after restart" + else + recovery_detail="HTTP Status: $recovery_status" + fi + log_message "$SCRIPT_NAME" "[CRITICAL] SLD service failed to recover after restart (${recovery_detail})" + send_notification_if_changed "$SCRIPT_NAME" "sld_down" "SLD Service Critical" \ + "SLD service FAILED to recover after restart (${recovery_detail})" "true" "RECOVERY_FAILED" + return 1 + fi + + return 0 +} + +# Run main function +main +exit_code=$? + +log_message "$SCRIPT_NAME" "SLD watchdog check complete." +exit $exit_code