initial commit

This commit is contained in:
2026-03-12 20:12:20 +01:00
commit e7c4142294
9 changed files with 904 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
.anchor

73
hana.conf Normal file
View File

@@ -0,0 +1,73 @@
#!/bin/bash
# =============================================================================
# SAP HANA Common Configuration
# =============================================================================
# --- HANA Instance Configuration ---
# HANA SID (e.g., "NDB") - used to derive the HANA user (<sid>adm)
HANA_SID="NDB"
# Derived HANA Linux user (automatically computed from HANA_SID)
HANA_USER="$(echo "$HANA_SID" | tr '[:upper:]' '[:lower:]')adm"
# HANA Instance Number (e.g., "00")
HANA_INSTANCE_NR="00"
# HANA User Key for hdbsql (hdbuserstore key)
HANA_USER_KEY="CRONKEY"
# --- Paths ---
SAPCONTROL_PATH="/usr/sap/hostctrl/exe/sapcontrol"
HDBSQL_PATH="/usr/sap/HDB/HDB${HANA_INSTANCE_NR}/exe/hdbsql"
# --- Monitoring Directories ---
DIRECTORIES_TO_MONITOR=(
"/hana/shared"
"/hana/log"
"/hana/data"
"/usr/sap"
)
# --- Log Directories for Auto-Cleanup ---
# These directories will be automatically cleaned when disk space is low
# Format: "mount_point:log_directory_path"
# The script will check if a monitored directory is on the same mount point
# as a log directory and can clean the log directory to free up space
LOG_DIRS_FOR_CLEANUP=(
"/hana/log:/hana/log"
"/usr/sap:/usr/sap/trans/log"
"/usr/sap:/usr/sap/hostctrl/work/log"
)
# --- Disk Cleanup Configuration ---
# Minimum free space percentage to maintain after cleanup
MIN_FREE_SPACE_AFTER_CLEANUP=5
# Maximum age of log files to delete (in days)
MAX_LOG_FILE_AGE_DAYS=7
# Enable automatic cleanup when disk usage exceeds threshold
AUTO_CLEANUP_ENABLED=true
# --- Thresholds ---
DISK_USAGE_THRESHOLD=85
TRUNCATED_PERCENTAGE_THRESHOLD=50
FREE_PERCENTAGE_THRESHOLD=10
STATEMENT_QUEUE_THRESHOLD=10
STATEMENT_QUEUE_CONSECUTIVE_RUNS=3
BACKUP_THRESHOLD_HOURS=32
# --- Notification Configuration ---
NTFY_TOKEN=""
NTFY_TOPIC_URL=""
COMPANY_NAME="My Company"
# --- Logging Configuration ---
LOG_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
LOG_FILE="${LOG_DIR}/hana_monitor.log"
# --- State Directory ---
STATE_DIR="${LOG_DIR}/monitor_state"
mkdir -p "${STATE_DIR}"
# --- Lock Directory ---
LOCK_DIR="/tmp"

83
hana_backup.sh Normal file
View File

@@ -0,0 +1,83 @@
#!/bin/bash
# =============================================================================
# SAP HANA Backup Status Monitoring Script
# Checks last successful backup age
# =============================================================================
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
SCRIPT_NAME="hana_backup"
# Load configuration
source "${SCRIPT_DIR}/hana.conf"
source "${SCRIPT_DIR}/hana_lib.sh"
# Acquire lock
LOCK_FILE=$(acquire_lock "$SCRIPT_NAME")
if [ $? -ne 0 ]; then
exit 1
fi
trap 'release_lock "$LOCK_FILE"' EXIT
log_message "$SCRIPT_NAME" "Starting backup status check..."
# Check if hdbsql is available
if [ ! -x "$HDBSQL_PATH" ]; then
log_message "$SCRIPT_NAME" "ERROR: hdbsql not found or not executable at ${HDBSQL_PATH}"
send_notification_if_changed "$SCRIPT_NAME" "hana_hdbsql_path_backup" "HANA Monitor Error" "hdbsql not found or not executable at ${HDBSQL_PATH}" "true" "HDBSQL_ERROR"
exit 1
fi
# SQL Query for last successful backup
BACKUP_SQL="SELECT TOP 1 SYS_START_TIME FROM M_BACKUP_CATALOG WHERE ENTRY_TYPE_NAME = 'complete data backup' AND STATE_NAME = 'successful' ORDER BY SYS_START_TIME DESC"
# Execute SQL query as HANA user with improved error handling
backup_result=$(su - "$HANA_USER" -c "$HDBSQL_PATH -U $HANA_USER_KEY -j -a -x \"$BACKUP_SQL\"" 2>&1)
sql_status=$?
if [ $sql_status -ne 0 ]; then
log_message "$SCRIPT_NAME" "ERROR: Failed to execute backup query. Exit code: ${sql_status}"
send_notification_if_changed "$SCRIPT_NAME" "hana_backup_query_error" "HANA Backup Error" "Failed to execute backup query. Exit code: ${sql_status}" "true" "QUERY_ERROR"
exit 1
fi
last_backup_date=$(echo "$backup_result" | tr -d '"' | sed 's/\..*//')
if [[ -z "$last_backup_date" || "$last_backup_date" == *"error"* || "$last_backup_date" == *"Error"* ]]; then
message="No successful complete data backup found for ${COMPANY_NAME} HANA."
log_message "$SCRIPT_NAME" "CRITICAL: ${message}"
send_notification_if_changed "$SCRIPT_NAME" "hana_backup_status" "HANA Backup" "$message" "true" "NO_BACKUP"
exit 1
fi
# Clear any previous query error state
send_notification_if_changed "$SCRIPT_NAME" "hana_backup_query_error" "HANA Backup" "Backup query successful." "false" "OK"
# Calculate backup age
last_backup_epoch=$(date -d "$last_backup_date" +%s 2>/dev/null)
if [ $? -ne 0 ]; then
log_message "$SCRIPT_NAME" "ERROR: Failed to parse backup date: ${last_backup_date}"
send_notification_if_changed "$SCRIPT_NAME" "hana_backup_status" "HANA Backup" "Failed to parse backup date: ${last_backup_date}" "true" "DATE_PARSE_ERROR"
exit 1
fi
current_epoch=$(date +%s)
threshold_seconds=$((BACKUP_THRESHOLD_HOURS * 3600))
age_seconds=$((current_epoch - last_backup_epoch))
age_hours=$((age_seconds / 3600))
if [ $age_seconds -lt 0 ]; then
log_message "$SCRIPT_NAME" "WARNING: Backup timestamp is in the future. Possible clock skew."
age_hours=0
fi
if [ $age_seconds -gt $threshold_seconds ]; then
message="Last successful HANA backup for ${COMPANY_NAME} is ${age_hours} hours old, which exceeds the threshold of ${BACKUP_THRESHOLD_HOURS} hours. Last backup was on: ${last_backup_date}."
log_message "$SCRIPT_NAME" "CRITICAL: ${message}"
send_notification_if_changed "$SCRIPT_NAME" "hana_backup_status" "HANA Backup" "$message" "true" "${age_hours}h"
else
message="Last successful backup is ${age_hours} hours old (Threshold: ${BACKUP_THRESHOLD_HOURS} hours)."
log_message "$SCRIPT_NAME" "SUCCESS: ${message}"
send_notification_if_changed "$SCRIPT_NAME" "hana_backup_status" "HANA Backup" "$message" "false" "OK"
fi
log_message "$SCRIPT_NAME" "Backup check complete."

84
hana_disk.sh Normal file
View File

@@ -0,0 +1,84 @@
#!/bin/bash
# =============================================================================
# SAP HANA Disk Space Monitoring Script
# Checks disk usage for configured directories with auto-cleanup capability
# =============================================================================
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
SCRIPT_NAME="hana_disk"
# Load configuration
source "${SCRIPT_DIR}/hana.conf"
source "${SCRIPT_DIR}/hana_lib.sh"
# Acquire lock
LOCK_FILE=$(acquire_lock "$SCRIPT_NAME")
if [ $? -ne 0 ]; then
exit 1
fi
trap 'release_lock "$LOCK_FILE"' EXIT
log_message "$SCRIPT_NAME" "Starting disk usage check..."
# Track overall status
ALERT_COUNT=0
TOTAL_DIRS=0
CLEANUP_PERFORMED=0
for dir in "${DIRECTORIES_TO_MONITOR[@]}"; do
TOTAL_DIRS=$((TOTAL_DIRS + 1))
# Check if directory exists
if [ ! -d "$dir" ]; then
log_message "$SCRIPT_NAME" "WARNING: Directory '$dir' not found. Skipping."
send_notification_if_changed "$SCRIPT_NAME" "disk_dir_not_found_${dir//\//_}" "HANA Disk Warning" "Directory '$dir' not found." "true" "DIR_NOT_FOUND"
ALERT_COUNT=$((ALERT_COUNT + 1))
continue
fi
# Get disk usage percentage
usage=$(get_disk_usage_percentage "$dir")
if [ -z "$usage" ] || [ "$usage" -eq 0 ]; then
log_message "$SCRIPT_NAME" "WARNING: Could not determine disk usage for '$dir'. Skipping."
continue
fi
log_message "$SCRIPT_NAME" "Directory ${dir} is at ${usage}%"
# Check if usage exceeds threshold
if [ "$usage" -gt "$DISK_USAGE_THRESHOLD" ]; then
log_message "$SCRIPT_NAME" "ALERT: ${dir} usage is at ${usage}% which is above the ${DISK_USAGE_THRESHOLD}% threshold."
# Attempt auto-cleanup if enabled
if [ "$AUTO_CLEANUP_ENABLED" == "true" ]; then
log_message "$SCRIPT_NAME" "Attempting auto-cleanup for '${dir}'..."
mount_point=$(get_mount_point "$dir")
if auto_cleanup "$mount_point" "$MIN_FREE_SPACE_AFTER_CLEANUP"; then
CLEANUP_PERFORMED=$((CLEANUP_PERFORMED + 1))
new_usage=$(get_disk_usage_percentage "$dir")
log_message "$SCRIPT_NAME" "After cleanup, ${dir} usage is at ${new_usage}%"
usage=$new_usage
else
log_message "$SCRIPT_NAME" "Auto-cleanup failed or no files to clean for '${dir}'"
fi
fi
# Send notification with final usage after cleanup attempt
send_notification_if_changed "$SCRIPT_NAME" "disk_usage_${dir//\//_}" "HANA Disk" "Disk usage for ${dir} is at ${usage}%." "true" "${usage}%"
ALERT_COUNT=$((ALERT_COUNT + 1))
else
# Send OK notification only if state changed from alert
send_notification_if_changed "$SCRIPT_NAME" "disk_usage_${dir//\//_}" "HANA Disk" "Disk usage for ${dir} is at ${usage}% (below threshold)." "false" "OK"
fi
done
# Summary logging
log_message "$SCRIPT_NAME" "Disk check complete. Total: ${TOTAL_DIRS} dirs, ${ALERT_COUNT} alerts, ${CLEANUP_PERFORMED} cleanups performed."
# Exit with status based on alerts
if [ "$ALERT_COUNT" -gt 0 ]; then
exit 1
fi
exit 0

272
hana_lib.sh Normal file
View File

@@ -0,0 +1,272 @@
#!/bin/bash
# =============================================================================
# SAP HANA Monitoring Library - Shared Functions
# =============================================================================
# Logging function with script name prefix
# Usage: log_message "SCRIPT_NAME" "message"
log_message() {
local script_name="$1"
local message="$2"
local timestamp=$(date "+%Y-%m-%d %H:%M:%S")
echo "[${timestamp}] [${script_name}] ${message}" | tee -a "${LOG_FILE}"
}
# Acquire lock for script execution
# Usage: acquire_lock "SCRIPT_NAME"
# Returns: 0 on success, 1 on failure (already running)
acquire_lock() {
local script_name="$1"
local lock_file="${LOCK_DIR}/hana_${script_name}.lock"
if [ -e "$lock_file" ]; then
log_message "$script_name" "Script is already running. Exiting."
return 1
fi
touch "$lock_file"
echo "$lock_file"
return 0
}
# Release lock
# Usage: release_lock "LOCK_FILE"
release_lock() {
local lock_file="$1"
if [ -n "$lock_file" ] && [ -f "$lock_file" ]; then
rm -f "$lock_file"
fi
}
# Get state value
# Usage: get_state "KEY"
get_state() {
local key="$1"
if [ -f "${STATE_DIR}/${key}.state" ]; then
cat "${STATE_DIR}/${key}.state"
else
echo ""
fi
}
# Set state value
# Usage: set_state "KEY" "VALUE"
set_state() {
local key="$1"
local value="$2"
echo "$value" > "${STATE_DIR}/${key}.state"
}
# Send notification if state changed
# Usage: send_notification_if_changed "SCRIPT_NAME" "ALERT_KEY" "TITLE_PREFIX" "MESSAGE" "IS_ALERT" "CURRENT_VALUE"
send_notification_if_changed() {
local script_name="$1"
local alert_key="$2"
local title_prefix="$3"
local current_message="$4"
local is_alert_condition="$5"
local current_value="$6"
local hostname=$(hostname)
local previous_value=$(get_state "$alert_key")
if [ "$current_value" != "$previous_value" ]; then
local full_title=""
local full_message=""
if [ "$is_alert_condition" == "true" ]; then
full_title="${title_prefix} Alert"
full_message="🚨 Critical: ${current_message}"
log_message "$script_name" "ALERT: ${full_message}"
else
if [ -n "$previous_value" ] && [ "$previous_value" != "OK" ]; then
full_title="${title_prefix} Resolved"
full_message="✅ Resolved: ${current_message}"
log_message "$script_name" "RESOLVED: ${full_message}"
else
set_state "$alert_key" "$current_value"
return
fi
fi
local final_message="[${COMPANY_NAME} | ${hostname}] ${full_message}"
if [ -n "$NTFY_TOKEN" ] && [ -n "$NTFY_TOPIC_URL" ]; then
curl -H "Authorization: Bearer ${NTFY_TOKEN}" -H "Title: ${full_title}" -d "${final_message}" "${NTFY_TOPIC_URL}" > /dev/null 2>&1
log_message "$script_name" "Notification sent: ${full_title}"
else
log_message "$script_name" "Ntfy not configured, skipping notification"
fi
set_state "$alert_key" "$current_value"
fi
}
# Run command as HANA user using su
# Usage: run_as_hana_user "COMMAND"
run_as_hana_user() {
local command="$1"
su - "$HANA_USER" -c "$command"
}
# Get disk usage percentage for a directory
# Usage: get_disk_usage_percentage "/path/to/dir"
# Returns: Usage percentage as integer (without % sign)
get_disk_usage_percentage() {
local dir="$1"
if [ ! -d "$dir" ]; then
echo "0"
return
fi
df "$dir" 2>/dev/null | awk 'NR==2 {gsub(/%/,"",$5); print $5}'
}
# Get mount point for a directory
# Usage: get_mount_point "/path/to/dir"
# Returns: Mount point path
get_mount_point() {
local dir="$1"
df "$dir" 2>/dev/null | awk 'NR==2 {print $NF}'
}
# Get available disk space in KB for a directory
# Usage: get_available_space_kb "/path/to/dir"
# Returns: Available space in KB
get_available_space_kb() {
local dir="$1"
df -k "$dir" 2>/dev/null | awk 'NR==2 {print $4}'
}
# Find log directories on the same mount point
# Usage: find_log_dirs_on_mount "mount_point"
# Returns: Space-separated list of log directories
find_log_dirs_on_mount() {
local mount_point="$1"
local result=""
for log_entry in "${LOG_DIRS_FOR_CLEANUP[@]}"; do
local entry_mount="${log_entry%%:*}"
local log_dir="${log_entry#*:}"
if [ "$entry_mount" == "$mount_point" ] && [ -d "$log_dir" ]; then
if [ -n "$result" ]; then
result="$result $log_dir"
else
result="$log_dir"
fi
fi
done
echo "$result"
}
# Clean old log files in a directory
# Usage: clean_log_files "/path/to/log/dir" "max_age_days"
# Returns: Number of files deleted and space freed
clean_log_files() {
local log_dir="$1"
local max_age_days="${2:-7}"
local files_deleted=0
local space_freed=0
if [ ! -d "$log_dir" ]; then
log_message "CLEANUP" "Log directory '$log_dir' not found. Skipping."
echo "0:0"
return
fi
# Find and delete old log files
while IFS= read -r -d '' file; do
if [ -f "$file" ]; then
local file_size=$(stat -c%s "$file" 2>/dev/null || echo "0")
rm -f "$file" 2>/dev/null && {
files_deleted=$((files_deleted + 1))
space_freed=$((space_freed + file_size))
}
fi
done < <(find "$log_dir" -type f -mtime +$max_age_days -print0 2>/dev/null)
# Also clean empty directories
find "$log_dir" -type d -empty -delete 2>/dev/null
log_message "CLEANUP" "Deleted $files_deleted files from '$log_dir', freed $((space_freed / 1024)) KB"
echo "${files_deleted}:${space_freed}"
}
# Automatic disk cleanup function
# Usage: auto_cleanup "mount_point" "target_free_percentage"
# Returns: 0 if cleanup successful, 1 if failed or not needed
auto_cleanup() {
local mount_point="$1"
local target_free_percentage="${2:-5}"
if [ "$AUTO_CLEANUP_ENABLED" != "true" ]; then
log_message "CLEANUP" "Auto-cleanup is disabled. Skipping."
return 1
fi
local log_dirs=$(find_log_dirs_on_mount "$mount_point")
if [ -z "$log_dirs" ]; then
log_message "CLEANUP" "No log directories configured for mount point '$mount_point'. Skipping cleanup."
return 1
fi
log_message "CLEANUP" "Starting auto-cleanup for mount point '$mount_point'. Log dirs: $log_dirs"
local total_freed=0
local total_files=0
for log_dir in $log_dirs; do
local result=$(clean_log_files "$log_dir" "$MAX_LOG_FILE_AGE_DAYS")
local files="${result%%:*}"
local freed="${result#*:}"
total_files=$((total_files + files))
total_freed=$((total_freed + freed))
done
log_message "CLEANUP" "Cleanup complete. Total files deleted: $total_files, Total space freed: $((total_freed / 1024)) KB"
if [ $total_freed -gt 0 ]; then
return 0
else
return 1
fi
}
# Check disk space and perform auto-cleanup if needed
# Usage: check_and_cleanup_disk "directory" "threshold"
# Returns: 0 if OK or cleanup successful, 1 if critical and cleanup failed
check_and_cleanup_disk() {
local dir="$1"
local threshold="${2:-85}"
local usage=$(get_disk_usage_percentage "$dir")
local mount_point=$(get_mount_point "$dir")
if [ -z "$usage" ] || [ "$usage" -eq 0 ]; then
log_message "CLEANUP" "Could not determine disk usage for '$dir'. Skipping."
return 1
fi
if [ "$usage" -gt "$threshold" ]; then
log_message "CLEANUP" "Disk usage ${usage}% exceeds threshold ${threshold}% for '$dir'. Attempting cleanup..."
if auto_cleanup "$mount_point" "$MIN_FREE_SPACE_AFTER_CLEANUP"; then
local new_usage=$(get_disk_usage_percentage "$dir")
log_message "CLEANUP" "After cleanup, disk usage is ${new_usage}%"
if [ "$new_usage" -le "$threshold" ]; then
return 0
else
log_message "CLEANUP" "Cleanup completed but usage ${new_usage}% still above threshold"
return 0
fi
else
log_message "CLEANUP" "Cleanup failed or no files to clean for '$dir'"
return 1
fi
fi
return 0
}

111
hana_log_segments.sh Normal file
View File

@@ -0,0 +1,111 @@
#!/bin/bash
# =============================================================================
# SAP HANA Log Segment Monitoring Script
# Checks log segment states (Truncated, Free)
# =============================================================================
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
SCRIPT_NAME="hana_log_segments"
# Load configuration
source "${SCRIPT_DIR}/hana.conf"
source "${SCRIPT_DIR}/hana_lib.sh"
# Acquire lock
LOCK_FILE=$(acquire_lock "$SCRIPT_NAME")
if [ $? -ne 0 ]; then
exit 1
fi
trap 'release_lock "$LOCK_FILE"' EXIT
log_message "$SCRIPT_NAME" "Starting log segment check..."
# SQL Query for log segments
SQL_QUERY="SELECT b.host, b.service_name, a.state, count(*) FROM PUBLIC.M_LOG_SEGMENTS a JOIN PUBLIC.M_SERVICES b ON (a.host = b.host AND a.port = b.port) GROUP BY b.host, b.service_name, a.state;"
# Check if hdbsql is available
if [ ! -x "$HDBSQL_PATH" ]; then
log_message "$SCRIPT_NAME" "ERROR: hdbsql not found or not executable at ${HDBSQL_PATH}"
send_notification_if_changed "$SCRIPT_NAME" "hana_hdbsql_path" "HANA Monitor Error" "hdbsql not found or not executable at ${HDBSQL_PATH}" "true" "HDBSQL_ERROR"
exit 1
fi
# Execute SQL query as HANA user with improved error handling
readarray -t sql_output < <(su - "$HANA_USER" -c "$HDBSQL_PATH -U $HANA_USER_KEY -c \";\" \"$SQL_QUERY\"" 2>&1)
sql_status=$?
if [ $sql_status -ne 0 ]; then
error_message=$(printf '%s\n' "${sql_output[@]}")
log_message "$SCRIPT_NAME" "ERROR: The hdbsql command failed. Details: ${error_message}"
send_notification_if_changed "$SCRIPT_NAME" "hana_hdbsql_command" "HANA Monitor Error" "The hdbsql command failed. Details: ${error_message}" "true" "HDBSQL_COMMAND_FAILED"
exit 1
fi
# Parse SQL output
total_segments=0
truncated_segments=0
free_segments=0
for line in "${sql_output[@]}"; do
# Skip empty lines and header
if [[ -z "$line" || "$line" == *"STATE"* || "$line" == *"host"* ]]; then
continue
fi
cleaned_line=$(echo "$line" | tr -d '"')
state=$(echo "$cleaned_line" | awk -F',' '{print $3}' | xargs) # Trim whitespace
count=$(echo "$cleaned_line" | awk -F',' '{print $4}' | xargs) # Trim whitespace
# Validate count is a number
if ! [[ "$count" =~ ^[0-9]+$ ]]; then
continue
fi
total_segments=$((total_segments + count))
if [[ "$state" == "Truncated" ]]; then
truncated_segments=$((truncated_segments + count))
elif [[ "$state" == "Free" ]]; then
free_segments=$((free_segments + count))
fi
done
log_message "$SCRIPT_NAME" "Total Segments: ${total_segments}"
log_message "$SCRIPT_NAME" "Truncated Segments: ${truncated_segments}"
log_message "$SCRIPT_NAME" "Free Segments: ${free_segments}"
if [ $total_segments -eq 0 ]; then
log_message "$SCRIPT_NAME" "WARNING: No log segments found. Skipping percentage checks."
send_notification_if_changed "$SCRIPT_NAME" "hana_log_segments_total" "HANA Log Segment Warning" "No log segments found. Skipping percentage checks." "true" "NO_LOG_SEGMENTS"
else
send_notification_if_changed "$SCRIPT_NAME" "hana_log_segments_total" "HANA Log Segment" "Log segments found." "false" "OK"
# Calculate truncated percentage with integer arithmetic
if [ $total_segments -gt 0 ]; then
truncated_percentage=$((truncated_segments * 100 / total_segments))
else
truncated_percentage=0
fi
if [ $truncated_percentage -gt $TRUNCATED_PERCENTAGE_THRESHOLD ]; then
log_message "$SCRIPT_NAME" "ALERT: ${truncated_percentage}% of log segments are 'Truncated'."
send_notification_if_changed "$SCRIPT_NAME" "hana_log_truncated" "HANA Log Segment" "${truncated_percentage}% of HANA log segments are in 'Truncated' state." "true" "${truncated_percentage}%"
else
send_notification_if_changed "$SCRIPT_NAME" "hana_log_truncated" "HANA Log Segment" "${truncated_percentage}% of HANA log segments are in 'Truncated' state (below threshold)." "false" "OK"
fi
# Calculate free percentage with integer arithmetic
if [ $total_segments -gt 0 ]; then
free_percentage=$((free_segments * 100 / total_segments))
else
free_percentage=0
fi
if [ $free_percentage -lt $FREE_PERCENTAGE_THRESHOLD ]; then
log_message "$SCRIPT_NAME" "ALERT: Only ${free_percentage}% of log segments are 'Free'."
send_notification_if_changed "$SCRIPT_NAME" "hana_log_free" "HANA Log Segment" "Only ${free_percentage}% of HANA log segments are in 'Free' state." "true" "${free_percentage}%"
else
send_notification_if_changed "$SCRIPT_NAME" "hana_log_free" "HANA Log Segment" "Only ${free_percentage}% of HANA log segments are in 'Free' state (above threshold)." "false" "OK"
fi
fi
log_message "$SCRIPT_NAME" "Log segment check complete."

64
hana_processes.sh Normal file
View File

@@ -0,0 +1,64 @@
#!/bin/bash
# =============================================================================
# SAP HANA Process Monitoring Script
# Checks if all HANA processes are in GREEN state
# =============================================================================
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
SCRIPT_NAME="hana_processes"
# Load configuration
source "${SCRIPT_DIR}/hana.conf"
source "${SCRIPT_DIR}/hana_lib.sh"
# Acquire lock
LOCK_FILE=$(acquire_lock "$SCRIPT_NAME")
if [ $? -ne 0 ]; then
exit 1
fi
trap 'release_lock "$LOCK_FILE"' EXIT
log_message "$SCRIPT_NAME" "Starting HANA process status check..."
# Check if sapcontrol is available
if [ ! -x "$SAPCONTROL_PATH" ]; then
log_message "$SCRIPT_NAME" "ERROR: sapcontrol not found or not executable at ${SAPCONTROL_PATH}"
send_notification_if_changed "$SCRIPT_NAME" "hana_sapcontrol_path" "HANA Monitor Error" "sapcontrol not found or not executable at ${SAPCONTROL_PATH}" "true" "SAPCONTROL_ERROR"
exit 1
fi
# Get process list with improved error handling
process_list=$(su - "$HANA_USER" -c "${SAPCONTROL_PATH} -nr ${HANA_INSTANCE_NR} -function GetProcessList" 2>&1)
sapcontrol_status=$?
if [ $sapcontrol_status -ne 0 ]; then
log_message "$SCRIPT_NAME" "ERROR: sapcontrol command failed with exit code ${sapcontrol_status}"
send_notification_if_changed "$SCRIPT_NAME" "hana_sapcontrol_command" "HANA Monitor Error" "sapcontrol command failed. Exit code: ${sapcontrol_status}" "true" "SAPCONTROL_COMMAND_FAILED"
exit 1
fi
# Clear any previous sapcontrol error state
send_notification_if_changed "$SCRIPT_NAME" "hana_sapcontrol_command" "HANA Process" "sapcontrol command successful." "false" "OK"
# Check for non-GREEN processes (skip header lines)
non_green_processes=$(echo "$process_list" | tail -n +6 | grep -v 'GREEN' | grep -v '^$')
if [ -n "$non_green_processes" ]; then
log_message "$SCRIPT_NAME" "ALERT: One or more HANA processes are not running!"
log_message "$SCRIPT_NAME" "Problem processes: ${non_green_processes}"
send_notification_if_changed "$SCRIPT_NAME" "hana_processes" "HANA Process" "One or more HANA processes are not GREEN. Problem processes: ${non_green_processes}" "true" "PROCESS_ALERT:${non_green_processes}"
exit 1
else
# Verify we actually got process data
green_processes=$(echo "$process_list" | tail -n +6 | grep 'GREEN')
if [ -z "$green_processes" ]; then
log_message "$SCRIPT_NAME" "WARNING: No process data found. SAP HANA may not be running."
send_notification_if_changed "$SCRIPT_NAME" "hana_processes" "HANA Process" "No process data found. SAP HANA may not be running." "true" "NO_PROCESS_DATA"
exit 1
fi
send_notification_if_changed "$SCRIPT_NAME" "hana_processes" "HANA Process" "All HANA processes are GREEN." "false" "OK"
log_message "$SCRIPT_NAME" "SUCCESS: All HANA processes are GREEN."
fi
log_message "$SCRIPT_NAME" "Process check complete."

82
hana_queue.sh Normal file
View File

@@ -0,0 +1,82 @@
#!/bin/bash
# =============================================================================
# SAP HANA Statement Queue Monitoring Script
# Checks for queued SQL statements
# =============================================================================
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
SCRIPT_NAME="hana_queue"
# Load configuration
source "${SCRIPT_DIR}/hana.conf"
source "${SCRIPT_DIR}/hana_lib.sh"
# Acquire lock
LOCK_FILE=$(acquire_lock "$SCRIPT_NAME")
if [ $? -ne 0 ]; then
exit 1
fi
trap 'release_lock "$LOCK_FILE"' EXIT
log_message "$SCRIPT_NAME" "Starting statement queue check..."
# SQL Query for statement queue
STATEMENT_QUEUE_SQL="SELECT COUNT(*) FROM M_SERVICE_THREADS WHERE THREAD_TYPE = 'SqlExecutor' AND THREAD_STATE = 'Queueing';"
# Check if hdbsql is available
if [ ! -x "$HDBSQL_PATH" ]; then
log_message "$SCRIPT_NAME" "ERROR: hdbsql not found or not executable at ${HDBSQL_PATH}"
send_notification_if_changed "$SCRIPT_NAME" "hana_hdbsql_path_queue" "HANA Monitor Error" "hdbsql not found or not executable at ${HDBSQL_PATH}" "true" "HDBSQL_ERROR"
exit 1
fi
# Execute SQL query as HANA user with improved error handling
queue_result=$(su - "$HANA_USER" -c "$HDBSQL_PATH -U $HANA_USER_KEY -j -a -x \"$STATEMENT_QUEUE_SQL\"" 2>&1)
sql_status=$?
if [ $sql_status -ne 0 ]; then
log_message "$SCRIPT_NAME" "ERROR: Failed to execute queue query. Exit code: ${sql_status}"
send_notification_if_changed "$SCRIPT_NAME" "hana_queue_query_error" "HANA Queue Error" "Failed to execute queue query. Exit code: ${sql_status}" "true" "QUERY_ERROR"
exit 1
fi
# Clear any previous query error state
send_notification_if_changed "$SCRIPT_NAME" "hana_queue_query_error" "HANA Statement Queue" "Queue query successful." "false" "OK"
# Parse queue count
queue_count=$(echo "$queue_result" | tr -d '"' | xargs)
# Validate queue count is a number
if ! [[ "$queue_count" =~ ^[0-9]+$ ]]; then
log_message "$SCRIPT_NAME" "WARNING: Could not retrieve HANA statement queue count. Got: '${queue_count}'. Skipping check."
send_notification_if_changed "$SCRIPT_NAME" "hana_statement_queue_check_fail" "HANA Monitor Warning" "Could not retrieve statement queue count. Got: '${queue_count}'" "true" "QUEUE_CHECK_FAIL"
else
# Clear any previous check failure state
send_notification_if_changed "$SCRIPT_NAME" "hana_statement_queue_check_fail" "HANA Monitor Warning" "Statement queue check is working." "false" "OK"
log_message "$SCRIPT_NAME" "Current statement queue length: ${queue_count}"
# Get breach count from state
breach_count=$(get_state "statement_queue_breach_count")
breach_count=${breach_count:-0}
if [ "$queue_count" -gt "$STATEMENT_QUEUE_THRESHOLD" ]; then
breach_count=$((breach_count + 1))
log_message "$SCRIPT_NAME" "Statement queue is above threshold (${queue_count} > ${STATEMENT_QUEUE_THRESHOLD}). Consecutive breach count: ${breach_count}/${STATEMENT_QUEUE_CONSECUTIVE_RUNS}."
else
if [ "$breach_count" -gt 0 ]; then
log_message "$SCRIPT_NAME" "Statement queue returned to normal. Resetting breach count from ${breach_count} to 0."
fi
breach_count=0
fi
set_state "statement_queue_breach_count" "$breach_count"
if [ "$breach_count" -ge "$STATEMENT_QUEUE_CONSECUTIVE_RUNS" ]; then
message="Statement queue has been over ${STATEMENT_QUEUE_THRESHOLD} for ${breach_count} checks. Current count: ${queue_count}."
send_notification_if_changed "$SCRIPT_NAME" "hana_statement_queue_status" "HANA Statement Queue" "$message" "true" "ALERT:${queue_count}"
else
message="Statement queue is normal. Current count: ${queue_count}."
send_notification_if_changed "$SCRIPT_NAME" "hana_statement_queue_status" "HANA Statement Queue" "$message" "false" "OK"
fi
fi
log_message "$SCRIPT_NAME" "Statement queue check complete."

134
sld_watchdog.sh Normal file
View File

@@ -0,0 +1,134 @@
#!/bin/bash
#
# sld_watchdog.sh - Monitors SLD service health and restarts if needed
# Optimized for better error handling and reliability
#
# Get script directory and name
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
SCRIPT_NAME="sld_watchdog"
# Source configuration and library
source "$SCRIPT_DIR/hana.conf"
source "$SCRIPT_DIR/hana_lib.sh"
# SLD-specific configuration
SLD_URL="${SLD_URL:-https://localhost:40000/sld/sld0100.svc}"
SLD_TIMEOUT="${SLD_TIMEOUT:-5}"
SLD_LOCK_FILE="/tmp/hana_sld_watchdog.lock"
# Acquire lock using library function
LOCK_FILE=$(acquire_lock "$SCRIPT_NAME")
if [ $? -ne 0 ]; then
exit 0
fi
trap 'release_lock "$LOCK_FILE"' EXIT
# Function to check SLD health
# Returns HTTP status code or "0" for connection errors
check_sld_health() {
local http_status
http_status=$(curl -k -s -o /dev/null -w "%{http_code}" -m "$SLD_TIMEOUT" --connect-timeout "$SLD_TIMEOUT" "$SLD_URL" 2>/dev/null)
# Handle curl errors (returns 000 on connection failure)
if [ -z "$http_status" ] || [ "$http_status" == "000" ]; then
echo "0"
else
echo "$http_status"
fi
}
# Function to restart SLD service
restart_sld_service() {
log_message "$SCRIPT_NAME" "Attempting to restart SLD service..."
# Try systemctl first
if command -v systemctl &> /dev/null; then
systemctl restart sapb1servertools 2>&1
local restart_status=$?
if [ $restart_status -eq 0 ]; then
log_message "$SCRIPT_NAME" "Service restart command executed successfully"
return 0
else
log_message "$SCRIPT_NAME" "Service restart failed with exit code ${restart_status}"
return 1
fi
else
log_message "$SCRIPT_NAME" "systemctl not available, trying alternative restart methods"
# Fallback: try service command
service sapb1servertools restart 2>&1
return $?
fi
}
# Main monitoring logic
main() {
log_message "$SCRIPT_NAME" "Checking SLD service health at ${SLD_URL}..."
local http_status
http_status=$(check_sld_health)
# 200 OK or 401 Unauthorized indicate the Tomcat layer is actively processing
if [[ $http_status == 200 || $http_status == 401 ]]; then
log_message "$SCRIPT_NAME" "[OK] SLD service healthy (HTTP Status: $http_status)"
send_notification_if_changed "$SCRIPT_NAME" "sld_status" "SLD Service" "SLD service is healthy (HTTP: $http_status)" "false" "OK"
return 0
fi
# Service is down or unresponsive
local status_detail
if [ "$http_status" == "0" ]; then
status_detail="Connection failed or timeout"
else
status_detail="HTTP Status: ${http_status}"
fi
log_message "$SCRIPT_NAME" "[ALERT] SLD dead or hung (${status_detail}). Restarting service..."
# Send notification
send_notification_if_changed "$SCRIPT_NAME" "sld_down" "SLD Service Critical" \
"SLD service is down (${status_detail}). Restarting ${SLD_URL}" "true" "SLD_DOWN"
# Restart the service
if ! restart_sld_service; then
log_message "$SCRIPT_NAME" "[CRITICAL] Failed to restart SLD service"
send_notification_if_changed "$SCRIPT_NAME" "sld_restart_failed" "SLD Service Critical" \
"Failed to restart SLD service" "true" "RESTART_FAILED"
return 1
fi
# Allow service to spin up, then log recovery status
log_message "$SCRIPT_NAME" "Waiting 15 seconds for service to restart..."
sleep 15
local recovery_status
recovery_status=$(check_sld_health)
if [[ $recovery_status == 200 || $recovery_status == 401 ]]; then
log_message "$SCRIPT_NAME" "[RECOVERY] SLD service recovered successfully (HTTP Status: $recovery_status)"
send_notification_if_changed "$SCRIPT_NAME" "sld_down" "SLD Service" \
"SLD service recovered (HTTP: $recovery_status)" "false" "OK"
send_notification_if_changed "$SCRIPT_NAME" "sld_restart_failed" "SLD Service" \
"Service recovered successfully" "false" "OK"
else
local recovery_detail
if [ "$recovery_status" == "0" ]; then
recovery_detail="Connection failed after restart"
else
recovery_detail="HTTP Status: $recovery_status"
fi
log_message "$SCRIPT_NAME" "[CRITICAL] SLD service failed to recover after restart (${recovery_detail})"
send_notification_if_changed "$SCRIPT_NAME" "sld_down" "SLD Service Critical" \
"SLD service FAILED to recover after restart (${recovery_detail})" "true" "RECOVERY_FAILED"
return 1
fi
return 0
}
# Run main function
main
exit_code=$?
log_message "$SCRIPT_NAME" "SLD watchdog check complete."
exit $exit_code