Compare commits
4 Commits
e7c4142294
...
cf5b81889d
| Author | SHA1 | Date | |
|---|---|---|---|
| cf5b81889d | |||
| 5a92bc4e93 | |||
| 8e0ba1878f | |||
| a5553067b6 |
19
hana.conf
19
hana.conf
@@ -28,25 +28,6 @@ DIRECTORIES_TO_MONITOR=(
|
|||||||
"/usr/sap"
|
"/usr/sap"
|
||||||
)
|
)
|
||||||
|
|
||||||
# --- Log Directories for Auto-Cleanup ---
|
|
||||||
# These directories will be automatically cleaned when disk space is low
|
|
||||||
# Format: "mount_point:log_directory_path"
|
|
||||||
# The script will check if a monitored directory is on the same mount point
|
|
||||||
# as a log directory and can clean the log directory to free up space
|
|
||||||
LOG_DIRS_FOR_CLEANUP=(
|
|
||||||
"/hana/log:/hana/log"
|
|
||||||
"/usr/sap:/usr/sap/trans/log"
|
|
||||||
"/usr/sap:/usr/sap/hostctrl/work/log"
|
|
||||||
)
|
|
||||||
|
|
||||||
# --- Disk Cleanup Configuration ---
|
|
||||||
# Minimum free space percentage to maintain after cleanup
|
|
||||||
MIN_FREE_SPACE_AFTER_CLEANUP=5
|
|
||||||
# Maximum age of log files to delete (in days)
|
|
||||||
MAX_LOG_FILE_AGE_DAYS=7
|
|
||||||
# Enable automatic cleanup when disk usage exceeds threshold
|
|
||||||
AUTO_CLEANUP_ENABLED=true
|
|
||||||
|
|
||||||
# --- Thresholds ---
|
# --- Thresholds ---
|
||||||
DISK_USAGE_THRESHOLD=85
|
DISK_USAGE_THRESHOLD=85
|
||||||
TRUNCATED_PERCENTAGE_THRESHOLD=50
|
TRUNCATED_PERCENTAGE_THRESHOLD=50
|
||||||
|
|||||||
@@ -12,31 +12,30 @@ source "${SCRIPT_DIR}/hana.conf"
|
|||||||
source "${SCRIPT_DIR}/hana_lib.sh"
|
source "${SCRIPT_DIR}/hana_lib.sh"
|
||||||
|
|
||||||
# Acquire lock
|
# Acquire lock
|
||||||
LOCK_FILE=$(acquire_lock "$SCRIPT_NAME")
|
if ! acquire_lock "$SCRIPT_NAME"; then
|
||||||
if [ $? -ne 0 ]; then
|
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
trap 'release_lock "$LOCK_FILE"' EXIT
|
trap 'release_lock "$SCRIPT_NAME"' EXIT
|
||||||
|
|
||||||
log_message "$SCRIPT_NAME" "Starting backup status check..."
|
log_message "$SCRIPT_NAME" "Starting backup status check..."
|
||||||
|
|
||||||
# Check if hdbsql is available
|
# Check if hdbsql is available
|
||||||
if [ ! -x "$HDBSQL_PATH" ]; then
|
if [ ! -x "$HDBSQL_PATH" ]; then
|
||||||
log_message "$SCRIPT_NAME" "ERROR: hdbsql not found or not executable at ${HDBSQL_PATH}"
|
log_message "$SCRIPT_NAME" "ERROR: hdbsql not found or not executable at ${HDBSQL_PATH}"
|
||||||
send_notification_if_changed "$SCRIPT_NAME" "hana_hdbsql_path_backup" "HANA Monitor Error" "hdbsql not found or not executable at ${HDBSQL_PATH}" "true" "HDBSQL_ERROR"
|
send_alert "$SCRIPT_NAME" "HANA Monitor Error" "hdbsql not found or not executable at ${HDBSQL_PATH}"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# SQL Query for last successful backup
|
# SQL Query for last successful backup
|
||||||
BACKUP_SQL="SELECT TOP 1 SYS_START_TIME FROM M_BACKUP_CATALOG WHERE ENTRY_TYPE_NAME = 'complete data backup' AND STATE_NAME = 'successful' ORDER BY SYS_START_TIME DESC"
|
BACKUP_SQL="SELECT TOP 1 SYS_START_TIME FROM M_BACKUP_CATALOG WHERE ENTRY_TYPE_NAME = 'complete data backup' AND STATE_NAME = 'successful' ORDER BY SYS_START_TIME DESC"
|
||||||
|
|
||||||
# Execute SQL query as HANA user with improved error handling
|
# Execute SQL query
|
||||||
backup_result=$(su - "$HANA_USER" -c "$HDBSQL_PATH -U $HANA_USER_KEY -j -a -x \"$BACKUP_SQL\"" 2>&1)
|
backup_result=$(execute_hana_sql "$BACKUP_SQL")
|
||||||
sql_status=$?
|
sql_status=$?
|
||||||
|
|
||||||
if [ $sql_status -ne 0 ]; then
|
if [ $sql_status -ne 0 ]; then
|
||||||
log_message "$SCRIPT_NAME" "ERROR: Failed to execute backup query. Exit code: ${sql_status}"
|
log_message "$SCRIPT_NAME" "ERROR: Failed to execute backup query. Exit code: ${sql_status}"
|
||||||
send_notification_if_changed "$SCRIPT_NAME" "hana_backup_query_error" "HANA Backup Error" "Failed to execute backup query. Exit code: ${sql_status}" "true" "QUERY_ERROR"
|
send_alert "$SCRIPT_NAME" "HANA Backup Error" "Failed to execute backup query. Exit code: ${sql_status}"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -45,18 +44,15 @@ last_backup_date=$(echo "$backup_result" | tr -d '"' | sed 's/\..*//')
|
|||||||
if [[ -z "$last_backup_date" || "$last_backup_date" == *"error"* || "$last_backup_date" == *"Error"* ]]; then
|
if [[ -z "$last_backup_date" || "$last_backup_date" == *"error"* || "$last_backup_date" == *"Error"* ]]; then
|
||||||
message="No successful complete data backup found for ${COMPANY_NAME} HANA."
|
message="No successful complete data backup found for ${COMPANY_NAME} HANA."
|
||||||
log_message "$SCRIPT_NAME" "CRITICAL: ${message}"
|
log_message "$SCRIPT_NAME" "CRITICAL: ${message}"
|
||||||
send_notification_if_changed "$SCRIPT_NAME" "hana_backup_status" "HANA Backup" "$message" "true" "NO_BACKUP"
|
send_alert "$SCRIPT_NAME" "HANA Backup" "$message"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Clear any previous query error state
|
|
||||||
send_notification_if_changed "$SCRIPT_NAME" "hana_backup_query_error" "HANA Backup" "Backup query successful." "false" "OK"
|
|
||||||
|
|
||||||
# Calculate backup age
|
# Calculate backup age
|
||||||
last_backup_epoch=$(date -d "$last_backup_date" +%s 2>/dev/null)
|
last_backup_epoch=$(date -d "$last_backup_date" +%s 2>/dev/null)
|
||||||
if [ $? -ne 0 ]; then
|
if [ $? -ne 0 ]; then
|
||||||
log_message "$SCRIPT_NAME" "ERROR: Failed to parse backup date: ${last_backup_date}"
|
log_message "$SCRIPT_NAME" "ERROR: Failed to parse backup date: ${last_backup_date}"
|
||||||
send_notification_if_changed "$SCRIPT_NAME" "hana_backup_status" "HANA Backup" "Failed to parse backup date: ${last_backup_date}" "true" "DATE_PARSE_ERROR"
|
send_alert "$SCRIPT_NAME" "HANA Backup" "Failed to parse backup date: ${last_backup_date}"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -73,11 +69,11 @@ fi
|
|||||||
if [ $age_seconds -gt $threshold_seconds ]; then
|
if [ $age_seconds -gt $threshold_seconds ]; then
|
||||||
message="Last successful HANA backup for ${COMPANY_NAME} is ${age_hours} hours old, which exceeds the threshold of ${BACKUP_THRESHOLD_HOURS} hours. Last backup was on: ${last_backup_date}."
|
message="Last successful HANA backup for ${COMPANY_NAME} is ${age_hours} hours old, which exceeds the threshold of ${BACKUP_THRESHOLD_HOURS} hours. Last backup was on: ${last_backup_date}."
|
||||||
log_message "$SCRIPT_NAME" "CRITICAL: ${message}"
|
log_message "$SCRIPT_NAME" "CRITICAL: ${message}"
|
||||||
send_notification_if_changed "$SCRIPT_NAME" "hana_backup_status" "HANA Backup" "$message" "true" "${age_hours}h"
|
send_alert "$SCRIPT_NAME" "HANA Backup" "$message"
|
||||||
|
exit 1
|
||||||
else
|
else
|
||||||
message="Last successful backup is ${age_hours} hours old (Threshold: ${BACKUP_THRESHOLD_HOURS} hours)."
|
message="Last successful backup is ${age_hours} hours old (Threshold: ${BACKUP_THRESHOLD_HOURS} hours)."
|
||||||
log_message "$SCRIPT_NAME" "SUCCESS: ${message}"
|
log_message "$SCRIPT_NAME" "SUCCESS: ${message}"
|
||||||
send_notification_if_changed "$SCRIPT_NAME" "hana_backup_status" "HANA Backup" "$message" "false" "OK"
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
log_message "$SCRIPT_NAME" "Backup check complete."
|
log_message "$SCRIPT_NAME" "Backup check complete."
|
||||||
|
|||||||
32
hana_disk.sh
32
hana_disk.sh
@@ -12,18 +12,16 @@ source "${SCRIPT_DIR}/hana.conf"
|
|||||||
source "${SCRIPT_DIR}/hana_lib.sh"
|
source "${SCRIPT_DIR}/hana_lib.sh"
|
||||||
|
|
||||||
# Acquire lock
|
# Acquire lock
|
||||||
LOCK_FILE=$(acquire_lock "$SCRIPT_NAME")
|
if ! acquire_lock "$SCRIPT_NAME"; then
|
||||||
if [ $? -ne 0 ]; then
|
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
trap 'release_lock "$LOCK_FILE"' EXIT
|
trap 'release_lock "$SCRIPT_NAME"' EXIT
|
||||||
|
|
||||||
log_message "$SCRIPT_NAME" "Starting disk usage check..."
|
log_message "$SCRIPT_NAME" "Starting disk usage check..."
|
||||||
|
|
||||||
# Track overall status
|
# Track overall status
|
||||||
ALERT_COUNT=0
|
ALERT_COUNT=0
|
||||||
TOTAL_DIRS=0
|
TOTAL_DIRS=0
|
||||||
CLEANUP_PERFORMED=0
|
|
||||||
|
|
||||||
for dir in "${DIRECTORIES_TO_MONITOR[@]}"; do
|
for dir in "${DIRECTORIES_TO_MONITOR[@]}"; do
|
||||||
TOTAL_DIRS=$((TOTAL_DIRS + 1))
|
TOTAL_DIRS=$((TOTAL_DIRS + 1))
|
||||||
@@ -31,7 +29,7 @@ for dir in "${DIRECTORIES_TO_MONITOR[@]}"; do
|
|||||||
# Check if directory exists
|
# Check if directory exists
|
||||||
if [ ! -d "$dir" ]; then
|
if [ ! -d "$dir" ]; then
|
||||||
log_message "$SCRIPT_NAME" "WARNING: Directory '$dir' not found. Skipping."
|
log_message "$SCRIPT_NAME" "WARNING: Directory '$dir' not found. Skipping."
|
||||||
send_notification_if_changed "$SCRIPT_NAME" "disk_dir_not_found_${dir//\//_}" "HANA Disk Warning" "Directory '$dir' not found." "true" "DIR_NOT_FOUND"
|
send_alert "$SCRIPT_NAME" "HANA Disk Warning" "Directory '$dir' not found."
|
||||||
ALERT_COUNT=$((ALERT_COUNT + 1))
|
ALERT_COUNT=$((ALERT_COUNT + 1))
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
@@ -49,33 +47,15 @@ for dir in "${DIRECTORIES_TO_MONITOR[@]}"; do
|
|||||||
# Check if usage exceeds threshold
|
# Check if usage exceeds threshold
|
||||||
if [ "$usage" -gt "$DISK_USAGE_THRESHOLD" ]; then
|
if [ "$usage" -gt "$DISK_USAGE_THRESHOLD" ]; then
|
||||||
log_message "$SCRIPT_NAME" "ALERT: ${dir} usage is at ${usage}% which is above the ${DISK_USAGE_THRESHOLD}% threshold."
|
log_message "$SCRIPT_NAME" "ALERT: ${dir} usage is at ${usage}% which is above the ${DISK_USAGE_THRESHOLD}% threshold."
|
||||||
|
send_alert "$SCRIPT_NAME" "HANA Disk" "Disk usage for ${dir} is at ${usage}% (threshold: ${DISK_USAGE_THRESHOLD}%)."
|
||||||
# Attempt auto-cleanup if enabled
|
|
||||||
if [ "$AUTO_CLEANUP_ENABLED" == "true" ]; then
|
|
||||||
log_message "$SCRIPT_NAME" "Attempting auto-cleanup for '${dir}'..."
|
|
||||||
mount_point=$(get_mount_point "$dir")
|
|
||||||
|
|
||||||
if auto_cleanup "$mount_point" "$MIN_FREE_SPACE_AFTER_CLEANUP"; then
|
|
||||||
CLEANUP_PERFORMED=$((CLEANUP_PERFORMED + 1))
|
|
||||||
new_usage=$(get_disk_usage_percentage "$dir")
|
|
||||||
log_message "$SCRIPT_NAME" "After cleanup, ${dir} usage is at ${new_usage}%"
|
|
||||||
usage=$new_usage
|
|
||||||
else
|
|
||||||
log_message "$SCRIPT_NAME" "Auto-cleanup failed or no files to clean for '${dir}'"
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Send notification with final usage after cleanup attempt
|
|
||||||
send_notification_if_changed "$SCRIPT_NAME" "disk_usage_${dir//\//_}" "HANA Disk" "Disk usage for ${dir} is at ${usage}%." "true" "${usage}%"
|
|
||||||
ALERT_COUNT=$((ALERT_COUNT + 1))
|
ALERT_COUNT=$((ALERT_COUNT + 1))
|
||||||
else
|
else
|
||||||
# Send OK notification only if state changed from alert
|
log_message "$SCRIPT_NAME" "OK: ${dir} usage is at ${usage}% (below threshold)."
|
||||||
send_notification_if_changed "$SCRIPT_NAME" "disk_usage_${dir//\//_}" "HANA Disk" "Disk usage for ${dir} is at ${usage}% (below threshold)." "false" "OK"
|
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
# Summary logging
|
# Summary logging
|
||||||
log_message "$SCRIPT_NAME" "Disk check complete. Total: ${TOTAL_DIRS} dirs, ${ALERT_COUNT} alerts, ${CLEANUP_PERFORMED} cleanups performed."
|
log_message "$SCRIPT_NAME" "Disk check complete. Total: ${TOTAL_DIRS} dirs, ${ALERT_COUNT} alerts."
|
||||||
|
|
||||||
# Exit with status based on alerts
|
# Exit with status based on alerts
|
||||||
if [ "$ALERT_COUNT" -gt 0 ]; then
|
if [ "$ALERT_COUNT" -gt 0 ]; then
|
||||||
|
|||||||
271
hana_lib.sh
271
hana_lib.sh
@@ -25,83 +25,55 @@ acquire_lock() {
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
touch "$lock_file"
|
touch "$lock_file"
|
||||||
echo "$lock_file"
|
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
# Release lock
|
# Release lock
|
||||||
# Usage: release_lock "LOCK_FILE"
|
# Usage: release_lock "SCRIPT_NAME"
|
||||||
release_lock() {
|
release_lock() {
|
||||||
local lock_file="$1"
|
local script_name="$1"
|
||||||
if [ -n "$lock_file" ] && [ -f "$lock_file" ]; then
|
local lock_file="${LOCK_DIR}/hana_${script_name}.lock"
|
||||||
|
if [ -f "$lock_file" ]; then
|
||||||
rm -f "$lock_file"
|
rm -f "$lock_file"
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
# Get state value
|
# Send notification via ntfy
|
||||||
# Usage: get_state "KEY"
|
# Usage: send_notification "TITLE" "MESSAGE"
|
||||||
get_state() {
|
send_notification() {
|
||||||
local key="$1"
|
local title="$1"
|
||||||
if [ -f "${STATE_DIR}/${key}.state" ]; then
|
local message="$2"
|
||||||
cat "${STATE_DIR}/${key}.state"
|
|
||||||
else
|
|
||||||
echo ""
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# Set state value
|
|
||||||
# Usage: set_state "KEY" "VALUE"
|
|
||||||
set_state() {
|
|
||||||
local key="$1"
|
|
||||||
local value="$2"
|
|
||||||
echo "$value" > "${STATE_DIR}/${key}.state"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Send notification if state changed
|
|
||||||
# Usage: send_notification_if_changed "SCRIPT_NAME" "ALERT_KEY" "TITLE_PREFIX" "MESSAGE" "IS_ALERT" "CURRENT_VALUE"
|
|
||||||
send_notification_if_changed() {
|
|
||||||
local script_name="$1"
|
|
||||||
local alert_key="$2"
|
|
||||||
local title_prefix="$3"
|
|
||||||
local current_message="$4"
|
|
||||||
local is_alert_condition="$5"
|
|
||||||
local current_value="$6"
|
|
||||||
local hostname=$(hostname)
|
local hostname=$(hostname)
|
||||||
|
local full_message="[${COMPANY_NAME} | ${hostname}] ${message}"
|
||||||
|
|
||||||
local previous_value=$(get_state "$alert_key")
|
if [ -n "$NTFY_TOKEN" ] && [ -n "$NTFY_TOPIC_URL" ]; then
|
||||||
|
curl -H "Authorization: Bearer ${NTFY_TOKEN}" -H "Title: ${title}" -d "${full_message}" "${NTFY_TOPIC_URL}" > /dev/null 2>&1
|
||||||
if [ "$current_value" != "$previous_value" ]; then
|
log_message "NOTIFY" "Notification sent: ${title}"
|
||||||
local full_title=""
|
else
|
||||||
local full_message=""
|
log_message "NOTIFY" "Ntfy not configured, skipping notification"
|
||||||
|
|
||||||
if [ "$is_alert_condition" == "true" ]; then
|
|
||||||
full_title="${title_prefix} Alert"
|
|
||||||
full_message="🚨 Critical: ${current_message}"
|
|
||||||
log_message "$script_name" "ALERT: ${full_message}"
|
|
||||||
else
|
|
||||||
if [ -n "$previous_value" ] && [ "$previous_value" != "OK" ]; then
|
|
||||||
full_title="${title_prefix} Resolved"
|
|
||||||
full_message="✅ Resolved: ${current_message}"
|
|
||||||
log_message "$script_name" "RESOLVED: ${full_message}"
|
|
||||||
else
|
|
||||||
set_state "$alert_key" "$current_value"
|
|
||||||
return
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
local final_message="[${COMPANY_NAME} | ${hostname}] ${full_message}"
|
|
||||||
|
|
||||||
if [ -n "$NTFY_TOKEN" ] && [ -n "$NTFY_TOPIC_URL" ]; then
|
|
||||||
curl -H "Authorization: Bearer ${NTFY_TOKEN}" -H "Title: ${full_title}" -d "${final_message}" "${NTFY_TOPIC_URL}" > /dev/null 2>&1
|
|
||||||
log_message "$script_name" "Notification sent: ${full_title}"
|
|
||||||
else
|
|
||||||
log_message "$script_name" "Ntfy not configured, skipping notification"
|
|
||||||
fi
|
|
||||||
|
|
||||||
set_state "$alert_key" "$current_value"
|
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Send alert notification
|
||||||
|
# Usage: send_alert "SCRIPT_NAME" "TITLE_PREFIX" "MESSAGE"
|
||||||
|
send_alert() {
|
||||||
|
local script_name="$1"
|
||||||
|
local title_prefix="$2"
|
||||||
|
local message="$3"
|
||||||
|
send_notification "${title_prefix} Alert" "🚨 Critical: ${message}"
|
||||||
|
log_message "$script_name" "ALERT: ${message}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Send OK notification (state change from alert to normal)
|
||||||
|
# Usage: send_ok "SCRIPT_NAME" "TITLE_PREFIX" "MESSAGE"
|
||||||
|
send_ok() {
|
||||||
|
local script_name="$1"
|
||||||
|
local title_prefix="$2"
|
||||||
|
local message="$3"
|
||||||
|
send_notification "${title_prefix} Resolved" "✅ Resolved: ${message}"
|
||||||
|
log_message "$script_name" "RESOLVED: ${message}"
|
||||||
|
}
|
||||||
|
|
||||||
# Run command as HANA user using su
|
# Run command as HANA user using su
|
||||||
# Usage: run_as_hana_user "COMMAND"
|
# Usage: run_as_hana_user "COMMAND"
|
||||||
run_as_hana_user() {
|
run_as_hana_user() {
|
||||||
@@ -109,6 +81,45 @@ run_as_hana_user() {
|
|||||||
su - "$HANA_USER" -c "$command"
|
su - "$HANA_USER" -c "$command"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Execute SQL query as HANA user
|
||||||
|
# Usage: execute_hana_sql "SQL_QUERY"
|
||||||
|
# Returns: SQL output on stdout, returns 0 on success, 1 on failure
|
||||||
|
execute_hana_sql() {
|
||||||
|
local sql_query="$1"
|
||||||
|
local output
|
||||||
|
|
||||||
|
output=$(su - "$HANA_USER" -c "$HDBSQL_PATH -U $HANA_USER_KEY -j -a -x \"$sql_query\"" 2>&1)
|
||||||
|
local sql_status=$?
|
||||||
|
|
||||||
|
if [ $sql_status -ne 0 ]; then
|
||||||
|
log_message "SQL" "ERROR: Failed to execute SQL query. Exit code: ${sql_status}"
|
||||||
|
echo "$output" >&2
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "$output"
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
# Execute SQL query and return result (for single-value queries)
|
||||||
|
# Usage: execute_hana_sql_query "SQL_QUERY"
|
||||||
|
# Returns: Query result on stdout, returns 0 on success, 1 on failure
|
||||||
|
execute_hana_sql_query() {
|
||||||
|
local sql_query="$1"
|
||||||
|
local output
|
||||||
|
|
||||||
|
output=$(execute_hana_sql "$sql_query")
|
||||||
|
local sql_status=$?
|
||||||
|
|
||||||
|
if [ $sql_status -ne 0 ]; then
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Clean output: remove quotes and whitespace
|
||||||
|
echo "$output" | tr -d '"' | xargs
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
# Get disk usage percentage for a directory
|
# Get disk usage percentage for a directory
|
||||||
# Usage: get_disk_usage_percentage "/path/to/dir"
|
# Usage: get_disk_usage_percentage "/path/to/dir"
|
||||||
# Returns: Usage percentage as integer (without % sign)
|
# Returns: Usage percentage as integer (without % sign)
|
||||||
@@ -136,137 +147,3 @@ get_available_space_kb() {
|
|||||||
local dir="$1"
|
local dir="$1"
|
||||||
df -k "$dir" 2>/dev/null | awk 'NR==2 {print $4}'
|
df -k "$dir" 2>/dev/null | awk 'NR==2 {print $4}'
|
||||||
}
|
}
|
||||||
|
|
||||||
# Find log directories on the same mount point
|
|
||||||
# Usage: find_log_dirs_on_mount "mount_point"
|
|
||||||
# Returns: Space-separated list of log directories
|
|
||||||
find_log_dirs_on_mount() {
|
|
||||||
local mount_point="$1"
|
|
||||||
local result=""
|
|
||||||
|
|
||||||
for log_entry in "${LOG_DIRS_FOR_CLEANUP[@]}"; do
|
|
||||||
local entry_mount="${log_entry%%:*}"
|
|
||||||
local log_dir="${log_entry#*:}"
|
|
||||||
|
|
||||||
if [ "$entry_mount" == "$mount_point" ] && [ -d "$log_dir" ]; then
|
|
||||||
if [ -n "$result" ]; then
|
|
||||||
result="$result $log_dir"
|
|
||||||
else
|
|
||||||
result="$log_dir"
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
echo "$result"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Clean old log files in a directory
|
|
||||||
# Usage: clean_log_files "/path/to/log/dir" "max_age_days"
|
|
||||||
# Returns: Number of files deleted and space freed
|
|
||||||
clean_log_files() {
|
|
||||||
local log_dir="$1"
|
|
||||||
local max_age_days="${2:-7}"
|
|
||||||
local files_deleted=0
|
|
||||||
local space_freed=0
|
|
||||||
|
|
||||||
if [ ! -d "$log_dir" ]; then
|
|
||||||
log_message "CLEANUP" "Log directory '$log_dir' not found. Skipping."
|
|
||||||
echo "0:0"
|
|
||||||
return
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Find and delete old log files
|
|
||||||
while IFS= read -r -d '' file; do
|
|
||||||
if [ -f "$file" ]; then
|
|
||||||
local file_size=$(stat -c%s "$file" 2>/dev/null || echo "0")
|
|
||||||
rm -f "$file" 2>/dev/null && {
|
|
||||||
files_deleted=$((files_deleted + 1))
|
|
||||||
space_freed=$((space_freed + file_size))
|
|
||||||
}
|
|
||||||
fi
|
|
||||||
done < <(find "$log_dir" -type f -mtime +$max_age_days -print0 2>/dev/null)
|
|
||||||
|
|
||||||
# Also clean empty directories
|
|
||||||
find "$log_dir" -type d -empty -delete 2>/dev/null
|
|
||||||
|
|
||||||
log_message "CLEANUP" "Deleted $files_deleted files from '$log_dir', freed $((space_freed / 1024)) KB"
|
|
||||||
echo "${files_deleted}:${space_freed}"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Automatic disk cleanup function
|
|
||||||
# Usage: auto_cleanup "mount_point" "target_free_percentage"
|
|
||||||
# Returns: 0 if cleanup successful, 1 if failed or not needed
|
|
||||||
auto_cleanup() {
|
|
||||||
local mount_point="$1"
|
|
||||||
local target_free_percentage="${2:-5}"
|
|
||||||
|
|
||||||
if [ "$AUTO_CLEANUP_ENABLED" != "true" ]; then
|
|
||||||
log_message "CLEANUP" "Auto-cleanup is disabled. Skipping."
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
local log_dirs=$(find_log_dirs_on_mount "$mount_point")
|
|
||||||
|
|
||||||
if [ -z "$log_dirs" ]; then
|
|
||||||
log_message "CLEANUP" "No log directories configured for mount point '$mount_point'. Skipping cleanup."
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
log_message "CLEANUP" "Starting auto-cleanup for mount point '$mount_point'. Log dirs: $log_dirs"
|
|
||||||
|
|
||||||
local total_freed=0
|
|
||||||
local total_files=0
|
|
||||||
|
|
||||||
for log_dir in $log_dirs; do
|
|
||||||
local result=$(clean_log_files "$log_dir" "$MAX_LOG_FILE_AGE_DAYS")
|
|
||||||
local files="${result%%:*}"
|
|
||||||
local freed="${result#*:}"
|
|
||||||
total_files=$((total_files + files))
|
|
||||||
total_freed=$((total_freed + freed))
|
|
||||||
done
|
|
||||||
|
|
||||||
log_message "CLEANUP" "Cleanup complete. Total files deleted: $total_files, Total space freed: $((total_freed / 1024)) KB"
|
|
||||||
|
|
||||||
if [ $total_freed -gt 0 ]; then
|
|
||||||
return 0
|
|
||||||
else
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# Check disk space and perform auto-cleanup if needed
|
|
||||||
# Usage: check_and_cleanup_disk "directory" "threshold"
|
|
||||||
# Returns: 0 if OK or cleanup successful, 1 if critical and cleanup failed
|
|
||||||
check_and_cleanup_disk() {
|
|
||||||
local dir="$1"
|
|
||||||
local threshold="${2:-85}"
|
|
||||||
|
|
||||||
local usage=$(get_disk_usage_percentage "$dir")
|
|
||||||
local mount_point=$(get_mount_point "$dir")
|
|
||||||
|
|
||||||
if [ -z "$usage" ] || [ "$usage" -eq 0 ]; then
|
|
||||||
log_message "CLEANUP" "Could not determine disk usage for '$dir'. Skipping."
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ "$usage" -gt "$threshold" ]; then
|
|
||||||
log_message "CLEANUP" "Disk usage ${usage}% exceeds threshold ${threshold}% for '$dir'. Attempting cleanup..."
|
|
||||||
|
|
||||||
if auto_cleanup "$mount_point" "$MIN_FREE_SPACE_AFTER_CLEANUP"; then
|
|
||||||
local new_usage=$(get_disk_usage_percentage "$dir")
|
|
||||||
log_message "CLEANUP" "After cleanup, disk usage is ${new_usage}%"
|
|
||||||
|
|
||||||
if [ "$new_usage" -le "$threshold" ]; then
|
|
||||||
return 0
|
|
||||||
else
|
|
||||||
log_message "CLEANUP" "Cleanup completed but usage ${new_usage}% still above threshold"
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
log_message "CLEANUP" "Cleanup failed or no files to clean for '$dir'"
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -12,32 +12,30 @@ source "${SCRIPT_DIR}/hana.conf"
|
|||||||
source "${SCRIPT_DIR}/hana_lib.sh"
|
source "${SCRIPT_DIR}/hana_lib.sh"
|
||||||
|
|
||||||
# Acquire lock
|
# Acquire lock
|
||||||
LOCK_FILE=$(acquire_lock "$SCRIPT_NAME")
|
if ! acquire_lock "$SCRIPT_NAME"; then
|
||||||
if [ $? -ne 0 ]; then
|
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
trap 'release_lock "$LOCK_FILE"' EXIT
|
trap 'release_lock "$SCRIPT_NAME"' EXIT
|
||||||
|
|
||||||
log_message "$SCRIPT_NAME" "Starting log segment check..."
|
log_message "$SCRIPT_NAME" "Starting log segment check..."
|
||||||
|
|
||||||
# SQL Query for log segments
|
|
||||||
SQL_QUERY="SELECT b.host, b.service_name, a.state, count(*) FROM PUBLIC.M_LOG_SEGMENTS a JOIN PUBLIC.M_SERVICES b ON (a.host = b.host AND a.port = b.port) GROUP BY b.host, b.service_name, a.state;"
|
|
||||||
|
|
||||||
# Check if hdbsql is available
|
# Check if hdbsql is available
|
||||||
if [ ! -x "$HDBSQL_PATH" ]; then
|
if [ ! -x "$HDBSQL_PATH" ]; then
|
||||||
log_message "$SCRIPT_NAME" "ERROR: hdbsql not found or not executable at ${HDBSQL_PATH}"
|
log_message "$SCRIPT_NAME" "ERROR: hdbsql not found or not executable at ${HDBSQL_PATH}"
|
||||||
send_notification_if_changed "$SCRIPT_NAME" "hana_hdbsql_path" "HANA Monitor Error" "hdbsql not found or not executable at ${HDBSQL_PATH}" "true" "HDBSQL_ERROR"
|
send_alert "$SCRIPT_NAME" "HANA Monitor Error" "hdbsql not found or not executable at ${HDBSQL_PATH}"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Execute SQL query as HANA user with improved error handling
|
# SQL Query for log segments
|
||||||
readarray -t sql_output < <(su - "$HANA_USER" -c "$HDBSQL_PATH -U $HANA_USER_KEY -c \";\" \"$SQL_QUERY\"" 2>&1)
|
SQL_QUERY="SELECT b.host, b.service_name, a.state, count(*) FROM PUBLIC.M_LOG_SEGMENTS a JOIN PUBLIC.M_SERVICES b ON (a.host = b.host AND a.port = b.port) GROUP BY b.host, b.service_name, a.state;"
|
||||||
|
|
||||||
|
# Execute SQL query
|
||||||
|
sql_output=$(execute_hana_sql "$SQL_QUERY")
|
||||||
sql_status=$?
|
sql_status=$?
|
||||||
|
|
||||||
if [ $sql_status -ne 0 ]; then
|
if [ $sql_status -ne 0 ]; then
|
||||||
error_message=$(printf '%s\n' "${sql_output[@]}")
|
log_message "$SCRIPT_NAME" "ERROR: The hdbsql command failed."
|
||||||
log_message "$SCRIPT_NAME" "ERROR: The hdbsql command failed. Details: ${error_message}"
|
send_alert "$SCRIPT_NAME" "HANA Monitor Error" "The hdbsql command failed."
|
||||||
send_notification_if_changed "$SCRIPT_NAME" "hana_hdbsql_command" "HANA Monitor Error" "The hdbsql command failed. Details: ${error_message}" "true" "HDBSQL_COMMAND_FAILED"
|
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -46,15 +44,15 @@ total_segments=0
|
|||||||
truncated_segments=0
|
truncated_segments=0
|
||||||
free_segments=0
|
free_segments=0
|
||||||
|
|
||||||
for line in "${sql_output[@]}"; do
|
while IFS= read -r line; do
|
||||||
# Skip empty lines and header
|
# Skip empty lines and header
|
||||||
if [[ -z "$line" || "$line" == *"STATE"* || "$line" == *"host"* ]]; then
|
if [[ -z "$line" || "$line" == *"STATE"* || "$line" == *"host"* ]]; then
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
cleaned_line=$(echo "$line" | tr -d '"')
|
cleaned_line=$(echo "$line" | tr -d '"')
|
||||||
state=$(echo "$cleaned_line" | awk -F',' '{print $3}' | xargs) # Trim whitespace
|
state=$(echo "$cleaned_line" | awk -F',' '{print $3}' | xargs)
|
||||||
count=$(echo "$cleaned_line" | awk -F',' '{print $4}' | xargs) # Trim whitespace
|
count=$(echo "$cleaned_line" | awk -F',' '{print $4}' | xargs)
|
||||||
|
|
||||||
# Validate count is a number
|
# Validate count is a number
|
||||||
if ! [[ "$count" =~ ^[0-9]+$ ]]; then
|
if ! [[ "$count" =~ ^[0-9]+$ ]]; then
|
||||||
@@ -67,7 +65,7 @@ for line in "${sql_output[@]}"; do
|
|||||||
elif [[ "$state" == "Free" ]]; then
|
elif [[ "$state" == "Free" ]]; then
|
||||||
free_segments=$((free_segments + count))
|
free_segments=$((free_segments + count))
|
||||||
fi
|
fi
|
||||||
done
|
done <<< "$sql_output"
|
||||||
|
|
||||||
log_message "$SCRIPT_NAME" "Total Segments: ${total_segments}"
|
log_message "$SCRIPT_NAME" "Total Segments: ${total_segments}"
|
||||||
log_message "$SCRIPT_NAME" "Truncated Segments: ${truncated_segments}"
|
log_message "$SCRIPT_NAME" "Truncated Segments: ${truncated_segments}"
|
||||||
@@ -75,37 +73,24 @@ log_message "$SCRIPT_NAME" "Free Segments: ${free_segments}"
|
|||||||
|
|
||||||
if [ $total_segments -eq 0 ]; then
|
if [ $total_segments -eq 0 ]; then
|
||||||
log_message "$SCRIPT_NAME" "WARNING: No log segments found. Skipping percentage checks."
|
log_message "$SCRIPT_NAME" "WARNING: No log segments found. Skipping percentage checks."
|
||||||
send_notification_if_changed "$SCRIPT_NAME" "hana_log_segments_total" "HANA Log Segment Warning" "No log segments found. Skipping percentage checks." "true" "NO_LOG_SEGMENTS"
|
send_alert "$SCRIPT_NAME" "HANA Log Segment Warning" "No log segments found."
|
||||||
else
|
exit 1
|
||||||
send_notification_if_changed "$SCRIPT_NAME" "hana_log_segments_total" "HANA Log Segment" "Log segments found." "false" "OK"
|
fi
|
||||||
|
|
||||||
# Calculate truncated percentage with integer arithmetic
|
# Calculate truncated percentage with integer arithmetic
|
||||||
if [ $total_segments -gt 0 ]; then
|
truncated_percentage=$((truncated_segments * 100 / total_segments))
|
||||||
truncated_percentage=$((truncated_segments * 100 / total_segments))
|
|
||||||
else
|
if [ $truncated_percentage -gt $TRUNCATED_PERCENTAGE_THRESHOLD ]; then
|
||||||
truncated_percentage=0
|
log_message "$SCRIPT_NAME" "ALERT: ${truncated_percentage}% of log segments are 'Truncated'."
|
||||||
fi
|
send_alert "$SCRIPT_NAME" "HANA Log Segment" "${truncated_percentage}% of HANA log segments are in 'Truncated' state."
|
||||||
|
fi
|
||||||
if [ $truncated_percentage -gt $TRUNCATED_PERCENTAGE_THRESHOLD ]; then
|
|
||||||
log_message "$SCRIPT_NAME" "ALERT: ${truncated_percentage}% of log segments are 'Truncated'."
|
# Calculate free percentage with integer arithmetic
|
||||||
send_notification_if_changed "$SCRIPT_NAME" "hana_log_truncated" "HANA Log Segment" "${truncated_percentage}% of HANA log segments are in 'Truncated' state." "true" "${truncated_percentage}%"
|
free_percentage=$((free_segments * 100 / total_segments))
|
||||||
else
|
|
||||||
send_notification_if_changed "$SCRIPT_NAME" "hana_log_truncated" "HANA Log Segment" "${truncated_percentage}% of HANA log segments are in 'Truncated' state (below threshold)." "false" "OK"
|
if [ $free_percentage -lt $FREE_PERCENTAGE_THRESHOLD ]; then
|
||||||
fi
|
log_message "$SCRIPT_NAME" "ALERT: Only ${free_percentage}% of log segments are 'Free'."
|
||||||
|
send_alert "$SCRIPT_NAME" "HANA Log Segment" "Only ${free_percentage}% of HANA log segments are in 'Free' state."
|
||||||
# Calculate free percentage with integer arithmetic
|
|
||||||
if [ $total_segments -gt 0 ]; then
|
|
||||||
free_percentage=$((free_segments * 100 / total_segments))
|
|
||||||
else
|
|
||||||
free_percentage=0
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ $free_percentage -lt $FREE_PERCENTAGE_THRESHOLD ]; then
|
|
||||||
log_message "$SCRIPT_NAME" "ALERT: Only ${free_percentage}% of log segments are 'Free'."
|
|
||||||
send_notification_if_changed "$SCRIPT_NAME" "hana_log_free" "HANA Log Segment" "Only ${free_percentage}% of HANA log segments are in 'Free' state." "true" "${free_percentage}%"
|
|
||||||
else
|
|
||||||
send_notification_if_changed "$SCRIPT_NAME" "hana_log_free" "HANA Log Segment" "Only ${free_percentage}% of HANA log segments are in 'Free' state (above threshold)." "false" "OK"
|
|
||||||
fi
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
log_message "$SCRIPT_NAME" "Log segment check complete."
|
log_message "$SCRIPT_NAME" "Log segment check complete."
|
||||||
|
|||||||
@@ -12,53 +12,47 @@ source "${SCRIPT_DIR}/hana.conf"
|
|||||||
source "${SCRIPT_DIR}/hana_lib.sh"
|
source "${SCRIPT_DIR}/hana_lib.sh"
|
||||||
|
|
||||||
# Acquire lock
|
# Acquire lock
|
||||||
LOCK_FILE=$(acquire_lock "$SCRIPT_NAME")
|
if ! acquire_lock "$SCRIPT_NAME"; then
|
||||||
if [ $? -ne 0 ]; then
|
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
trap 'release_lock "$LOCK_FILE"' EXIT
|
trap 'release_lock "$SCRIPT_NAME"' EXIT
|
||||||
|
|
||||||
log_message "$SCRIPT_NAME" "Starting HANA process status check..."
|
log_message "$SCRIPT_NAME" "Starting HANA process status check..."
|
||||||
|
|
||||||
# Check if sapcontrol is available
|
# Check if sapcontrol is available
|
||||||
if [ ! -x "$SAPCONTROL_PATH" ]; then
|
if [ ! -x "$SAPCONTROL_PATH" ]; then
|
||||||
log_message "$SCRIPT_NAME" "ERROR: sapcontrol not found or not executable at ${SAPCONTROL_PATH}"
|
log_message "$SCRIPT_NAME" "ERROR: sapcontrol not found or not executable at ${SAPCONTROL_PATH}"
|
||||||
send_notification_if_changed "$SCRIPT_NAME" "hana_sapcontrol_path" "HANA Monitor Error" "sapcontrol not found or not executable at ${SAPCONTROL_PATH}" "true" "SAPCONTROL_ERROR"
|
send_alert "$SCRIPT_NAME" "HANA Monitor Error" "sapcontrol not found or not executable at ${SAPCONTROL_PATH}"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Get process list with improved error handling
|
# Get process list
|
||||||
process_list=$(su - "$HANA_USER" -c "${SAPCONTROL_PATH} -nr ${HANA_INSTANCE_NR} -function GetProcessList" 2>&1)
|
process_list=$(su - "$HANA_USER" -c "${SAPCONTROL_PATH} -nr ${HANA_INSTANCE_NR} -function GetProcessList" 2>&1)
|
||||||
sapcontrol_status=$?
|
sapcontrol_status=$?
|
||||||
|
|
||||||
if [ $sapcontrol_status -ne 0 ]; then
|
if [ $sapcontrol_status -ne 0 ]; then
|
||||||
log_message "$SCRIPT_NAME" "ERROR: sapcontrol command failed with exit code ${sapcontrol_status}"
|
log_message "$SCRIPT_NAME" "ERROR: sapcontrol command failed with exit code ${sapcontrol_status}"
|
||||||
send_notification_if_changed "$SCRIPT_NAME" "hana_sapcontrol_command" "HANA Monitor Error" "sapcontrol command failed. Exit code: ${sapcontrol_status}" "true" "SAPCONTROL_COMMAND_FAILED"
|
send_alert "$SCRIPT_NAME" "HANA Monitor Error" "sapcontrol command failed. Exit code: ${sapcontrol_status}"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Clear any previous sapcontrol error state
|
|
||||||
send_notification_if_changed "$SCRIPT_NAME" "hana_sapcontrol_command" "HANA Process" "sapcontrol command successful." "false" "OK"
|
|
||||||
|
|
||||||
# Check for non-GREEN processes (skip header lines)
|
# Check for non-GREEN processes (skip header lines)
|
||||||
non_green_processes=$(echo "$process_list" | tail -n +6 | grep -v 'GREEN' | grep -v '^$')
|
non_green_processes=$(echo "$process_list" | tail -n +6 | grep -v 'GREEN' | grep -v '^$')
|
||||||
|
|
||||||
if [ -n "$non_green_processes" ]; then
|
if [ -n "$non_green_processes" ]; then
|
||||||
log_message "$SCRIPT_NAME" "ALERT: One or more HANA processes are not running!"
|
log_message "$SCRIPT_NAME" "ALERT: One or more HANA processes are not running!"
|
||||||
log_message "$SCRIPT_NAME" "Problem processes: ${non_green_processes}"
|
log_message "$SCRIPT_NAME" "Problem processes: ${non_green_processes}"
|
||||||
send_notification_if_changed "$SCRIPT_NAME" "hana_processes" "HANA Process" "One or more HANA processes are not GREEN. Problem processes: ${non_green_processes}" "true" "PROCESS_ALERT:${non_green_processes}"
|
send_alert "$SCRIPT_NAME" "HANA Process" "One or more HANA processes are not GREEN. Problem processes: ${non_green_processes}"
|
||||||
exit 1
|
exit 1
|
||||||
else
|
|
||||||
# Verify we actually got process data
|
|
||||||
green_processes=$(echo "$process_list" | tail -n +6 | grep 'GREEN')
|
|
||||||
if [ -z "$green_processes" ]; then
|
|
||||||
log_message "$SCRIPT_NAME" "WARNING: No process data found. SAP HANA may not be running."
|
|
||||||
send_notification_if_changed "$SCRIPT_NAME" "hana_processes" "HANA Process" "No process data found. SAP HANA may not be running." "true" "NO_PROCESS_DATA"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
send_notification_if_changed "$SCRIPT_NAME" "hana_processes" "HANA Process" "All HANA processes are GREEN." "false" "OK"
|
|
||||||
log_message "$SCRIPT_NAME" "SUCCESS: All HANA processes are GREEN."
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Verify we actually got process data
|
||||||
|
green_processes=$(echo "$process_list" | tail -n +6 | grep 'GREEN')
|
||||||
|
if [ -z "$green_processes" ]; then
|
||||||
|
log_message "$SCRIPT_NAME" "WARNING: No process data found. SAP HANA may not be running."
|
||||||
|
send_alert "$SCRIPT_NAME" "HANA Process" "No process data found. SAP HANA may not be running."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
log_message "$SCRIPT_NAME" "SUCCESS: All HANA processes are GREEN."
|
||||||
log_message "$SCRIPT_NAME" "Process check complete."
|
log_message "$SCRIPT_NAME" "Process check complete."
|
||||||
|
|||||||
@@ -12,71 +12,66 @@ source "${SCRIPT_DIR}/hana.conf"
|
|||||||
source "${SCRIPT_DIR}/hana_lib.sh"
|
source "${SCRIPT_DIR}/hana_lib.sh"
|
||||||
|
|
||||||
# Acquire lock
|
# Acquire lock
|
||||||
LOCK_FILE=$(acquire_lock "$SCRIPT_NAME")
|
if ! acquire_lock "$SCRIPT_NAME"; then
|
||||||
if [ $? -ne 0 ]; then
|
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
trap 'release_lock "$LOCK_FILE"' EXIT
|
trap 'release_lock "$SCRIPT_NAME"' EXIT
|
||||||
|
|
||||||
log_message "$SCRIPT_NAME" "Starting statement queue check..."
|
log_message "$SCRIPT_NAME" "Starting statement queue check..."
|
||||||
|
|
||||||
# SQL Query for statement queue
|
|
||||||
STATEMENT_QUEUE_SQL="SELECT COUNT(*) FROM M_SERVICE_THREADS WHERE THREAD_TYPE = 'SqlExecutor' AND THREAD_STATE = 'Queueing';"
|
|
||||||
|
|
||||||
# Check if hdbsql is available
|
# Check if hdbsql is available
|
||||||
if [ ! -x "$HDBSQL_PATH" ]; then
|
if [ ! -x "$HDBSQL_PATH" ]; then
|
||||||
log_message "$SCRIPT_NAME" "ERROR: hdbsql not found or not executable at ${HDBSQL_PATH}"
|
log_message "$SCRIPT_NAME" "ERROR: hdbsql not found or not executable at ${HDBSQL_PATH}"
|
||||||
send_notification_if_changed "$SCRIPT_NAME" "hana_hdbsql_path_queue" "HANA Monitor Error" "hdbsql not found or not executable at ${HDBSQL_PATH}" "true" "HDBSQL_ERROR"
|
send_alert "$SCRIPT_NAME" "HANA Monitor Error" "hdbsql not found or not executable at ${HDBSQL_PATH}"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Execute SQL query as HANA user with improved error handling
|
# SQL Query for statement queue
|
||||||
queue_result=$(su - "$HANA_USER" -c "$HDBSQL_PATH -U $HANA_USER_KEY -j -a -x \"$STATEMENT_QUEUE_SQL\"" 2>&1)
|
STATEMENT_QUEUE_SQL="SELECT COUNT(*) FROM M_SERVICE_THREADS WHERE THREAD_TYPE = 'SqlExecutor' AND THREAD_STATE = 'Queueing';"
|
||||||
|
|
||||||
|
# Execute SQL query
|
||||||
|
queue_result=$(execute_hana_sql_query "$STATEMENT_QUEUE_SQL")
|
||||||
sql_status=$?
|
sql_status=$?
|
||||||
|
|
||||||
if [ $sql_status -ne 0 ]; then
|
if [ $sql_status -ne 0 ]; then
|
||||||
log_message "$SCRIPT_NAME" "ERROR: Failed to execute queue query. Exit code: ${sql_status}"
|
log_message "$SCRIPT_NAME" "ERROR: Failed to execute queue query."
|
||||||
send_notification_if_changed "$SCRIPT_NAME" "hana_queue_query_error" "HANA Queue Error" "Failed to execute queue query. Exit code: ${sql_status}" "true" "QUERY_ERROR"
|
send_alert "$SCRIPT_NAME" "HANA Queue Error" "Failed to execute queue query."
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Clear any previous query error state
|
|
||||||
send_notification_if_changed "$SCRIPT_NAME" "hana_queue_query_error" "HANA Statement Queue" "Queue query successful." "false" "OK"
|
|
||||||
|
|
||||||
# Parse queue count
|
|
||||||
queue_count=$(echo "$queue_result" | tr -d '"' | xargs)
|
|
||||||
|
|
||||||
# Validate queue count is a number
|
# Validate queue count is a number
|
||||||
if ! [[ "$queue_count" =~ ^[0-9]+$ ]]; then
|
if ! [[ "$queue_count" =~ ^[0-9]+$ ]]; then
|
||||||
log_message "$SCRIPT_NAME" "WARNING: Could not retrieve HANA statement queue count. Got: '${queue_count}'. Skipping check."
|
log_message "$SCRIPT_NAME" "WARNING: Could not retrieve HANA statement queue count. Got: '${queue_count}'."
|
||||||
send_notification_if_changed "$SCRIPT_NAME" "hana_statement_queue_check_fail" "HANA Monitor Warning" "Could not retrieve statement queue count. Got: '${queue_count}'" "true" "QUEUE_CHECK_FAIL"
|
send_alert "$SCRIPT_NAME" "HANA Monitor Warning" "Could not retrieve statement queue count."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
log_message "$SCRIPT_NAME" "Current statement queue length: ${queue_count}"
|
||||||
|
|
||||||
|
# Get breach count from state file
|
||||||
|
breach_count_file="${STATE_DIR}/statement_queue_breach_count"
|
||||||
|
breach_count=0
|
||||||
|
if [ -f "$breach_count_file" ]; then
|
||||||
|
breach_count=$(cat "$breach_count_file")
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "$queue_count" -gt "$STATEMENT_QUEUE_THRESHOLD" ]; then
|
||||||
|
breach_count=$((breach_count + 1))
|
||||||
|
log_message "$SCRIPT_NAME" "Statement queue is above threshold (${queue_count} > ${STATEMENT_QUEUE_THRESHOLD}). Consecutive breach count: ${breach_count}/${STATEMENT_QUEUE_CONSECUTIVE_RUNS}."
|
||||||
else
|
else
|
||||||
# Clear any previous check failure state
|
if [ "$breach_count" -gt 0 ]; then
|
||||||
send_notification_if_changed "$SCRIPT_NAME" "hana_statement_queue_check_fail" "HANA Monitor Warning" "Statement queue check is working." "false" "OK"
|
log_message "$SCRIPT_NAME" "Statement queue returned to normal. Resetting breach count from ${breach_count} to 0."
|
||||||
log_message "$SCRIPT_NAME" "Current statement queue length: ${queue_count}"
|
|
||||||
|
|
||||||
# Get breach count from state
|
|
||||||
breach_count=$(get_state "statement_queue_breach_count")
|
|
||||||
breach_count=${breach_count:-0}
|
|
||||||
|
|
||||||
if [ "$queue_count" -gt "$STATEMENT_QUEUE_THRESHOLD" ]; then
|
|
||||||
breach_count=$((breach_count + 1))
|
|
||||||
log_message "$SCRIPT_NAME" "Statement queue is above threshold (${queue_count} > ${STATEMENT_QUEUE_THRESHOLD}). Consecutive breach count: ${breach_count}/${STATEMENT_QUEUE_CONSECUTIVE_RUNS}."
|
|
||||||
else
|
|
||||||
if [ "$breach_count" -gt 0 ]; then
|
|
||||||
log_message "$SCRIPT_NAME" "Statement queue returned to normal. Resetting breach count from ${breach_count} to 0."
|
|
||||||
fi
|
|
||||||
breach_count=0
|
|
||||||
fi
|
fi
|
||||||
set_state "statement_queue_breach_count" "$breach_count"
|
breach_count=0
|
||||||
|
fi
|
||||||
|
echo "$breach_count" > "$breach_count_file"
|
||||||
|
|
||||||
if [ "$breach_count" -ge "$STATEMENT_QUEUE_CONSECUTIVE_RUNS" ]; then
|
if [ "$breach_count" -ge "$STATEMENT_QUEUE_CONSECUTIVE_RUNS" ]; then
|
||||||
message="Statement queue has been over ${STATEMENT_QUEUE_THRESHOLD} for ${breach_count} checks. Current count: ${queue_count}."
|
message="Statement queue has been over ${STATEMENT_QUEUE_THRESHOLD} for ${breach_count} checks. Current count: ${queue_count}."
|
||||||
send_notification_if_changed "$SCRIPT_NAME" "hana_statement_queue_status" "HANA Statement Queue" "$message" "true" "ALERT:${queue_count}"
|
send_alert "$SCRIPT_NAME" "HANA Statement Queue" "$message"
|
||||||
else
|
exit 1
|
||||||
message="Statement queue is normal. Current count: ${queue_count}."
|
else
|
||||||
send_notification_if_changed "$SCRIPT_NAME" "hana_statement_queue_status" "HANA Statement Queue" "$message" "false" "OK"
|
log_message "$SCRIPT_NAME" "Statement queue is normal. Current count: ${queue_count}."
|
||||||
fi
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
log_message "$SCRIPT_NAME" "Statement queue check complete."
|
log_message "$SCRIPT_NAME" "Statement queue check complete."
|
||||||
|
|||||||
277
install.sh
Normal file
277
install.sh
Normal file
@@ -0,0 +1,277 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# =============================================================================
|
||||||
|
# SAP HANA Monitoring Scripts Installer
|
||||||
|
# Downloads and installs HANA monitoring scripts to /root/hana-scripts/
|
||||||
|
# Preserves existing configuration if hana.conf already exists
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
INSTALL_DIR="/root/hana-scripts"
|
||||||
|
REPO_URL="https://git.technopunk.space/tomi/hana-scripts/raw/branch/main"
|
||||||
|
BACKUP_DIR="/root/hana-scripts-backup-$(date +%Y%m%d_%H%M%S)"
|
||||||
|
|
||||||
|
# List of files to install
|
||||||
|
FILES=(
|
||||||
|
"hana.conf"
|
||||||
|
"hana_lib.sh"
|
||||||
|
"hana_disk.sh"
|
||||||
|
"hana_backup.sh"
|
||||||
|
"hana_log_segments.sh"
|
||||||
|
"hana_processes.sh"
|
||||||
|
"hana_queue.sh"
|
||||||
|
"sld_watchdog.sh"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Colors for output
|
||||||
|
RED='\033[0;31m'
|
||||||
|
GREEN='\033[0;32m'
|
||||||
|
YELLOW='\033[1;33m'
|
||||||
|
BLUE='\033[0;34m'
|
||||||
|
NC='\033[0m' # No Color
|
||||||
|
|
||||||
|
# Logging functions
|
||||||
|
log_info() {
|
||||||
|
echo -e "${BLUE}[INFO]${NC} $1"
|
||||||
|
}
|
||||||
|
|
||||||
|
log_success() {
|
||||||
|
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
||||||
|
}
|
||||||
|
|
||||||
|
log_warning() {
|
||||||
|
echo -e "${YELLOW}[WARNING]${NC} $1"
|
||||||
|
}
|
||||||
|
|
||||||
|
log_error() {
|
||||||
|
echo -e "${RED}[ERROR]${NC} $1"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Check if running as root
|
||||||
|
check_root() {
|
||||||
|
if [ "$EUID" -ne 0 ]; then
|
||||||
|
log_error "This script must be run as root"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Check if curl is available
|
||||||
|
check_curl() {
|
||||||
|
if ! command -v curl &> /dev/null; then
|
||||||
|
log_error "curl is required but not installed. Please install curl first."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
log_info "curl is available"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Create backup of existing installation
|
||||||
|
backup_existing() {
|
||||||
|
if [ -d "$INSTALL_DIR" ]; then
|
||||||
|
log_info "Existing installation found at $INSTALL_DIR"
|
||||||
|
log_info "Creating backup at $BACKUP_DIR..."
|
||||||
|
|
||||||
|
mkdir -p "$BACKUP_DIR"
|
||||||
|
cp -r "$INSTALL_DIR"/* "$BACKUP_DIR"/ 2>/dev/null || true
|
||||||
|
|
||||||
|
# Save the existing hana.conf separately for easy access
|
||||||
|
if [ -f "$INSTALL_DIR/hana.conf" ]; then
|
||||||
|
cp "$INSTALL_DIR/hana.conf" "$BACKUP_DIR/hana.conf.preserved"
|
||||||
|
log_success "Existing hana.conf backed up to $BACKUP_DIR/hana.conf.preserved"
|
||||||
|
fi
|
||||||
|
|
||||||
|
log_success "Backup completed"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Create installation directory
|
||||||
|
create_install_dir() {
|
||||||
|
log_info "Creating installation directory: $INSTALL_DIR"
|
||||||
|
mkdir -p "$INSTALL_DIR"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Download a file from the repository
|
||||||
|
download_file() {
|
||||||
|
local filename="$1"
|
||||||
|
local url="${REPO_URL}/${filename}"
|
||||||
|
local dest="${INSTALL_DIR}/${filename}"
|
||||||
|
|
||||||
|
log_info "Downloading ${filename}..."
|
||||||
|
|
||||||
|
if curl -sSf -o "$dest" "$url"; then
|
||||||
|
log_success "Downloaded ${filename}"
|
||||||
|
return 0
|
||||||
|
else
|
||||||
|
log_error "Failed to download ${filename}"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Restore preserved configuration
|
||||||
|
restore_config() {
|
||||||
|
local preserved_config="$BACKUP_DIR/hana.conf.preserved"
|
||||||
|
|
||||||
|
if [ -f "$preserved_config" ]; then
|
||||||
|
log_info "Preserving existing hana.conf..."
|
||||||
|
|
||||||
|
# Download the new config to a temp location
|
||||||
|
local temp_config="${INSTALL_DIR}/hana.conf.new"
|
||||||
|
if curl -sSf -o "$temp_config" "${REPO_URL}/hana.conf"; then
|
||||||
|
# Compare configs to check for new settings
|
||||||
|
if ! diff -q "$preserved_config" "$temp_config" > /dev/null 2>&1; then
|
||||||
|
log_warning "Configuration file has updates. Merging changes..."
|
||||||
|
|
||||||
|
# Keep the old config but add any new settings from the new config
|
||||||
|
# First, copy the preserved config
|
||||||
|
cp "$preserved_config" "$dest"
|
||||||
|
|
||||||
|
# Extract new settings from the new config (settings not in old config)
|
||||||
|
# This is a simple merge - new variables are appended if not present
|
||||||
|
while IFS= read -r line; do
|
||||||
|
# Skip comments and empty lines
|
||||||
|
if [[ "$line" =~ ^[[:space:]]*# ]] || [[ -z "${line// }" ]]; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Extract variable name
|
||||||
|
var_name="${line%%=*}"
|
||||||
|
var_name="${var_name// /}"
|
||||||
|
|
||||||
|
# Check if variable exists in preserved config
|
||||||
|
if ! grep -q "^${var_name}=" "$preserved_config" 2>/dev/null; then
|
||||||
|
log_info "Adding new configuration: ${var_name}"
|
||||||
|
echo "" >> "$dest"
|
||||||
|
echo "$line" >> "$dest"
|
||||||
|
fi
|
||||||
|
done < "$temp_config"
|
||||||
|
|
||||||
|
log_success "Configuration merged successfully"
|
||||||
|
else
|
||||||
|
log_info "No configuration changes detected, keeping existing config"
|
||||||
|
cp "$preserved_config" "$dest"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Clean up temp file
|
||||||
|
rm -f "$temp_config"
|
||||||
|
else
|
||||||
|
log_warning "Could not download new hana.conf, keeping existing config"
|
||||||
|
cp "$preserved_config" "$dest"
|
||||||
|
fi
|
||||||
|
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
# Set executable permissions
|
||||||
|
set_permissions() {
|
||||||
|
log_info "Setting executable permissions..."
|
||||||
|
chmod +x "${INSTALL_DIR}"/*.sh 2>/dev/null || true
|
||||||
|
log_success "Permissions set"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Verify installation
|
||||||
|
verify_installation() {
|
||||||
|
log_info "Verifying installation..."
|
||||||
|
|
||||||
|
local errors=0
|
||||||
|
|
||||||
|
for file in "${FILES[@]}"; do
|
||||||
|
if [ ! -f "${INSTALL_DIR}/${file}" ]; then
|
||||||
|
log_error "Missing file: ${file}"
|
||||||
|
errors=$((errors + 1))
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ $errors -eq 0 ]; then
|
||||||
|
log_success "Installation verified successfully"
|
||||||
|
return 0
|
||||||
|
else
|
||||||
|
log_error "Verification failed with ${errors} errors"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Print summary
|
||||||
|
print_summary() {
|
||||||
|
echo ""
|
||||||
|
echo "=============================================="
|
||||||
|
echo " Installation Complete!"
|
||||||
|
echo "=============================================="
|
||||||
|
echo ""
|
||||||
|
echo "Installation directory: ${INSTALL_DIR}"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
if [ -d "$BACKUP_DIR" ]; then
|
||||||
|
echo "Backup location: ${BACKUP_DIR}"
|
||||||
|
echo ""
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Next steps:"
|
||||||
|
echo " 1. Review and update ${INSTALL_DIR}/hana.conf with your settings"
|
||||||
|
echo " 2. Make scripts executable: chmod +x ${INSTALL_DIR}/*.sh"
|
||||||
|
echo " 3. Test individual scripts manually"
|
||||||
|
echo " 4. Set up cron jobs for automated monitoring"
|
||||||
|
echo ""
|
||||||
|
echo "Example cron entries (run crontab -e):"
|
||||||
|
echo " */5 * * * * ${INSTALL_DIR}/hana_disk.sh"
|
||||||
|
echo " */5 * * * * ${INSTALL_DIR}/hana_processes.sh"
|
||||||
|
echo " */10 * * * * ${INSTALL_DIR}/hana_backup.sh"
|
||||||
|
echo " */5 * * * * ${INSTALL_DIR}/hana_queue.sh"
|
||||||
|
echo " */5 * * * * ${INSTALL_DIR}/hana_log_segments.sh"
|
||||||
|
echo " */5 * * * * ${INSTALL_DIR}/sld_watchdog.sh"
|
||||||
|
echo ""
|
||||||
|
}
|
||||||
|
|
||||||
|
# Main installation function
|
||||||
|
main() {
|
||||||
|
echo ""
|
||||||
|
echo "=============================================="
|
||||||
|
echo " SAP HANA Monitoring Scripts Installer"
|
||||||
|
echo "=============================================="
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Pre-flight checks
|
||||||
|
check_root
|
||||||
|
check_curl
|
||||||
|
|
||||||
|
# Backup existing installation
|
||||||
|
backup_existing
|
||||||
|
|
||||||
|
# Create installation directory
|
||||||
|
create_install_dir
|
||||||
|
|
||||||
|
# Download all files
|
||||||
|
local download_errors=0
|
||||||
|
for file in "${FILES[@]}"; do
|
||||||
|
if [ "$file" == "hana.conf" ]; then
|
||||||
|
# Handle config file specially
|
||||||
|
if ! restore_config; then
|
||||||
|
# No existing config, download fresh
|
||||||
|
download_file "$file" || download_errors=$((download_errors + 1))
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
download_file "$file" || download_errors=$((download_errors + 1))
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ $download_errors -gt 0 ]; then
|
||||||
|
log_error "Failed to download ${download_errors} files"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Set permissions
|
||||||
|
set_permissions
|
||||||
|
|
||||||
|
# Verify installation
|
||||||
|
if ! verify_installation; then
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Print summary
|
||||||
|
print_summary
|
||||||
|
}
|
||||||
|
|
||||||
|
# Run main function
|
||||||
|
main "$@"
|
||||||
@@ -1,10 +1,8 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
#
|
# =============================================================================
|
||||||
# sld_watchdog.sh - Monitors SLD service health and restarts if needed
|
# sld_watchdog.sh - Monitors SLD service health and restarts if needed
|
||||||
# Optimized for better error handling and reliability
|
# =============================================================================
|
||||||
#
|
|
||||||
|
|
||||||
# Get script directory and name
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
SCRIPT_NAME="sld_watchdog"
|
SCRIPT_NAME="sld_watchdog"
|
||||||
|
|
||||||
@@ -15,14 +13,12 @@ source "$SCRIPT_DIR/hana_lib.sh"
|
|||||||
# SLD-specific configuration
|
# SLD-specific configuration
|
||||||
SLD_URL="${SLD_URL:-https://localhost:40000/sld/sld0100.svc}"
|
SLD_URL="${SLD_URL:-https://localhost:40000/sld/sld0100.svc}"
|
||||||
SLD_TIMEOUT="${SLD_TIMEOUT:-5}"
|
SLD_TIMEOUT="${SLD_TIMEOUT:-5}"
|
||||||
SLD_LOCK_FILE="/tmp/hana_sld_watchdog.lock"
|
|
||||||
|
|
||||||
# Acquire lock using library function
|
# Acquire lock
|
||||||
LOCK_FILE=$(acquire_lock "$SCRIPT_NAME")
|
if ! acquire_lock "$SCRIPT_NAME"; then
|
||||||
if [ $? -ne 0 ]; then
|
|
||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
trap 'release_lock "$LOCK_FILE"' EXIT
|
trap 'release_lock "$SCRIPT_NAME"' EXIT
|
||||||
|
|
||||||
# Function to check SLD health
|
# Function to check SLD health
|
||||||
# Returns HTTP status code or "0" for connection errors
|
# Returns HTTP status code or "0" for connection errors
|
||||||
@@ -71,7 +67,6 @@ main() {
|
|||||||
# 200 OK or 401 Unauthorized indicate the Tomcat layer is actively processing
|
# 200 OK or 401 Unauthorized indicate the Tomcat layer is actively processing
|
||||||
if [[ $http_status == 200 || $http_status == 401 ]]; then
|
if [[ $http_status == 200 || $http_status == 401 ]]; then
|
||||||
log_message "$SCRIPT_NAME" "[OK] SLD service healthy (HTTP Status: $http_status)"
|
log_message "$SCRIPT_NAME" "[OK] SLD service healthy (HTTP Status: $http_status)"
|
||||||
send_notification_if_changed "$SCRIPT_NAME" "sld_status" "SLD Service" "SLD service is healthy (HTTP: $http_status)" "false" "OK"
|
|
||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -86,14 +81,12 @@ main() {
|
|||||||
log_message "$SCRIPT_NAME" "[ALERT] SLD dead or hung (${status_detail}). Restarting service..."
|
log_message "$SCRIPT_NAME" "[ALERT] SLD dead or hung (${status_detail}). Restarting service..."
|
||||||
|
|
||||||
# Send notification
|
# Send notification
|
||||||
send_notification_if_changed "$SCRIPT_NAME" "sld_down" "SLD Service Critical" \
|
send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service is down (${status_detail}). Restarting ${SLD_URL}"
|
||||||
"SLD service is down (${status_detail}). Restarting ${SLD_URL}" "true" "SLD_DOWN"
|
|
||||||
|
|
||||||
# Restart the service
|
# Restart the service
|
||||||
if ! restart_sld_service; then
|
if ! restart_sld_service; then
|
||||||
log_message "$SCRIPT_NAME" "[CRITICAL] Failed to restart SLD service"
|
log_message "$SCRIPT_NAME" "[CRITICAL] Failed to restart SLD service"
|
||||||
send_notification_if_changed "$SCRIPT_NAME" "sld_restart_failed" "SLD Service Critical" \
|
send_alert "$SCRIPT_NAME" "SLD Service Critical" "Failed to restart SLD service"
|
||||||
"Failed to restart SLD service" "true" "RESTART_FAILED"
|
|
||||||
return 1
|
return 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -106,10 +99,6 @@ main() {
|
|||||||
|
|
||||||
if [[ $recovery_status == 200 || $recovery_status == 401 ]]; then
|
if [[ $recovery_status == 200 || $recovery_status == 401 ]]; then
|
||||||
log_message "$SCRIPT_NAME" "[RECOVERY] SLD service recovered successfully (HTTP Status: $recovery_status)"
|
log_message "$SCRIPT_NAME" "[RECOVERY] SLD service recovered successfully (HTTP Status: $recovery_status)"
|
||||||
send_notification_if_changed "$SCRIPT_NAME" "sld_down" "SLD Service" \
|
|
||||||
"SLD service recovered (HTTP: $recovery_status)" "false" "OK"
|
|
||||||
send_notification_if_changed "$SCRIPT_NAME" "sld_restart_failed" "SLD Service" \
|
|
||||||
"Service recovered successfully" "false" "OK"
|
|
||||||
else
|
else
|
||||||
local recovery_detail
|
local recovery_detail
|
||||||
if [ "$recovery_status" == "0" ]; then
|
if [ "$recovery_status" == "0" ]; then
|
||||||
@@ -118,8 +107,7 @@ main() {
|
|||||||
recovery_detail="HTTP Status: $recovery_status"
|
recovery_detail="HTTP Status: $recovery_status"
|
||||||
fi
|
fi
|
||||||
log_message "$SCRIPT_NAME" "[CRITICAL] SLD service failed to recover after restart (${recovery_detail})"
|
log_message "$SCRIPT_NAME" "[CRITICAL] SLD service failed to recover after restart (${recovery_detail})"
|
||||||
send_notification_if_changed "$SCRIPT_NAME" "sld_down" "SLD Service Critical" \
|
send_alert "$SCRIPT_NAME" "SLD Service Critical" "SLD service FAILED to recover after restart (${recovery_detail})"
|
||||||
"SLD service FAILED to recover after restart (${recovery_detail})" "true" "RECOVERY_FAILED"
|
|
||||||
return 1
|
return 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user