Files
vps2-skripte/vps-healthcheck.sh
2025-08-26 18:20:10 +02:00

190 lines
7.2 KiB
Bash
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env bash
# Healthcheck v2 (modular) Core + optionale Module (z. B. Mailcow)
# --- Helpers / Debug ---
DEBUG="${DEBUG:-0}"
NO_PUSH="${NO_PUSH:-0}"
log() { [ "$DEBUG" = "1" ] && echo "[dbg] $*" >&2; }
alerts=()
add_alert(){ alerts+=("$1"); }
# --- Defaults, falls nicht via Env gesetzt ---
KUMA_BASE_URL="${KUMA_BASE_URL:-https://status.example/api/push}"
KUMA_PUSH_ID="${KUMA_PUSH_ID:-}"
LOAD_PER_CORE_MAX="${LOAD_PER_CORE_MAX:-1.50}"
MEM_USED_MAX="${MEM_USED_MAX:-90}"
DISK_USED_MAX="${DISK_USED_MAX:-85}"
INODE_USED_MAX="${INODE_USED_MAX:-90}"
MOUNTS=(${MOUNTS:-/ /var /opt /var/lib/docker})
SWAP_USED_MAX="${SWAP_USED_MAX:-60}"
SWAP_SI_MIN="${SWAP_SI_MIN:-50}"
SWAP_SO_MIN="${SWAP_SO_MIN:-50}"
MEM_PRESSURE_FOR_SWAP_ALERT="${MEM_PRESSURE_FOR_SWAP_ALERT:-95}"
# Mailcow-Module
ENABLE_MAILCOW="${ENABLE_MAILCOW:-0}"
COMPOSE_PROJECT_LABEL="${COMPOSE_PROJECT_LABEL:-mailcowdockerized}"
MAILCOW_POSTFIX="${MAILCOW_POSTFIX:-postfix-mailcow}"
QUEUE_MAX="${QUEUE_MAX:-100}"
# --- Echtes Netz-Ping (ICMP/TCP) ---
PING_MODE="${PING_MODE:-icmp}" # icmp | tcp | off
PING_TARGET="${PING_TARGET:-$(echo "${KUMA_BASE_URL}" | awk -F/ '{print $3}')}"
PING_COUNT="${PING_COUNT:-3}"
PING_TIMEOUT="${PING_TIMEOUT:-1}" # Sek. pro Reply
PING_DEADLINE="${PING_DEADLINE:-3}" # Sek. Gesamt
PING_FAMILY="${PING_FAMILY:-auto}" # 4 | 6 | auto
PING_TCP_URL="${PING_TCP_URL:-https://$(echo "${KUMA_BASE_URL}" | awk -F/ '{print $3}')/}"
PING_TCP_TIMEOUT="${PING_TCP_TIMEOUT:-2}"
# --- Sanitizer für Zahlen aus Env (entfernt Inline-Kommentare/Einheiten) ---
strip_comment() { echo "${1%%#*}"; }
num_int() { strip_comment "$1" | tr -cd '0-9'; }
num_float() { strip_comment "$1" | tr -cd '0-9.'; }
LOAD_PER_CORE_MAX=$(num_float "${LOAD_PER_CORE_MAX:-1.50}")
MEM_USED_MAX=$(num_int "${MEM_USED_MAX:-90}")
DISK_USED_MAX=$(num_int "${DISK_USED_MAX:-85}")
INODE_USED_MAX=$(num_int "${INODE_USED_MAX:-90}")
SWAP_USED_MAX=$(num_int "${SWAP_USED_MAX:-60}")
SWAP_SI_MIN=$(num_int "${SWAP_SI_MIN:-50}")
SWAP_SO_MIN=$(num_int "${SWAP_SO_MIN:-50}")
MEM_PRESSURE_FOR_SWAP_ALERT=$(num_int "${MEM_PRESSURE_FOR_SWAP_ALERT:-95}")
# --- Sanity ---
if [ -z "${KUMA_PUSH_ID}" ]; then
echo "ERROR: KUMA_PUSH_ID ist leer. Bitte in /etc/vps-healthcheck.env setzen."
exit 1
fi
# --- Mess-Funktion: echter Netz-Ping in ms ---
measure_ping() {
local ms=""
if [ "${PING_MODE}" = "icmp" ] && command -v ping >/dev/null 2>&1 && [ -n "${PING_TARGET}" ]; then
local fam=""
[ "$PING_FAMILY" = "4" ] && fam="-4"
[ "$PING_FAMILY" = "6" ] && fam="-6"
local out
out=$(LANG=C ping -n $fam -c "$PING_COUNT" -W "$PING_TIMEOUT" -w "$PING_DEADLINE" "$PING_TARGET" 2>&1 || true)
# iputils: rtt min/avg/max/mdev = a/b/c/d ms -> avg
ms=$(echo "$out" | sed -n 's/.*= \([0-9.]\+\)\/\([0-9.]\+\)\/\([0-9.]\+\)\/\([0-9.]\+\) .*$/\2/p' | tail -n1)
# busybox: round-trip min/avg/max = a/b/c ms -> avg
[ -z "$ms" ] && ms=$(echo "$out" | sed -n 's/.*= \([0-9.]\+\)\/\([0-9.]\+\)\/\([0-9.]\+\) .*$/\2/p' | tail -n1)
fi
# TCP-Fallback: TLS-Handshake- oder Connect-Zeit (Sekunden -> ms)
if [ -z "$ms" ] && command -v curl >/dev/null 2>&1; then
local sec
if echo "$PING_TCP_URL" | grep -q '^https://'; then
sec=$(curl -sS -o /dev/null --max-time "$PING_TCP_TIMEOUT" -w "%{time_appconnect}" "$PING_TCP_URL" || true)
else
sec=$(curl -sS -o /dev/null --max-time "$PING_TCP_TIMEOUT" -w "%{time_connect}" "$PING_TCP_URL" || true)
fi
if echo "$sec" | grep -Eq '^[0-9.]+$'; then
ms=$(awk -v s="$sec" 'BEGIN{printf "%.0f", s*1000}')
fi
fi
if echo "$ms" | grep -Eq '^[0-9.]+$'; then
awk -v a="$ms" 'BEGIN{printf "%.0f", a}'
else
echo "0"
fi
}
# --- Push an Kuma (mit optionalem ping=) ---
push_to_kuma() {
local status="$1" msg="$2" ping_ms="$3"
local args=( -fsS --retry 2 --max-time 8 --get "${KUMA_BASE_URL}/${KUMA_PUSH_ID}"
--data-urlencode "status=${status}"
--data-urlencode "msg=${msg}" )
[ -n "$ping_ms" ] && args+=( --data-urlencode "ping=${ping_ms}" )
if [ "$NO_PUSH" = "1" ]; then
echo "[dry-run] would push: status=$status ping=${ping_ms:-} msg=$msg"
return 0
fi
curl "${args[@]}" >/dev/null || true
}
# --- CPU-Load pro Core ---
cores=$(nproc 2>/dev/null || echo 1)
load1=$(awk '{print $1}' /proc/loadavg)
load_per_core=$(awk -v l="$load1" -v c="$cores" 'BEGIN{printf "%.2f", (c>0?l/c:l)}')
awk -v a="$load_per_core" -v m="$LOAD_PER_CORE_MAX" 'BEGIN{exit (a>m)?0:1}' && \
add_alert "Hohe CPU-Last: ${load1} (=${load_per_core}/Core, Limit ${LOAD_PER_CORE_MAX}/Core)"
# --- RAM-Nutzung (MemAvailable) ---
mem_used_pct=$(awk '
/MemTotal:/ {t=$2}
/MemAvailable:/ {a=$2}
END { if(t>0){printf "%.0f", (1- a/t)*100} else {print 0} }
' /proc/meminfo)
[ "${mem_used_pct:-0}" -gt "$MEM_USED_MAX" ] && add_alert "RAM hoch: ${mem_used_pct}% (Limit ${MEM_USED_MAX}%)"
# --- Swap: Nutzung + Aktivität (pages/s) ---
swap_used_pct=$(awk '
/SwapTotal:/ {t=$2}
/SwapFree:/ {f=$2}
END { if(t>0) printf "%.0f", (1- f/t)*100; else print 0 }
' /proc/meminfo)
psin0=$(awk '/pswpin/ {print $2}' /proc/vmstat); psout0=$(awk '/pswpout/ {print $2}' /proc/vmstat)
sleep 1
psin1=$(awk '/pswpin/ {print $2}' /proc/vmstat); psout1=$(awk '/pswpout/ {print $2}' /proc/vmstat)
si=$((psin1-psin0)); so=$((psout1-psout0))
if [ "$swap_used_pct" -gt "$SWAP_USED_MAX" ] && { [ "$si" -gt "$SWAP_SI_MIN" ] || [ "$so" -gt "$SWAP_SO_MIN" ] || [ "${mem_used_pct:-0}" -gt "$MEM_PRESSURE_FOR_SWAP_ALERT" ]; }; then
add_alert "Swap hoch: ${swap_used_pct}% (si=${si}/s so=${so}/s)"
fi
# --- IO-wait (zweites 1s Sample) ---
read u1 n1 s1 i1 w1 _ < <(awk '/^cpu /{print $2,$3,$4,$5,$6}' /proc/stat)
sleep 1
read u2 n2 s2 i2 w2 _ < <(awk '/^cpu /{print $2,$3,$4,$5,$6}' /proc/stat)
total=$(( (u2-u1)+(n2-n1)+(s2-s1)+(i2-i1)+(w2-w1) ))
iow=$(( w2 - w1 ))
iow_pct=$(awk -v i="$iow" -v t="$total" 'BEGIN{ if(t>0) printf "%.0f", (i*100)/t; else print 0 }')
[ "${iow_pct:-0}" -gt 25 ] && add_alert "IO-Wait hoch: ${iow_pct}% (Limit 25%)"
# --- Disks: Nutzung + Inodes ---
for m in "${MOUNTS[@]}"; do
if mountpoint -q "$m"; then
used=$(df -P "$m" 2>/dev/null | awk 'NR==2{gsub("%","",$5);print $5}')
[ -n "$used" ] && [ "$used" -gt "$DISK_USED_MAX" ] && add_alert "Disk fast voll: $m = ${used}% (Limit ${DISK_USED_MAX}%)"
iused=$(df -Pi "$m" 2>/dev/null | awk 'NR==2{gsub("%","",$5);print $5}')
[ -n "$iused" ] && [ "$iused" -gt "$INODE_USED_MAX" ] && add_alert "Inodes knapp: $m = ${iused}% (Limit ${INODE_USED_MAX}%)"
fi
done
# --- Optionale Module laden ---
MODULE_DIR="/usr/local/lib/vps-healthcheck/modules"
# Mailcow (optional)
if [ "${ENABLE_MAILCOW:-0}" = "1" ] && [ -r "${MODULE_DIR}/mailcow.sh" ]; then
log "loading module: mailcow"
source "${MODULE_DIR}/mailcow.sh"
fi
# Raspi (optional)
if [ "${ENABLE_RASPI:-0}" = "1" ] && [ -r "${MODULE_DIR}/raspi.sh" ]; then
log "loading module: raspi"
source "${MODULE_DIR}/raspi.sh"
fi
# --- Ergebnis pushen (mit echtem Netz-Ping) ---
NET_PING_MS="$(measure_ping)"
[ "$DEBUG" = "1" ] && echo "[dbg] ping(${PING_MODE}) -> ${NET_PING_MS} ms @ ${PING_TARGET:-$PING_TCP_URL}" >&2
if [ "${#alerts[@]}" -gt 0 ]; then
msg="$(hostname) $(printf '%s; ' "${alerts[@]}")"
push_to_kuma "down" "${msg:0:900}" "$NET_PING_MS"
echo -e "$msg"
else
push_to_kuma "up" "OK ($(hostname))" "$NET_PING_MS"
echo "OK"
fi