190 lines
7.2 KiB
Bash
Executable File
190 lines
7.2 KiB
Bash
Executable File
#!/usr/bin/env bash
|
||
# Healthcheck v2 (modular) – Core + optionale Module (z. B. Mailcow)
|
||
|
||
# --- Helpers / Debug ---
|
||
DEBUG="${DEBUG:-0}"
|
||
NO_PUSH="${NO_PUSH:-0}"
|
||
log() { [ "$DEBUG" = "1" ] && echo "[dbg] $*" >&2; }
|
||
|
||
alerts=()
|
||
add_alert(){ alerts+=("$1"); }
|
||
|
||
# --- Defaults, falls nicht via Env gesetzt ---
|
||
KUMA_BASE_URL="${KUMA_BASE_URL:-https://status.example/api/push}"
|
||
KUMA_PUSH_ID="${KUMA_PUSH_ID:-}"
|
||
|
||
LOAD_PER_CORE_MAX="${LOAD_PER_CORE_MAX:-1.50}"
|
||
MEM_USED_MAX="${MEM_USED_MAX:-90}"
|
||
DISK_USED_MAX="${DISK_USED_MAX:-85}"
|
||
INODE_USED_MAX="${INODE_USED_MAX:-90}"
|
||
MOUNTS=(${MOUNTS:-/ /var /opt /var/lib/docker})
|
||
|
||
SWAP_USED_MAX="${SWAP_USED_MAX:-60}"
|
||
SWAP_SI_MIN="${SWAP_SI_MIN:-50}"
|
||
SWAP_SO_MIN="${SWAP_SO_MIN:-50}"
|
||
MEM_PRESSURE_FOR_SWAP_ALERT="${MEM_PRESSURE_FOR_SWAP_ALERT:-95}"
|
||
|
||
# Mailcow-Module
|
||
ENABLE_MAILCOW="${ENABLE_MAILCOW:-0}"
|
||
COMPOSE_PROJECT_LABEL="${COMPOSE_PROJECT_LABEL:-mailcowdockerized}"
|
||
MAILCOW_POSTFIX="${MAILCOW_POSTFIX:-postfix-mailcow}"
|
||
QUEUE_MAX="${QUEUE_MAX:-100}"
|
||
|
||
# --- Echtes Netz-Ping (ICMP/TCP) ---
|
||
PING_MODE="${PING_MODE:-icmp}" # icmp | tcp | off
|
||
PING_TARGET="${PING_TARGET:-$(echo "${KUMA_BASE_URL}" | awk -F/ '{print $3}')}"
|
||
PING_COUNT="${PING_COUNT:-3}"
|
||
PING_TIMEOUT="${PING_TIMEOUT:-1}" # Sek. pro Reply
|
||
PING_DEADLINE="${PING_DEADLINE:-3}" # Sek. Gesamt
|
||
PING_FAMILY="${PING_FAMILY:-auto}" # 4 | 6 | auto
|
||
PING_TCP_URL="${PING_TCP_URL:-https://$(echo "${KUMA_BASE_URL}" | awk -F/ '{print $3}')/}"
|
||
PING_TCP_TIMEOUT="${PING_TCP_TIMEOUT:-2}"
|
||
|
||
# --- Sanitizer für Zahlen aus Env (entfernt Inline-Kommentare/Einheiten) ---
|
||
strip_comment() { echo "${1%%#*}"; }
|
||
num_int() { strip_comment "$1" | tr -cd '0-9'; }
|
||
num_float() { strip_comment "$1" | tr -cd '0-9.'; }
|
||
|
||
LOAD_PER_CORE_MAX=$(num_float "${LOAD_PER_CORE_MAX:-1.50}")
|
||
MEM_USED_MAX=$(num_int "${MEM_USED_MAX:-90}")
|
||
DISK_USED_MAX=$(num_int "${DISK_USED_MAX:-85}")
|
||
INODE_USED_MAX=$(num_int "${INODE_USED_MAX:-90}")
|
||
|
||
SWAP_USED_MAX=$(num_int "${SWAP_USED_MAX:-60}")
|
||
SWAP_SI_MIN=$(num_int "${SWAP_SI_MIN:-50}")
|
||
SWAP_SO_MIN=$(num_int "${SWAP_SO_MIN:-50}")
|
||
MEM_PRESSURE_FOR_SWAP_ALERT=$(num_int "${MEM_PRESSURE_FOR_SWAP_ALERT:-95}")
|
||
|
||
# --- Sanity ---
|
||
if [ -z "${KUMA_PUSH_ID}" ]; then
|
||
echo "ERROR: KUMA_PUSH_ID ist leer. Bitte in /etc/vps-healthcheck.env setzen."
|
||
exit 1
|
||
fi
|
||
|
||
# --- Mess-Funktion: echter Netz-Ping in ms ---
|
||
measure_ping() {
|
||
local ms=""
|
||
if [ "${PING_MODE}" = "icmp" ] && command -v ping >/dev/null 2>&1 && [ -n "${PING_TARGET}" ]; then
|
||
local fam=""
|
||
[ "$PING_FAMILY" = "4" ] && fam="-4"
|
||
[ "$PING_FAMILY" = "6" ] && fam="-6"
|
||
local out
|
||
out=$(LANG=C ping -n $fam -c "$PING_COUNT" -W "$PING_TIMEOUT" -w "$PING_DEADLINE" "$PING_TARGET" 2>&1 || true)
|
||
# iputils: rtt min/avg/max/mdev = a/b/c/d ms -> avg
|
||
ms=$(echo "$out" | sed -n 's/.*= \([0-9.]\+\)\/\([0-9.]\+\)\/\([0-9.]\+\)\/\([0-9.]\+\) .*$/\2/p' | tail -n1)
|
||
# busybox: round-trip min/avg/max = a/b/c ms -> avg
|
||
[ -z "$ms" ] && ms=$(echo "$out" | sed -n 's/.*= \([0-9.]\+\)\/\([0-9.]\+\)\/\([0-9.]\+\) .*$/\2/p' | tail -n1)
|
||
fi
|
||
|
||
# TCP-Fallback: TLS-Handshake- oder Connect-Zeit (Sekunden -> ms)
|
||
if [ -z "$ms" ] && command -v curl >/dev/null 2>&1; then
|
||
local sec
|
||
if echo "$PING_TCP_URL" | grep -q '^https://'; then
|
||
sec=$(curl -sS -o /dev/null --max-time "$PING_TCP_TIMEOUT" -w "%{time_appconnect}" "$PING_TCP_URL" || true)
|
||
else
|
||
sec=$(curl -sS -o /dev/null --max-time "$PING_TCP_TIMEOUT" -w "%{time_connect}" "$PING_TCP_URL" || true)
|
||
fi
|
||
if echo "$sec" | grep -Eq '^[0-9.]+$'; then
|
||
ms=$(awk -v s="$sec" 'BEGIN{printf "%.0f", s*1000}')
|
||
fi
|
||
fi
|
||
|
||
if echo "$ms" | grep -Eq '^[0-9.]+$'; then
|
||
awk -v a="$ms" 'BEGIN{printf "%.0f", a}'
|
||
else
|
||
echo "0"
|
||
fi
|
||
}
|
||
|
||
# --- Push an Kuma (mit optionalem ping=) ---
|
||
push_to_kuma() {
|
||
local status="$1" msg="$2" ping_ms="$3"
|
||
local args=( -fsS --retry 2 --max-time 8 --get "${KUMA_BASE_URL}/${KUMA_PUSH_ID}"
|
||
--data-urlencode "status=${status}"
|
||
--data-urlencode "msg=${msg}" )
|
||
[ -n "$ping_ms" ] && args+=( --data-urlencode "ping=${ping_ms}" )
|
||
if [ "$NO_PUSH" = "1" ]; then
|
||
echo "[dry-run] would push: status=$status ping=${ping_ms:-} msg=$msg"
|
||
return 0
|
||
fi
|
||
curl "${args[@]}" >/dev/null || true
|
||
}
|
||
|
||
# --- CPU-Load pro Core ---
|
||
cores=$(nproc 2>/dev/null || echo 1)
|
||
load1=$(awk '{print $1}' /proc/loadavg)
|
||
load_per_core=$(awk -v l="$load1" -v c="$cores" 'BEGIN{printf "%.2f", (c>0?l/c:l)}')
|
||
awk -v a="$load_per_core" -v m="$LOAD_PER_CORE_MAX" 'BEGIN{exit (a>m)?0:1}' && \
|
||
add_alert "Hohe CPU-Last: ${load1} (=${load_per_core}/Core, Limit ${LOAD_PER_CORE_MAX}/Core)"
|
||
|
||
# --- RAM-Nutzung (MemAvailable) ---
|
||
mem_used_pct=$(awk '
|
||
/MemTotal:/ {t=$2}
|
||
/MemAvailable:/ {a=$2}
|
||
END { if(t>0){printf "%.0f", (1- a/t)*100} else {print 0} }
|
||
' /proc/meminfo)
|
||
[ "${mem_used_pct:-0}" -gt "$MEM_USED_MAX" ] && add_alert "RAM hoch: ${mem_used_pct}% (Limit ${MEM_USED_MAX}%)"
|
||
|
||
# --- Swap: Nutzung + Aktivität (pages/s) ---
|
||
swap_used_pct=$(awk '
|
||
/SwapTotal:/ {t=$2}
|
||
/SwapFree:/ {f=$2}
|
||
END { if(t>0) printf "%.0f", (1- f/t)*100; else print 0 }
|
||
' /proc/meminfo)
|
||
|
||
psin0=$(awk '/pswpin/ {print $2}' /proc/vmstat); psout0=$(awk '/pswpout/ {print $2}' /proc/vmstat)
|
||
sleep 1
|
||
psin1=$(awk '/pswpin/ {print $2}' /proc/vmstat); psout1=$(awk '/pswpout/ {print $2}' /proc/vmstat)
|
||
si=$((psin1-psin0)); so=$((psout1-psout0))
|
||
|
||
if [ "$swap_used_pct" -gt "$SWAP_USED_MAX" ] && { [ "$si" -gt "$SWAP_SI_MIN" ] || [ "$so" -gt "$SWAP_SO_MIN" ] || [ "${mem_used_pct:-0}" -gt "$MEM_PRESSURE_FOR_SWAP_ALERT" ]; }; then
|
||
add_alert "Swap hoch: ${swap_used_pct}% (si=${si}/s so=${so}/s)"
|
||
fi
|
||
|
||
# --- IO-wait (zweites 1s Sample) ---
|
||
read u1 n1 s1 i1 w1 _ < <(awk '/^cpu /{print $2,$3,$4,$5,$6}' /proc/stat)
|
||
sleep 1
|
||
read u2 n2 s2 i2 w2 _ < <(awk '/^cpu /{print $2,$3,$4,$5,$6}' /proc/stat)
|
||
total=$(( (u2-u1)+(n2-n1)+(s2-s1)+(i2-i1)+(w2-w1) ))
|
||
iow=$(( w2 - w1 ))
|
||
iow_pct=$(awk -v i="$iow" -v t="$total" 'BEGIN{ if(t>0) printf "%.0f", (i*100)/t; else print 0 }')
|
||
[ "${iow_pct:-0}" -gt 25 ] && add_alert "IO-Wait hoch: ${iow_pct}% (Limit 25%)"
|
||
|
||
# --- Disks: Nutzung + Inodes ---
|
||
for m in "${MOUNTS[@]}"; do
|
||
if mountpoint -q "$m"; then
|
||
used=$(df -P "$m" 2>/dev/null | awk 'NR==2{gsub("%","",$5);print $5}')
|
||
[ -n "$used" ] && [ "$used" -gt "$DISK_USED_MAX" ] && add_alert "Disk fast voll: $m = ${used}% (Limit ${DISK_USED_MAX}%)"
|
||
iused=$(df -Pi "$m" 2>/dev/null | awk 'NR==2{gsub("%","",$5);print $5}')
|
||
[ -n "$iused" ] && [ "$iused" -gt "$INODE_USED_MAX" ] && add_alert "Inodes knapp: $m = ${iused}% (Limit ${INODE_USED_MAX}%)"
|
||
fi
|
||
done
|
||
|
||
# --- Optionale Module laden ---
|
||
MODULE_DIR="/usr/local/lib/vps-healthcheck/modules"
|
||
|
||
# Mailcow (optional)
|
||
if [ "${ENABLE_MAILCOW:-0}" = "1" ] && [ -r "${MODULE_DIR}/mailcow.sh" ]; then
|
||
log "loading module: mailcow"
|
||
source "${MODULE_DIR}/mailcow.sh"
|
||
fi
|
||
|
||
# Raspi (optional)
|
||
if [ "${ENABLE_RASPI:-0}" = "1" ] && [ -r "${MODULE_DIR}/raspi.sh" ]; then
|
||
log "loading module: raspi"
|
||
source "${MODULE_DIR}/raspi.sh"
|
||
fi
|
||
|
||
# --- Ergebnis pushen (mit echtem Netz-Ping) ---
|
||
NET_PING_MS="$(measure_ping)"
|
||
[ "$DEBUG" = "1" ] && echo "[dbg] ping(${PING_MODE}) -> ${NET_PING_MS} ms @ ${PING_TARGET:-$PING_TCP_URL}" >&2
|
||
|
||
if [ "${#alerts[@]}" -gt 0 ]; then
|
||
msg="⚠ $(hostname) – $(printf '%s; ' "${alerts[@]}")"
|
||
push_to_kuma "down" "${msg:0:900}" "$NET_PING_MS"
|
||
echo -e "$msg"
|
||
else
|
||
push_to_kuma "up" "OK ($(hostname))" "$NET_PING_MS"
|
||
echo "OK"
|
||
fi
|