check_pve_backup_lxc.sh/check_pve_backup_lxc.sh

70 lines
2.0 KiB
Bash

#!/usr/bin/env bash
set -u
DAYS="${1:-7}"
LOGDIR="/var/log/vzdump"
STATE_OK=0
STATE_CRITICAL=2
STATE_UNKNOWN=3
if [[ ! -d "$LOGDIR" ]]; then
echo "UNKNOWN - ${LOGDIR} not found"
exit $STATE_UNKNOWN
fi
mapfile -t files < <(find "$LOGDIR" -maxdepth 1 -type f -mtime "-$DAYS" \
\( -name 'lxc-*.log' -o -name '*lxc*.log' \) 2>/dev/null)
if [[ "${#files[@]}" -eq 0 ]]; then
echo "CRITICAL - no LXC vzdump logs found in ${LOGDIR} within last ${DAYS} days"
exit $STATE_CRITICAL
fi
relevant=0
bad=0
bad_list=()
for f in "${files[@]}"; do
# LXC log if filename is lxc-<id>.log or contains CT backup marker
if [[ "$(basename "$f")" =~ ^lxc-[0-9]+\.log$ ]] || grep -qE 'Starting Backup of (CT|VM) [0-9]+' "$f" 2>/dev/null; then
((relevant++))
# Extract CTID from filename if possible
ctid="$(basename "$f" | sed -n 's/^lxc-\([0-9]\+\)\.log$/\1/p')"
[[ -z "${ctid:-}" ]] && ctid="$(grep -oE 'Starting Backup of (CT|VM) [0-9]+' "$f" | head -n1 | awk '{print $7}')"
[[ -z "${ctid:-}" ]] && ctid="unknown"
# Error detection
if grep -qE 'TASK ERROR|Backup of (CT|VM) [0-9]+ failed|^ERROR:' "$f" 2>/dev/null; then
((bad++))
hint="$(grep -E 'TASK ERROR|Backup of (CT|VM) [0-9]+ failed|^ERROR:' "$f" | tail -n1 | sed 's/^[[:space:]]*//')"
bad_list+=("CT ${ctid}: ${hint}")
continue
fi
# Success detection
if grep -qE '(^|\s)TASK OK(\s|$)|Backup job finished successfully' "$f" 2>/dev/null; then
: # ok
else
((bad++))
bad_list+=("CT ${ctid}: no success marker found")
fi
fi
done
if [[ "$relevant" -eq 0 ]]; then
echo "CRITICAL - no LXC (CT) backup logs found in last ${DAYS} days"
exit $STATE_CRITICAL
fi
if [[ "$bad" -gt 0 ]]; then
details="$(printf '%s; ' "${bad_list[@]:0:5}")"
[[ "$bad" -gt 5 ]] && details="${details}...(+$(("$bad"-5)) more)"
echo "CRITICAL - ${bad}/${relevant} LXC backup log(s) not OK in last ${DAYS} days - ${details}"
exit $STATE_CRITICAL
fi
echo "OK - ${relevant} LXC backup log(s) OK in last ${DAYS} days"
exit $STATE_OK