70 lines
2.0 KiB
Bash
70 lines
2.0 KiB
Bash
#!/usr/bin/env bash
|
|
set -u
|
|
|
|
DAYS="${1:-7}"
|
|
LOGDIR="/var/log/vzdump"
|
|
|
|
STATE_OK=0
|
|
STATE_CRITICAL=2
|
|
STATE_UNKNOWN=3
|
|
|
|
if [[ ! -d "$LOGDIR" ]]; then
|
|
echo "UNKNOWN - ${LOGDIR} not found"
|
|
exit $STATE_UNKNOWN
|
|
fi
|
|
|
|
mapfile -t files < <(find "$LOGDIR" -maxdepth 1 -type f -mtime "-$DAYS" \
|
|
\( -name 'lxc-*.log' -o -name '*lxc*.log' \) 2>/dev/null)
|
|
|
|
if [[ "${#files[@]}" -eq 0 ]]; then
|
|
echo "CRITICAL - no LXC vzdump logs found in ${LOGDIR} within last ${DAYS} days"
|
|
exit $STATE_CRITICAL
|
|
fi
|
|
|
|
relevant=0
|
|
bad=0
|
|
bad_list=()
|
|
|
|
for f in "${files[@]}"; do
|
|
# LXC log if filename is lxc-<id>.log or contains CT backup marker
|
|
if [[ "$(basename "$f")" =~ ^lxc-[0-9]+\.log$ ]] || grep -qE 'Starting Backup of (CT|VM) [0-9]+' "$f" 2>/dev/null; then
|
|
((relevant++))
|
|
|
|
# Extract CTID from filename if possible
|
|
ctid="$(basename "$f" | sed -n 's/^lxc-\([0-9]\+\)\.log$/\1/p')"
|
|
[[ -z "${ctid:-}" ]] && ctid="$(grep -oE 'Starting Backup of (CT|VM) [0-9]+' "$f" | head -n1 | awk '{print $7}')"
|
|
[[ -z "${ctid:-}" ]] && ctid="unknown"
|
|
|
|
# Error detection
|
|
if grep -qE 'TASK ERROR|Backup of (CT|VM) [0-9]+ failed|^ERROR:' "$f" 2>/dev/null; then
|
|
((bad++))
|
|
hint="$(grep -E 'TASK ERROR|Backup of (CT|VM) [0-9]+ failed|^ERROR:' "$f" | tail -n1 | sed 's/^[[:space:]]*//')"
|
|
bad_list+=("CT ${ctid}: ${hint}")
|
|
continue
|
|
fi
|
|
|
|
# Success detection
|
|
if grep -qE '(^|\s)TASK OK(\s|$)|Backup job finished successfully' "$f" 2>/dev/null; then
|
|
: # ok
|
|
else
|
|
((bad++))
|
|
bad_list+=("CT ${ctid}: no success marker found")
|
|
fi
|
|
fi
|
|
done
|
|
|
|
if [[ "$relevant" -eq 0 ]]; then
|
|
echo "CRITICAL - no LXC (CT) backup logs found in last ${DAYS} days"
|
|
exit $STATE_CRITICAL
|
|
fi
|
|
|
|
if [[ "$bad" -gt 0 ]]; then
|
|
details="$(printf '%s; ' "${bad_list[@]:0:5}")"
|
|
[[ "$bad" -gt 5 ]] && details="${details}...(+$(("$bad"-5)) more)"
|
|
echo "CRITICAL - ${bad}/${relevant} LXC backup log(s) not OK in last ${DAYS} days - ${details}"
|
|
exit $STATE_CRITICAL
|
|
fi
|
|
|
|
echo "OK - ${relevant} LXC backup log(s) OK in last ${DAYS} days"
|
|
exit $STATE_OK
|