#!/bin/sh
# keyboot-autosnap — cross-distro recursive ZFS auto-snapshotter (ADR 0006).
#
# A lean, POSIX-sh remake of zfsonlinux/zfs-auto-snapshot with keyboot's naming
# convention so the snapshots are (a) unambiguous during recovery, (b) sort
# chronologically for retention, and (c) form a consistent point-in-time across
# the whole boot-environment subtree that keyboot's rollback UI can group.
#
# Naming:  <dataset>@<YYYY-MM-DD-HHMM>Z-<LABEL>      (UTC, explicit Z, 24h)
#   e.g.   rpool/ROOT/gentoo@2026-06-04-1600Z-DAILY
#   LABEL in {FREQUENT,HOURLY,DAILY,WEEKLY,MONTHLY} (free-form; you pick per cron).
#
# Why UTC, not local time: local names are non-monotonic across DST (the
# fall-back hour repeats), which silently breaks lexical-sort retention and
# "keep newest N", and is ambiguous exactly when you're recovering. UTC sorts
# lexically == chronologically. keyboot renders local at display from the BE's
# timezone (see ADR 0006). The `Z` answers "is 1600 UTC?" at a glance.
#
# Lockstep: snapshots are taken `zfs snapshot -r` so the whole subtree shares
# one atomic name (same txg) — a point-in-time is always complete. Retention
# destroys `<target>@<name>` with `-r`, cascading to the children, so no child
# snapshot is ever orphaned. Per-CHILD opt-out is deliberately NOT supported
# (it would break point-in-time consistency, ADR 0006/0007); opt out a whole
# target with `com.sun:auto-snapshot=false`.
#
# Usage:
#   keyboot-autosnap --label LABEL --keep N [--dry-run] [TARGET ...]
#     TARGET   a pool or dataset (snapshotted recursively). Default: every pool.
#     --keep N keep the newest N snapshots of this LABEL per target; purge older.
#              N=0 disables purging (snapshot only).
#     --label  the LABEL suffix (required).
#     --dry-run print the zfs commands instead of running them.
#     --prefix override the property namespace checked for opt-out
#              (default com.sun:auto-snapshot; per-label: <prefix>:<lclabel>).
#
# Exit status is non-zero if any snapshot or destroy failed.

set -u

PROG=keyboot-autosnap
LABEL=
KEEP=
DRYRUN=0
PROP_PREFIX="com.sun:auto-snapshot"
TARGETS=""

die()  { printf '%s: error: %s\n' "$PROG" "$*" >&2; exit 2; }
warn() { printf '%s: %s\n' "$PROG" "$*" >&2; }
run()  { if [ "$DRYRUN" -eq 1 ]; then printf '[dry-run] %s\n' "$*" >&2; return 0; fi; "$@"; }

while [ $# -gt 0 ]; do
    case "$1" in
        --label)   LABEL="${2:?--label needs a value}"; shift 2 ;;
        --label=*) LABEL="${1#*=}"; shift ;;
        --keep)    KEEP="${2:?--keep needs a value}"; shift 2 ;;
        --keep=*)  KEEP="${1#*=}"; shift ;;
        --prefix)  PROP_PREFIX="${2:?--prefix needs a value}"; shift 2 ;;
        --prefix=*) PROP_PREFIX="${1#*=}"; shift ;;
        --dry-run) DRYRUN=1; shift ;;
        -h|--help) sed -n '2,30p' "$0" | sed 's/^# \{0,1\}//'; exit 0 ;;
        --)        shift; while [ $# -gt 0 ]; do TARGETS="$TARGETS $1"; shift; done ;;
        -*)        die "unknown option: $1" ;;
        *)         TARGETS="$TARGETS $1"; shift ;;
    esac
done

[ -n "$LABEL" ] || die "--label is required (e.g. DAILY)"
case "$KEEP" in ''|*[!0-9]*) die "--keep must be a non-negative integer";; esac
command -v zfs >/dev/null 2>&1 || die "zfs not found"

# Timestamp computed ONCE so every target in this run shares the same instant.
STAMP="$(date -u +%Y-%m-%d-%H%MZ)" || die "date failed"
SNAPNAME="${STAMP}-${LABEL}"

# Default targets = every imported pool.
if [ -z "${TARGETS# }" ]; then
    TARGETS="$(zpool list -H -o name 2>/dev/null)" || die "zpool list failed"
    [ -n "$TARGETS" ] || { warn "no pools imported; nothing to do"; exit 0; }
fi

# com.sun:auto-snapshot[:label] = false on a target opts it out entirely.
excluded() {
    _t="$1"
    _v="$(zfs get -H -o value "$PROP_PREFIX" "$_t" 2>/dev/null)"
    [ "$_v" = "false" ] && return 0
    _lc="$(printf '%s' "$LABEL" | tr '[:upper:]' '[:lower:]')"
    _v="$(zfs get -H -o value "${PROP_PREFIX}:${_lc}" "$_t" 2>/dev/null)"
    [ "$_v" = "false" ] && return 0
    return 1
}

rc=0
for target in $TARGETS; do
    zfs list -H -o name "$target" >/dev/null 2>&1 || { warn "no such dataset: $target"; rc=1; continue; }
    if excluded "$target"; then
        warn "skip $target ($PROP_PREFIX=false)"
        continue
    fi

    # 1. Recursive, atomic snapshot of the whole subtree under one name.
    if run zfs snapshot -r "${target}@${SNAPNAME}"; then
        [ "$DRYRUN" -eq 1 ] || printf '%s: snapshot %s@%s\n' "$PROG" "$target" "$SNAPNAME" >&2
    else
        warn "snapshot ${target}@${SNAPNAME} FAILED"; rc=1; continue
    fi

    # 2. Retention: keep the newest $KEEP of this LABEL, destroy older (-r so
    #    the whole point-in-time goes, children included). UTC names sort
    #    chronologically, so plain `sort` is correct. -d 1 = this target's own
    #    snapshots only (the point-in-time anchors); -r on destroy cascades.
    [ "$KEEP" -eq 0 ] && continue
    snaps="$(zfs list -H -o name -t snapshot -d 1 "$target" 2>/dev/null \
        | grep -E "^${target}@[0-9]{4}-[0-9]{2}-[0-9]{2}-[0-9]{4}Z-${LABEL}\$" \
        | sort)"
    [ -n "$snaps" ] || continue
    total="$(printf '%s\n' "$snaps" | wc -l | tr -d ' ')"
    [ "$total" -gt "$KEEP" ] || continue
    drop=$(( total - KEEP ))
    printf '%s\n' "$snaps" | head -n "$drop" | while IFS= read -r snap; do
        [ -n "$snap" ] || continue
        if run zfs destroy -r "$snap"; then
            [ "$DRYRUN" -eq 1 ] || printf '%s: purged %s\n' "$PROG" "$snap" >&2
        else
            warn "destroy $snap FAILED"
        fi
    done
done

exit "$rc"
