#!/bin/sh
# keyboot-be-upgrade — clone the running BE and chroot into the clone to upgrade
# it in isolation, then boot into it (ADR 0005/0007, SPEC §14.2).
#
# The whole point of a boot environment: never upgrade in place. We recursively
# clone the running BE (root + every dataset under it — `zfs clone` is NOT
# recursive, so we clone each one), mount the clone set at a scratch altroot,
# bind the API filesystems, and chroot. You run the distro's package manager /
# kernel build inside; if it botches, the running BE is untouched and you just
# don't boot the clone. On success, set it boot-next (boot-once, auto-reverts)
# or bootfs and reboot.
#
# Cross-distro: POSIX sh + busybox-compatible mounts (no --rbind/--make-rslave).
#
# Usage:
#   keyboot-be-upgrade [options] [-- CMD ...]
#     --new-be NAME   clone dataset name under <pool>/ROOT (default <be>-<UTCstamp>)
#     --altroot DIR   scratch mountpoint (default /run/keyboot/upgrade.<stamp>)
#     --set-bootfs    on clean exit, `zpool set bootfs=<clone>` (permanent default)
#     --keep-mounted  don't tear down on exit (inspect the chroot yourself)
#     --dry-run       print the zfs/mount/chroot commands, do nothing
#     -- CMD ...      run CMD in the chroot instead of an interactive shell
#                     (e.g. -- apk upgrade -U ; for unattended/test use)
#
# On success it prints the clone name and the exact `keyboot-install be ...` /
# reboot next step. It does NOT auto-reboot.

set -u
PROG=keyboot-be-upgrade
NEW_BE=
ALTROOT=
SET_BOOTFS=0
KEEP_MOUNTED=0
DRYRUN=0
CMD=

die()  { printf '%s: error: %s\n' "$PROG" "$*" >&2; exit 2; }
log()  { printf '%s: %s\n' "$PROG" "$*" >&2; }
run()  { if [ "$DRYRUN" -eq 1 ]; then printf '[dry-run] %s\n' "$*" >&2; return 0; fi; "$@"; }

while [ $# -gt 0 ]; do
    case "$1" in
        --new-be)   NEW_BE="${2:?}"; shift 2 ;;
        --new-be=*) NEW_BE="${1#*=}"; shift ;;
        --altroot)  ALTROOT="${2:?}"; shift 2 ;;
        --altroot=*) ALTROOT="${1#*=}"; shift ;;
        --set-bootfs)   SET_BOOTFS=1; shift ;;
        --keep-mounted) KEEP_MOUNTED=1; shift ;;
        --dry-run)  DRYRUN=1; shift ;;
        -h|--help)  sed -n '2,30p' "$0" | sed 's/^# \{0,1\}//'; exit 0 ;;
        --)         shift; CMD="$*"; break ;;
        -*)         die "unknown option: $1" ;;
        *)          die "unexpected arg: $1 (did you mean '-- $*'?)" ;;
    esac
done

[ "$(id -u)" = 0 ] || die "must run as root"
command -v zfs >/dev/null 2>&1 || die "zfs not found"

# 1. Identify the running BE: the zfs dataset mounted at / (robust, no util-linux).
BE="$(awk '$2=="/" && $3=="zfs"{print $1; exit}' /proc/mounts)"
[ -n "$BE" ] || die "no zfs dataset mounted at / (is this a keyboot ZFS root?)"
case "$BE" in */ROOT/*) : ;; *) log "warning: $BE is not under <pool>/ROOT (proceeding anyway)";; esac
POOL="${BE%%/*}"
ROOTPREFIX="${BE%/*}"                      # <pool>/ROOT
BASENAME="${BE##*/}"                        # e.g. alpine, or alpine-2026-06-05-1823Z
# Re-stamp the STEM, not the full leaf: strip a trailing -YYYY-MM-DD-HHMMZ so
# repeated upgrades give <stem>-<newstamp> instead of stacking stamps
# (ADR 0012; canonical keyboot_be_stem in keyboot-install-os/lib/benaming.sh).
case "$BASENAME" in
    *-[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]-[0-9][0-9][0-9][0-9]Z)
        BASENAME="${BASENAME%-[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]-[0-9][0-9][0-9][0-9]Z}"
        ;;
esac
STAMP="$(date -u +%Y-%m-%d-%H%MZ)"
[ -n "$NEW_BE" ] || NEW_BE="${BASENAME}-${STAMP}"
case "$NEW_BE" in */*) DST="$NEW_BE" ;; *) DST="${ROOTPREFIX}/${NEW_BE}" ;; esac
[ "$DST" != "$BE" ] || die "clone name equals the running BE"
zfs list -H -o name "$DST" >/dev/null 2>&1 && die "destination $DST already exists"
[ -n "$ALTROOT" ] || ALTROOT="/run/keyboot/upgrade.${STAMP}"
SNAP="${BE}@${STAMP}-PREUPGRADE"

log "running BE : $BE"
log "clone      : $DST"
log "snapshot   : $SNAP"
log "altroot    : $ALTROOT"

# Cleanup runs on ANY exit so we never leave the chroot half-mounted.
MOUNTS=""                                   # newline list, unmounted in reverse
do_cleanup() {
    [ "$KEEP_MOUNTED" -eq 1 ] && { log "left mounted at $ALTROOT (--keep-mounted): clone $DST"; return; }
    [ "$DRYRUN" -eq 1 ] && return
    # Unmount everything we mounted, deepest/last first.
    echo "$MOUNTS" | sed '1!G;h;$!d' | while IFS= read -r m; do
        [ -n "$m" ] || continue
        umount "$m" 2>/dev/null || umount -l "$m" 2>/dev/null || true
    done
    rmdir "$ALTROOT" 2>/dev/null || true
}
trap do_cleanup EXIT INT TERM

mark() { MOUNTS="${MOUNTS}
$1"; }

# 2. Recursive snapshot (atomic across the subtree) + recursive clone.
run zfs snapshot -r "$SNAP" || die "snapshot failed"

# Enumerate the subtree (parent-first order from zfs list -r) and clone each
# dataset's snapshot into the mirror path under $DST. canmount=noauto so the
# clones never auto-mount at their inherited mountpoint and fight the live root.
SETS="$(zfs list -H -o name -r "$BE")" || die "zfs list failed"
for ds in $SETS; do
    suffix="${ds#"$BE"}"                    # "" for the root, "/home" for a child
    run zfs clone -o canmount=noauto "${ds}@${STAMP}-PREUPGRADE" "${DST}${suffix}" \
        || die "clone ${ds} failed"
done
# A BE root must be mountpoint=/ : `zfs clone` does NOT copy the source's local
# mountpoint, so the clone would inherit `none` from <pool>/ROOT and be-unlock's
# `zfs mount` fails with "no mountpoint set" at boot. Set it on the clone root;
# children re-inherit the right path (/home, …) since theirs stays inherited.
run zfs set mountpoint=/ "$DST" || die "set mountpoint=/ on clone failed"

# 3. Mount the clone set at the altroot. Root first (explicit path), then each
#    child at <altroot><child-mountpoint> (children inherit a real mountpoint
#    like /home). mount.zfs -o zfsutil mounts a dataset at an arbitrary path
#    without touching its mountpoint property.
run mkdir -p "$ALTROOT"
run mount -t zfs -o zfsutil "$DST" "$ALTROOT" || die "mount $DST failed"
mark "$ALTROOT"
for ds in $SETS; do
    suffix="${ds#"$BE"}"
    [ -n "$suffix" ] || continue            # skip the root, already mounted
    cds="${DST}${suffix}"
    mp="$(zfs get -H -o value mountpoint "$cds" 2>/dev/null)"
    case "$mp" in /*) tgt="${ALTROOT}${mp}" ;; *) tgt="${ALTROOT}${suffix}" ;; esac
    run mkdir -p "$tgt"
    run mount -t zfs -o zfsutil "$cds" "$tgt" || die "mount $cds at $tgt failed"
    mark "$tgt"
done

# 4. API filesystems (busybox-safe: plain bind + typed mounts, no rbind).
for d in dev dev/pts proc sys run; do run mkdir -p "${ALTROOT}/${d}"; done
run mount -o bind /dev      "${ALTROOT}/dev";     mark "${ALTROOT}/dev"
[ -d /dev/pts ] && { run mount -o bind /dev/pts "${ALTROOT}/dev/pts"; mark "${ALTROOT}/dev/pts"; }
run mount -t proc  proc  "${ALTROOT}/proc";       mark "${ALTROOT}/proc"
run mount -t sysfs sys   "${ALTROOT}/sys";        mark "${ALTROOT}/sys"
run mount -t tmpfs tmpfs "${ALTROOT}/run";        mark "${ALTROOT}/run"
[ -r /etc/resolv.conf ] && run cp /etc/resolv.conf "${ALTROOT}/etc/resolv.conf"

# 5. Chroot: run CMD (automation/test) or an interactive shell.
log "entering chroot $ALTROOT (clone $DST) — upgrade away; exit when done"
if [ -n "$CMD" ]; then
    run chroot "$ALTROOT" /bin/sh -c "$CMD"; rc=$?
else
    if [ "$DRYRUN" -eq 1 ]; then printf '[dry-run] chroot %s /bin/sh -l\n' "$ALTROOT" >&2; rc=0
    else chroot "$ALTROOT" /bin/sh -l; rc=$?; fi
fi
log "chroot exited rc=$rc"

# 6. Adopt: optionally set bootfs; always print the next step. Teardown is the
#    trap. The clone persists (unmounted) as a boot candidate.
if [ "$rc" -eq 0 ] && [ "$SET_BOOTFS" -eq 1 ]; then
    run zpool set "bootfs=${DST}" "$POOL" && log "set bootfs=${DST} on ${POOL}"
fi
log "DONE. Upgraded clone: ${DST}"
if [ "$rc" -eq 0 ]; then
    log "next: boot it ONCE (auto-reverts if broken):  keyboot-install be boot-next ${DST}"
    log "  or permanently:                              keyboot-install be promote ${POOL} ${DST}"
    log "  then reboot. To discard:                     zfs destroy -r ${DST}"
else
    log "chroot reported failure; the running BE ${BE} is untouched. Discard: zfs destroy -r ${DST}"
fi
exit "$rc"
