fix: add JD01 GC retention controls

2026-07-05 04:18:22 +00:00
parent e956a0ec2a
commit ab3566435c
18 changed files with 2205 additions and 1240 deletions
@@ -4,7 +4,7 @@ UniDesk 是一个以主 server 为统一入口的分布式工作平台。本文
 ## P0: 文件体积与脚本分流
- P0: 任何源码/CLI 文件超过 3000 行必须先按职责拆分再继续，禁止继续追加绕过。
+- P0: 任何源码/CLI 文件超过 3000 行必须先按职责差分拆到 2000 行以下再继续，禁止卡在 3000 行边界反复触发。
 - P0: 禁止把 shell/Node/Python 等脚本作为大段字符串内嵌；脚本必须放入原生后缀文件（如 `.sh`/`.mjs`/`.py`）并从文件加载。
 ## P0: 主 worktree 同步提交第一原则
@@ -719,7 +719,7 @@ controlPlane:
            - hyueapi.com
            - .hyueapi.com
          retention:
-            maxRunners: 20
+            maxRunners: 3
            cleanupOrder: oldest-inactive-last-active-first
            activeHeartbeatMaxAgeMs: 900000
            selectors:
@@ -734,6 +734,13 @@ controlPlane:
            ageBasedCleanup:
              enabled: false
              maxAgeHours: 48
            sessionPvcRetention:
              enabled: true
              prefixes:
                - agentrun-v01-session-
                - agentrun-v02-session-
                - agentrun-jd01-v02-session-
              maxDeletePerRun: 1000
          cancelLifecycle:
            deliveryMode: manager-epoch
            gracefulAbortMs: 15000
@@ -133,8 +133,31 @@ gc:
          hwlabNode: JD01
          hwlabLane: v03
          agentrunNode: JD01
-          agentrunLane: v02
+          agentrunLane: jd01-v02
          limit: 80
        containerdImageCache:
          enabled: true
          runtimeEndpoint: unix:///run/k3s/containerd/containerd.sock
          namespace: k8s.io
          ciNamespaces:
            - hwlab-ci
            - agentrun-ci
        hostContainerdCache:
          enabled: true
          root: /var/lib/containerd
          address: /run/containerd/containerd.sock
          namespaces:
            - default
          orphanCleanup:
            enabled: true
            overlaySnapshotsRoot: /var/lib/containerd/io.containerd.snapshotter.v1.overlayfs/snapshots
            contentBlobRoot: /var/lib/containerd/io.containerd.content.v1.content/blobs/sha256
        localPathStorage:
          enabled: true
          root: /var/lib/rancher/k3s/storage
          orphanDirPrefixes:
            - pvc-
          orphanMinAgeMinutes: 0
        policyTimer:
          enabled: true
          name: unidesk-jd01-low-risk-gc
@@ -14,6 +14,8 @@ Local worktrees, D601 runtime files, copied scripts, copied images, ad-hoc Kuber
 When stable release lanes such as `release/v1` are enabled, the desired-state ref must be explicit in the command, job log and deploy output. Until that support exists, commands that are documented to read `origin/master:deploy.json` must keep doing so and must not silently switch to another branch or a dirty manifest.
 Source and CLI files must not be kept near the 3000-line split boundary. Once a file exceeds 3000 lines, split it by responsibility until the original file is below 2000 lines before continuing feature or fix work. Do not make token-preserving micro-edits that leave the file just under or exactly at 3000 lines; that only guarantees the next small change will trigger the same split problem again.
 ## Prohibited Deployment Truth
 The following practices are not acceptable as the long-term or hidden source of a working environment:
@@ -93,6 +93,14 @@ JD01 远端 plan 必须适配短连接：`snapshot` 和轻量 `plan` 返回有
 JD01 PVC 归因必须按 YAML 配置的 namespace 集合读取 k8s API，不得复用 G14 专属 namespace 硬编码。报告至少包含 namespace、PVC、PV、host path、requested size、estimated actual bytes、active mount pods、owner/session/PipelineRun/runId、phase 和 reclaim policy。默认只做 plan 和归因；删除 PVC/PV、local-path host path、k3s storage、containerd snapshot/blob 或 workload 对象必须通过对应高层 retention 子命令和 GitOps/运行面 owner 判定，不能由 remote GC 扩大成 raw `kubectl delete` 或 host path 删除。
 JD01/AgentRun 这类 PVC retention 确认入口必须适配短连接：确认步骤只提交经过 plan 选中的 Kubernetes 删除请求并快速返回，不能等待 local-path PV 后端同步回收完成；收敛状态通过下一次 dry-run、`gc remote JD01 status` 或专用 status 子命令查询。若一次提交在 transport 窗口内仍不稳定，应降低 YAML/CLI 批量，而不是改成手工 raw kubectl 或 host path 删除。
 JD01 local-path storage 中没有 PV 引用的 orphan 目录只能通过 `gc remote JD01 plan|run --include-local-path-orphans` 进入候选。该入口必须从 YAML 读取 storage root、目录前缀 allowlist 和年龄策略，只允许删除 root 的直接子目录，且执行前重新确认无 PV 引用、无 symlink、无打开 fd/cwd；不得把它扩大成通用 `/var/lib/rancher/k3s/storage` 清空或 raw host path 删除。
 JD01 host containerd 只能通过 `gc remote JD01 plan|run --include-host-containerd-cache` 进入候选。该入口必须从 YAML 读取 containerd root、socket address 和 namespace allowlist；只有 host containerd 目标 namespace 中没有 task/container 时才允许执行 `ctr images prune --all`，不得直接删除 `/var/lib/containerd` 下的 content、snapshot 或 metadata 路径。
 当 host containerd 的 `ctr` 元数据中 images、containers、tasks、leases、snapshots 和 content 全为空，但 YAML allowlist 下仍残留 overlay snapshot 目录或 content blob 文件时，才能把它们分类为 orphan state。orphan state 清理仍必须通过 `--include-host-containerd-cache` 的 plan/run，执行前重新检查元数据为空、路径在 YAML root 下、名称匹配受控形态、无 symlink、无打开 fd/cwd；不得删除 metadata DB 或扩大到 containerd root。
 JD01 Web observe artifact 是一等 GC 对象。state root 必须来自 YAML；候选按 run 聚合并读取 `manifest.json`、`heartbeat.json`、`pid`、report sha 和 top files。年龄判定以 manifest/heartbeat 的 started/completed/updated 字段、pid 存活和打开 fd 检查为准，不以目录 mtime 为唯一依据，因为手动 GC 或目录遍历可能刷新 mtime。active run、pid alive、open fd、未生成必要 report 的 run 均为 protected。safe 候选只覆盖超过 YAML retention 且可重建的 raw samples、browser-process、network/trace、screenshot 等大 artifact；长期保留 report summary、report json/md、最终截图或诊断摘要由 YAML cap/retention 策略控制。
 JD01 Chrome 内存治理应优先管理 observer runner 生命周期，而不是孤立清理 Chrome 进程。Web probe sentinel 和 quick-verify 启动 observer 后，所有终态路径（成功、blocked、失败、timeout、异常）都必须执行 YAML 控制的 `web-probe observe stop`/force stop 流程，并验证对应 runner/Chrome process tree 退出；observe runner 自身也必须从 scenario/YAML 获得最大运行时长或 max samples 兜底，即使调用方退出也会停止采样并关闭 browser。browser freeze policy 只能作为异常保护，不替代正常任务生命周期结束后的 stop。
@@ -223,6 +223,11 @@ export interface AgentRunRunnerRetentionSpec {
    readonly enabled: boolean;
    readonly maxAgeHours: number | null;
  };
  readonly sessionPvcRetention: {
    readonly enabled: boolean;
    readonly prefixes: readonly string[];
    readonly maxDeletePerRun: number;
  };
 }
 export type AgentRunCancelLifecycleStage = "accepted" | "persisted" | "delivered" | "aborting" | "terminalized" | "fenced" | "late-write-rejected";
@@ -701,6 +706,14 @@ function parseCancelLifecycleStages(input: unknown, path: string): readonly Agen
 function parseRunnerRetention(input: Record<string, unknown>, path: string): AgentRunRunnerRetentionSpec {
  const selectors = recordField(input, "selectors", path);
  const ageBasedCleanup = recordField(input, "ageBasedCleanup", path);
  const sessionPvcRetentionRaw = input.sessionPvcRetention;
  const sessionPvcRetention = typeof sessionPvcRetentionRaw === "object" && sessionPvcRetentionRaw !== null && !Array.isArray(sessionPvcRetentionRaw)
    ? sessionPvcRetentionRaw as Record<string, unknown>
    : {};
  const sessionPvcPrefixes = sessionPvcRetention.prefixes === undefined ? [] : stringArrayField(sessionPvcRetention, "prefixes", `${path}.sessionPvcRetention`);
  for (const [index, prefix] of sessionPvcPrefixes.entries()) {
    if (!/^[a-z0-9]([-a-z0-9]*[a-z0-9-])?$/u.test(prefix)) throw new Error(`${path}.sessionPvcRetention.prefixes[${index}] must be a lowercase Kubernetes PVC name prefix`);
  }
  return {
    maxRunners: positiveIntegerField(input, "maxRunners", path),
    cleanupOrder: enumField(input, "cleanupOrder", path, ["oldest-inactive-last-active-first"]),
@@ -716,6 +729,11 @@ function parseRunnerRetention(input: Record<string, unknown>, path: string): Age
      enabled: booleanField(ageBasedCleanup, "enabled", `${path}.ageBasedCleanup`),
      maxAgeHours: optionalPositiveIntegerField(ageBasedCleanup, "maxAgeHours", `${path}.ageBasedCleanup`) ?? null,
    },
    sessionPvcRetention: {
      enabled: sessionPvcRetention.enabled === undefined ? false : booleanField(sessionPvcRetention, "enabled", `${path}.sessionPvcRetention`),
      prefixes: sessionPvcPrefixes,
      maxDeletePerRun: optionalPositiveIntegerField(sessionPvcRetention, "maxDeletePerRun", `${path}.sessionPvcRetention`) ?? 100,
    },
  };
 }
@@ -0,0 +1,102 @@
 import { execFileSync, spawnSync } from "node:child_process";
 function runJson(args) {
  return JSON.parse(execFileSync("kubectl", args, { encoding: "utf8", maxBuffer: 32 * 1024 * 1024 }));
 }
 function duBytes(path) {
  if (!path || !path.startsWith("/var/lib/rancher/k3s/storage/")) return null;
  const result = spawnSync("du", ["-sb", path], { encoding: "utf8", timeout: 8000 });
  if (result.status !== 0) return null;
  const value = Number(result.stdout.trim().split(/\s+/u)[0]);
  return Number.isFinite(value) ? value : null;
 }
 const namespace = process.env.NAMESPACE;
 const confirm = process.env.CONFIRM === "true";
 const enabled = process.env.ENABLED === "true";
 const limit = Math.max(1, Math.min(Number(process.env.LIMIT || "100"), 1000));
 const prefixes = JSON.parse(Buffer.from(process.env.PREFIXES_JSON_B64 || "W10=", "base64").toString("utf8"));
 if (!enabled) {
  console.log(JSON.stringify({ ok: false, error: "session-pvc-retention-disabled", selectedPvcCount: 0, mutation: false }));
  process.exit(0);
 }
 if (!namespace || !Array.isArray(prefixes) || prefixes.length === 0) throw new Error("session PVC cleanup requires namespace and YAML prefixes");
 const pvData = runJson(["get", "pv", "-o", "json"]);
 const pvcData = runJson(["-n", namespace, "get", "pvc", "-o", "json"]);
 const podData = runJson(["-n", namespace, "get", "pod", "-o", "json"]);
 const pvs = new Map((pvData.items || []).map((pv) => [pv.metadata?.name, pv]));
 const activeClaims = new Map();
 for (const pod of podData.items || []) {
  const phase = pod.status?.phase;
  if (phase === "Succeeded" || phase === "Failed") continue;
  for (const volume of pod.spec?.volumes || []) {
    const claim = volume.persistentVolumeClaim?.claimName;
    if (!claim) continue;
    const list = activeClaims.get(claim) || [];
    list.push(pod.metadata?.name);
    activeClaims.set(claim, list);
  }
 }
 const candidates = [];
 const protectedRows = [];
 for (const pvc of pvcData.items || []) {
  const name = pvc.metadata?.name || "";
  const matchedPrefix = prefixes.find((prefix) => name.startsWith(prefix));
  if (!matchedPrefix) continue;
  const activeMountPods = activeClaims.get(name) || [];
  const pv = pvs.get(pvc.spec?.volumeName);
  const storageClass = pvc.spec?.storageClassName || pv?.spec?.storageClassName || null;
  const reclaimPolicy = pv?.spec?.persistentVolumeReclaimPolicy || null;
  const hostPath = pv?.spec?.hostPath?.path || pv?.spec?.local?.path || null;
  const row = {
    namespace,
    pvc: name,
    volume: pvc.spec?.volumeName || null,
    matchedPrefix,
    phase: pvc.status?.phase || null,
    pvPhase: pv?.status?.phase || null,
    storageClass,
    reclaimPolicy,
    activeMountCount: activeMountPods.length,
    activeMountPods: activeMountPods.slice(0, 5),
    estimatedBytes: duBytes(hostPath),
  };
  if (activeMountPods.length > 0 || storageClass !== "local-path" || reclaimPolicy !== "Delete") {
    protectedRows.push({ ...row, reason: activeMountPods.length > 0 ? "active-mount" : "not-local-path-delete" });
  } else {
    candidates.push(row);
  }
 }
 candidates.sort((a, b) => (b.estimatedBytes || 0) - (a.estimatedBytes || 0));
 const selected = candidates.slice(0, limit);
 const result = {
  ok: true,
  planKind: "agentrun-session-pvc-retention",
  namespace,
  dryRun: !confirm,
  mutation: confirm,
  criteria: { prefixes, storageClass: "local-path", reclaimPolicy: "Delete", requireNoActiveMount: true, limit },
  candidatePvcCount: candidates.length,
  selectedPvcCount: selected.length,
  protectedPvcCount: protectedRows.length,
  estimatedReclaimBytes: selected.reduce((sum, item) => sum + (item.estimatedBytes || 0), 0),
  selectedPreview: selected.slice(0, 12),
  protectedPreview: protectedRows.slice(0, 12),
  deletedPvcCount: 0,
  valuesPrinted: false,
 };
 if (confirm && selected.length > 0) {
  for (let index = 0; index < selected.length; index += 50) {
    execFileSync("kubectl", ["-n", namespace, "delete", "pvc", "--wait=false", ...selected.slice(index, index + 50).map((item) => item.pvc)], { encoding: "utf8", maxBuffer: 1024 * 1024 });
  }
  result.deletedPvcCount = selected.length;
  result.deleteMode = "submit-only-wait-false";
 }
 console.log(JSON.stringify(result));
@@ -33,7 +33,7 @@ import {
 } from "../agentrun-manifests";
 import { sha256Fingerprint } from "../platform-infra-ops-library";
-import type { CleanupReleasedPvOptions, CleanupRunnersOptions, CleanupRunsOptions, ConfirmOptions, GitMirrorOptions, LaneConfirmOptions, RefreshOptions, SecretSyncOptions, StatusOptions } from "./options";
+import type { CleanupReleasedPvOptions, CleanupRunnersOptions, CleanupRunsOptions, CleanupSessionPvcsOptions, ConfirmOptions, GitMirrorOptions, LaneConfirmOptions, RefreshOptions, SecretSyncOptions, StatusOptions } from "./options";
 import { agentRunControlPlaneStatusCommand } from "./public-exposure";
 import { applyYamlScript, manifestObjectRef, yamlLaneGitMirrorStatusScript } from "./secrets";
 import { compactAgentRunLaneStatusTarget, compactLaneSecretsStatus } from "./trigger";
@@ -193,6 +193,18 @@ export function parseCleanupReleasedPvOptions(args: string[]): CleanupReleasedPv
  };
 }
 export function parseCleanupSessionPvcsOptions(args: string[]): CleanupSessionPvcsOptions {
  validateOptions(args, new Set(["--confirm", "--dry-run"]), new Set(["--limit", "--timeout-seconds", "--node", "--lane"]));
  const base = parseConfirmOptions(args);
  return {
    ...base,
    node: optionValue(args, "--node") ?? null,
    lane: optionValue(args, "--lane") ?? null,
    limit: positiveIntegerOption(args, "--limit", 100, 1000),
    timeoutSeconds: positiveIntegerOption(args, "--timeout-seconds", 180, 900),
  };
 }
 export function validateOptions(args: string[], booleanOptions: Set<string>, valueOptions: Set<string>): void {
  for (let index = 0; index < args.length; index += 1) {
    const arg = args[index];
@@ -34,7 +34,7 @@ import {
 import { sha256Fingerprint } from "../platform-infra-ops-library";
 import type { AgentRunResourceVerb, AgentRunRestCompatGroup } from "./utils";
-import { controlPlaneApply, controlPlanePlan, parseCleanupReleasedPvOptions, parseCleanupRunnersOptions, parseCleanupRunsOptions, parseConfirmOptions, parseGitMirrorOptions, parseLaneConfirmOptions, parseRefreshOptions, parseSecretSyncOptions, status } from "./control-plane";
+import { controlPlaneApply, controlPlanePlan, parseCleanupReleasedPvOptions, parseCleanupRunnersOptions, parseCleanupRunsOptions, parseCleanupSessionPvcsOptions, parseConfirmOptions, parseGitMirrorOptions, parseLaneConfirmOptions, parseRefreshOptions, parseSecretSyncOptions, status } from "./control-plane";
 import { gitMirrorStatus } from "./git-mirror";
 import { agentRunExplain, isRecord, parseGitMirrorStatusOptions, parseStatusOptions, parseTriggerOptions } from "./options";
 import { renderAgentRunControlPlaneActionSummary, renderAgentRunControlPlanePlanSummary, renderAgentRunControlPlaneStatusSummary } from "./public-exposure";
@@ -43,7 +43,7 @@ import { agentRunGetKindHelp, runAgentRunResourceCommand } from "./resource-acti
 import { runAgentRunRestCompatCommand, runGitMirrorJob, startAsyncAgentRunJob } from "./rest-bridge";
 import { exposeAgentRun, restartYamlLane, secretSync, triggerCurrent } from "./trigger";
 import { unsupported } from "./utils";
-import { cleanupReleasedPvs, cleanupRunners, cleanupRuns, refresh } from "./yaml-lane";
+import { cleanupReleasedPvs, cleanupRunners, cleanupRuns, cleanupSessionPvcs, refresh } from "./yaml-lane";
 export function agentRunHelp(): unknown {
  return {
@@ -143,6 +143,9 @@ export async function runAgentRunCommand(config: UniDeskConfig | null, args: str
      return options.full || options.raw ? result : renderAgentRunControlPlaneActionSummary(result, "AGENTRUN RUNNER CLEANUP");
    }
    if (action === "cleanup-runs") return await cleanupRuns(config, parseCleanupRunsOptions(actionArgs));
    if (action === "cleanup-session-pvcs") {
      return await cleanupSessionPvcs(config, parseCleanupSessionPvcsOptions(actionArgs));
    }
    if (action === "cleanup-released-pvs") return await cleanupReleasedPvs(config, parseCleanupReleasedPvOptions(actionArgs));
  }
  if (group === "git-mirror") {
@@ -271,7 +274,7 @@ export function agentRunHelpText(args: string[]): string {
    return [
      "Usage: bun scripts/cli.ts agentrun control-plane <action> [options]",
      "",
-      "Actions: plan, apply, status, secret-sync, expose, trigger-current, refresh, cleanup-runners, cleanup-runs, cleanup-released-pvs",
+      "Actions: plan, apply, status, secret-sync, expose, trigger-current, refresh, cleanup-runners, cleanup-runs, cleanup-session-pvcs, cleanup-released-pvs",
      "Examples:",
      "  bun scripts/cli.ts agentrun control-plane plan --node D601 --lane v02",
      "  bun scripts/cli.ts agentrun control-plane apply --node D601 --lane v02 --dry-run",
@@ -283,6 +286,7 @@ export function agentRunHelpText(args: string[]): string {
      "  bun scripts/cli.ts agentrun control-plane expose --dry-run",
      "  bun scripts/cli.ts agentrun control-plane trigger-current --dry-run",
      "  bun scripts/cli.ts agentrun control-plane cleanup-runners --node D601 --lane v02 --dry-run",
      "  bun scripts/cli.ts agentrun control-plane cleanup-session-pvcs --node JD01 --lane jd01-v02 --dry-run",
      "  bun scripts/cli.ts agentrun control-plane cleanup-runs --min-age-minutes 30 --limit 200 --dry-run",
    ].join("\n");
  }
@@ -265,6 +265,13 @@ export interface CleanupReleasedPvOptions extends ConfirmOptions {
  timeoutSeconds: number;
 }
 export interface CleanupSessionPvcsOptions extends ConfirmOptions {
  node: string | null;
  lane: string | null;
  limit: number;
  timeoutSeconds: number;
 }
 export interface DisclosureOptions {
  full: boolean;
  raw: boolean;
@@ -35,7 +35,7 @@ import {
 } from "../agentrun-manifests";
 import { sha256Fingerprint } from "../platform-infra-ops-library";
-import type { CleanupReleasedPvOptions, CleanupRunnersOptions, CleanupRunsOptions, RefreshOptions } from "./options";
+import type { CleanupReleasedPvOptions, CleanupRunnersOptions, CleanupRunsOptions, CleanupSessionPvcsOptions, RefreshOptions } from "./options";
 import { cleanupReleasedPvsFinalizeNodeScript, cleanupReleasedPvsPlanNodeScript, cleanupRunnersFinalizeNodeScript, cleanupRunsFinalizeNodeScript, cleanupRunsPlanNodeScript, refreshYamlLaneScript } from "./git-mirror";
 import { cleanupRunnersFactsNodeScript, cleanupRunnersPlanNodeScript, collectLaneSecretSources, createYamlLaneJobScript, yamlLaneGitopsPublishJobManifest, yamlLaneGitopsPublishPayloadFromProbe, yamlLaneJobProbeScript } from "./secrets";
 import { capture, captureJsonPayload, compactCapture, progressEvent, shQuote, sleep, stringOrNull } from "./utils";
@@ -204,6 +204,55 @@ export async function cleanupReleasedPvs(config: UniDeskConfig, options: Cleanup
  };
 }
 export async function cleanupSessionPvcs(config: UniDeskConfig, options: CleanupSessionPvcsOptions): Promise<Record<string, unknown>> {
  const { configPath, spec } = resolveAgentRunLaneTarget(options);
  const result = await capture(config, spec.nodeKubeRoute, ["sh", "--", cleanupSessionPvcsScript(options, spec)]);
  const payload = captureJsonPayload(result);
  const ok = result.exitCode === 0 && payload.ok !== false;
  const base = {
    ...payload,
    ok,
    command: "agentrun control-plane cleanup-session-pvcs",
    configPath,
    target: { node: spec.nodeId, lane: spec.lane, namespace: spec.runtime.namespace },
    mode: options.dryRun || !options.confirm ? "dry-run" : "confirmed-cleanup",
    namespace: spec.runtime.namespace,
    retention: spec.deployment.runner.retention.sessionPvcRetention,
    probe: result.exitCode === 0 ? undefined : compactCapture(result, { full: true, stdoutTailChars: 3000, stderrTailChars: 3000 }),
  };
  if (options.dryRun || !options.confirm) {
    return { ...base, dryRun: true, mutation: false, next: { confirm: `bun scripts/cli.ts agentrun control-plane cleanup-session-pvcs --node ${spec.nodeId} --lane ${spec.lane} --limit ${options.limit} --confirm` } };
  }
  return {
    ...base,
    dryRun: false,
    mutation: true,
    followUp: {
      dryRun: `bun scripts/cli.ts agentrun control-plane cleanup-session-pvcs --node ${spec.nodeId} --lane ${spec.lane} --limit ${options.limit} --dry-run`,
      diskPressure: `bun scripts/cli.ts gc remote ${spec.nodeId} status --limit 20`,
    },
  };
 }
 export function cleanupSessionPvcsScript(options: CleanupSessionPvcsOptions, spec: AgentRunLaneSpec): string {
  const retention = spec.deployment.runner.retention.sessionPvcRetention;
  const script = readFileSync(rootPath("scripts/src/agentrun/cleanup-session-pvcs.mjs"), "utf8");
  return [
    "set -eu",
    `namespace=${shQuote(spec.runtime.namespace)}`,
    `confirm=${options.confirm && !options.dryRun ? "true" : "false"}`,
    `limit=${String(Math.min(options.limit, retention.maxDeletePerRun))}`,
    `enabled=${retention.enabled ? "true" : "false"}`,
    `prefixes_json_b64=${shQuote(Buffer.from(JSON.stringify(retention.prefixes), "utf8").toString("base64"))}`,
    "tmp_dir=$(mktemp -d)",
    "trap 'rm -rf \"$tmp_dir\"' EXIT",
    "cat > \"$tmp_dir/cleanup-session-pvcs.mjs\" <<'NODE'",
    script,
    "NODE",
    "env NAMESPACE=\"$namespace\" CONFIRM=\"$confirm\" LIMIT=\"$limit\" ENABLED=\"$enabled\" PREFIXES_JSON_B64=\"$prefixes_json_b64\" node \"$tmp_dir/cleanup-session-pvcs.mjs\"",
  ].join("\n");
 }
 export function cleanupRunnersScript(options: CleanupRunnersOptions, spec: AgentRunLaneSpec): string {
  const retention = spec.deployment.runner.retention;
  const matchLabelsB64 = Buffer.from(JSON.stringify(retention.selectors.matchLabels), "utf8").toString("base64");
@@ -0,0 +1,386 @@
 def k3s_crictl_base():
    endpoint = str(CONTAINERD_CONFIG.get("runtimeEndpoint") or "unix:///run/k3s/containerd/containerd.sock")
    return ["crictl", "--runtime-endpoint", endpoint]
 def shell_single_quote(value):
    return "'" + str(value).replace("'", "'\"'\"'") + "'"
 def k3s_crictl_json(args, timeout=30):
    result = command(k3s_crictl_base() + args + ["-o", "json"], timeout)
    if result["exitCode"] != 0:
        return None, result
    try:
        return json.loads(result["stdout"] or "{}"), result
    except Exception:
        return None, result
 def ci_activity_snapshot_for_prune():
    namespaces = config_list(CONTAINERD_CONFIG, "ciNamespaces", ["hwlab-ci", "agentrun-ci"])
    active = []
    commands = []
    for namespace in namespaces:
        result = command(["sh", "-lc", "KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl get pipelinerun,taskrun,job -n %s --no-headers 2>/dev/null | awk '$2 != \"True\" && $2 != \"False\" && $2 != \"Complete\" && $2 != \"Failed\" {print}' | head -20" % shell_single_quote(namespace)], 15)
        commands.append({"namespace": namespace, "command": bounded(result)})
        for line in (result.get("stdout") or "").splitlines():
            if line.strip():
                active.append({"namespace": namespace, "line": line.strip()})
    return {"ok": True, "activeCount": len(active), "activePreview": active[:20], "commands": commands}
 def compact_ci_activity(activity):
    return {
        "ok": activity.get("ok"),
        "activeCount": activity.get("activeCount"),
        "activePreview": activity.get("activePreview") or [],
    }
 def compact_image_ref(ref):
    ref = str(ref or "")
    return ref if len(ref) <= 120 else ref[:117] + "..."
 def k3s_cri_image_rows():
    images, image_cmd = k3s_crictl_json(["images"], 45)
    containers, container_cmd = k3s_crictl_json(["ps", "-a"], 30)
    if images is None:
        return None, {"ok": False, "reason": "crictl-images-failed", "command": bounded(image_cmd)}
    if containers is None:
        return None, {"ok": False, "reason": "crictl-ps-failed", "command": bounded(container_cmd)}
    used = set()
    for container in containers.get("containers") or []:
        for key in ["imageRef", "image", "imageId"]:
            value = container.get(key)
            if isinstance(value, str) and value:
                used.add(value)
        image = container.get("image") or {}
        if isinstance(image, dict):
            for key in ["image", "annotations", "userSpecifiedImage"]:
                value = image.get(key)
                if isinstance(value, str) and value:
                    used.add(value)
    rows = []
    for image in images.get("images") or []:
        refs = []
        for key in ["repoTags", "repoDigests"]:
            value = image.get(key)
            if isinstance(value, list):
                refs.extend([str(item) for item in value if item])
        image_id = str(image.get("id") or "")
        pinned = bool(image.get("pinned"))
        size = safe_int(image.get("size_") or image.get("size") or 0)
        in_use = pinned or image_id in used or any(ref in used for ref in refs)
        rows.append({"id": image_id, "refs": refs, "sizeBytes": size, "inUse": in_use, "pinned": pinned})
    return rows, {"ok": True, "imageCommand": bounded(image_cmd), "containerCommand": bounded(container_cmd)}
 def k3s_image_cache_candidate():
    if not config_bool(CONTAINERD_CONFIG, "enabled", False):
        return {
            "id": "k3s-cri-image-prune:disabled",
            "kind": "k3s-cri-image-prune-disabled",
            "risk": "blocked",
            "description": "K3s CRI image prune is disabled in YAML",
            "estimatedReclaimBytes": 0,
            "configSource": "config/unidesk-cli.yaml#gc.remote.targets.%s.containerdImageCache.enabled" % PROVIDER_ID,
        }
    activity = ci_activity_snapshot_for_prune()
    if int(activity.get("activeCount") or 0) > 0:
        return {
            "id": "k3s-cri-image-prune:ci-active",
            "kind": "k3s-cri-image-prune-blocked",
            "risk": "blocked",
            "description": "K3s CRI image prune is blocked while CI workloads are active",
            "estimatedReclaimBytes": 0,
            "ciActivity": compact_ci_activity(activity),
        }
    rows, meta = k3s_cri_image_rows()
    if rows is None:
        return {
            "id": "k3s-cri-image-prune:unavailable",
            "kind": "k3s-cri-image-prune-unavailable",
            "risk": "blocked",
            "description": "K3s CRI image list is unavailable",
            "estimatedReclaimBytes": 0,
            "diagnostic": meta,
        }
    unused = [row for row in rows if not row.get("inUse")]
    estimated = sum(safe_int(row.get("sizeBytes")) for row in unused)
    if estimated <= 0:
        return None
    return {
        "id": "k3s-cri-image-prune:unused",
        "kind": "k3s-cri-image-prune",
        "risk": "medium",
        "description": "Prune unused k3s CRI images through crictl rmi --prune; no containerd paths are deleted directly",
        "sizeBytes": estimated,
        "estimatedReclaimBytes": estimated,
        "imageCount": len(rows),
        "unusedImageCount": len(unused),
        "unusedPreview": [{"id": row.get("id"), "refs": [compact_image_ref(ref) for ref in (row.get("refs") or [])[:2]], "sizeBytes": row.get("sizeBytes")} for row in unused[:3]],
        "ciActivity": compact_ci_activity(activity),
        "action": {"command": k3s_crictl_base() + ["rmi", "--prune"], "mode": "cri-unused-images-only"},
    }
 def execute_k3s_image_cache_prune():
    activity = ci_activity_snapshot_for_prune()
    if int(activity.get("activeCount") or 0) > 0:
        raise RuntimeError("refusing k3s image prune while CI workloads are active")
    before = du_size("/var/lib/rancher/k3s/agent/containerd", 45) or 0
    result = command(k3s_crictl_base() + ["rmi", "--prune"], 300)
    if result["exitCode"] != 0:
        raise RuntimeError((result["stderr"] or result["stdout"] or "crictl rmi --prune failed").strip())
    after = du_size("/var/lib/rancher/k3s/agent/containerd", 45) or 0
    return {"reclaimedBytes": max(0, before - after), "commandOutput": bounded(result), "ciActivity": compact_ci_activity(activity)}
 def host_ctr_base(namespace=None):
    address = config_str(HOST_CONTAINERD_CONFIG, "address", "")
    args = ["ctr"]
    if address:
        args.extend(["--address", address])
    if namespace:
        args.extend(["-n", namespace])
    return args
 def host_ctr(args, timeout=30, namespace=None):
    return command(host_ctr_base(namespace) + args, timeout)
 def host_containerd_namespaces():
    configured = config_list(HOST_CONTAINERD_CONFIG, "namespaces", [])
    if configured:
        return configured, {"source": "yaml", "command": None}
    result = host_ctr(["namespaces", "list", "-q"], 20)
    if result["exitCode"] != 0:
        return [], {"source": "ctr", "command": bounded(result), "error": "ctr-namespaces-failed"}
    return [line.strip() for line in (result.get("stdout") or "").splitlines() if line.strip()], {"source": "ctr", "command": bounded(result)}
 def host_containerd_activity():
    if not config_bool(HOST_CONTAINERD_CONFIG, "enabled", False):
        return {"ok": False, "reason": "host-containerd-cache-disabled", "activeCount": 0}
    root = config_str(HOST_CONTAINERD_CONFIG, "root", "")
    if not root or not os.path.isdir(root):
        return {"ok": False, "reason": "host-containerd-root-unavailable", "root": root, "activeCount": 0}
    namespaces, namespace_meta = host_containerd_namespaces()
    active = []
    commands = []
    for namespace in namespaces:
        task_result = host_ctr(["tasks", "list", "-q"], 20, namespace)
        container_result = host_ctr(["containers", "list", "-q"], 20, namespace)
        image_result = host_ctr(["images", "list", "-q"], 20, namespace)
        lease_result = host_ctr(["leases", "list", "-q"], 20, namespace)
        snapshot_result = host_ctr(["snapshots", "list"], 20, namespace)
        content_result = host_ctr(["content", "list"], 20, namespace)
        snapshot_lines = table_data_lines(snapshot_result.get("stdout") or "", "KEY")
        content_lines = table_data_lines(content_result.get("stdout") or "", "DIGEST")
        commands.append({
            "namespace": namespace,
            "tasks": bounded(task_result),
            "containers": bounded(container_result),
            "images": bounded(image_result),
            "leases": bounded(lease_result),
            "snapshots": bounded(snapshot_result),
            "content": bounded(content_result),
        })
        for kind, result in [("task", task_result), ("container", container_result), ("lease", lease_result)]:
            if result["exitCode"] != 0:
                active.append({"namespace": namespace, "kind": kind, "state": "unknown", "reason": "ctr-list-failed"})
                continue
            for line in (result.get("stdout") or "").splitlines():
                if line.strip():
                    active.append({"namespace": namespace, "kind": kind, "name": line.strip()})
        if snapshot_result["exitCode"] != 0:
            active.append({"namespace": namespace, "kind": "snapshot", "state": "unknown", "reason": "ctr-list-failed"})
        for line in snapshot_lines:
            active.append({"namespace": namespace, "kind": "snapshot", "name": line.split()[0] if line.split() else line})
        if content_result["exitCode"] != 0:
            active.append({"namespace": namespace, "kind": "content", "state": "unknown", "reason": "ctr-list-failed"})
        for line in content_lines:
            active.append({"namespace": namespace, "kind": "content", "name": line.split()[0] if line.split() else line})
    return {
        "ok": True,
        "root": root,
        "namespaces": namespaces,
        "namespaceMeta": namespace_meta,
        "activeCount": len(active),
        "activePreview": active[:20],
        "commands": commands,
    }
 def compact_host_containerd_activity(activity):
    return {
        "ok": activity.get("ok"),
        "reason": activity.get("reason"),
        "root": activity.get("root"),
        "namespaces": activity.get("namespaces"),
        "activeCount": activity.get("activeCount"),
        "activePreview": activity.get("activePreview") or [],
    }
 def table_data_lines(stdout, header_prefix):
    lines = [line.strip() for line in str(stdout or "").splitlines() if line.strip()]
    return [line for line in lines if not line.startswith(header_prefix)]
 def host_containerd_orphan_config():
    value = HOST_CONTAINERD_CONFIG.get("orphanCleanup") if isinstance(HOST_CONTAINERD_CONFIG, dict) else None
    return value if isinstance(value, dict) else {}
 def direct_child_paths(root, predicate):
    if not root or not os.path.isdir(root) or os.path.islink(root):
        return []
    rows = []
    for name in sorted(os.listdir(root)):
        path = os.path.realpath(os.path.abspath(os.path.join(root, name)))
        if os.path.dirname(path) != os.path.realpath(os.path.abspath(root)):
            continue
        if not predicate(name, path):
            continue
        rows.append({"name": name, "path": path, "estimatedReclaimBytes": du_size(path, 10) or path_size(path)})
    return rows
 def host_containerd_orphan_rows(activity):
    cfg = host_containerd_orphan_config()
    if not config_bool(cfg, "enabled", False):
        return [], {"ok": False, "reason": "host-containerd-orphan-cleanup-disabled"}
    if not activity.get("ok") or int(activity.get("activeCount") or 0) > 0:
        return [], {"ok": False, "reason": "host-containerd-metadata-not-empty", "activity": compact_host_containerd_activity(activity)}
    overlay_root = os.path.realpath(os.path.abspath(config_str(cfg, "overlaySnapshotsRoot", "")))
    content_root = os.path.realpath(os.path.abspath(config_str(cfg, "contentBlobRoot", "")))
    root = os.path.realpath(os.path.abspath(config_str(HOST_CONTAINERD_CONFIG, "root", "")))
    if not root or not overlay_root.startswith(root.rstrip("/") + "/") or not content_root.startswith(root.rstrip("/") + "/"):
        return [], {"ok": False, "reason": "host-containerd-orphan-root-outside-containerd-root", "root": root}
    open_roots = []
    for candidate_root in [overlay_root, content_root]:
        if os.path.exists(candidate_root) and path_has_open_fd(candidate_root):
            open_roots.append(candidate_root)
    if open_roots:
        return [], {"ok": False, "reason": "host-containerd-orphan-root-open-fd", "openRoots": open_roots}
    overlay_rows = direct_child_paths(overlay_root, lambda name, path: os.path.isdir(path) and not os.path.islink(path) and re.match(r"^[0-9]+$", name) is not None)
    content_rows = direct_child_paths(content_root, lambda name, path: os.path.isfile(path) and not os.path.islink(path) and re.match(r"^[0-9a-f]{64}$", name) is not None)
    safe_rows = []
    for kind, rows in [("overlay-snapshot-dir", overlay_rows), ("content-blob-file", content_rows)]:
        for row in rows:
            safe_rows.append({**row, "kind": kind})
    safe_rows.sort(key=lambda item: safe_int(item.get("estimatedReclaimBytes")), reverse=True)
    return safe_rows, {
        "ok": True,
        "root": root,
        "overlaySnapshotsRoot": overlay_root,
        "contentBlobRoot": content_root,
        "overlayCandidateCount": len(overlay_rows),
        "contentCandidateCount": len(content_rows),
        "protectedCount": 0,
        "protectedPreview": [],
    }
 def host_containerd_orphan_candidate(activity):
    rows, meta = host_containerd_orphan_rows(activity)
    if not meta.get("ok"):
        return None
    limit = int(OPTIONS.get("limit") or 50)
    selected = rows[:limit]
    estimated = sum(safe_int(row.get("estimatedReclaimBytes")) for row in selected)
    if estimated <= 0:
        return None
    return {
        "id": "host-containerd-orphan-state:delete",
        "kind": "host-containerd-orphan-state-delete",
        "risk": "medium",
        "description": "Delete YAML-allowlisted host containerd orphan snapshot/content files only when ctr metadata has no tasks, containers, leases, images, snapshots or content",
        "path": meta.get("root"),
        "sizeBytes": estimated,
        "estimatedReclaimBytes": estimated,
        "orphanCount": len(rows),
        "selectedOrphanCount": len(selected),
        "overlayCandidateCount": meta.get("overlayCandidateCount"),
        "contentCandidateCount": meta.get("contentCandidateCount"),
        "protectedCount": meta.get("protectedCount"),
        "selectedPreview": [{"kind": row.get("kind"), "name": row.get("name"), "estimatedReclaimBytes": row.get("estimatedReclaimBytes")} for row in selected[:8]],
        "protectedPreview": meta.get("protectedPreview"),
        "action": {"op": "remove-yaml-allowlisted-host-containerd-orphans", "limit": limit},
    }
 def host_containerd_cache_candidate():
    activity = host_containerd_activity()
    if not activity.get("ok"):
        return {
            "id": "host-containerd-cache:unavailable",
            "kind": "host-containerd-cache-unavailable",
            "risk": "blocked",
            "description": "Host containerd cache cleanup is disabled or unavailable by YAML",
            "estimatedReclaimBytes": 0,
            "diagnostic": compact_host_containerd_activity(activity),
        }
    if int(activity.get("activeCount") or 0) > 0:
        return {
            "id": "host-containerd-cache:active",
            "kind": "host-containerd-cache-blocked",
            "risk": "blocked",
            "description": "Host containerd cache prune is blocked while host containerd tasks or containers exist",
            "estimatedReclaimBytes": 0,
            "activity": compact_host_containerd_activity(activity),
        }
    orphan = host_containerd_orphan_candidate(activity)
    if orphan:
        return orphan
    root = activity.get("root") or ""
    size = du_size(root, 45) or 0
    if size <= 0:
        return None
    return {
        "id": "host-containerd-cache:prune-unused",
        "kind": "host-containerd-cache-prune",
        "risk": "medium",
        "description": "Prune host containerd images in YAML-selected namespaces only when no host containerd tasks or containers exist",
        "path": root,
        "sizeBytes": size,
        "estimatedReclaimBytes": size,
        "activity": compact_host_containerd_activity(activity),
        "action": {"command": "ctr images prune --all per namespace", "mode": "host-containerd-unused-images-only"},
    }
 def execute_host_containerd_cache_prune():
    activity = host_containerd_activity()
    if not activity.get("ok"):
        raise RuntimeError("host containerd cache cleanup unavailable: %s" % activity.get("reason"))
    if int(activity.get("activeCount") or 0) > 0:
        raise RuntimeError("refusing host containerd prune while tasks or containers exist")
    root = activity.get("root") or ""
    before = du_size(root, 45) or 0
    results = []
    for namespace in activity.get("namespaces") or []:
        result = host_ctr(["images", "prune", "--all"], 300, namespace)
        results.append({"namespace": namespace, "imagesPrune": bounded(result)})
        if result["exitCode"] != 0:
            raise RuntimeError("host containerd image prune failed in namespace %s: %s" % (namespace, (result.get("stderr") or result.get("stdout") or "").strip()))
    after = du_size(root, 45) or 0
    return {
        "reclaimedBytes": max(0, before - after),
        "activity": compact_host_containerd_activity(activity),
        "commandResults": results[:8],
    }
 def execute_host_containerd_orphan_cleanup():
    activity = host_containerd_activity()
    rows, meta = host_containerd_orphan_rows(activity)
    if not meta.get("ok"):
        raise RuntimeError("host containerd orphan cleanup unavailable: %s" % meta.get("reason"))
    for root_path in [meta.get("overlaySnapshotsRoot"), meta.get("contentBlobRoot")]:
        if root_path and os.path.exists(root_path) and path_has_open_fd(root_path):
            raise RuntimeError("refusing host containerd orphan cleanup with open fd/cwd under root: %s" % root_path)
    limit = int(OPTIONS.get("limit") or 50)
    selected = rows[:limit]
    reclaimed = 0
    deleted = []
    for row in selected:
        path = row.get("path")
        before = du_size(path, 10) or path_size(path)
        if row.get("kind") == "overlay-snapshot-dir":
            shutil.rmtree(path, ignore_errors=True)
        elif row.get("kind") == "content-blob-file":
            os.unlink(path)
        else:
            raise RuntimeError("unsupported host containerd orphan kind: %s" % row.get("kind"))
        reclaimed += before
        deleted.append({"kind": row.get("kind"), "name": row.get("name"), "reclaimedBytes": before})
    return {
        "reclaimedBytes": reclaimed,
        "deletedOrphanCount": len(deleted),
        "deletedPreview": deleted[:12],
        "root": meta.get("root"),
    }
@@ -0,0 +1,292 @@
 def registry_growth_snapshot():
    summary = {
        "path": REGISTRY_ROOT,
        "sizeBytes": du_size(REGISTRY_ROOT, 60) or 0,
    }
    summary["sizeHuman"] = fmt_bytes(summary["sizeBytes"])
    if OPTIONS.get("hwlabRegistry", False):
        plan = plan_registry_retention()
        retention = dict(plan.get("summary") or {})
        for key in ["registrySizeBytes", "estimatedReclaimBytes"]:
            if key in retention:
                retention[key.replace("Bytes", "Human")] = fmt_bytes(retention.get(key) or 0)
        summary["retentionPlan"] = retention
    else:
        summary["retentionPlan"] = {
            "skipped": True,
            "reason": "rerun snapshot with --include-hwlab-registry to compute tag/revision retention counters",
        }
    summary["cadence"] = {
        "dryRun": "daily or before/after every v0.2 CI/CD burst",
        "maintenanceRun": "weekly, or when root >=80%, or when registry growth exceeds the agreed daily threshold",
        "planCommand": "bun scripts/cli.ts gc remote %s plan --target-use-percent 70 --include-hwlab-registry --limit 50" % PROVIDER_ID,
        "snapshotCommand": "bun scripts/cli.ts gc remote %s snapshot --include-hwlab-registry --history-limit 12" % PROVIDER_ID,
        "runCommand": "bun scripts/cli.ts gc remote %s run --confirm --include-hwlab-registry --target-use-percent 70 --limit 50" % PROVIDER_ID,
        "defaultRetention": {
            "keepPerRepo": int(OPTIONS.get("registryKeepPerRepo") or 20),
            "minAgeHours": float(OPTIONS.get("registryMinAgeHours") or 48),
            "protects": ["current workload refs", "digest closure", "protected tags", "recent tags", "newest N tags per repo"],
        },
    }
    return summary
 def growth_watermark_policy(root_disk):
    use_percent = root_disk.get("usePercent") if isinstance(root_disk, dict) else None
    if use_percent is None:
        state = "unknown"
        action = "collect-snapshot"
    elif use_percent < 75:
        state = "healthy"
        action = "observe-trend"
    elif use_percent < 80:
        state = "watch"
        action = "run-dry-run-plan"
    elif use_percent < 85:
        state = "maintenance"
        action = "schedule-owner-aware-retention"
    else:
        state = "emergency"
        action = "restore-runtime-then-file-evidence"
    return {
        "state": state,
        "recommendedAction": action,
        "watermarks": [
            {"range": "<75%", "action": "trend only"},
            {"range": "75%-80%", "action": "run dry-run plan and identify source"},
            {"range": "80%-85%", "action": "small owner-aware retention run"},
            {"range": ">=85%", "action": "runtime recovery first, then root-cause growth source"},
        ],
        "growthThresholdPolicy": "If bytes/day remains high for consecutive snapshots, act before 80%; exact threshold should be set from the first week of saved snapshots.",
    }
 def snapshot_metric_map(snapshot):
    metrics = {}
    root = snapshot.get("rootDisk") or {}
    if isinstance(root, dict) and root.get("usedBytes") is not None:
        metrics["root.usedBytes"] = {"value": safe_int(root.get("usedBytes")), "unit": "bytes", "label": "root used bytes"}
    for item in snapshot.get("sources") or []:
        if not isinstance(item, dict) or item.get("sizeBytes") is None:
            continue
        key = "source.%s.sizeBytes" % item.get("id")
        metrics[key] = {"value": safe_int(item.get("sizeBytes")), "unit": "bytes", "label": item.get("label") or item.get("id")}
    storage = ((snapshot.get("ciStorage") or {}).get("byOwnerGroup") or {})
    if not storage:
        storage = ((snapshot.get("pvcAttribution") or {}).get("byOwnerGroup") or {})
    for owner, value in storage.items():
        metrics["ciStorage.%s.estimatedBytes" % owner] = {"value": safe_int((value or {}).get("estimatedBytes")), "unit": "bytes", "label": "CI storage %s" % owner}
    memory = snapshot.get("memoryPressure") or {}
    memory_summary = memory.get("summary") or {}
    if memory_summary.get("matchedRssBytes") is not None:
        metrics["memoryPressure.matchedRssBytes"] = {"value": safe_int(memory_summary.get("matchedRssBytes")), "unit": "bytes", "label": "matched observer/chrome RSS"}
    if memory_summary.get("observeStateBytes") is not None:
        metrics["memoryPressure.observeStateBytes"] = {"value": safe_int(memory_summary.get("observeStateBytes")), "unit": "bytes", "label": "web observe state bytes"}
    for key in ["matchedProcessCount", "activeObserverSignals", "staleObserverSignals"]:
        if memory_summary.get(key) is not None:
            metrics["memoryPressure.%s" % key] = {"value": safe_int(memory_summary.get(key)), "unit": "count", "label": "memory pressure %s" % key}
    registry = snapshot.get("registry") or {}
    retention = registry.get("retentionPlan") or {}
    for key in ["totalTags", "totalRevisions", "deleteTags", "deleteRevisions", "estimatedReclaimBytes"]:
        if key in retention and retention.get(key) is not None:
            unit = "bytes" if key.endswith("Bytes") else "count"
            metrics["registry.%s" % key] = {"value": safe_int(retention.get(key)), "unit": unit, "label": "registry %s" % key}
    return metrics
 def delta_metric_rows(before, after):
    before_metrics = snapshot_metric_map(before)
    after_metrics = snapshot_metric_map(after)
    before_ts = iso_to_epoch(before.get("observedAt"))
    after_ts = iso_to_epoch(after.get("observedAt"))
    seconds = (after_ts - before_ts) if before_ts is not None and after_ts is not None else None
    rows = []
    for key in sorted(set(before_metrics.keys()) | set(after_metrics.keys())):
        old = before_metrics.get(key, {"value": 0, "unit": (after_metrics.get(key) or {}).get("unit"), "label": key})
        new = after_metrics.get(key, {"value": 0, "unit": old.get("unit"), "label": old.get("label")})
        delta = safe_int(new.get("value")) - safe_int(old.get("value"))
        row = {
            "key": key,
            "label": new.get("label") or old.get("label") or key,
            "unit": new.get("unit") or old.get("unit"),
            "before": old.get("value"),
            "after": new.get("value"),
            "delta": delta,
        }
        if row["unit"] == "bytes":
            row["beforeHuman"] = fmt_bytes(row["before"] or 0)
            row["afterHuman"] = fmt_bytes(row["after"] or 0)
            row["deltaHuman"] = ("-" if delta < 0 else "") + fmt_bytes(abs(delta))
            if seconds and seconds > 0:
                per_day = int(delta * 86400 / seconds)
                row["perDayBytes"] = per_day
                row["perDayHuman"] = ("-" if per_day < 0 else "") + fmt_bytes(abs(per_day)) + "/day"
        rows.append(row)
    rows.sort(key=lambda item: safe_int(item.get("delta")), reverse=True)
    return {"durationSeconds": seconds, "metrics": rows}
 def growth_trend_payload(points):
    points = [point for point in points if isinstance(point, dict)]
    if len(points) < 2:
        return {
            "pointCount": len(points),
            "state": "insufficient-history",
            "message": "Run snapshot at least twice to compute deltas.",
        }
    latest_delta = delta_metric_rows(points[-2], points[-1])
    window_delta = delta_metric_rows(points[0], points[-1])
    def rate_warning(delta):
        seconds = delta.get("durationSeconds")
        if seconds is not None and seconds < 3600:
            return {
                "code": "short-window-rate-noisy",
                "message": "Per-day rates from windows shorter than 1 hour are directional only; use daily snapshots for governance decisions.",
                "durationSeconds": seconds,
            }
        return None
    return {
        "pointCount": len(points),
        "oldestAt": points[0].get("observedAt"),
        "latestAt": points[-1].get("observedAt"),
        "latestDelta": {
            "durationSeconds": latest_delta.get("durationSeconds"),
            "rateWarning": rate_warning(latest_delta),
            "topGrowingBytes": [row for row in latest_delta.get("metrics", []) if row.get("unit") == "bytes" and safe_int(row.get("delta")) > 0][:10],
            "topShrinkingBytes": [row for row in reversed(latest_delta.get("metrics", [])) if row.get("unit") == "bytes" and safe_int(row.get("delta")) < 0][:10],
            "registryCounters": [row for row in latest_delta.get("metrics", []) if str(row.get("key", "")).startswith("registry.") and row.get("unit") == "count"],
        },
        "windowDelta": {
            "durationSeconds": window_delta.get("durationSeconds"),
            "rateWarning": rate_warning(window_delta),
            "topGrowingBytes": [row for row in window_delta.get("metrics", []) if row.get("unit") == "bytes" and safe_int(row.get("delta")) > 0][:10],
            "topShrinkingBytes": [row for row in reversed(window_delta.get("metrics", [])) if row.get("unit") == "bytes" and safe_int(row.get("delta")) < 0][:10],
            "registryCounters": [row for row in window_delta.get("metrics", []) if str(row.get("key", "")).startswith("registry.") and row.get("unit") == "count"],
        },
    }
 def compact_metric_rows(rows, limit=3):
    compact = []
    for row in (rows or [])[:limit]:
        compact.append({
            "key": row.get("key"),
            "label": row.get("label"),
            "unit": row.get("unit"),
            "delta": row.get("delta"),
            "deltaHuman": row.get("deltaHuman"),
            "perDayHuman": row.get("perDayHuman"),
        })
    return compact
 def compact_trend_payload(payload):
    if payload.get("state") == "insufficient-history":
        return payload
    latest = payload.get("latestDelta") or {}
    window = payload.get("windowDelta") or {}
    return {
        "pointCount": payload.get("pointCount"),
        "oldestAt": payload.get("oldestAt"),
        "latestAt": payload.get("latestAt"),
        "latestDelta": {
            "durationSeconds": latest.get("durationSeconds"),
            "rateWarning": latest.get("rateWarning"),
            "topGrowingBytes": compact_metric_rows(latest.get("topGrowingBytes") or [], 1),
            "topShrinkingBytes": compact_metric_rows(latest.get("topShrinkingBytes") or [], 1),
            "registryCounters": compact_metric_rows(latest.get("registryCounters") or [], 1),
        },
        "windowDelta": {
            "durationSeconds": window.get("durationSeconds"),
            "rateWarning": window.get("rateWarning"),
            "topGrowingBytes": compact_metric_rows(window.get("topGrowingBytes") or [], 1),
            "topShrinkingBytes": compact_metric_rows(window.get("topShrinkingBytes") or [], 1),
            "registryCounters": compact_metric_rows(window.get("registryCounters") or [], 1),
        },
        "fullDisclosure": "rerun trend --full for all metric rows",
    }
 def compact_growth_point(item):
    registry = item.get("registry") or {}
    retention = registry.get("retentionPlan") or {}
    ci_storage = item.get("ciStorage") or {}
    containerd = item.get("containerd") or {}
    memory = item.get("memoryPressure") or {}
    memory_summary = memory.get("summary") or {}
    observe = (memory.get("webObserve") or {})
    return {
        "observedAt": item.get("observedAt"),
        "rootDisk": item.get("rootDisk"),
        "sourceCount": len(item.get("sources") or []),
        "registry": {
            "sizeBytes": registry.get("sizeBytes"),
            "sizeHuman": registry.get("sizeHuman"),
            "totalTags": retention.get("totalTags"),
            "totalRevisions": retention.get("totalRevisions"),
            "deleteTags": retention.get("deleteTags"),
            "deleteRevisions": retention.get("deleteRevisions"),
            "estimatedReclaimBytes": retention.get("estimatedReclaimBytes"),
            "estimatedReclaimHuman": retention.get("estimatedReclaimHuman"),
        },
        "ciStorage": {
            "pvcCount": ci_storage.get("pvcCount"),
            "estimatedBytes": ci_storage.get("estimatedBytes"),
            "estimatedHuman": ci_storage.get("estimatedHuman"),
            "byOwnerGroup": ci_storage.get("byOwnerGroup"),
        },
        "containerd": {
            "state": containerd.get("state"),
            "cleanupSupported": containerd.get("cleanupSupported"),
        },
        "memoryPressure": {
            "matchedProcessCount": memory_summary.get("matchedProcessCount"),
            "matchedRssBytes": memory_summary.get("matchedRssBytes"),
            "matchedRssHuman": memory_summary.get("matchedRssHuman"),
            "activeObserverSignals": memory_summary.get("activeObserverSignals"),
            "staleObserverSignals": memory_summary.get("staleObserverSignals"),
            "observeStateBytes": memory_summary.get("observeStateBytes"),
            "observeStateHuman": memory_summary.get("observeStateHuman"),
            "webObserveRootCount": observe.get("rootCount"),
        },
    }
 def collect_growth_snapshot(observed_at, preflight):
    root_disk = df_snapshot()
    sources = disk_source_snapshot()
    ci_storage = ci_storage_snapshot()
    memory_pressure = collect_memory_pressure()
    compact_pvc = compact_pvc_attribution(ci_storage)
    if bool(OPTIONS.get("full")):
        public_pvc = ci_storage
        public_memory = memory_pressure
    else:
        public_pvc = compact_ci_storage_summary(ci_storage)
        public_memory = compact_memory_summary(memory_pressure)
    registry = registry_growth_snapshot()
    containerd = containerd_breakdown_snapshot()
    commands = {
        "snapshot": "bun scripts/cli.ts gc remote %s snapshot --include-hwlab-registry --history-limit %s" % (PROVIDER_ID, int(OPTIONS.get("historyLimit") or 12)),
        "trend": "bun scripts/cli.ts gc remote %s trend --history-limit %s" % (PROVIDER_ID, int(OPTIONS.get("historyLimit") or 12)),
        "registryPlan": "bun scripts/cli.ts gc remote %s plan --target-use-percent 70 --include-hwlab-registry --limit 50" % PROVIDER_ID,
        "hwlabCiRetention": ((ci_storage.get("handoff") or {}).get("hwlab") or {}).get("dryRun"),
        "agentrunRetention": ((ci_storage.get("handoff") or {}).get("agentrun") or {}).get("dryRun"),
        "remotePolicy": "bun scripts/cli.ts gc remote %s policy plan" % PROVIDER_ID,
    }
    if not bool(OPTIONS.get("full")):
        commands = {
            "trend": "bun scripts/cli.ts gc remote %s trend --history-limit %s" % (PROVIDER_ID, int(OPTIONS.get("historyLimit") or 12)),
            "status": "bun scripts/cli.ts gc remote %s status --limit %s" % (PROVIDER_ID, int(OPTIONS.get("limit") or 50)),
            "full": "bun scripts/cli.ts gc remote %s snapshot --full --no-save" % PROVIDER_ID,
        }
    return {
        "ok": True,
        "action": "gc remote snapshot",
        "providerId": PROVIDER_ID,
        "dryRun": True,
        "mutation": False,
        "diagnosticStateMutation": bool(OPTIONS.get("saveSnapshot", True)),
        "observedAt": observed_at,
        "rootDisk": root_disk,
        "clusterPreflight": preflight,
        "sources": sources,
        "registry": registry,
        "pvcAttribution": public_pvc,
        "memoryPressure": public_memory,
        "containerd": containerd,
        "policy": growth_watermark_policy(root_disk or {}),
        "commands": commands,
    }
@@ -0,0 +1,383 @@
 def pv_host_path(pv):
    spec = (pv or {}).get("spec") or {}
    host_path = (spec.get("hostPath") or {}).get("path")
    if isinstance(host_path, str) and host_path:
        return host_path
    local_path = (spec.get("local") or {}).get("path")
    if isinstance(local_path, str) and local_path:
        return local_path
    return None
 def pvc_owner_group(namespace, owner):
    owner = str(owner or "")
    if namespace == "agentrun-ci":
        return "agentrun"
    if namespace == "hwlab-ci":
        if owner.startswith("agentrun-"):
            return "agentrun"
        return "hwlab"
    if namespace.startswith("hwlab-"):
        return "hwlab-runtime"
    return "other"
 def parse_k8s_quantity(value):
    if value is None:
        return None
    raw = str(value).strip()
    match = re.match(r"^([0-9]+(?:\.[0-9]+)?)(Ki|Mi|Gi|Ti|K|M|G|T)?$", raw)
    if not match:
        return None
    multiplier = {
        None: 1,
        "K": 1000,
        "M": 1000**2,
        "G": 1000**3,
        "T": 1000**4,
        "Ki": 1024,
        "Mi": 1024**2,
        "Gi": 1024**3,
        "Ti": 1024**4,
    }.get(match.group(2), 1)
    return int(float(match.group(1)) * multiplier)
 def metadata_owner(meta):
    refs = meta.get("ownerReferences") or []
    if refs:
        first = refs[0] or {}
        return first.get("kind"), first.get("name"), [{"kind": item.get("kind"), "name": item.get("name")} for item in refs[:5]]
    labels = meta.get("labels") or {}
    annotations = meta.get("annotations") or {}
    for key in [
        "tekton.dev/pipelineRun",
        "tekton.dev/taskRun",
        "agentrun.unidesk/run-id",
        "hwlab.unidesk/run-id",
        "app.kubernetes.io/instance",
    ]:
        value = labels.get(key) or annotations.get(key)
        if value:
            return "Label", value, []
    return None, None, []
 def ci_storage_snapshot():
    namespaces = set(config_list(PVC_CONFIG, "namespaces", ["hwlab-ci", "agentrun-ci"]))
    candidate_namespaces = set(config_list(PVC_CONFIG, "candidateNamespaces", []))
    hwlab_node = config_str(PVC_CONFIG, "hwlabNode", PROVIDER_ID)
    hwlab_lane = config_str(PVC_CONFIG, "hwlabLane", "v03")
    agentrun_node = config_str(PVC_CONFIG, "agentrunNode", PROVIDER_ID)
    agentrun_lane = config_str(PVC_CONFIG, "agentrunLane", "v02")
    limit = config_int(PVC_CONFIG, "limit", int(OPTIONS.get("limit") or 50), minimum=1, maximum=5000)
    pv_data = kubectl_json(["get", "pv"], 30) or {}
    pvc_data = kubectl_json(["get", "pvc", "-A"], 30) or {}
    pod_data = kubectl_json(["get", "pod", "-A"], 30) or {}
    pvs = {}
    for pv in pv_data.get("items") or []:
        meta = pv.get("metadata") or {}
        name = meta.get("name")
        if name:
            pvs[name] = pv
    mounts = {}
    for pod in pod_data.get("items") or []:
        meta = pod.get("metadata") or {}
        ns = str(meta.get("namespace") or "")
        pod_name = str(meta.get("name") or "")
        phase = str(((pod.get("status") or {}).get("phase")) or "")
        if phase in set(["Succeeded", "Failed"]):
            continue
        spec = pod.get("spec") or {}
        for vol in spec.get("volumes") or []:
            claim = (vol.get("persistentVolumeClaim") or {}).get("claimName")
            if claim:
                mounts.setdefault((ns, claim), []).append(pod_name)
    rows = []
    for pvc in pvc_data.get("items") or []:
        meta = pvc.get("metadata") or {}
        spec = pvc.get("spec") or {}
        status = pvc.get("status") or {}
        ns = str(meta.get("namespace") or "")
        name = str(meta.get("name") or "")
        if ns not in namespaces:
            continue
        volume = str(spec.get("volumeName") or "")
        pv = pvs.get(volume) or {}
        pv_spec = pv.get("spec") or {}
        pv_meta = pv.get("metadata") or {}
        owner_kind, owner_name, owner_refs = metadata_owner(meta)
        requested = parse_k8s_quantity((((spec.get("resources") or {}).get("requests") or {}).get("storage")))
        host_path = pv_host_path(pv)
        active = sorted(mounts.get((ns, name), []))
        estimated = du_size(host_path, 8) if host_path else None
        candidate_reasons = []
        if not active:
            candidate_reasons.append("no-active-mount-observed")
        if status.get("phase") != "Bound":
            candidate_reasons.append("pvc-not-bound")
        if (pv.get("status") or {}).get("phase") == "Released":
            candidate_reasons.append("pv-released")
        review_candidate = ns in candidate_namespaces and len(candidate_reasons) > 0
        rows.append({
            "namespace": ns,
            "pvc": name,
            "volume": volume or None,
            "phase": status.get("phase"),
            "pvPhase": (pv.get("status") or {}).get("phase"),
            "ownerKind": owner_kind,
            "owner": owner_name,
            "ownerRefs": owner_refs,
            "ownerGroup": pvc_owner_group(ns, owner_name),
            "storageClass": spec.get("storageClassName") or pv_spec.get("storageClassName"),
            "reclaimPolicy": pv_spec.get("persistentVolumeReclaimPolicy"),
            "requestedBytes": requested,
            "requestedHuman": fmt_bytes(requested or 0),
            "hostPath": host_path,
            "pvCreatedAt": (pv_meta.get("creationTimestamp") if isinstance(pv_meta, dict) else None),
            "pvcCreatedAt": meta.get("creationTimestamp"),
            "activeMountPods": active,
            "estimatedBytes": estimated,
            "estimatedHuman": fmt_bytes(estimated or 0),
            "reviewCandidate": review_candidate,
            "reviewReasons": candidate_reasons,
            "dryRunOnly": True,
        })
    rows.sort(key=lambda item: safe_int(item.get("estimatedBytes")), reverse=True)
    by_namespace = {}
    by_owner_group = {}
    for row in rows:
        for bucket, key in [(by_namespace, row.get("namespace") or "unknown"), (by_owner_group, row.get("ownerGroup") or "unknown")]:
            current = bucket.setdefault(key, {"count": 0, "estimatedBytes": 0, "activeMountCount": 0})
            current["count"] += 1
            current["estimatedBytes"] += safe_int(row.get("estimatedBytes"))
            current["activeMountCount"] += len(row.get("activeMountPods") or [])
            current["estimatedHuman"] = fmt_bytes(current["estimatedBytes"])
    review_candidates = [row for row in rows if row.get("reviewCandidate")]
    return {
        "scope": "YAML-configured PVC namespaces",
        "configSource": "config/unidesk-cli.yaml#gc.remote.targets.%s.pvcAttribution" % PROVIDER_ID,
        "namespaces": sorted(namespaces),
        "candidateNamespaces": sorted(candidate_namespaces),
        "pvcCount": len(rows),
        "reviewCandidateCount": len(review_candidates),
        "estimatedBytes": sum(safe_int(row.get("estimatedBytes")) for row in rows),
        "estimatedHuman": fmt_bytes(sum(safe_int(row.get("estimatedBytes")) for row in rows)),
        "requestedBytes": sum(safe_int(row.get("requestedBytes")) for row in rows),
        "requestedHuman": fmt_bytes(sum(safe_int(row.get("requestedBytes")) for row in rows)),
        "byNamespace": by_namespace,
        "byOwnerGroup": by_owner_group,
        "topPvcs": rows[:limit],
        "reviewCandidates": review_candidates[:limit],
        "handoff": {
            "hwlab": {
                "dryRun": "bun scripts/cli.ts hwlab nodes control-plane cleanup-runs --node %s --lane %s --min-age-minutes 30 --limit 200 --dry-run" % (hwlab_node, hwlab_lane),
                "releasedPvs": "bun scripts/cli.ts hwlab nodes control-plane cleanup-released-pvs --node %s --lane %s --limit 200 --dry-run" % (hwlab_node, hwlab_lane),
            },
            "agentrun": {
                "dryRun": "bun scripts/cli.ts agentrun control-plane cleanup-runs --node %s --lane %s --min-age-minutes 30 --limit 200 --dry-run" % (agentrun_node, agentrun_lane),
                "releasedPvs": "bun scripts/cli.ts agentrun control-plane cleanup-released-pvs --node %s --lane %s --limit 200 --dry-run" % (agentrun_node, agentrun_lane),
            },
        },
        "policy": "analysis-only; remote GC never deletes PVC/PV/local-path data and only hands off to owner-aware retention commands",
    }
 def compact_pvc_row(row):
    return {
        "namespace": row.get("namespace"),
        "pvc": row.get("pvc"),
        "phase": row.get("phase"),
        "pvPhase": row.get("pvPhase"),
        "ownerKind": row.get("ownerKind"),
        "owner": row.get("owner"),
        "ownerGroup": row.get("ownerGroup"),
        "estimatedBytes": row.get("estimatedBytes"),
        "estimatedHuman": row.get("estimatedHuman"),
        "activeMountCount": len(row.get("activeMountPods") or []),
        "reviewCandidate": row.get("reviewCandidate"),
        "reviewReasons": row.get("reviewReasons"),
    }
 def compact_pvc_attribution(payload):
    if bool(OPTIONS.get("full")):
        return payload
    top = payload.get("topPvcs") or []
    review = payload.get("reviewCandidates") or []
    compact_top = [compact_pvc_row(row) for row in top[:8] if isinstance(row, dict)]
    return {
        "configSource": payload.get("configSource"),
        "candidateNamespaces": payload.get("candidateNamespaces"),
        "pvcCount": payload.get("pvcCount"),
        "reviewCandidateCount": payload.get("reviewCandidateCount"),
        "estimatedBytes": payload.get("estimatedBytes"),
        "estimatedHuman": payload.get("estimatedHuman"),
        "byNamespace": payload.get("byNamespace"),
        "byOwnerGroup": payload.get("byOwnerGroup"),
        "topPvcs": compact_top,
        "reviewCandidates": [compact_pvc_row(row) for row in review[:2] if isinstance(row, dict)],
        "handoff": payload.get("handoff"),
        "compacted": True,
        "fullDisclosure": "rerun with --full for hostPath, creation timestamps and complete row details",
    }
 def compact_ci_storage_summary(payload):
    return {
        "scope": payload.get("scope"),
        "configSource": payload.get("configSource"),
        "pvcCount": payload.get("pvcCount"),
        "reviewCandidateCount": payload.get("reviewCandidateCount"),
        "estimatedBytes": payload.get("estimatedBytes"),
        "estimatedHuman": payload.get("estimatedHuman"),
        "requestedBytes": payload.get("requestedBytes"),
        "requestedHuman": payload.get("requestedHuman"),
        "compacted": True,
        "fullDisclosure": "use pvcAttribution or --full for row-level details",
    }
 def local_path_storage_root():
    root = config_str(LOCAL_PATH_CONFIG, "root", "")
    if not root:
        return ""
    return os.path.realpath(os.path.abspath(root))
 def local_path_orphan_prefixes():
    return config_list(LOCAL_PATH_CONFIG, "orphanDirPrefixes", [])
 def is_direct_local_path_child(root, path):
    resolved = os.path.realpath(os.path.abspath(path))
    return os.path.dirname(resolved) == root and resolved.startswith(root.rstrip("/") + "/")
 def local_path_referenced_paths(root):
    pv_data = kubectl_json(["get", "pv"], 30) or {}
    referenced = set()
    for pv in pv_data.get("items") or []:
        host_path = pv_host_path(pv)
        if not host_path:
            continue
        resolved = os.path.realpath(os.path.abspath(host_path))
        if resolved == root or resolved.startswith(root.rstrip("/") + "/"):
            referenced.add(resolved)
    return referenced
 def assert_local_path_orphan(path, referenced=None):
    root = local_path_storage_root()
    if not root:
        raise RuntimeError("localPathStorage.root is not configured")
    prefixes = local_path_orphan_prefixes()
    resolved = os.path.realpath(os.path.abspath(path))
    name = os.path.basename(resolved)
    if not is_direct_local_path_child(root, resolved):
        raise RuntimeError("refusing to remove local-path orphan outside configured direct storage root: %s" % path)
    if os.path.islink(path) or not os.path.isdir(resolved):
        raise RuntimeError("refusing to remove non-directory or symlink local-path orphan: %s" % path)
    if not prefixes or not any(name.startswith(prefix) for prefix in prefixes):
        raise RuntimeError("refusing to remove local-path orphan outside YAML prefix allowlist: %s" % path)
    refs = referenced if referenced is not None else local_path_referenced_paths(root)
    for ref in refs:
        if resolved == ref or ref.startswith(resolved.rstrip("/") + "/") or resolved.startswith(ref.rstrip("/") + "/"):
            raise RuntimeError("refusing to remove local-path path still referenced by PV: %s" % path)
    if path_has_open_fd(resolved):
        raise RuntimeError("refusing to remove local-path orphan with open fd/cwd reference: %s" % path)
    return resolved
 def local_path_orphan_rows():
    if not config_bool(LOCAL_PATH_CONFIG, "enabled", False):
        return [], {"ok": False, "reason": "local-path-orphan-cleanup-disabled"}
    root = local_path_storage_root()
    prefixes = local_path_orphan_prefixes()
    if not root or not os.path.isdir(root) or os.path.islink(root):
        return [], {"ok": False, "reason": "local-path-root-unavailable", "root": root}
    if not prefixes:
        return [], {"ok": False, "reason": "local-path-prefix-allowlist-empty", "root": root}
    referenced = local_path_referenced_paths(root)
    min_age_minutes = config_float(LOCAL_PATH_CONFIG, "orphanMinAgeMinutes", 0.0, minimum=0.0)
    cutoff = time.time() - min_age_minutes * 60.0
    rows = []
    protected = []
    for name in sorted(os.listdir(root)):
        path = os.path.join(root, name)
        resolved = os.path.realpath(os.path.abspath(path))
        try:
            stat = os.lstat(path)
        except OSError:
            continue
        if not os.path.isdir(path) or os.path.islink(path) or not any(name.startswith(prefix) for prefix in prefixes):
            continue
        row = {"path": resolved, "name": name, "sizeBytes": 0, "estimatedReclaimBytes": 0}
        if not is_direct_local_path_child(root, resolved):
            protected.append({**row, "reason": "not-direct-child"})
            continue
        if stat.st_mtime >= cutoff:
            protected.append({**row, "reason": "younger-than-min-age"})
            continue
        referenced_by = [ref for ref in referenced if resolved == ref or ref.startswith(resolved.rstrip("/") + "/") or resolved.startswith(ref.rstrip("/") + "/")]
        if referenced_by:
            protected.append({**row, "reason": "pv-referenced", "referencedCount": len(referenced_by)})
            continue
        if path_has_open_fd(resolved):
            protected.append({**row, "reason": "open-fd"})
            continue
        size = du_size(resolved, 10) or path_size(resolved)
        rows.append({**row, "sizeBytes": size, "estimatedReclaimBytes": size})
    rows.sort(key=lambda item: safe_int(item.get("estimatedReclaimBytes")), reverse=True)
    return rows, {
        "ok": True,
        "root": root,
        "prefixes": prefixes,
        "referencedPathCount": len(referenced),
        "protectedCount": len(protected),
        "protectedPreview": protected[:8],
        "minAgeMinutes": min_age_minutes,
    }
 def local_path_orphan_candidate():
    rows, meta = local_path_orphan_rows()
    if not meta.get("ok"):
        return {
            "id": "k3s-local-path-orphans:unavailable",
            "kind": "k3s-local-path-orphans-unavailable",
            "risk": "blocked",
            "description": "K3s local-path orphan cleanup is unavailable or disabled by YAML",
            "estimatedReclaimBytes": 0,
            "diagnostic": meta,
        }
    limit = int(OPTIONS.get("limit") or 50)
    selected = rows[:limit]
    estimated = sum(safe_int(row.get("estimatedReclaimBytes")) for row in selected)
    if estimated <= 0:
        return None
    return {
        "id": "k3s-local-path-orphans:delete",
        "kind": "k3s-local-path-orphans-delete",
        "risk": "medium",
        "description": "Delete YAML-allowlisted k3s local-path storage directories that no PV references and no process has open",
        "path": meta.get("root"),
        "sizeBytes": estimated,
        "estimatedReclaimBytes": estimated,
        "orphanCount": len(rows),
        "selectedOrphanCount": len(selected),
        "protectedCount": meta.get("protectedCount"),
        "referencedPathCount": meta.get("referencedPathCount"),
        "selectedPreview": [{"name": row.get("name"), "path": row.get("path"), "estimatedReclaimBytes": row.get("estimatedReclaimBytes")} for row in selected[:8]],
        "protectedPreview": meta.get("protectedPreview"),
        "action": {"op": "rm-recursive", "allowlist": "yaml-local-path-orphan", "root": meta.get("root"), "limit": limit},
    }
 def execute_local_path_orphan_cleanup():
    rows, meta = local_path_orphan_rows()
    if not meta.get("ok"):
        raise RuntimeError("local-path orphan cleanup unavailable: %s" % meta.get("reason"))
    limit = int(OPTIONS.get("limit") or 50)
    selected = rows[:limit]
    referenced = local_path_referenced_paths(local_path_storage_root())
    reclaimed = 0
    deleted = []
    for row in selected:
        path = assert_local_path_orphan(row.get("path"), referenced)
        before = du_size(path, 10) or path_size(path)
        shutil.rmtree(path, ignore_errors=True)
        reclaimed += before
        deleted.append({"name": row.get("name"), "path": path, "reclaimedBytes": before})
    return {
        "reclaimedBytes": reclaimed,
        "deletedOrphanCount": len(deleted),
        "deletedPreview": deleted[:12],
        "root": meta.get("root"),
        "protectedCount": meta.get("protectedCount"),
    }
@@ -0,0 +1,677 @@
 def active_hwlab_ci_writes():
    result = command(["sh", "-lc", "KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl get pipelinerun,taskrun -n hwlab-ci --no-headers 2>/dev/null | awk '$2 != \"True\" && $2 != \"False\" {print}' | head -40"], 15)
    lines = [line for line in (result.get("stdout") or "").splitlines() if line.strip()]
    return {"ok": result["exitCode"] == 0, "activeCount": len(lines), "activePreview": lines, "command": bounded(result)}
 def active_hwlab_ci_jobs():
    result = command(["sh", "-lc", "KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl get jobs -n hwlab-ci --no-headers 2>/dev/null | awk '$2 != \"Complete\" && $2 != \"Failed\" {print}' | head -40"], 15)
    lines = [line for line in (result.get("stdout") or "").splitlines() if line.strip()]
    return {"ok": result["exitCode"] == 0, "activeCount": len(lines), "activePreview": lines, "command": bounded(result)}
 def wait_no_active_hwlab_ci(timeout=180):
    deadline = time.time() + timeout
    last = None
    while time.time() < deadline:
        writes = active_hwlab_ci_writes()
        jobs = active_hwlab_ci_jobs()
        last = {"writes": writes, "jobs": jobs}
        if writes.get("ok") and jobs.get("ok") and int(writes.get("activeCount") or 0) == 0 and int(jobs.get("activeCount") or 0) == 0:
            return {"ok": True, "last": last}
        time.sleep(5)
    return {"ok": False, "last": last}
 def kubectl_json(args, timeout=20):
    result = command(["env", "KUBECONFIG=/etc/rancher/k3s/k3s.yaml", "kubectl"] + args + ["-o", "json"], timeout)
    if result["exitCode"] != 0:
        return None
    try:
        return json.loads(result["stdout"] or "{}")
    except Exception:
        return None
 def kctl(args, timeout=30):
    return command(["env", "KUBECONFIG=/etc/rancher/k3s/k3s.yaml", "kubectl"] + args, timeout)
 def workload_image_refs():
    result = command(["sh", "-lc", "KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl get deploy,sts,ds,pod -A -o jsonpath='{range .items[*]}{range .spec.containers[*]}{.image}{\"\\n\"}{end}{range .spec.initContainers[*]}{.image}{\"\\n\"}{end}{range .spec.template.spec.containers[*]}{.image}{\"\\n\"}{end}{range .spec.template.spec.initContainers[*]}{.image}{\"\\n\"}{end}{end}' 2>/dev/null | sort -u"], 30)
    refs = set()
    digests = set()
    for image in (result.get("stdout") or "").splitlines():
        image = image.strip()
        if not image.startswith("127.0.0.1:5000/"):
            continue
        ref = image.split("127.0.0.1:5000/", 1)[1]
        if "@sha256:" in ref:
            repo, digest = ref.split("@", 1)
            refs.add((repo, "@" + digest))
            digests.add("sha256:" + digest.split(":", 1)[1])
        elif ":" in ref:
            repo, tag = ref.rsplit(":", 1)
            refs.add((repo, tag))
    return refs, digests, bounded(result)
 def registry_request(method, path, headers=None, timeout=20):
    url = "http://127.0.0.1:5000" + path
    req = urllib.request.Request(url, method=method, headers=headers or {})
    with urllib.request.urlopen(req, timeout=timeout) as response:
        body = response.read()
        return {"status": response.status, "headers": dict(response.headers), "body": body.decode("utf-8", errors="replace")}
 def registry_tag_rows():
    rows = []
    root = REGISTRY_REPOSITORY_ROOT
    if not os.path.isdir(root):
        return rows
    for repo_root, dirs, files in os.walk(root):
        if os.path.basename(repo_root) != "tags":
            continue
        rel = os.path.relpath(repo_root, root)
        suffix = "/_manifests/tags"
        if not rel.endswith(suffix):
            continue
        repo = rel[:-len(suffix)]
        try:
            tags = os.listdir(repo_root)
        except OSError:
            continue
        for tag in sorted(tags):
            link = os.path.join(repo_root, tag, "current", "link")
            if not os.path.isfile(link):
                continue
            try:
                with open(link, "r", encoding="utf-8") as handle:
                    digest = handle.read().strip()
                stat = os.stat(link)
            except OSError:
                continue
            rows.append({
                "repo": repo,
                "tag": tag,
                "digest": digest,
                "mtime": stat.st_mtime,
                "mtimeIso": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(stat.st_mtime)),
                "path": os.path.join(repo_root, tag),
            })
    return rows
 def registry_revision_rows():
    rows = []
    root = REGISTRY_REPOSITORY_ROOT
    if not os.path.isdir(root):
        return rows
    for repo_root, dirs, files in os.walk(root):
        if os.path.basename(repo_root) != "sha256":
            continue
        rel = os.path.relpath(repo_root, root)
        suffix = "/_manifests/revisions/sha256"
        if not rel.endswith(suffix):
            continue
        repo = rel[:-len(suffix)]
        try:
            revisions = os.listdir(repo_root)
        except OSError:
            continue
        for digest_hex in sorted(revisions):
            path = os.path.join(repo_root, digest_hex)
            link = os.path.join(path, "link")
            if not os.path.isfile(link):
                continue
            try:
                with open(link, "r", encoding="utf-8") as handle:
                    digest = handle.read().strip()
                stat = os.stat(link)
            except OSError:
                continue
            rows.append({
                "repo": repo,
                "digest": digest,
                "mtime": stat.st_mtime,
                "mtimeIso": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(stat.st_mtime)),
                "path": path,
            })
    return rows
 def registry_retention_repo(repo):
    return repo.startswith("hwlab/hwlab-") or repo.startswith("hwlab/cache/hwlab-")
 def registry_digest_hex(digest):
    if not isinstance(digest, str) or not digest.startswith("sha256:"):
        return None
    value = digest.split(":", 1)[1]
    if re.match(r"^[0-9a-f]{64}$", value) is None:
        return None
    return value
 def registry_blob_data_path(digest):
    value = registry_digest_hex(digest)
    if value is None:
        return None
    return os.path.join(REGISTRY_ROOT, "docker/registry/v2/blobs/sha256", value[:2], value, "data")
 _manifest_cache = {}
 def registry_manifest_json(digest):
    if digest in _manifest_cache:
        return _manifest_cache[digest]
    path = registry_blob_data_path(digest)
    if path is None or not os.path.isfile(path):
        _manifest_cache[digest] = None
        return None
    try:
        with open(path, "rb") as handle:
            data = handle.read(8 * 1024 * 1024)
        value = json.loads(data.decode("utf-8"))
    except Exception:
        value = None
    _manifest_cache[digest] = value
    return value
 def registry_manifest_refs(digest):
    manifest = registry_manifest_json(digest)
    if not isinstance(manifest, dict):
        return set()
    refs = set()
    config = manifest.get("config") or {}
    config_digest = config.get("digest")
    if isinstance(config_digest, str) and registry_digest_hex(config_digest) is not None:
        refs.add(config_digest)
    for item in manifest.get("layers") or []:
        item_digest = (item or {}).get("digest")
        if isinstance(item_digest, str) and registry_digest_hex(item_digest) is not None:
            refs.add(item_digest)
    for item in manifest.get("manifests") or []:
        item_digest = (item or {}).get("digest")
        if isinstance(item_digest, str) and registry_digest_hex(item_digest) is not None:
            refs.add(item_digest)
    return refs
 def registry_digest_closure(seed):
    seen = set()
    stack = list(seed)
    while stack:
        digest = stack.pop()
        if digest in seen or registry_digest_hex(digest) is None:
            continue
        seen.add(digest)
        for child in registry_manifest_refs(digest):
            if child not in seen:
                stack.append(child)
    return seen
 def registry_blob_size(digest):
    path = registry_blob_data_path(digest)
    if path is None or not os.path.isfile(path):
        return 0
    try:
        return int(os.lstat(path).st_blocks) * 512
    except OSError:
        return 0
 def estimate_registry_reclaim(delete_manifest_digests, kept_manifest_digests):
    deleted = registry_digest_closure(delete_manifest_digests)
    kept = registry_digest_closure(kept_manifest_digests)
    reclaim = deleted - kept
    return sum(registry_blob_size(digest) for digest in reclaim)
 def plan_registry_retention():
    keep_per_repo = int(OPTIONS.get("registryKeepPerRepo") if OPTIONS.get("registryKeepPerRepo") is not None else 5)
    min_age_hours = float(OPTIONS.get("registryMinAgeHours") if OPTIONS.get("registryMinAgeHours") is not None else 48)
    cutoff = time.time() - min_age_hours * 3600
    refs, digests, refs_command = workload_image_refs()
    rows = registry_tag_rows()
    revision_rows = registry_revision_rows()
    by_repo = {}
    for row in rows:
        by_repo.setdefault(row["repo"], []).append(row)
    keep = set()
    keep_reasons = {}
    for repo, items in by_repo.items():
        items.sort(key=lambda item: item["mtime"], reverse=True)
        for row in items[:keep_per_repo]:
            key = (row["repo"], row["tag"])
            keep.add(key)
            keep_reasons[key] = "latest-per-repo"
        for row in items:
            key = (row["repo"], row["tag"])
            if row["tag"] in REGISTRY_PROTECTED_TAGS:
                keep.add(key)
                keep_reasons[key] = "protected-tag"
            if key in refs:
                keep.add(key)
                keep_reasons[key] = "workload-tag-ref"
            if row["digest"] in digests:
                keep.add(key)
                keep_reasons[key] = "workload-digest-ref"
            if row["repo"].startswith("hwlab/cache/"):
                keep.add(key)
                keep_reasons[key] = "cache-repo"
            if row["mtime"] >= cutoff:
                keep.add(key)
                keep_reasons[key] = "recent-tag"
    delete_rows = []
    kept_count = 0
    delete_by_repo = {}
    keep_by_repo = {}
    kept_digests = set()
    for row in rows:
        key = (row["repo"], row["tag"])
        should_delete = (
            key not in keep
            and row["repo"].startswith("hwlab/hwlab-")
            and re.match(r"^[0-9a-f]{7,40}$", row["tag"]) is not None
        )
        if should_delete:
            delete_rows.append(row)
            delete_by_repo[row["repo"]] = delete_by_repo.get(row["repo"], 0) + 1
        else:
            kept_count += 1
            kept_digests.add(row["digest"])
            keep_by_repo[row["repo"]] = keep_by_repo.get(row["repo"], 0) + 1
    protected_digests = kept_digests | digests
    protected_digests.update(row["digest"] for row in revision_rows if not registry_retention_repo(row["repo"]))
    protected_digests = registry_digest_closure(protected_digests)
    delete_revision_rows = []
    revision_delete_by_repo = {}
    for row in revision_rows:
        if not registry_retention_repo(row["repo"]):
            continue
        if row["digest"] in protected_digests:
            continue
        delete_revision_rows.append(row)
        revision_delete_by_repo[row["repo"]] = revision_delete_by_repo.get(row["repo"], 0) + 1
    kept_revision_digests = set(row["digest"] for row in revision_rows if row not in delete_revision_rows)
    delete_revision_digests = set(row["digest"] for row in delete_revision_rows)
    deletable_manifests = {}
    for row in delete_rows:
        if row["digest"] in kept_digests:
            continue
        deletable_manifests.setdefault(row["repo"], set()).add(row["digest"])
    for row in delete_revision_rows:
        deletable_manifests.setdefault(row["repo"], set()).add(row["digest"])
    deletable_manifest_count = sum(len(items) for items in deletable_manifests.values())
    registry_size = du_size(REGISTRY_ROOT, 30) or 0
    estimate = estimate_registry_reclaim(delete_revision_digests, kept_revision_digests)
    return {
        "tagRows": rows,
        "revisionRows": revision_rows,
        "deleteRows": delete_rows,
        "deleteRevisionRows": delete_revision_rows,
        "summary": {
            "totalTags": len(rows),
            "totalRevisions": len(revision_rows),
            "repoCount": len(by_repo),
            "keepPerRepo": keep_per_repo,
            "minAgeHours": min_age_hours,
            "protectedWorkloadRefs": len(refs),
            "protectedDigestRefs": len(digests),
            "protectedDigestClosure": len(protected_digests),
            "keptTags": kept_count,
            "deleteTags": len(delete_rows),
            "deleteManifests": deletable_manifest_count,
            "deleteRevisions": len(delete_revision_rows),
            "deleteByRepo": delete_by_repo,
            "revisionDeleteByRepo": revision_delete_by_repo,
            "keepByRepo": keep_by_repo,
            "registrySizeBytes": registry_size,
            "estimatedReclaimBytes": estimate,
        },
        "deleteManifestsByRepo": {repo: sorted(list(digests)) for repo, digests in deletable_manifests.items()},
        "refsCommand": refs_command,
    }
 def registry_deployment_preflight():
    dep = kubectl_json(["-n", "hwlab-ci", "get", "deploy", "hwlab-registry"], 20)
    if not dep:
        return {"ok": False, "reason": "registry-deployment-missing"}
    spec = ((dep.get("spec") or {}).get("template") or {}).get("spec") or {}
    containers = spec.get("containers") or []
    volumes = spec.get("volumes") or []
    registry_container = next((item for item in containers if item.get("name") == "registry"), containers[0] if containers else {})
    mounts = registry_container.get("volumeMounts") or []
    has_host_path = any(((vol.get("hostPath") or {}).get("path") == REGISTRY_ROOT and vol.get("name") == "storage") for vol in volumes)
    has_mount = any((mount.get("name") == "storage" and mount.get("mountPath") == "/var/lib/registry") for mount in mounts)
    image = str(registry_container.get("image") or "")
    ok = bool(has_host_path and has_mount and image.startswith("registry:") and spec.get("hostNetwork") is True)
    return {
        "ok": ok,
        "reason": "ok" if ok else "unexpected-registry-deployment-shape",
        "image": image,
        "hostNetwork": spec.get("hostNetwork"),
        "hasExpectedHostPath": has_host_path,
        "hasExpectedMount": has_mount,
        "replicas": (dep.get("spec") or {}).get("replicas"),
        "readyReplicas": (dep.get("status") or {}).get("readyReplicas"),
    }
 def cronjob_suspend_states(names):
    states = {}
    for name in names:
        data = kubectl_json(["-n", "hwlab-ci", "get", "cronjob", name], 15)
        if data:
            states[name] = bool(((data.get("spec") or {}).get("suspend")) is True)
    return states
 def patch_cronjob_suspend(name, suspend):
    payload = json.dumps({"spec": {"suspend": bool(suspend)}})
    return kctl(["-n", "hwlab-ci", "patch", "cronjob", name, "--type=merge", "-p", payload], 30)
 def wait_registry_pod_count(target, timeout=90):
    deadline = time.time() + timeout
    last = None
    while time.time() < deadline:
        result = kctl(["-n", "hwlab-ci", "get", "pods", "-l", "app.kubernetes.io/name=hwlab-registry", "--no-headers"], 20)
        last = bounded(result)
        lines = [line for line in (result.get("stdout") or "").splitlines() if line.strip()]
        active = []
        for line in lines:
            parts = line.split()
            status = parts[2] if len(parts) >= 3 else ""
            if status in set(["Completed", "Error", "Failed", "Succeeded"]):
                continue
            active.append(line)
        if len(active) == target:
            return {"ok": True, "lines": active, "allLines": lines, "last": last}
        time.sleep(2)
    return {"ok": False, "lines": [], "last": last}
 def wait_pod_terminal(name, timeout=900):
    deadline = time.time() + timeout
    last = None
    while time.time() < deadline:
        data = kubectl_json(["-n", "hwlab-ci", "get", "pod", name], 20)
        if data:
            phase = ((data.get("status") or {}).get("phase")) or ""
            last = {"phase": phase}
            if phase == "Succeeded":
                return {"ok": True, "phase": phase}
            if phase == "Failed":
                return {"ok": False, "phase": phase}
        time.sleep(3)
    return {"ok": False, "phase": "Timeout", "last": last}
 def execute_registry_retention():
    if PROVIDER_ID.upper() != "G14":
        raise RuntimeError("HWLAB registry retention is only supported on G14")
    deployment = registry_deployment_preflight()
    if not deployment.get("ok"):
        raise RuntimeError("registry deployment preflight failed: %s" % deployment.get("reason"))
    plan = plan_registry_retention()
    delete_rows = plan.get("deleteRows") or []
    delete_revision_rows = plan.get("deleteRevisionRows") or []
    delete_manifests = plan.get("deleteManifestsByRepo") or {}
    if not delete_rows and not delete_revision_rows:
        return {"reclaimedBytes": 0, "commandOutput": {"message": "no registry tags or revisions matched conservative retention", "registryPlan": plan.get("summary")}}
    if not delete_manifests:
        return {"reclaimedBytes": 0, "commandOutput": {"message": "matched manifests are still referenced by retained manifests; registry GC would not reclaim blobs", "registryPlan": plan.get("summary")}}
    cronjobs = ["hwlab-g14-branch-poller", "hwlab-v02-branch-poller"]
    original_crons = cronjob_suspend_states(cronjobs)
    before = du_size(REGISTRY_ROOT, 60) or 0
    gc_name = "hwlab-registry-gc-%s" % int(time.time())
    steps = []
    try:
        for name in original_crons:
            result = patch_cronjob_suspend(name, True)
            steps.append({"step": "suspend-cronjob", "name": name, "result": bounded(result)})
            if result["exitCode"] != 0:
                raise RuntimeError("failed to suspend cronjob %s" % name)
        idle_after_suspend = wait_no_active_hwlab_ci(180)
        steps.append({"step": "idle-after-suspend", "result": idle_after_suspend})
        if not idle_after_suspend.get("ok"):
            raise RuntimeError("refusing registry maintenance because hwlab-ci did not become idle after suspend")
        deleted_manifests = []
        for repo, digests in delete_manifests.items():
            encoded_repo = "/".join(urllib.parse.quote(part, safe="") for part in repo.split("/"))
            for digest in digests:
                try:
                    result = registry_request("DELETE", "/v2/%s/manifests/%s" % (encoded_repo, urllib.parse.quote(digest, safe=":")), {"Accept": "application/vnd.docker.distribution.manifest.v2+json, application/vnd.oci.image.manifest.v1+json"})
                    deleted_manifests.append({"repo": repo, "digest": digest, "status": result.get("status")})
                except urllib.error.HTTPError as exc:
                    if exc.code == 404:
                        deleted_manifests.append({"repo": repo, "digest": digest, "status": 404})
                    else:
                        raise
        steps.append({"step": "registry-api-delete-manifests", "count": len(deleted_manifests), "preview": deleted_manifests[:20]})
        scale_down = kctl(["-n", "hwlab-ci", "scale", "deploy", "hwlab-registry", "--replicas=0"], 60)
        steps.append({"step": "scale-registry-down", "result": bounded(scale_down)})
        if scale_down["exitCode"] != 0:
            raise RuntimeError("failed to scale registry down")
        waited_down = wait_registry_pod_count(0, 120)
        steps.append({"step": "wait-registry-down", "result": waited_down})
        if not waited_down.get("ok"):
            raise RuntimeError("registry pod did not scale down")
        deleted = []
        for row in delete_rows:
            path = os.path.abspath(str(row.get("path") or ""))
            if not path.startswith(REGISTRY_REPOSITORY_ROOT + "/") or "/_manifests/tags/" not in path:
                raise RuntimeError("refusing unexpected registry tag path: %s" % path)
            if not re.match(r"^[0-9a-f]{7,40}$", str(row.get("tag") or "")):
                raise RuntimeError("refusing unexpected registry tag name: %s" % row.get("tag"))
            if os.path.isdir(path) and not os.path.islink(path):
                shutil.rmtree(path)
                deleted.append({"repo": row.get("repo"), "tag": row.get("tag"), "digest": row.get("digest")})
        steps.append({"step": "delete-tag-directories", "count": len(deleted)})
        deleted_revisions = []
        for row in delete_revision_rows:
            path = os.path.abspath(str(row.get("path") or ""))
            digest_hex = registry_digest_hex(str(row.get("digest") or ""))
            if digest_hex is None:
                raise RuntimeError("refusing unexpected registry revision digest: %s" % row.get("digest"))
            if not path.startswith(REGISTRY_REPOSITORY_ROOT + "/") or "/_manifests/revisions/sha256/" not in path:
                raise RuntimeError("refusing unexpected registry revision path: %s" % path)
            if os.path.basename(path) != digest_hex:
                raise RuntimeError("refusing registry revision path/digest mismatch: %s" % path)
            if os.path.isdir(path) and not os.path.islink(path):
                shutil.rmtree(path)
                deleted_revisions.append({"repo": row.get("repo"), "digest": row.get("digest")})
        steps.append({"step": "delete-revision-directories", "count": len(deleted_revisions)})
        overrides = {
            "apiVersion": "v1",
            "spec": {
                "restartPolicy": "Never",
                "containers": [{
                    "name": "registry-gc",
                    "image": "registry:2.8.3",
                    "command": ["registry", "garbage-collect", "/etc/docker/registry/config.yml"],
                    "volumeMounts": [{"name": "storage", "mountPath": "/var/lib/registry"}],
                }],
                "volumes": [{"name": "storage", "hostPath": {"path": REGISTRY_ROOT, "type": "DirectoryOrCreate"}}],
            },
        }
        run_gc = kctl(["-n", "hwlab-ci", "run", gc_name, "--restart=Never", "--image=registry:2.8.3", "--overrides=%s" % json.dumps(overrides)], 60)
        steps.append({"step": "start-registry-gc-pod", "result": bounded(run_gc), "pod": gc_name})
        if run_gc["exitCode"] != 0:
            raise RuntimeError("failed to start registry GC pod")
        waited_gc = wait_pod_terminal(gc_name, 900)
        steps.append({"step": "wait-registry-gc", "result": waited_gc})
        logs = kctl(["-n", "hwlab-ci", "logs", gc_name], 120)
        steps.append({"step": "registry-gc-logs", "result": bounded(logs)})
        if not waited_gc.get("ok"):
            raise RuntimeError("registry GC pod did not complete successfully")
    finally:
        cleanup_gc = kctl(["-n", "hwlab-ci", "delete", "pod", gc_name, "--ignore-not-found=true"], 60)
        steps.append({"step": "delete-registry-gc-pod", "result": bounded(cleanup_gc)})
        scale_up = kctl(["-n", "hwlab-ci", "scale", "deploy", "hwlab-registry", "--replicas=%s" % int(deployment.get("replicas") or 1)], 60)
        steps.append({"step": "scale-registry-up", "result": bounded(scale_up)})
        rollout = kctl(["-n", "hwlab-ci", "rollout", "status", "deploy/hwlab-registry", "--timeout=180s"], 200)
        steps.append({"step": "wait-registry-rollout", "result": bounded(rollout)})
        for name, was_suspended in original_crons.items():
            restore = patch_cronjob_suspend(name, was_suspended)
            steps.append({"step": "restore-cronjob", "name": name, "suspend": was_suspended, "result": bounded(restore)})
    after = du_size(REGISTRY_ROOT, 60) or 0
    return {
        "reclaimedBytes": max(0, before - after),
        "commandOutput": {
            "registryPlan": plan.get("summary"),
            "deletedTagCount": len(delete_rows),
            "deletedRevisionCount": len(delete_revision_rows),
            "deletedManifestCount": sum(len(items) for items in delete_manifests.values()),
            "diskBeforeBytes": before,
            "diskAfterBytes": after,
            "steps": steps[-12:],
        },
    }
 def execute_registry_garbage_collect_only():
    if PROVIDER_ID.upper() != "G14":
        raise RuntimeError("HWLAB registry garbage-collect is only supported on G14")
    deployment = registry_deployment_preflight()
    if not deployment.get("ok"):
        raise RuntimeError("registry deployment preflight failed: %s" % deployment.get("reason"))
    cronjobs = ["hwlab-g14-branch-poller", "hwlab-v02-branch-poller"]
    original_crons = cronjob_suspend_states(cronjobs)
    before = du_size(REGISTRY_ROOT, 60) or 0
    gc_name = "hwlab-registry-gc-%s" % int(time.time())
    steps = []
    try:
        for name in original_crons:
            result = patch_cronjob_suspend(name, True)
            steps.append({"step": "suspend-cronjob", "name": name, "result": bounded(result)})
            if result["exitCode"] != 0:
                raise RuntimeError("failed to suspend cronjob %s" % name)
        idle_after_suspend = wait_no_active_hwlab_ci(180)
        steps.append({"step": "idle-after-suspend", "result": idle_after_suspend})
        if not idle_after_suspend.get("ok"):
            raise RuntimeError("refusing registry maintenance because hwlab-ci did not become idle after suspend")
        scale_down = kctl(["-n", "hwlab-ci", "scale", "deploy", "hwlab-registry", "--replicas=0"], 60)
        steps.append({"step": "scale-registry-down", "result": bounded(scale_down)})
        if scale_down["exitCode"] != 0:
            raise RuntimeError("failed to scale registry down")
        waited_down = wait_registry_pod_count(0, 120)
        steps.append({"step": "wait-registry-down", "result": waited_down})
        if not waited_down.get("ok"):
            raise RuntimeError("registry pod did not scale down")
        overrides = {
            "apiVersion": "v1",
            "spec": {
                "restartPolicy": "Never",
                "containers": [{
                    "name": "registry-gc",
                    "image": "registry:2.8.3",
                    "command": ["registry", "garbage-collect", "/etc/docker/registry/config.yml"],
                    "volumeMounts": [{"name": "storage", "mountPath": "/var/lib/registry"}],
                }],
                "volumes": [{"name": "storage", "hostPath": {"path": REGISTRY_ROOT, "type": "DirectoryOrCreate"}}],
            },
        }
        run_gc = kctl(["-n", "hwlab-ci", "run", gc_name, "--restart=Never", "--image=registry:2.8.3", "--overrides=%s" % json.dumps(overrides)], 60)
        steps.append({"step": "start-registry-gc-pod", "result": bounded(run_gc), "pod": gc_name})
        if run_gc["exitCode"] != 0:
            raise RuntimeError("failed to start registry GC pod")
        waited_gc = wait_pod_terminal(gc_name, 900)
        steps.append({"step": "wait-registry-gc", "result": waited_gc})
        logs = kctl(["-n", "hwlab-ci", "logs", gc_name], 120)
        steps.append({"step": "registry-gc-logs", "result": bounded(logs)})
        if not waited_gc.get("ok"):
            raise RuntimeError("registry GC pod did not complete successfully")
    finally:
        cleanup_gc = kctl(["-n", "hwlab-ci", "delete", "pod", gc_name, "--ignore-not-found=true"], 60)
        steps.append({"step": "delete-registry-gc-pod", "result": bounded(cleanup_gc)})
        scale_up = kctl(["-n", "hwlab-ci", "scale", "deploy", "hwlab-registry", "--replicas=%s" % int(deployment.get("replicas") or 1)], 60)
        steps.append({"step": "scale-registry-up", "result": bounded(scale_up)})
        rollout = kctl(["-n", "hwlab-ci", "rollout", "status", "deploy/hwlab-registry", "--timeout=180s"], 200)
        steps.append({"step": "wait-registry-rollout", "result": bounded(rollout)})
        for name, was_suspended in original_crons.items():
            restore = patch_cronjob_suspend(name, was_suspended)
            steps.append({"step": "restore-cronjob", "name": name, "suspend": was_suspended, "result": bounded(restore)})
    after = du_size(REGISTRY_ROOT, 60) or 0
    return {
        "reclaimedBytes": max(0, before - after),
        "commandOutput": {
            "message": "official registry garbage-collect only; no additional tag deletion",
            "diskBeforeBytes": before,
            "diskAfterBytes": after,
            "steps": steps[-12:],
        },
    }
 def start_registry_retention_job(mode):
    job_id = "g14-registry-%s-%s" % (int(time.time()), os.getpid())
    paths = job_paths(job_id)
    started_at = now_iso()
    initial = {
        "ok": True,
        "action": "gc remote status",
        "providerId": PROVIDER_ID,
        "jobId": job_id,
        "status": "running",
        "kind": "hwlab-registry-retention-gc" if mode == "retention" else "hwlab-registry-garbage-collect",
        "mode": mode,
        "startedAt": started_at,
        "statePath": paths["state"],
        "logPath": paths["log"],
        "options": OPTIONS,
    }
    write_json_atomic(paths["state"], initial)
    pid = os.fork()
    if pid != 0:
        return {
            "status": "started",
            "reclaimedBytes": None,
            "commandOutput": {
                "jobId": job_id,
                "pid": pid,
                "statePath": paths["state"],
                "logPath": paths["log"],
                "statusCommand": "bun scripts/cli.ts gc remote %s status --job-id %s" % (PROVIDER_ID, job_id),
                "message": "registry retention GC is running as a detached remote job",
            },
        }
    try:
        os.setsid()
    except Exception:
        pass
    try:
        devnull = os.open(os.devnull, os.O_RDONLY)
        os.dup2(devnull, 0)
        os.close(devnull)
    except Exception:
        pass
    try:
        log_handle = open(paths["log"], "a", encoding="utf-8", buffering=1)
        os.dup2(log_handle.fileno(), 1)
        os.dup2(log_handle.fileno(), 2)
    except Exception:
        log_handle = None
    try:
        print("[%s] starting HWLAB registry %s job %s" % (now_iso(), mode, job_id), flush=True)
        result = execute_registry_retention() if mode == "retention" else execute_registry_garbage_collect_only()
        payload = dict(initial)
        payload.update({
            "status": "succeeded",
            "finishedAt": now_iso(),
            "result": result,
            "diskAfter": df_snapshot(),
            "clusterAfter": cluster_preflight(),
        })
        write_json_atomic(paths["state"], payload)
        print("[%s] completed HWLAB registry %s job %s" % (now_iso(), mode, job_id), flush=True)
        os._exit(0)
    except Exception as exc:
        payload = dict(initial)
        payload.update({
            "ok": False,
            "status": "failed",
            "finishedAt": now_iso(),
            "error": str(exc),
            "diskAfter": df_snapshot(),
            "clusterAfter": cluster_preflight(),
        })
        try:
            write_json_atomic(paths["state"], payload)
        except Exception:
            pass
        print("[%s] failed HWLAB registry %s job %s: %s" % (now_iso(), mode, job_id, exc), flush=True)
        os._exit(1)
    finally:
        try:
            if log_handle:
                log_handle.close()
        except Exception:
            pass
@@ -0,0 +1,57 @@
 def configured_observe_roots():
    roots = config_list(MEMORY_CONFIG, "observeStateRoots", config_list(MEMORY_CONFIG, "webObserveRoots", []))
    return [os.path.abspath(item) for item in roots if isinstance(item, str) and item.startswith("/")]
 def is_direct_observe_run_path(path):
    resolved = os.path.abspath(path)
    for root in configured_observe_roots():
        if os.path.dirname(resolved) == root and resolved.startswith(root.rstrip("/") + "/"):
            return True
    return False
 def path_has_open_fd(path):
    resolved = os.path.realpath(path)
    prefix = resolved.rstrip("/") + "/"
    proc_root = "/proc"
    try:
        pids = [name for name in os.listdir(proc_root) if name.isdigit()]
    except OSError:
        return True
    for pid in pids:
        base = os.path.join(proc_root, pid)
        for name in ["cwd", "root"]:
            try:
                target = os.path.realpath(os.readlink(os.path.join(base, name)))
            except OSError:
                continue
            if target == resolved or target.startswith(prefix):
                return True
        fd_dir = os.path.join(base, "fd")
        try:
            fds = os.listdir(fd_dir)
        except OSError:
            continue
        for fd in fds:
            try:
                target = os.path.realpath(os.readlink(os.path.join(fd_dir, fd)))
            except OSError:
                continue
            if target == resolved or target.startswith(prefix):
                return True
    return False
 def assert_web_observe_candidate(path):
    resolved = os.path.abspath(path)
    if not is_direct_observe_run_path(resolved):
        raise RuntimeError("refusing to remove web-observe path outside configured direct run roots: %s" % path)
    if os.path.islink(resolved) or not os.path.isdir(resolved):
        raise RuntimeError("refusing to remove non-directory or symlink web-observe path: %s" % path)
    stale_hours = config_float(MEMORY_CONFIG, "staleRunMaxAgeHours", 6.0, minimum=0.0)
    record = observe_run_record(resolved, stale_hours)
    if record.get("pidAlive"):
        raise RuntimeError("refusing to remove active web-observe run with live pid: %s" % path)
    if not record.get("staleSignal"):
        raise RuntimeError("refusing to remove web-observe run without stale signal: %s" % path)
    if path_has_open_fd(resolved):
        raise RuntimeError("refusing to remove web-observe run with open fd/cwd reference: %s" % path)
    return record
@@ -18,6 +18,10 @@ interface RemoteGcOptions {
  tmp: boolean;
  tmpMinAgeHours: number;
  toolCaches: boolean;
  webObserveArtifacts: boolean;
  k3sImageCache: boolean;
  hostContainerdCache: boolean;
  localPathOrphans: boolean;
  aptCache: boolean;
  coreDumps: boolean;
  coreDumpMinAgeHours: number;
@@ -45,6 +49,10 @@ const DEFAULT_REMOTE_OPTIONS: RemoteGcOptions = {
  tmp: true,
  tmpMinAgeHours: 24,
  toolCaches: false,
  webObserveArtifacts: false,
  k3sImageCache: false,
  hostContainerdCache: false,
  localPathOrphans: false,
  aptCache: true,
  coreDumps: true,
  coreDumpMinAgeHours: 1,
@@ -63,6 +71,16 @@ const GC_CONFIG_RELATIVE_PATH = "config/unidesk-cli.yaml";
 const GC_REMOTE_CONFIG_REF = `${GC_CONFIG_RELATIVE_PATH}#gc.remote.targets`;
 const GC_REMOTE_RUNNER_RELATIVE_PATH = "scripts/src/gc-remote-runner.py";
 const GC_REMOTE_RUNNER_CONFIG_PLACEHOLDER = "__UNIDESK_GC_REMOTE_CONFIG_BASE64__";
 const GC_REMOTE_WEB_OBSERVE_RELATIVE_PATH = "scripts/src/gc-remote-web-observe.py";
 const GC_REMOTE_WEB_OBSERVE_PLACEHOLDER = "# __UNIDESK_GC_REMOTE_WEB_OBSERVE_HELPERS__";
 const GC_REMOTE_CONTAINERD_RELATIVE_PATH = "scripts/src/gc-remote-containerd.py";
 const GC_REMOTE_CONTAINERD_PLACEHOLDER = "# __UNIDESK_GC_REMOTE_CONTAINERD_HELPERS__";
 const GC_REMOTE_PVC_RELATIVE_PATH = "scripts/src/gc-remote-pvc.py";
 const GC_REMOTE_PVC_PLACEHOLDER = "# __UNIDESK_GC_REMOTE_PVC_HELPERS__";
 const GC_REMOTE_GROWTH_RELATIVE_PATH = "scripts/src/gc-remote-growth.py";
 const GC_REMOTE_GROWTH_PLACEHOLDER = "# __UNIDESK_GC_REMOTE_GROWTH_HELPERS__";
 const GC_REMOTE_REGISTRY_RELATIVE_PATH = "scripts/src/gc-remote-registry.py";
 const GC_REMOTE_REGISTRY_PLACEHOLDER = "# __UNIDESK_GC_REMOTE_REGISTRY_HELPERS__";
 export async function runRemoteGcCommand(config: UniDeskConfig, providerId: string | undefined, action: string | undefined, args: string[]): Promise<unknown> {
  if (providerId === undefined || providerId.length === 0) {
@@ -186,6 +204,22 @@ function parseRemoteGcOptions(args: string[]): RemoteGcOptions {
      options.toolCaches = true;
    } else if (arg === "--no-tool-caches") {
      options.toolCaches = false;
    } else if (arg === "--include-web-observe-artifacts") {
      options.webObserveArtifacts = true;
    } else if (arg === "--no-web-observe-artifacts") {
      options.webObserveArtifacts = false;
    } else if (arg === "--include-k3s-image-cache") {
      options.k3sImageCache = true;
    } else if (arg === "--no-k3s-image-cache") {
      options.k3sImageCache = false;
    } else if (arg === "--include-host-containerd-cache") {
      options.hostContainerdCache = true;
    } else if (arg === "--no-host-containerd-cache") {
      options.hostContainerdCache = false;
    } else if (arg === "--include-local-path-orphans") {
      options.localPathOrphans = true;
    } else if (arg === "--no-local-path-orphans") {
      options.localPathOrphans = false;
    } else if (arg === "--no-apt-cache") {
      options.aptCache = false;
    } else if (arg === "--no-core-dumps") {
@@ -295,5 +329,31 @@ function remoteGcPython(configBase64: string): string {
  if (!template.includes(GC_REMOTE_RUNNER_CONFIG_PLACEHOLDER)) {
    throw new Error(`${GC_REMOTE_RUNNER_RELATIVE_PATH} missing ${GC_REMOTE_RUNNER_CONFIG_PLACEHOLDER}`);
  }
-  return template.replace(GC_REMOTE_RUNNER_CONFIG_PLACEHOLDER, configBase64);
+  if (!template.includes(GC_REMOTE_WEB_OBSERVE_PLACEHOLDER)) {
    throw new Error(`${GC_REMOTE_RUNNER_RELATIVE_PATH} missing ${GC_REMOTE_WEB_OBSERVE_PLACEHOLDER}`);
  }
  if (!template.includes(GC_REMOTE_CONTAINERD_PLACEHOLDER)) {
    throw new Error(`${GC_REMOTE_RUNNER_RELATIVE_PATH} missing ${GC_REMOTE_CONTAINERD_PLACEHOLDER}`);
  }
  if (!template.includes(GC_REMOTE_PVC_PLACEHOLDER)) {
    throw new Error(`${GC_REMOTE_RUNNER_RELATIVE_PATH} missing ${GC_REMOTE_PVC_PLACEHOLDER}`);
  }
  if (!template.includes(GC_REMOTE_GROWTH_PLACEHOLDER)) {
    throw new Error(`${GC_REMOTE_RUNNER_RELATIVE_PATH} missing ${GC_REMOTE_GROWTH_PLACEHOLDER}`);
  }
  if (!template.includes(GC_REMOTE_REGISTRY_PLACEHOLDER)) {
    throw new Error(`${GC_REMOTE_RUNNER_RELATIVE_PATH} missing ${GC_REMOTE_REGISTRY_PLACEHOLDER}`);
  }
  const webObserveHelpers = readFileSync(rootPath(GC_REMOTE_WEB_OBSERVE_RELATIVE_PATH), "utf8");
  const containerdHelpers = readFileSync(rootPath(GC_REMOTE_CONTAINERD_RELATIVE_PATH), "utf8");
  const pvcHelpers = readFileSync(rootPath(GC_REMOTE_PVC_RELATIVE_PATH), "utf8");
  const growthHelpers = readFileSync(rootPath(GC_REMOTE_GROWTH_RELATIVE_PATH), "utf8");
  const registryHelpers = readFileSync(rootPath(GC_REMOTE_REGISTRY_RELATIVE_PATH), "utf8");
  return template
    .replace(GC_REMOTE_WEB_OBSERVE_PLACEHOLDER, webObserveHelpers.trimEnd())
    .replace(GC_REMOTE_CONTAINERD_PLACEHOLDER, containerdHelpers.trimEnd())
    .replace(GC_REMOTE_PVC_PLACEHOLDER, pvcHelpers.trimEnd())
    .replace(GC_REMOTE_GROWTH_PLACEHOLDER, growthHelpers.trimEnd())
    .replace(GC_REMOTE_REGISTRY_PLACEHOLDER, registryHelpers.trimEnd())
    .replace(GC_REMOTE_RUNNER_CONFIG_PLACEHOLDER, configBase64);
 }