diff --git a/scripts/src/gc-remote-degraded.ts b/scripts/src/gc-remote-degraded.ts new file mode 100644 index 00000000..94b55683 --- /dev/null +++ b/scripts/src/gc-remote-degraded.ts @@ -0,0 +1,38 @@ +import { classifySshTcpPoolFailure, type SshCaptureResult } from "./ssh"; + +export function remoteGcDegradedFailure(providerId: string, action: string, result: SshCaptureResult): Record { + const text = `${result.stderr}\n${result.stdout}`; + const failureKind = classifySshTcpPoolFailure(text); + const timeout = result.exitCode === 124 || text.includes("ssh-runtime-timeout"); + const providerOffline = text.includes(`provider is not online: ${providerId}`) || text.includes("provider is not online"); + const degradedReason = failureKind ?? (providerOffline ? "provider-offline" : timeout ? "ssh-runtime-timeout" : "remote-command-failed"); + return { + ok: false, + degraded: true, + providerId, + action: `gc remote ${action}`, + degradedReason, + transport: { + sshTcpPoolFailureKind: failureKind, + providerOffline, + sshRuntimeTimeout: timeout, + exitCode: result.exitCode, + }, + safeCandidateCount: null, + runAllowed: false, + mutation: false, + summary: failureKind !== null + ? `remote GC could not acquire a provider data channel: ${failureKind}` + : providerOffline + ? `provider ${providerId} is offline from the controlled CLI transport view` + : timeout + ? "remote GC did not complete before the SSH runtime timeout" + : "remote GC command failed before producing a valid plan", + next: { + sshPool: `bun scripts/cli.ts debug ssh-pool ${providerId}`, + fullHealth: "bun scripts/cli.ts debug health", + smoke: `trans ${providerId} argv true`, + retryPlan: `bun scripts/cli.ts gc remote ${providerId} plan --no-snapshot-save`, + }, + }; +} diff --git a/scripts/src/gc-remote-runner.py b/scripts/src/gc-remote-runner.py new file mode 100644 index 00000000..fffff0ed --- /dev/null +++ b/scripts/src/gc-remote-runner.py @@ -0,0 +1,2941 @@ +import base64 +import calendar +import json +import os +import re +import shutil +import subprocess +import sys +import time +import urllib.error +import urllib.parse +import urllib.request + +CONFIG = json.loads(base64.b64decode("__UNIDESK_GC_REMOTE_CONFIG_BASE64__").decode("utf-8")) +PROVIDER_ID = str(CONFIG.get("providerId") or "") +ACTION = str(CONFIG.get("action") or "plan") +OPTIONS = CONFIG.get("options") or {} +REMOTE_TARGET = CONFIG.get("remoteTarget") if isinstance(CONFIG.get("remoteTarget"), dict) else {} +MEMORY_CONFIG = REMOTE_TARGET.get("memoryPressure") if isinstance(REMOTE_TARGET.get("memoryPressure"), dict) else {} +PVC_CONFIG = REMOTE_TARGET.get("pvcAttribution") if isinstance(REMOTE_TARGET.get("pvcAttribution"), dict) else {} +POLICY_TIMER_CONFIG = REMOTE_TARGET.get("policyTimer") if isinstance(REMOTE_TARGET.get("policyTimer"), dict) else {} + +TMP_PREFIX_ALLOWLIST = [ + "hwlab-agent-", + "hwlab-cd-", + "hwlab-cli-cicd-", + "hwlab-codeagent-trace", + "hwlab-desired-state-", + "hwlab-g14-", + "hwlab-main-", + "hwlab-merge-", + "hwlab-pr", + "hwlab-refresh-", + "hwlab-remote-", + "hwlab-ts-check", + "hwlab-bun-runtime-check-", + "hwlab-v02-", + "playwright-artifacts-", + "playwright_chromiumdev_profile-", + "unidesk-apply-patch-v2-perf-", + "unidesk-clean-", + "unidesk-code-queue", + "unidesk-hwlab-cd-", + "unidesk-pr", + "unidesk-tran-runner", + "bunx-", + "codex-app-schema", + "codex-app-ts", + "marked-", + "node-compile-cache", +] + +TMP_EXACT_PROTECT = set([ + "/tmp/codex-apply-patch", + "/tmp/codex-ipc", + "/tmp/tmux-0", + "/tmp/snap-private-tmp", +]) + +CORE_DUMP_DIR_ALLOWLIST = set([ + "/root/unidesk", +]) + +TOOL_CACHE_ALLOWLIST = [ + { + "id": "npm-cacache", + "path": "/root/.npm/_cacache", + "description": "Delete npm content-addressable package cache; npm can rebuild it.", + }, + { + "id": "npm-npx", + "path": "/root/.npm/_npx", + "description": "Delete npx package execution cache; npx can rebuild it.", + }, + { + "id": "bun-install-cache", + "path": "/root/.bun/install/cache", + "description": "Delete Bun install package cache; bun can rebuild it.", + }, +] + +REGISTRY_REPOSITORY_ROOT = "/var/lib/hwlab/registry/docker/registry/v2/repositories" +REGISTRY_ROOT = "/var/lib/hwlab/registry" +REGISTRY_PROTECTED_TAGS = set([ + "latest", + "16-alpine", + "20-bookworm-slim", + "node22-alpine-v1", + "node22-alpine-bun-v1", + "sidecar", + "1b99888d3dae", +]) + +EXPECTED_G14_NODE = "ubuntu-rog-zephyrus-g14-ga401iv-ga401iv" +REMOTE_GC_JOB_DIR = "/tmp/unidesk-gc-remote/jobs" +REMOTE_GROWTH_SNAPSHOT_DIR = "/tmp/unidesk-gc-remote/growth-snapshots" +REMOTE_STDOUT_JSON_LIMIT = 256 * 1024 + +def config_list(cfg, key, default=None): + value = cfg.get(key) if isinstance(cfg, dict) else None + if isinstance(value, list): + return [str(item) for item in value if isinstance(item, (str, int, float)) and str(item)] + return list(default or []) + +def config_bool(cfg, key, default=False): + value = cfg.get(key) if isinstance(cfg, dict) else None + if isinstance(value, bool): + return value + return bool(default) + +def config_int(cfg, key, default=0, minimum=None, maximum=None): + value = cfg.get(key) if isinstance(cfg, dict) else None + try: + parsed = int(value) + except Exception: + parsed = int(default) + if minimum is not None: + parsed = max(int(minimum), parsed) + if maximum is not None: + parsed = min(int(maximum), parsed) + return parsed + +def config_float(cfg, key, default=0.0, minimum=None, maximum=None): + value = cfg.get(key) if isinstance(cfg, dict) else None + try: + parsed = float(value) + except Exception: + parsed = float(default) + if minimum is not None: + parsed = max(float(minimum), parsed) + if maximum is not None: + parsed = min(float(maximum), parsed) + return parsed + +def config_str(cfg, key, default=""): + value = cfg.get(key) if isinstance(cfg, dict) else None + if isinstance(value, str) and value: + return value + return str(default) + +def parse_size_value(value, default=None): + if isinstance(value, (int, float)) and value > 0: + return int(value) + if not isinstance(value, str): + return default + match = re.match(r"^\s*([0-9]+(?:\.[0-9]+)?)\s*(b|k|kb|kib|m|mb|mib|g|gb|gib)?\s*$", value, re.I) + if not match: + return default + unit = (match.group(2) or "b").lower() + mult = 1024**3 if unit in set(["g", "gb", "gib"]) else 1024**2 if unit in set(["m", "mb", "mib"]) else 1024 if unit in set(["k", "kb", "kib"]) else 1 + return int(float(match.group(1)) * mult) + +def now_iso(): + return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()) + +def command(cmd, timeout=10): + try: + p = subprocess.run(cmd, text=True, capture_output=True, timeout=timeout) + return {"exitCode": p.returncode, "stdout": p.stdout, "stderr": p.stderr, "timedOut": False} + except subprocess.TimeoutExpired as exc: + return { + "exitCode": None, + "stdout": exc.stdout or "", + "stderr": exc.stderr or ("timed out after %ss" % timeout), + "timedOut": True, + } + except Exception as exc: + return {"exitCode": None, "stdout": "", "stderr": str(exc), "timedOut": False} + +def bounded(result): + return { + "exitCode": result.get("exitCode"), + "timedOut": bool(result.get("timedOut")), + "stdoutTail": str(result.get("stdout") or "")[-2000:], + "stderrTail": str(result.get("stderr") or "")[-2000:], + } + +def job_id_or_none(): + raw = str(OPTIONS.get("jobId") or "") + if raw and re.match(r"^[A-Za-z0-9._-]{1,128}$", raw): + return raw + return None + +def job_paths(job_id): + os.makedirs(REMOTE_GC_JOB_DIR, exist_ok=True) + return { + "state": os.path.join(REMOTE_GC_JOB_DIR, "%s.json" % job_id), + "log": os.path.join(REMOTE_GC_JOB_DIR, "%s.log" % job_id), + } + +def status_command(job_id): + return "bun scripts/cli.ts gc remote %s status --job-id %s" % (PROVIDER_ID, job_id) + +def write_json_atomic(path, payload): + tmp = "%s.tmp.%s" % (path, os.getpid()) + with open(tmp, "w", encoding="utf-8") as handle: + json.dump(payload, handle, ensure_ascii=False, indent=2) + handle.write("\n") + os.replace(tmp, path) + +def read_file_tail(path, limit=12000): + try: + size = os.path.getsize(path) + with open(path, "rb") as handle: + if size > limit: + handle.seek(size - limit) + data = handle.read() + return data.decode("utf-8", errors="replace") + except OSError: + return "" + +def stdout_page(items): + if not isinstance(items, list): + return items + raw_limit = OPTIONS.get("resultLimit") or OPTIONS.get("limit") or 50 + try: + limit = int(raw_limit) + except Exception: + limit = 50 + limit = max(1, min(limit, 100)) + return items[:limit] + +def compact_payload_for_stdout(payload, full_size_bytes, job_id=None, paths=None): + compact = { + "ok": payload.get("ok", True), + "action": payload.get("action") or "gc remote", + "providerId": payload.get("providerId") or PROVIDER_ID, + "output": { + "truncated": True, + "reason": "stdout-size-guard", + "fullResultBytes": full_size_bytes, + }, + } + for key in [ + "dryRun", "mutation", "observedAt", "status", "kind", "mode", + "startedAt", "finishedAt", "error", "message", "options", + "diskBefore", "diskAfter", "clusterPreflight", "clusterAfter", + "summary", "policy", + ]: + if key in payload: + compact[key] = payload[key] + if job_id: + state_path = paths["state"] if paths else payload.get("statePath") + compact["jobId"] = job_id + compact["statePath"] = state_path + compact["statusCommand"] = status_command(job_id) + compact["fullResult"] = { + "jobId": job_id, + "statePath": state_path, + "statusCommand": status_command(job_id), + } + compact["output"]["fullResultJobId"] = job_id + if "results" in payload: + results = payload.get("results") or [] + compact["results"] = stdout_page(results) + compact["returnedResultCount"] = len(compact["results"]) + compact["omittedResultCount"] = max(0, len(results) - len(compact["results"])) if isinstance(results, list) else 0 + if "candidates" in payload: + candidates = payload.get("candidates") or [] + compact["candidates"] = stdout_page(candidates) + compact["returnedCandidateCount"] = len(compact["candidates"]) + compact["omittedCandidateCount"] = max(0, len(candidates) - len(compact["candidates"])) if isinstance(candidates, list) else 0 + if "protected" in payload: + compact["protected"] = payload["protected"] + if "logTail" in payload: + compact["logTail"] = str(payload.get("logTail") or "")[-12000:] + return compact + +def emit_json(payload, persist_large=True): + raw = json.dumps(payload, ensure_ascii=False, indent=2) + full_size = len(raw.encode("utf-8")) + if full_size <= REMOTE_STDOUT_JSON_LIMIT: + print(raw) + return + job_id = str(payload.get("jobId") or "") + paths = None + if persist_large: + if not job_id: + provider_slug = re.sub(r"[^A-Za-z0-9._-]+", "-", PROVIDER_ID.lower()).strip("-") or "provider" + job_id = "%s-gc-output-%s-%s" % (provider_slug, int(time.time()), os.getpid()) + paths = job_paths(job_id) + payload = dict(payload) + payload.update({ + "jobId": job_id, + "statePath": paths["state"], + "statusCommand": status_command(job_id), + "outputPersistedAt": now_iso(), + }) + write_json_atomic(paths["state"], payload) + elif job_id: + paths = job_paths(job_id) + compact = compact_payload_for_stdout(payload, full_size, job_id or None, paths) + print(json.dumps(compact, ensure_ascii=False, indent=2)) + +def remote_gc_job_status(): + job_id = job_id_or_none() + if not job_id: + return remote_gc_live_status(now_iso(), cluster_preflight()) + paths = job_paths(job_id) + if not os.path.isfile(paths["state"]): + return { + "ok": False, + "error": "gc-remote-job-not-found", + "jobId": job_id, + "statePath": paths["state"], + "logTail": read_file_tail(paths["log"]), + } + try: + with open(paths["state"], "r", encoding="utf-8") as handle: + payload = json.load(handle) + except Exception as exc: + return { + "ok": False, + "error": "gc-remote-job-state-invalid", + "jobId": job_id, + "message": str(exc), + "statePath": paths["state"], + "logTail": read_file_tail(paths["log"]), + } + payload["logTail"] = read_file_tail(paths["log"]) + return payload + +def remote_gc_live_status(observed_at, preflight): + memory_pressure = collect_memory_pressure() + ci_storage = ci_storage_snapshot() + compact_pvc = compact_pvc_attribution(ci_storage) + return { + "ok": True, + "action": "gc remote status", + "providerId": PROVIDER_ID, + "dryRun": True, + "mutation": False, + "observedAt": observed_at, + "disk": df_snapshot(), + "clusterPreflight": preflight, + "memoryPressure": compact_memory_pressure(memory_pressure), + "pvcAttribution": compact_pvc, + "policy": growth_watermark_policy(df_snapshot() or {}), + "next": { + "snapshot": "bun scripts/cli.ts gc remote %s snapshot --history-limit %s" % (PROVIDER_ID, int(OPTIONS.get("historyLimit") or 12)), + "plan": "bun scripts/cli.ts gc remote %s plan --target-use-percent --limit %s" % (PROVIDER_ID, int(OPTIONS.get("limit") or 50)), + "policy": "bun scripts/cli.ts gc remote %s policy plan" % PROVIDER_ID, + "jobStatus": "bun scripts/cli.ts gc remote %s status --job-id " % PROVIDER_ID, + }, + } + +def path_size(path): + try: + if os.path.islink(path) or os.path.isfile(path): + return os.lstat(path).st_size + if not os.path.isdir(path): + return 0 + total = 0 + for root, dirs, files in os.walk(path): + for name in files: + child = os.path.join(root, name) + try: + total += os.lstat(child).st_size + except OSError: + pass + for name in dirs: + child = os.path.join(root, name) + try: + if os.path.islink(child): + total += os.lstat(child).st_size + except OSError: + pass + return total + except OSError: + return 0 + +def du_size(path, timeout=20): + if not os.path.exists(path): + return None + result = command(["du", "-sxB1", path], timeout) + if result["exitCode"] != 0: + return path_size(path) + text = result["stdout"].strip() + if not text: + return 0 + try: + return int(text.split()[0]) + except Exception: + return path_size(path) + +def safe_int(value, default=0): + try: + if value is None: + return default + return int(value) + except Exception: + return default + +def iso_to_epoch(value): + try: + return calendar.timegm(time.strptime(str(value), "%Y-%m-%dT%H:%M:%SZ")) + except Exception: + return None + +def growth_snapshot_path(): + os.makedirs(REMOTE_GROWTH_SNAPSHOT_DIR, exist_ok=True) + provider_slug = re.sub(r"[^A-Za-z0-9._-]+", "-", PROVIDER_ID.lower()).strip("-") or "provider" + return os.path.join(REMOTE_GROWTH_SNAPSHOT_DIR, "%s.jsonl" % provider_slug) + +def read_growth_snapshots(limit=None): + path = growth_snapshot_path() + if not os.path.isfile(path): + return [] + try: + with open(path, "r", encoding="utf-8") as handle: + lines = handle.readlines() + except OSError: + return [] + rows = [] + for line in lines[-max(1, int(limit or 200)):]: + line = line.strip() + if not line: + continue + try: + item = json.loads(line) + except Exception: + continue + if isinstance(item, dict): + rows.append(item) + return rows + +def append_growth_snapshot(snapshot): + path = growth_snapshot_path() + os.makedirs(os.path.dirname(path), exist_ok=True) + with open(path, "a", encoding="utf-8") as handle: + handle.write(json.dumps(snapshot, ensure_ascii=False, sort_keys=True)) + handle.write("\n") + return path + +def source_size_item(source_id, label, path, cleanup_owner, timeout=20): + size = du_size(path, timeout) if os.path.exists(path) else None + return { + "id": source_id, + "label": label, + "path": path, + "exists": size is not None, + "sizeBytes": size, + "sizeHuman": fmt_bytes(size or 0), + "cleanupOwner": cleanup_owner, + } + +def pid_alive(pid): + try: + pid_int = int(pid) + except Exception: + return False + if pid_int <= 0: + return False + return os.path.exists("/proc/%s" % pid_int) + +def read_json_file(path): + try: + with open(path, "r", encoding="utf-8") as handle: + value = json.load(handle) + return value if isinstance(value, dict) else None + except Exception: + return None + +def read_pid_file(path): + try: + with open(path, "r", encoding="utf-8") as handle: + raw = handle.read().strip() + return int(raw) if re.match(r"^\d+$", raw) else None + except Exception: + return None + +def iso_or_epoch_to_epoch(value): + if value is None: + return None + if isinstance(value, (int, float)): + return float(value) + text_value = str(value).strip() + if not text_value: + return None + for fmt in ["%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%S.%fZ", "%Y-%m-%d %H:%M:%S"]: + try: + return float(calendar.timegm(time.strptime(text_value, fmt))) + except Exception: + pass + return None + +def redact_command_preview(value): + text_value = str(value or "") + text_value = re.sub(r"(?i)(api[_-]?key|token|authorization|password|secret)=\S+", r"\1=", text_value) + text_value = re.sub(r"(?i)(--(?:api-key|token|password|secret))\s+\S+", r"\1 ", text_value) + return text_value[:180] + +def collect_process_pressure(patterns): + result = command(["ps", "-eo", "pid=,ppid=,rss=,comm=,args="], 10) + if result["exitCode"] != 0: + return { + "ok": False, + "error": "ps-failed", + "command": bounded(result), + "processCount": 0, + "rssBytes": 0, + "rows": [], + } + lowered = [(pattern, pattern.lower()) for pattern in patterns] + rows = [] + by_pattern = {} + for line in result["stdout"].splitlines(): + parts = line.strip().split(None, 4) + if len(parts) < 4: + continue + pid, ppid, rss_kib, comm = parts[:4] + args = parts[4] if len(parts) >= 5 else comm + haystack = ("%s %s" % (comm, args)).lower() + matches = [original for original, lowered_pattern in lowered if lowered_pattern and lowered_pattern in haystack] + if not matches: + continue + rss_bytes = safe_int(rss_kib) * 1024 + row = { + "pid": safe_int(pid), + "ppid": safe_int(ppid), + "comm": comm, + "rssBytes": rss_bytes, + "rssHuman": fmt_bytes(rss_bytes), + "matchedPatterns": matches, + "commandPreview": redact_command_preview(args), + } + rows.append(row) + for pattern in matches: + bucket = by_pattern.setdefault(pattern, {"processCount": 0, "rssBytes": 0, "rssHuman": "0 B"}) + bucket["processCount"] += 1 + bucket["rssBytes"] += rss_bytes + bucket["rssHuman"] = fmt_bytes(bucket["rssBytes"]) + rows.sort(key=lambda item: safe_int(item.get("rssBytes")), reverse=True) + total = sum(safe_int(item.get("rssBytes")) for item in rows) + return { + "ok": True, + "patterns": patterns, + "processCount": len(rows), + "rssBytes": total, + "rssHuman": fmt_bytes(total), + "byPattern": by_pattern, + "top": rows[:int(OPTIONS.get("limit") or 50)], + } + +def collect_memory_snapshot(): + result = command(["free", "-b"], 5) + if result["exitCode"] != 0: + return {"ok": False, "error": "free-failed", "command": bounded(result)} + memory = {} + for line in result["stdout"].splitlines(): + parts = line.split() + if parts and parts[0].rstrip(":") == "Mem" and len(parts) >= 7: + memory = { + "totalBytes": safe_int(parts[1]), + "usedBytes": safe_int(parts[2]), + "freeBytes": safe_int(parts[3]), + "availableBytes": safe_int(parts[6]), + "totalHuman": fmt_bytes(parts[1]), + "usedHuman": fmt_bytes(parts[2]), + "availableHuman": fmt_bytes(parts[6]), + } + break + return {"ok": bool(memory), "memory": memory, "command": bounded(result)} + +def observe_run_record(path, stale_hours): + stat = os.stat(path) + heartbeat = read_json_file(os.path.join(path, "heartbeat.json")) or {} + manifest = read_json_file(os.path.join(path, "manifest.json")) or {} + pid = None + for candidate in ["pid", "observer.pid", "browser.pid", "runner.pid"]: + pid = read_pid_file(os.path.join(path, candidate)) + if pid is not None: + break + if pid is None: + for source in [heartbeat, manifest]: + for key in ["pid", "processId", "runnerPid", "browserPid"]: + if source.get(key) is not None: + try: + pid = int(source.get(key)) + break + except Exception: + pass + if pid is not None: + break + timestamp = None + for source in [heartbeat, manifest]: + for key in ["updatedAt", "completedAt", "finishedAt", "stoppedAt", "startedAt", "createdAt"]: + timestamp = iso_or_epoch_to_epoch(source.get(key)) + if timestamp is not None: + break + if timestamp is not None: + break + if timestamp is None: + timestamp = stat.st_mtime + age_hours = max(0.0, (time.time() - timestamp) / 3600.0) + status = heartbeat.get("status") or manifest.get("status") or manifest.get("state") + alive = pid_alive(pid) + terminal = str(status or "").lower() in set(["done", "completed", "complete", "failed", "blocked", "timeout", "timed-out", "stopped", "exited"]) + stale_signal = (not alive) and age_hours >= float(stale_hours) and (terminal or status is None) + return { + "id": os.path.basename(path), + "path": path, + "pid": pid, + "pidAlive": alive, + "status": status, + "ageHours": round(age_hours, 2), + "timestampBasis": "manifest-or-heartbeat" if heartbeat or manifest else "directory-mtime-fallback", + "staleSignal": stale_signal, + "classification": "review-only", + } + +def collect_web_observe_summary(): + roots = config_list(MEMORY_CONFIG, "observeStateRoots", config_list(MEMORY_CONFIG, "webObserveRoots", [])) + stale_hours = config_float(MEMORY_CONFIG, "staleRunMaxAgeHours", 6.0, minimum=0.0) + if not roots: + return { + "ok": True, + "skipped": True, + "reason": "no-yaml-observe-roots", + "configSource": "config/unidesk-cli.yaml#gc.remote.targets.%s.memoryPressure.observeStateRoots" % PROVIDER_ID, + } + root_rows = [] + stale_rows = [] + active_rows = [] + run_count = 0 + total_bytes = 0 + for root in roots: + exists = os.path.isdir(root) + root_size = du_size(root, 15) if exists else None + if root_size is not None: + total_bytes += safe_int(root_size) + row = { + "root": root, + "exists": exists, + "sizeBytes": root_size, + "sizeHuman": fmt_bytes(root_size or 0), + "runCount": 0, + "staleSignalCount": 0, + "activeSignalCount": 0, + } + if exists: + try: + children = [os.path.join(root, name) for name in os.listdir(root)] + except OSError: + children = [] + for child in children: + if not os.path.isdir(child): + continue + try: + record = observe_run_record(child, stale_hours) + except OSError: + continue + row["runCount"] += 1 + run_count += 1 + if record.get("pidAlive"): + row["activeSignalCount"] += 1 + active_rows.append(record) + if record.get("staleSignal"): + row["staleSignalCount"] += 1 + stale_rows.append(record) + root_rows.append(row) + stale_rows.sort(key=lambda item: float(item.get("ageHours") or 0), reverse=True) + active_rows.sort(key=lambda item: safe_int(item.get("pid"))) + return { + "ok": True, + "configSource": "config/unidesk-cli.yaml#gc.remote.targets.%s.memoryPressure" % PROVIDER_ID, + "staleRunMaxAgeHours": stale_hours, + "rootCount": len(root_rows), + "totalBytes": total_bytes, + "totalHuman": fmt_bytes(total_bytes), + "runCount": run_count, + "activeSignalCount": len(active_rows), + "staleSignalCount": len(stale_rows), + "roots": root_rows, + "activeSignals": active_rows[:int(OPTIONS.get("limit") or 50)], + "staleSignals": stale_rows[:int(OPTIONS.get("limit") or 50)], + "policy": "analysis-only; active or stale observe runs must be stopped/retained through controlled observer lifecycle commands, not raw process kill or directory deletion", + } + +def collect_memory_pressure(): + patterns = config_list(MEMORY_CONFIG, "processPatterns", []) + if not patterns: + return { + "ok": True, + "skipped": True, + "reason": "no-yaml-process-patterns", + "configSource": "config/unidesk-cli.yaml#gc.remote.targets.%s.memoryPressure.processPatterns" % PROVIDER_ID, + } + processes = collect_process_pressure(patterns) + observe = collect_web_observe_summary() + return { + "ok": processes.get("ok") is True, + "configSource": "config/unidesk-cli.yaml#gc.remote.targets.%s.memoryPressure" % PROVIDER_ID, + "hostMemory": collect_memory_snapshot(), + "processes": processes, + "webObserve": observe, + "summary": { + "matchedProcessCount": processes.get("processCount"), + "matchedRssBytes": processes.get("rssBytes"), + "matchedRssHuman": processes.get("rssHuman"), + "chromeProcessCount": (processes.get("byPattern") or {}).get("chrome", {}).get("processCount"), + "observerRunCount": observe.get("runCount"), + "activeObserverSignals": observe.get("activeSignalCount"), + "staleObserverSignals": observe.get("staleSignalCount"), + "observeStateBytes": observe.get("totalBytes"), + "observeStateHuman": observe.get("totalHuman"), + }, + "drillDown": { + "processes": "bun scripts/cli.ts gc remote %s snapshot --full --no-save" % PROVIDER_ID, + "status": "bun scripts/cli.ts gc remote %s status --job-id " % PROVIDER_ID, + }, + } + +def disk_source_snapshot(): + sources = [ + source_size_item("hwlab-host-data", "HWLAB host data", "/var/lib/hwlab", "hwlab-registry-retention", 60), + source_size_item("hwlab-registry", "HWLAB registry", REGISTRY_ROOT, "gc-remote-hwlab-registry", 60), + source_size_item("k3s-storage", "k3s local-path storage", "/var/lib/rancher/k3s/storage", "owner-aware-pvc-retention", 45), + source_size_item("k3s-containerd", "k3s containerd", "/var/lib/rancher/k3s/agent/containerd", "observation-only", 45), + source_size_item("host-containerd", "host containerd", "/var/lib/containerd", "observation-only", 30), + source_size_item("kubelet", "kubelet state", "/var/lib/kubelet", "protected-runtime", 20), + source_size_item("var-log", "host logs", "/var/log", "gc-remote-logs-journald", 20), + source_size_item("tmp", "allowlisted tmp and other tmp", "/tmp", "gc-remote-tmp-allowlist", 20), + source_size_item("apt-cache", "apt archives", "/var/cache/apt/archives", "gc-remote-apt-cache", 10), + source_size_item("hwlab-v02-source", "HWLAB v0.2 source workspace", "/root/hwlab-v02", "protected-source", 20), + source_size_item("agentrun-source", "AgentRun source workspace", "/root/agentrun-v01", "protected-source", 20), + ] + return [item for item in sources if item.get("exists")] + +def containerd_breakdown_snapshot(): + rows = [ + source_size_item("k3s-containerd-content", "k3s containerd content store", "/var/lib/rancher/k3s/agent/containerd/io.containerd.content.v1.content", "observation-only", 30), + source_size_item("k3s-containerd-overlayfs", "k3s containerd overlay snapshots", "/var/lib/rancher/k3s/agent/containerd/io.containerd.snapshotter.v1.overlayfs", "observation-only", 30), + source_size_item("host-containerd-content", "host containerd content store", "/var/lib/containerd/io.containerd.content.v1.content", "observation-only", 20), + source_size_item("host-containerd-overlayfs", "host containerd overlay snapshots", "/var/lib/containerd/io.containerd.snapshotter.v1.overlayfs", "observation-only", 20), + ] + rows = [item for item in rows if item.get("exists")] + return { + "state": "observation-only", + "cleanupSupported": False, + "reason": "containerd cleanup still requires a reference-safe image/content classifier; this snapshot only classifies growth sources", + "breakdown": rows, + } + +def pv_host_path(pv): + spec = (pv or {}).get("spec") or {} + host_path = (spec.get("hostPath") or {}).get("path") + if isinstance(host_path, str) and host_path: + return host_path + local_path = (spec.get("local") or {}).get("path") + if isinstance(local_path, str) and local_path: + return local_path + return None + +def pvc_owner_group(namespace, owner): + owner = str(owner or "") + if namespace == "agentrun-ci": + return "agentrun" + if namespace == "hwlab-ci": + if owner.startswith("agentrun-"): + return "agentrun" + return "hwlab" + if namespace.startswith("hwlab-"): + return "hwlab-runtime" + return "other" + +def parse_k8s_quantity(value): + if value is None: + return None + raw = str(value).strip() + match = re.match(r"^([0-9]+(?:\.[0-9]+)?)(Ki|Mi|Gi|Ti|K|M|G|T)?$", raw) + if not match: + return None + multiplier = { + None: 1, + "K": 1000, + "M": 1000**2, + "G": 1000**3, + "T": 1000**4, + "Ki": 1024, + "Mi": 1024**2, + "Gi": 1024**3, + "Ti": 1024**4, + }.get(match.group(2), 1) + return int(float(match.group(1)) * multiplier) + +def metadata_owner(meta): + refs = meta.get("ownerReferences") or [] + if refs: + first = refs[0] or {} + return first.get("kind"), first.get("name"), [{"kind": item.get("kind"), "name": item.get("name")} for item in refs[:5]] + labels = meta.get("labels") or {} + annotations = meta.get("annotations") or {} + for key in [ + "tekton.dev/pipelineRun", + "tekton.dev/taskRun", + "agentrun.unidesk/run-id", + "hwlab.unidesk/run-id", + "app.kubernetes.io/instance", + ]: + value = labels.get(key) or annotations.get(key) + if value: + return "Label", value, [] + return None, None, [] + +def ci_storage_snapshot(): + namespaces = set(config_list(PVC_CONFIG, "namespaces", ["hwlab-ci", "agentrun-ci"])) + candidate_namespaces = set(config_list(PVC_CONFIG, "candidateNamespaces", [])) + hwlab_node = config_str(PVC_CONFIG, "hwlabNode", PROVIDER_ID) + hwlab_lane = config_str(PVC_CONFIG, "hwlabLane", "v03") + agentrun_node = config_str(PVC_CONFIG, "agentrunNode", PROVIDER_ID) + agentrun_lane = config_str(PVC_CONFIG, "agentrunLane", "v02") + limit = config_int(PVC_CONFIG, "limit", int(OPTIONS.get("limit") or 50), minimum=1, maximum=5000) + pv_data = kubectl_json(["get", "pv"], 30) or {} + pvc_data = kubectl_json(["get", "pvc", "-A"], 30) or {} + pod_data = kubectl_json(["get", "pod", "-A"], 30) or {} + pvs = {} + for pv in pv_data.get("items") or []: + meta = pv.get("metadata") or {} + name = meta.get("name") + if name: + pvs[name] = pv + mounts = {} + for pod in pod_data.get("items") or []: + meta = pod.get("metadata") or {} + ns = str(meta.get("namespace") or "") + pod_name = str(meta.get("name") or "") + phase = str(((pod.get("status") or {}).get("phase")) or "") + if phase in set(["Succeeded", "Failed"]): + continue + spec = pod.get("spec") or {} + for vol in spec.get("volumes") or []: + claim = (vol.get("persistentVolumeClaim") or {}).get("claimName") + if claim: + mounts.setdefault((ns, claim), []).append(pod_name) + rows = [] + for pvc in pvc_data.get("items") or []: + meta = pvc.get("metadata") or {} + spec = pvc.get("spec") or {} + status = pvc.get("status") or {} + ns = str(meta.get("namespace") or "") + name = str(meta.get("name") or "") + if ns not in namespaces: + continue + volume = str(spec.get("volumeName") or "") + pv = pvs.get(volume) or {} + pv_spec = pv.get("spec") or {} + pv_meta = pv.get("metadata") or {} + owner_kind, owner_name, owner_refs = metadata_owner(meta) + requested = parse_k8s_quantity((((spec.get("resources") or {}).get("requests") or {}).get("storage"))) + host_path = pv_host_path(pv) + active = sorted(mounts.get((ns, name), [])) + estimated = du_size(host_path, 8) if host_path else None + candidate_reasons = [] + if not active: + candidate_reasons.append("no-active-mount-observed") + if status.get("phase") != "Bound": + candidate_reasons.append("pvc-not-bound") + if (pv.get("status") or {}).get("phase") == "Released": + candidate_reasons.append("pv-released") + review_candidate = ns in candidate_namespaces and len(candidate_reasons) > 0 + rows.append({ + "namespace": ns, + "pvc": name, + "volume": volume or None, + "phase": status.get("phase"), + "pvPhase": (pv.get("status") or {}).get("phase"), + "ownerKind": owner_kind, + "owner": owner_name, + "ownerRefs": owner_refs, + "ownerGroup": pvc_owner_group(ns, owner_name), + "storageClass": spec.get("storageClassName") or pv_spec.get("storageClassName"), + "reclaimPolicy": pv_spec.get("persistentVolumeReclaimPolicy"), + "requestedBytes": requested, + "requestedHuman": fmt_bytes(requested or 0), + "hostPath": host_path, + "pvCreatedAt": (pv_meta.get("creationTimestamp") if isinstance(pv_meta, dict) else None), + "pvcCreatedAt": meta.get("creationTimestamp"), + "activeMountPods": active, + "estimatedBytes": estimated, + "estimatedHuman": fmt_bytes(estimated or 0), + "reviewCandidate": review_candidate, + "reviewReasons": candidate_reasons, + "dryRunOnly": True, + }) + rows.sort(key=lambda item: safe_int(item.get("estimatedBytes")), reverse=True) + by_namespace = {} + by_owner_group = {} + for row in rows: + for bucket, key in [(by_namespace, row.get("namespace") or "unknown"), (by_owner_group, row.get("ownerGroup") or "unknown")]: + current = bucket.setdefault(key, {"count": 0, "estimatedBytes": 0, "activeMountCount": 0}) + current["count"] += 1 + current["estimatedBytes"] += safe_int(row.get("estimatedBytes")) + current["activeMountCount"] += len(row.get("activeMountPods") or []) + current["estimatedHuman"] = fmt_bytes(current["estimatedBytes"]) + review_candidates = [row for row in rows if row.get("reviewCandidate")] + return { + "scope": "YAML-configured PVC namespaces", + "configSource": "config/unidesk-cli.yaml#gc.remote.targets.%s.pvcAttribution" % PROVIDER_ID, + "namespaces": sorted(namespaces), + "candidateNamespaces": sorted(candidate_namespaces), + "pvcCount": len(rows), + "reviewCandidateCount": len(review_candidates), + "estimatedBytes": sum(safe_int(row.get("estimatedBytes")) for row in rows), + "estimatedHuman": fmt_bytes(sum(safe_int(row.get("estimatedBytes")) for row in rows)), + "requestedBytes": sum(safe_int(row.get("requestedBytes")) for row in rows), + "requestedHuman": fmt_bytes(sum(safe_int(row.get("requestedBytes")) for row in rows)), + "byNamespace": by_namespace, + "byOwnerGroup": by_owner_group, + "topPvcs": rows[:limit], + "reviewCandidates": review_candidates[:limit], + "handoff": { + "hwlab": { + "dryRun": "bun scripts/cli.ts hwlab nodes control-plane cleanup-runs --node %s --lane %s --min-age-minutes 30 --limit 200 --dry-run" % (hwlab_node, hwlab_lane), + "releasedPvs": "bun scripts/cli.ts hwlab nodes control-plane cleanup-released-pvs --node %s --lane %s --limit 200 --dry-run" % (hwlab_node, hwlab_lane), + }, + "agentrun": { + "dryRun": "bun scripts/cli.ts agentrun control-plane cleanup-runs --node %s --lane %s --min-age-minutes 30 --limit 200 --dry-run" % (agentrun_node, agentrun_lane), + "releasedPvs": "bun scripts/cli.ts agentrun control-plane cleanup-released-pvs --node %s --lane %s --limit 200 --dry-run" % (agentrun_node, agentrun_lane), + }, + }, + "policy": "analysis-only; remote GC never deletes PVC/PV/local-path data and only hands off to owner-aware retention commands", + } + +def compact_pvc_row(row): + return { + "namespace": row.get("namespace"), + "pvc": row.get("pvc"), + "volume": row.get("volume"), + "phase": row.get("phase"), + "pvPhase": row.get("pvPhase"), + "ownerKind": row.get("ownerKind"), + "owner": row.get("owner"), + "ownerGroup": row.get("ownerGroup"), + "storageClass": row.get("storageClass"), + "reclaimPolicy": row.get("reclaimPolicy"), + "requestedBytes": row.get("requestedBytes"), + "requestedHuman": row.get("requestedHuman"), + "estimatedBytes": row.get("estimatedBytes"), + "estimatedHuman": row.get("estimatedHuman"), + "activeMountCount": len(row.get("activeMountPods") or []), + "activeMountPods": (row.get("activeMountPods") or [])[:5], + "reviewCandidate": row.get("reviewCandidate"), + "reviewReasons": row.get("reviewReasons"), + "dryRunOnly": True, + } + +def compact_pvc_attribution(payload): + if bool(OPTIONS.get("full")): + return payload + limit = 1 + top = payload.get("topPvcs") or [] + review = payload.get("reviewCandidates") or [] + return { + "scope": payload.get("scope"), + "configSource": payload.get("configSource"), + "namespaces": payload.get("namespaces"), + "candidateNamespaces": payload.get("candidateNamespaces"), + "pvcCount": payload.get("pvcCount"), + "reviewCandidateCount": payload.get("reviewCandidateCount"), + "estimatedBytes": payload.get("estimatedBytes"), + "estimatedHuman": payload.get("estimatedHuman"), + "requestedBytes": payload.get("requestedBytes"), + "requestedHuman": payload.get("requestedHuman"), + "byNamespace": payload.get("byNamespace"), + "byOwnerGroup": payload.get("byOwnerGroup"), + "topPvcs": [compact_pvc_row(row) for row in top[:limit] if isinstance(row, dict)], + "reviewCandidates": [compact_pvc_row(row) for row in review[:limit] if isinstance(row, dict)], + "handoff": payload.get("handoff"), + "policy": payload.get("policy"), + "compacted": True, + "fullDisclosure": "rerun with --full for hostPath, creation timestamps and complete row details", + } + +def compact_ci_storage_summary(payload): + return { + "scope": payload.get("scope"), + "configSource": payload.get("configSource"), + "pvcCount": payload.get("pvcCount"), + "reviewCandidateCount": payload.get("reviewCandidateCount"), + "estimatedBytes": payload.get("estimatedBytes"), + "estimatedHuman": payload.get("estimatedHuman"), + "requestedBytes": payload.get("requestedBytes"), + "requestedHuman": payload.get("requestedHuman"), + "compacted": True, + "fullDisclosure": "use pvcAttribution or --full for row-level details", + } + +def compact_memory_pressure(payload): + if bool(OPTIONS.get("full")): + return payload + processes = payload.get("processes") or {} + observe = payload.get("webObserve") or {} + process_limit = max(1, min(int(OPTIONS.get("limit") or 50), 8)) + signal_limit = max(1, min(int(OPTIONS.get("limit") or 50), 5)) + compact_processes = dict(processes) + compact_processes["top"] = (processes.get("top") or [])[:process_limit] + compact_observe = dict(observe) + compact_observe["activeSignals"] = (observe.get("activeSignals") or [])[:signal_limit] + compact_observe["staleSignals"] = (observe.get("staleSignals") or [])[:signal_limit] + return { + "ok": payload.get("ok"), + "configSource": payload.get("configSource"), + "hostMemory": payload.get("hostMemory"), + "processes": compact_processes, + "webObserve": compact_observe, + "summary": payload.get("summary"), + "drillDown": payload.get("drillDown"), + "compacted": True, + } + +def compact_memory_summary(payload): + observe = payload.get("webObserve") or {} + return { + "ok": payload.get("ok"), + "configSource": payload.get("configSource"), + "summary": payload.get("summary"), + "webObserve": { + "rootCount": observe.get("rootCount"), + "totalBytes": observe.get("totalBytes"), + "totalHuman": observe.get("totalHuman"), + "runCount": observe.get("runCount"), + "activeSignalCount": observe.get("activeSignalCount"), + "staleSignalCount": observe.get("staleSignalCount"), + }, + "compacted": True, + "drillDown": "bun scripts/cli.ts gc remote %s status --limit %s" % (PROVIDER_ID, int(OPTIONS.get("limit") or 50)), + } + +def registry_growth_snapshot(): + summary = { + "path": REGISTRY_ROOT, + "sizeBytes": du_size(REGISTRY_ROOT, 60) or 0, + } + summary["sizeHuman"] = fmt_bytes(summary["sizeBytes"]) + if OPTIONS.get("hwlabRegistry", False): + plan = plan_registry_retention() + retention = dict(plan.get("summary") or {}) + for key in ["registrySizeBytes", "estimatedReclaimBytes"]: + if key in retention: + retention[key.replace("Bytes", "Human")] = fmt_bytes(retention.get(key) or 0) + summary["retentionPlan"] = retention + else: + summary["retentionPlan"] = { + "skipped": True, + "reason": "rerun snapshot with --include-hwlab-registry to compute tag/revision retention counters", + } + summary["cadence"] = { + "dryRun": "daily or before/after every v0.2 CI/CD burst", + "maintenanceRun": "weekly, or when root >=80%, or when registry growth exceeds the agreed daily threshold", + "planCommand": "bun scripts/cli.ts gc remote %s plan --target-use-percent 70 --include-hwlab-registry --limit 50" % PROVIDER_ID, + "snapshotCommand": "bun scripts/cli.ts gc remote %s snapshot --include-hwlab-registry --history-limit 12" % PROVIDER_ID, + "runCommand": "bun scripts/cli.ts gc remote %s run --confirm --include-hwlab-registry --target-use-percent 70 --limit 50" % PROVIDER_ID, + "defaultRetention": { + "keepPerRepo": int(OPTIONS.get("registryKeepPerRepo") or 20), + "minAgeHours": float(OPTIONS.get("registryMinAgeHours") or 48), + "protects": ["current workload refs", "digest closure", "protected tags", "recent tags", "newest N tags per repo"], + }, + } + return summary + +def growth_watermark_policy(root_disk): + use_percent = root_disk.get("usePercent") if isinstance(root_disk, dict) else None + if use_percent is None: + state = "unknown" + action = "collect-snapshot" + elif use_percent < 75: + state = "healthy" + action = "observe-trend" + elif use_percent < 80: + state = "watch" + action = "run-dry-run-plan" + elif use_percent < 85: + state = "maintenance" + action = "schedule-owner-aware-retention" + else: + state = "emergency" + action = "restore-runtime-then-file-evidence" + return { + "state": state, + "recommendedAction": action, + "watermarks": [ + {"range": "<75%", "action": "trend only"}, + {"range": "75%-80%", "action": "run dry-run plan and identify source"}, + {"range": "80%-85%", "action": "small owner-aware retention run"}, + {"range": ">=85%", "action": "runtime recovery first, then root-cause growth source"}, + ], + "growthThresholdPolicy": "If bytes/day remains high for consecutive snapshots, act before 80%; exact threshold should be set from the first week of saved snapshots.", + } + +def snapshot_metric_map(snapshot): + metrics = {} + root = snapshot.get("rootDisk") or {} + if isinstance(root, dict) and root.get("usedBytes") is not None: + metrics["root.usedBytes"] = {"value": safe_int(root.get("usedBytes")), "unit": "bytes", "label": "root used bytes"} + for item in snapshot.get("sources") or []: + if not isinstance(item, dict) or item.get("sizeBytes") is None: + continue + key = "source.%s.sizeBytes" % item.get("id") + metrics[key] = {"value": safe_int(item.get("sizeBytes")), "unit": "bytes", "label": item.get("label") or item.get("id")} + storage = ((snapshot.get("ciStorage") or {}).get("byOwnerGroup") or {}) + if not storage: + storage = ((snapshot.get("pvcAttribution") or {}).get("byOwnerGroup") or {}) + for owner, value in storage.items(): + metrics["ciStorage.%s.estimatedBytes" % owner] = {"value": safe_int((value or {}).get("estimatedBytes")), "unit": "bytes", "label": "CI storage %s" % owner} + memory = snapshot.get("memoryPressure") or {} + memory_summary = memory.get("summary") or {} + if memory_summary.get("matchedRssBytes") is not None: + metrics["memoryPressure.matchedRssBytes"] = {"value": safe_int(memory_summary.get("matchedRssBytes")), "unit": "bytes", "label": "matched observer/chrome RSS"} + if memory_summary.get("observeStateBytes") is not None: + metrics["memoryPressure.observeStateBytes"] = {"value": safe_int(memory_summary.get("observeStateBytes")), "unit": "bytes", "label": "web observe state bytes"} + for key in ["matchedProcessCount", "activeObserverSignals", "staleObserverSignals"]: + if memory_summary.get(key) is not None: + metrics["memoryPressure.%s" % key] = {"value": safe_int(memory_summary.get(key)), "unit": "count", "label": "memory pressure %s" % key} + registry = snapshot.get("registry") or {} + retention = registry.get("retentionPlan") or {} + for key in ["totalTags", "totalRevisions", "deleteTags", "deleteRevisions", "estimatedReclaimBytes"]: + if key in retention and retention.get(key) is not None: + unit = "bytes" if key.endswith("Bytes") else "count" + metrics["registry.%s" % key] = {"value": safe_int(retention.get(key)), "unit": unit, "label": "registry %s" % key} + return metrics + +def delta_metric_rows(before, after): + before_metrics = snapshot_metric_map(before) + after_metrics = snapshot_metric_map(after) + before_ts = iso_to_epoch(before.get("observedAt")) + after_ts = iso_to_epoch(after.get("observedAt")) + seconds = (after_ts - before_ts) if before_ts is not None and after_ts is not None else None + rows = [] + for key in sorted(set(before_metrics.keys()) | set(after_metrics.keys())): + old = before_metrics.get(key, {"value": 0, "unit": (after_metrics.get(key) or {}).get("unit"), "label": key}) + new = after_metrics.get(key, {"value": 0, "unit": old.get("unit"), "label": old.get("label")}) + delta = safe_int(new.get("value")) - safe_int(old.get("value")) + row = { + "key": key, + "label": new.get("label") or old.get("label") or key, + "unit": new.get("unit") or old.get("unit"), + "before": old.get("value"), + "after": new.get("value"), + "delta": delta, + } + if row["unit"] == "bytes": + row["beforeHuman"] = fmt_bytes(row["before"] or 0) + row["afterHuman"] = fmt_bytes(row["after"] or 0) + row["deltaHuman"] = ("-" if delta < 0 else "") + fmt_bytes(abs(delta)) + if seconds and seconds > 0: + per_day = int(delta * 86400 / seconds) + row["perDayBytes"] = per_day + row["perDayHuman"] = ("-" if per_day < 0 else "") + fmt_bytes(abs(per_day)) + "/day" + rows.append(row) + rows.sort(key=lambda item: safe_int(item.get("delta")), reverse=True) + return {"durationSeconds": seconds, "metrics": rows} + +def growth_trend_payload(points): + points = [point for point in points if isinstance(point, dict)] + if len(points) < 2: + return { + "pointCount": len(points), + "state": "insufficient-history", + "message": "Run snapshot at least twice to compute deltas.", + } + latest_delta = delta_metric_rows(points[-2], points[-1]) + window_delta = delta_metric_rows(points[0], points[-1]) + def rate_warning(delta): + seconds = delta.get("durationSeconds") + if seconds is not None and seconds < 3600: + return { + "code": "short-window-rate-noisy", + "message": "Per-day rates from windows shorter than 1 hour are directional only; use daily snapshots for governance decisions.", + "durationSeconds": seconds, + } + return None + return { + "pointCount": len(points), + "oldestAt": points[0].get("observedAt"), + "latestAt": points[-1].get("observedAt"), + "latestDelta": { + "durationSeconds": latest_delta.get("durationSeconds"), + "rateWarning": rate_warning(latest_delta), + "topGrowingBytes": [row for row in latest_delta.get("metrics", []) if row.get("unit") == "bytes" and safe_int(row.get("delta")) > 0][:10], + "topShrinkingBytes": [row for row in reversed(latest_delta.get("metrics", [])) if row.get("unit") == "bytes" and safe_int(row.get("delta")) < 0][:10], + "registryCounters": [row for row in latest_delta.get("metrics", []) if str(row.get("key", "")).startswith("registry.") and row.get("unit") == "count"], + }, + "windowDelta": { + "durationSeconds": window_delta.get("durationSeconds"), + "rateWarning": rate_warning(window_delta), + "topGrowingBytes": [row for row in window_delta.get("metrics", []) if row.get("unit") == "bytes" and safe_int(row.get("delta")) > 0][:10], + "topShrinkingBytes": [row for row in reversed(window_delta.get("metrics", [])) if row.get("unit") == "bytes" and safe_int(row.get("delta")) < 0][:10], + "registryCounters": [row for row in window_delta.get("metrics", []) if str(row.get("key", "")).startswith("registry.") and row.get("unit") == "count"], + }, + } + +def compact_metric_rows(rows, limit=3): + compact = [] + for row in (rows or [])[:limit]: + compact.append({ + "key": row.get("key"), + "label": row.get("label"), + "unit": row.get("unit"), + "delta": row.get("delta"), + "deltaHuman": row.get("deltaHuman"), + "perDayHuman": row.get("perDayHuman"), + }) + return compact + +def compact_trend_payload(payload): + if payload.get("state") == "insufficient-history": + return payload + latest = payload.get("latestDelta") or {} + window = payload.get("windowDelta") or {} + return { + "pointCount": payload.get("pointCount"), + "oldestAt": payload.get("oldestAt"), + "latestAt": payload.get("latestAt"), + "latestDelta": { + "durationSeconds": latest.get("durationSeconds"), + "rateWarning": latest.get("rateWarning"), + "topGrowingBytes": compact_metric_rows(latest.get("topGrowingBytes") or [], 1), + "topShrinkingBytes": compact_metric_rows(latest.get("topShrinkingBytes") or [], 1), + "registryCounters": compact_metric_rows(latest.get("registryCounters") or [], 1), + }, + "windowDelta": { + "durationSeconds": window.get("durationSeconds"), + "rateWarning": window.get("rateWarning"), + "topGrowingBytes": compact_metric_rows(window.get("topGrowingBytes") or [], 1), + "topShrinkingBytes": compact_metric_rows(window.get("topShrinkingBytes") or [], 1), + "registryCounters": compact_metric_rows(window.get("registryCounters") or [], 1), + }, + "fullDisclosure": "rerun trend --full for all metric rows", + } + +def compact_growth_point(item): + registry = item.get("registry") or {} + retention = registry.get("retentionPlan") or {} + ci_storage = item.get("ciStorage") or {} + containerd = item.get("containerd") or {} + memory = item.get("memoryPressure") or {} + memory_summary = memory.get("summary") or {} + observe = (memory.get("webObserve") or {}) + return { + "observedAt": item.get("observedAt"), + "rootDisk": item.get("rootDisk"), + "sourceCount": len(item.get("sources") or []), + "registry": { + "sizeBytes": registry.get("sizeBytes"), + "sizeHuman": registry.get("sizeHuman"), + "totalTags": retention.get("totalTags"), + "totalRevisions": retention.get("totalRevisions"), + "deleteTags": retention.get("deleteTags"), + "deleteRevisions": retention.get("deleteRevisions"), + "estimatedReclaimBytes": retention.get("estimatedReclaimBytes"), + "estimatedReclaimHuman": retention.get("estimatedReclaimHuman"), + }, + "ciStorage": { + "pvcCount": ci_storage.get("pvcCount"), + "estimatedBytes": ci_storage.get("estimatedBytes"), + "estimatedHuman": ci_storage.get("estimatedHuman"), + "byOwnerGroup": ci_storage.get("byOwnerGroup"), + }, + "containerd": { + "state": containerd.get("state"), + "cleanupSupported": containerd.get("cleanupSupported"), + }, + "memoryPressure": { + "matchedProcessCount": memory_summary.get("matchedProcessCount"), + "matchedRssBytes": memory_summary.get("matchedRssBytes"), + "matchedRssHuman": memory_summary.get("matchedRssHuman"), + "activeObserverSignals": memory_summary.get("activeObserverSignals"), + "staleObserverSignals": memory_summary.get("staleObserverSignals"), + "observeStateBytes": memory_summary.get("observeStateBytes"), + "observeStateHuman": memory_summary.get("observeStateHuman"), + "webObserveRootCount": observe.get("rootCount"), + }, + } + +def collect_growth_snapshot(observed_at, preflight): + root_disk = df_snapshot() + sources = disk_source_snapshot() + ci_storage = ci_storage_snapshot() + memory_pressure = collect_memory_pressure() + compact_pvc = compact_pvc_attribution(ci_storage) + if bool(OPTIONS.get("full")): + public_pvc = ci_storage + public_memory = memory_pressure + else: + public_pvc = compact_ci_storage_summary(ci_storage) + public_memory = compact_memory_summary(memory_pressure) + registry = registry_growth_snapshot() + containerd = containerd_breakdown_snapshot() + commands = { + "snapshot": "bun scripts/cli.ts gc remote %s snapshot --include-hwlab-registry --history-limit %s" % (PROVIDER_ID, int(OPTIONS.get("historyLimit") or 12)), + "trend": "bun scripts/cli.ts gc remote %s trend --history-limit %s" % (PROVIDER_ID, int(OPTIONS.get("historyLimit") or 12)), + "registryPlan": "bun scripts/cli.ts gc remote %s plan --target-use-percent 70 --include-hwlab-registry --limit 50" % PROVIDER_ID, + "hwlabCiRetention": ((ci_storage.get("handoff") or {}).get("hwlab") or {}).get("dryRun"), + "agentrunRetention": ((ci_storage.get("handoff") or {}).get("agentrun") or {}).get("dryRun"), + "remotePolicy": "bun scripts/cli.ts gc remote %s policy plan" % PROVIDER_ID, + } + if not bool(OPTIONS.get("full")): + commands = { + "trend": "bun scripts/cli.ts gc remote %s trend --history-limit %s" % (PROVIDER_ID, int(OPTIONS.get("historyLimit") or 12)), + "status": "bun scripts/cli.ts gc remote %s status --limit %s" % (PROVIDER_ID, int(OPTIONS.get("limit") or 50)), + "full": "bun scripts/cli.ts gc remote %s snapshot --full --no-save" % PROVIDER_ID, + } + return { + "ok": True, + "action": "gc remote snapshot", + "providerId": PROVIDER_ID, + "dryRun": True, + "mutation": False, + "diagnosticStateMutation": bool(OPTIONS.get("saveSnapshot", True)), + "observedAt": observed_at, + "rootDisk": root_disk, + "clusterPreflight": preflight, + "sources": sources, + "registry": registry, + "pvcAttribution": public_pvc, + "memoryPressure": public_memory, + "containerd": containerd, + "policy": growth_watermark_policy(root_disk or {}), + "commands": commands, + } + +def allocated_file_size(path): + try: + stat = os.stat(path) + blocks = getattr(stat, "st_blocks", 0) + if blocks: + return int(blocks) * 512 + return int(stat.st_size) + except OSError: + return 0 + +def df_snapshot(): + result = command(["df", "-B1", "-P", "/"], 5) + if result["exitCode"] != 0: + return None + lines = result["stdout"].strip().splitlines() + if len(lines) < 2: + return None + parts = lines[1].split() + if len(parts) < 6: + return None + return { + "filesystem": parts[0], + "sizeBytes": int(parts[1]), + "usedBytes": int(parts[2]), + "availableBytes": int(parts[3]), + "dfBasisBytes": int(parts[2]) + int(parts[3]), + "reservedBytes": max(0, int(parts[1]) - int(parts[2]) - int(parts[3])), + "usePercentExact": round((int(parts[2]) * 100.0 / (int(parts[2]) + int(parts[3]))) if (int(parts[2]) + int(parts[3])) > 0 else 0.0, 2), + "usePercent": int(parts[4].replace("%", "")), + "mount": parts[5], + } + +def fmt_bytes(value): + units = ["B", "KiB", "MiB", "GiB", "TiB"] + size = float(max(0, int(value or 0))) + idx = 0 + while size >= 1024 and idx < len(units) - 1: + size /= 1024.0 + idx += 1 + return ("%0.0f %s" if size >= 10 or idx == 0 else "%0.1f %s") % (size, units[idx]) + +def disk_use_percent(size_bytes, used_bytes): + try: + size = int(size_bytes or 0) + used = int(used_bytes or 0) + except Exception: + return None + if size <= 0: + return None + return int((max(0, used) * 100 + size - 1) // size) + +def parse_journal_usage(text): + m = re.search(r"take up\s+([0-9.]+)\s*([KMGT]?)(?:i?B|B)?", text, re.I) + if not m: + return None + mult = {"": 1, "K": 1024, "M": 1024**2, "G": 1024**3, "T": 1024**4}.get(m.group(2).upper(), 1) + return int(float(m.group(1)) * mult) + +def parse_docker_human_size(raw): + raw = str(raw).split("(")[0].strip() + m = re.match(r"^([0-9.]+)\s*([KMGT]?B)$", raw, re.I) + if not m: + return None + mult = {"B": 1, "KB": 1000, "MB": 1000**2, "GB": 1000**3, "TB": 1000**4}.get(m.group(2).upper(), 1) + return int(float(m.group(1)) * mult) + +def parse_docker_build_cache(text): + for line in text.splitlines(): + if not line.startswith("Build Cache"): + continue + match = re.match(r"^Build Cache\s+\S+\s+\S+\s+(\S+)\s+(\S+)", line.strip()) + if not match: + continue + size = parse_docker_human_size(match.group(1)) + reclaim = parse_docker_human_size(match.group(2)) + if size is None or reclaim is None: + return None + return {"sizeBytes": size, "reclaimableBytes": reclaim} + return None + +def docker_containers(): + ps = command(["docker", "ps", "-qa", "--no-trunc"], 5) + if ps["exitCode"] != 0 or not ps["stdout"].strip(): + return [] + ids = ps["stdout"].split() + inspect = command(["docker", "inspect"] + ids, 10) + if inspect["exitCode"] != 0 or not inspect["stdout"].strip(): + return [] + try: + data = json.loads(inspect["stdout"]) + except Exception: + return [] + rows = [] + for item in data: + cfg = item.get("Config") or {} + rows.append({ + "id": str(item.get("Id") or ""), + "name": str(item.get("Name") or "").lstrip("/"), + "image": str(cfg.get("Image") or item.get("Image") or ""), + "logPath": str(item.get("LogPath") or ""), + }) + return [row for row in rows if row["id"]] + +def cluster_preflight(): + node_cmd = command(["sh", "-lc", "KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{\"\\n\"}{end}' 2>/dev/null"], 10) + pods_cmd = command(["sh", "-lc", "KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl get pods -n hwlab-dev --no-headers 2>/dev/null | wc -l"], 10) + nodes = [line.strip() for line in node_cmd["stdout"].splitlines() if line.strip()] + expected = EXPECTED_G14_NODE if PROVIDER_ID.upper() == "G14" else None + ok = True + reason = "ok" + if expected is not None and expected not in nodes: + ok = False + reason = "expected-g14-node-missing" + return { + "ok": ok, + "reason": reason, + "providerId": PROVIDER_ID, + "hostname": command(["hostname"], 5)["stdout"].strip(), + "expectedNode": expected, + "nodes": nodes, + "nodeCommand": bounded(node_cmd), + "hwlabDevPodCount": int(pods_cmd["stdout"].strip() or "0") if pods_cmd["exitCode"] == 0 else None, + "hwlabDevPodCommand": bounded(pods_cmd), + } + +def active_hwlab_ci_writes(): + result = command(["sh", "-lc", "KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl get pipelinerun,taskrun -n hwlab-ci --no-headers 2>/dev/null | awk '$2 != \"True\" && $2 != \"False\" {print}' | head -40"], 15) + lines = [line for line in (result.get("stdout") or "").splitlines() if line.strip()] + return {"ok": result["exitCode"] == 0, "activeCount": len(lines), "activePreview": lines, "command": bounded(result)} + +def active_hwlab_ci_jobs(): + result = command(["sh", "-lc", "KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl get jobs -n hwlab-ci --no-headers 2>/dev/null | awk '$2 != \"Complete\" && $2 != \"Failed\" {print}' | head -40"], 15) + lines = [line for line in (result.get("stdout") or "").splitlines() if line.strip()] + return {"ok": result["exitCode"] == 0, "activeCount": len(lines), "activePreview": lines, "command": bounded(result)} + +def wait_no_active_hwlab_ci(timeout=180): + deadline = time.time() + timeout + last = None + while time.time() < deadline: + writes = active_hwlab_ci_writes() + jobs = active_hwlab_ci_jobs() + last = {"writes": writes, "jobs": jobs} + if writes.get("ok") and jobs.get("ok") and int(writes.get("activeCount") or 0) == 0 and int(jobs.get("activeCount") or 0) == 0: + return {"ok": True, "last": last} + time.sleep(5) + return {"ok": False, "last": last} + +def kubectl_json(args, timeout=20): + result = command(["env", "KUBECONFIG=/etc/rancher/k3s/k3s.yaml", "kubectl"] + args + ["-o", "json"], timeout) + if result["exitCode"] != 0: + return None + try: + return json.loads(result["stdout"] or "{}") + except Exception: + return None + +def kctl(args, timeout=30): + return command(["env", "KUBECONFIG=/etc/rancher/k3s/k3s.yaml", "kubectl"] + args, timeout) + +def workload_image_refs(): + result = command(["sh", "-lc", "KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl get deploy,sts,ds,pod -A -o jsonpath='{range .items[*]}{range .spec.containers[*]}{.image}{\"\\n\"}{end}{range .spec.initContainers[*]}{.image}{\"\\n\"}{end}{range .spec.template.spec.containers[*]}{.image}{\"\\n\"}{end}{range .spec.template.spec.initContainers[*]}{.image}{\"\\n\"}{end}{end}' 2>/dev/null | sort -u"], 30) + refs = set() + digests = set() + for image in (result.get("stdout") or "").splitlines(): + image = image.strip() + if not image.startswith("127.0.0.1:5000/"): + continue + ref = image.split("127.0.0.1:5000/", 1)[1] + if "@sha256:" in ref: + repo, digest = ref.split("@", 1) + refs.add((repo, "@" + digest)) + digests.add("sha256:" + digest.split(":", 1)[1]) + elif ":" in ref: + repo, tag = ref.rsplit(":", 1) + refs.add((repo, tag)) + return refs, digests, bounded(result) + +def registry_request(method, path, headers=None, timeout=20): + url = "http://127.0.0.1:5000" + path + req = urllib.request.Request(url, method=method, headers=headers or {}) + with urllib.request.urlopen(req, timeout=timeout) as response: + body = response.read() + return {"status": response.status, "headers": dict(response.headers), "body": body.decode("utf-8", errors="replace")} + +def registry_tag_rows(): + rows = [] + root = REGISTRY_REPOSITORY_ROOT + if not os.path.isdir(root): + return rows + for repo_root, dirs, files in os.walk(root): + if os.path.basename(repo_root) != "tags": + continue + rel = os.path.relpath(repo_root, root) + suffix = "/_manifests/tags" + if not rel.endswith(suffix): + continue + repo = rel[:-len(suffix)] + try: + tags = os.listdir(repo_root) + except OSError: + continue + for tag in sorted(tags): + link = os.path.join(repo_root, tag, "current", "link") + if not os.path.isfile(link): + continue + try: + with open(link, "r", encoding="utf-8") as handle: + digest = handle.read().strip() + stat = os.stat(link) + except OSError: + continue + rows.append({ + "repo": repo, + "tag": tag, + "digest": digest, + "mtime": stat.st_mtime, + "mtimeIso": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(stat.st_mtime)), + "path": os.path.join(repo_root, tag), + }) + return rows + +def registry_revision_rows(): + rows = [] + root = REGISTRY_REPOSITORY_ROOT + if not os.path.isdir(root): + return rows + for repo_root, dirs, files in os.walk(root): + if os.path.basename(repo_root) != "sha256": + continue + rel = os.path.relpath(repo_root, root) + suffix = "/_manifests/revisions/sha256" + if not rel.endswith(suffix): + continue + repo = rel[:-len(suffix)] + try: + revisions = os.listdir(repo_root) + except OSError: + continue + for digest_hex in sorted(revisions): + path = os.path.join(repo_root, digest_hex) + link = os.path.join(path, "link") + if not os.path.isfile(link): + continue + try: + with open(link, "r", encoding="utf-8") as handle: + digest = handle.read().strip() + stat = os.stat(link) + except OSError: + continue + rows.append({ + "repo": repo, + "digest": digest, + "mtime": stat.st_mtime, + "mtimeIso": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(stat.st_mtime)), + "path": path, + }) + return rows + +def registry_retention_repo(repo): + return repo.startswith("hwlab/hwlab-") or repo.startswith("hwlab/cache/hwlab-") + +def registry_digest_hex(digest): + if not isinstance(digest, str) or not digest.startswith("sha256:"): + return None + value = digest.split(":", 1)[1] + if re.match(r"^[0-9a-f]{64}$", value) is None: + return None + return value + +def registry_blob_data_path(digest): + value = registry_digest_hex(digest) + if value is None: + return None + return os.path.join(REGISTRY_ROOT, "docker/registry/v2/blobs/sha256", value[:2], value, "data") + +_manifest_cache = {} +def registry_manifest_json(digest): + if digest in _manifest_cache: + return _manifest_cache[digest] + path = registry_blob_data_path(digest) + if path is None or not os.path.isfile(path): + _manifest_cache[digest] = None + return None + try: + with open(path, "rb") as handle: + data = handle.read(8 * 1024 * 1024) + value = json.loads(data.decode("utf-8")) + except Exception: + value = None + _manifest_cache[digest] = value + return value + +def registry_manifest_refs(digest): + manifest = registry_manifest_json(digest) + if not isinstance(manifest, dict): + return set() + refs = set() + config = manifest.get("config") or {} + config_digest = config.get("digest") + if isinstance(config_digest, str) and registry_digest_hex(config_digest) is not None: + refs.add(config_digest) + for item in manifest.get("layers") or []: + item_digest = (item or {}).get("digest") + if isinstance(item_digest, str) and registry_digest_hex(item_digest) is not None: + refs.add(item_digest) + for item in manifest.get("manifests") or []: + item_digest = (item or {}).get("digest") + if isinstance(item_digest, str) and registry_digest_hex(item_digest) is not None: + refs.add(item_digest) + return refs + +def registry_digest_closure(seed): + seen = set() + stack = list(seed) + while stack: + digest = stack.pop() + if digest in seen or registry_digest_hex(digest) is None: + continue + seen.add(digest) + for child in registry_manifest_refs(digest): + if child not in seen: + stack.append(child) + return seen + +def registry_blob_size(digest): + path = registry_blob_data_path(digest) + if path is None or not os.path.isfile(path): + return 0 + try: + return int(os.lstat(path).st_blocks) * 512 + except OSError: + return 0 + +def estimate_registry_reclaim(delete_manifest_digests, kept_manifest_digests): + deleted = registry_digest_closure(delete_manifest_digests) + kept = registry_digest_closure(kept_manifest_digests) + reclaim = deleted - kept + return sum(registry_blob_size(digest) for digest in reclaim) + +def plan_registry_retention(): + keep_per_repo = int(OPTIONS.get("registryKeepPerRepo") if OPTIONS.get("registryKeepPerRepo") is not None else 5) + min_age_hours = float(OPTIONS.get("registryMinAgeHours") if OPTIONS.get("registryMinAgeHours") is not None else 48) + cutoff = time.time() - min_age_hours * 3600 + refs, digests, refs_command = workload_image_refs() + rows = registry_tag_rows() + revision_rows = registry_revision_rows() + by_repo = {} + for row in rows: + by_repo.setdefault(row["repo"], []).append(row) + keep = set() + keep_reasons = {} + for repo, items in by_repo.items(): + items.sort(key=lambda item: item["mtime"], reverse=True) + for row in items[:keep_per_repo]: + key = (row["repo"], row["tag"]) + keep.add(key) + keep_reasons[key] = "latest-per-repo" + for row in items: + key = (row["repo"], row["tag"]) + if row["tag"] in REGISTRY_PROTECTED_TAGS: + keep.add(key) + keep_reasons[key] = "protected-tag" + if key in refs: + keep.add(key) + keep_reasons[key] = "workload-tag-ref" + if row["digest"] in digests: + keep.add(key) + keep_reasons[key] = "workload-digest-ref" + if row["repo"].startswith("hwlab/cache/"): + keep.add(key) + keep_reasons[key] = "cache-repo" + if row["mtime"] >= cutoff: + keep.add(key) + keep_reasons[key] = "recent-tag" + delete_rows = [] + kept_count = 0 + delete_by_repo = {} + keep_by_repo = {} + kept_digests = set() + for row in rows: + key = (row["repo"], row["tag"]) + should_delete = ( + key not in keep + and row["repo"].startswith("hwlab/hwlab-") + and re.match(r"^[0-9a-f]{7,40}$", row["tag"]) is not None + ) + if should_delete: + delete_rows.append(row) + delete_by_repo[row["repo"]] = delete_by_repo.get(row["repo"], 0) + 1 + else: + kept_count += 1 + kept_digests.add(row["digest"]) + keep_by_repo[row["repo"]] = keep_by_repo.get(row["repo"], 0) + 1 + protected_digests = kept_digests | digests + protected_digests.update(row["digest"] for row in revision_rows if not registry_retention_repo(row["repo"])) + protected_digests = registry_digest_closure(protected_digests) + delete_revision_rows = [] + revision_delete_by_repo = {} + for row in revision_rows: + if not registry_retention_repo(row["repo"]): + continue + if row["digest"] in protected_digests: + continue + delete_revision_rows.append(row) + revision_delete_by_repo[row["repo"]] = revision_delete_by_repo.get(row["repo"], 0) + 1 + kept_revision_digests = set(row["digest"] for row in revision_rows if row not in delete_revision_rows) + delete_revision_digests = set(row["digest"] for row in delete_revision_rows) + deletable_manifests = {} + for row in delete_rows: + if row["digest"] in kept_digests: + continue + deletable_manifests.setdefault(row["repo"], set()).add(row["digest"]) + for row in delete_revision_rows: + deletable_manifests.setdefault(row["repo"], set()).add(row["digest"]) + deletable_manifest_count = sum(len(items) for items in deletable_manifests.values()) + registry_size = du_size(REGISTRY_ROOT, 30) or 0 + estimate = estimate_registry_reclaim(delete_revision_digests, kept_revision_digests) + return { + "tagRows": rows, + "revisionRows": revision_rows, + "deleteRows": delete_rows, + "deleteRevisionRows": delete_revision_rows, + "summary": { + "totalTags": len(rows), + "totalRevisions": len(revision_rows), + "repoCount": len(by_repo), + "keepPerRepo": keep_per_repo, + "minAgeHours": min_age_hours, + "protectedWorkloadRefs": len(refs), + "protectedDigestRefs": len(digests), + "protectedDigestClosure": len(protected_digests), + "keptTags": kept_count, + "deleteTags": len(delete_rows), + "deleteManifests": deletable_manifest_count, + "deleteRevisions": len(delete_revision_rows), + "deleteByRepo": delete_by_repo, + "revisionDeleteByRepo": revision_delete_by_repo, + "keepByRepo": keep_by_repo, + "registrySizeBytes": registry_size, + "estimatedReclaimBytes": estimate, + }, + "deleteManifestsByRepo": {repo: sorted(list(digests)) for repo, digests in deletable_manifests.items()}, + "refsCommand": refs_command, + } + +def registry_deployment_preflight(): + dep = kubectl_json(["-n", "hwlab-ci", "get", "deploy", "hwlab-registry"], 20) + if not dep: + return {"ok": False, "reason": "registry-deployment-missing"} + spec = ((dep.get("spec") or {}).get("template") or {}).get("spec") or {} + containers = spec.get("containers") or [] + volumes = spec.get("volumes") or [] + registry_container = next((item for item in containers if item.get("name") == "registry"), containers[0] if containers else {}) + mounts = registry_container.get("volumeMounts") or [] + has_host_path = any(((vol.get("hostPath") or {}).get("path") == REGISTRY_ROOT and vol.get("name") == "storage") for vol in volumes) + has_mount = any((mount.get("name") == "storage" and mount.get("mountPath") == "/var/lib/registry") for mount in mounts) + image = str(registry_container.get("image") or "") + ok = bool(has_host_path and has_mount and image.startswith("registry:") and spec.get("hostNetwork") is True) + return { + "ok": ok, + "reason": "ok" if ok else "unexpected-registry-deployment-shape", + "image": image, + "hostNetwork": spec.get("hostNetwork"), + "hasExpectedHostPath": has_host_path, + "hasExpectedMount": has_mount, + "replicas": (dep.get("spec") or {}).get("replicas"), + "readyReplicas": (dep.get("status") or {}).get("readyReplicas"), + } + +def cronjob_suspend_states(names): + states = {} + for name in names: + data = kubectl_json(["-n", "hwlab-ci", "get", "cronjob", name], 15) + if data: + states[name] = bool(((data.get("spec") or {}).get("suspend")) is True) + return states + +def patch_cronjob_suspend(name, suspend): + payload = json.dumps({"spec": {"suspend": bool(suspend)}}) + return kctl(["-n", "hwlab-ci", "patch", "cronjob", name, "--type=merge", "-p", payload], 30) + +def wait_registry_pod_count(target, timeout=90): + deadline = time.time() + timeout + last = None + while time.time() < deadline: + result = kctl(["-n", "hwlab-ci", "get", "pods", "-l", "app.kubernetes.io/name=hwlab-registry", "--no-headers"], 20) + last = bounded(result) + lines = [line for line in (result.get("stdout") or "").splitlines() if line.strip()] + active = [] + for line in lines: + parts = line.split() + status = parts[2] if len(parts) >= 3 else "" + if status in set(["Completed", "Error", "Failed", "Succeeded"]): + continue + active.append(line) + if len(active) == target: + return {"ok": True, "lines": active, "allLines": lines, "last": last} + time.sleep(2) + return {"ok": False, "lines": [], "last": last} + +def wait_pod_terminal(name, timeout=900): + deadline = time.time() + timeout + last = None + while time.time() < deadline: + data = kubectl_json(["-n", "hwlab-ci", "get", "pod", name], 20) + if data: + phase = ((data.get("status") or {}).get("phase")) or "" + last = {"phase": phase} + if phase == "Succeeded": + return {"ok": True, "phase": phase} + if phase == "Failed": + return {"ok": False, "phase": phase} + time.sleep(3) + return {"ok": False, "phase": "Timeout", "last": last} + +def execute_registry_retention(): + if PROVIDER_ID.upper() != "G14": + raise RuntimeError("HWLAB registry retention is only supported on G14") + deployment = registry_deployment_preflight() + if not deployment.get("ok"): + raise RuntimeError("registry deployment preflight failed: %s" % deployment.get("reason")) + plan = plan_registry_retention() + delete_rows = plan.get("deleteRows") or [] + delete_revision_rows = plan.get("deleteRevisionRows") or [] + delete_manifests = plan.get("deleteManifestsByRepo") or {} + if not delete_rows and not delete_revision_rows: + return {"reclaimedBytes": 0, "commandOutput": {"message": "no registry tags or revisions matched conservative retention", "registryPlan": plan.get("summary")}} + if not delete_manifests: + return {"reclaimedBytes": 0, "commandOutput": {"message": "matched manifests are still referenced by retained manifests; registry GC would not reclaim blobs", "registryPlan": plan.get("summary")}} + cronjobs = ["hwlab-g14-branch-poller", "hwlab-v02-branch-poller"] + original_crons = cronjob_suspend_states(cronjobs) + before = du_size(REGISTRY_ROOT, 60) or 0 + gc_name = "hwlab-registry-gc-%s" % int(time.time()) + steps = [] + try: + for name in original_crons: + result = patch_cronjob_suspend(name, True) + steps.append({"step": "suspend-cronjob", "name": name, "result": bounded(result)}) + if result["exitCode"] != 0: + raise RuntimeError("failed to suspend cronjob %s" % name) + idle_after_suspend = wait_no_active_hwlab_ci(180) + steps.append({"step": "idle-after-suspend", "result": idle_after_suspend}) + if not idle_after_suspend.get("ok"): + raise RuntimeError("refusing registry maintenance because hwlab-ci did not become idle after suspend") + + deleted_manifests = [] + for repo, digests in delete_manifests.items(): + encoded_repo = "/".join(urllib.parse.quote(part, safe="") for part in repo.split("/")) + for digest in digests: + try: + result = registry_request("DELETE", "/v2/%s/manifests/%s" % (encoded_repo, urllib.parse.quote(digest, safe=":")), {"Accept": "application/vnd.docker.distribution.manifest.v2+json, application/vnd.oci.image.manifest.v1+json"}) + deleted_manifests.append({"repo": repo, "digest": digest, "status": result.get("status")}) + except urllib.error.HTTPError as exc: + if exc.code == 404: + deleted_manifests.append({"repo": repo, "digest": digest, "status": 404}) + else: + raise + steps.append({"step": "registry-api-delete-manifests", "count": len(deleted_manifests), "preview": deleted_manifests[:20]}) + + scale_down = kctl(["-n", "hwlab-ci", "scale", "deploy", "hwlab-registry", "--replicas=0"], 60) + steps.append({"step": "scale-registry-down", "result": bounded(scale_down)}) + if scale_down["exitCode"] != 0: + raise RuntimeError("failed to scale registry down") + waited_down = wait_registry_pod_count(0, 120) + steps.append({"step": "wait-registry-down", "result": waited_down}) + if not waited_down.get("ok"): + raise RuntimeError("registry pod did not scale down") + + deleted = [] + for row in delete_rows: + path = os.path.abspath(str(row.get("path") or "")) + if not path.startswith(REGISTRY_REPOSITORY_ROOT + "/") or "/_manifests/tags/" not in path: + raise RuntimeError("refusing unexpected registry tag path: %s" % path) + if not re.match(r"^[0-9a-f]{7,40}$", str(row.get("tag") or "")): + raise RuntimeError("refusing unexpected registry tag name: %s" % row.get("tag")) + if os.path.isdir(path) and not os.path.islink(path): + shutil.rmtree(path) + deleted.append({"repo": row.get("repo"), "tag": row.get("tag"), "digest": row.get("digest")}) + steps.append({"step": "delete-tag-directories", "count": len(deleted)}) + + deleted_revisions = [] + for row in delete_revision_rows: + path = os.path.abspath(str(row.get("path") or "")) + digest_hex = registry_digest_hex(str(row.get("digest") or "")) + if digest_hex is None: + raise RuntimeError("refusing unexpected registry revision digest: %s" % row.get("digest")) + if not path.startswith(REGISTRY_REPOSITORY_ROOT + "/") or "/_manifests/revisions/sha256/" not in path: + raise RuntimeError("refusing unexpected registry revision path: %s" % path) + if os.path.basename(path) != digest_hex: + raise RuntimeError("refusing registry revision path/digest mismatch: %s" % path) + if os.path.isdir(path) and not os.path.islink(path): + shutil.rmtree(path) + deleted_revisions.append({"repo": row.get("repo"), "digest": row.get("digest")}) + steps.append({"step": "delete-revision-directories", "count": len(deleted_revisions)}) + + overrides = { + "apiVersion": "v1", + "spec": { + "restartPolicy": "Never", + "containers": [{ + "name": "registry-gc", + "image": "registry:2.8.3", + "command": ["registry", "garbage-collect", "/etc/docker/registry/config.yml"], + "volumeMounts": [{"name": "storage", "mountPath": "/var/lib/registry"}], + }], + "volumes": [{"name": "storage", "hostPath": {"path": REGISTRY_ROOT, "type": "DirectoryOrCreate"}}], + }, + } + run_gc = kctl(["-n", "hwlab-ci", "run", gc_name, "--restart=Never", "--image=registry:2.8.3", "--overrides=%s" % json.dumps(overrides)], 60) + steps.append({"step": "start-registry-gc-pod", "result": bounded(run_gc), "pod": gc_name}) + if run_gc["exitCode"] != 0: + raise RuntimeError("failed to start registry GC pod") + waited_gc = wait_pod_terminal(gc_name, 900) + steps.append({"step": "wait-registry-gc", "result": waited_gc}) + logs = kctl(["-n", "hwlab-ci", "logs", gc_name], 120) + steps.append({"step": "registry-gc-logs", "result": bounded(logs)}) + if not waited_gc.get("ok"): + raise RuntimeError("registry GC pod did not complete successfully") + finally: + cleanup_gc = kctl(["-n", "hwlab-ci", "delete", "pod", gc_name, "--ignore-not-found=true"], 60) + steps.append({"step": "delete-registry-gc-pod", "result": bounded(cleanup_gc)}) + scale_up = kctl(["-n", "hwlab-ci", "scale", "deploy", "hwlab-registry", "--replicas=%s" % int(deployment.get("replicas") or 1)], 60) + steps.append({"step": "scale-registry-up", "result": bounded(scale_up)}) + rollout = kctl(["-n", "hwlab-ci", "rollout", "status", "deploy/hwlab-registry", "--timeout=180s"], 200) + steps.append({"step": "wait-registry-rollout", "result": bounded(rollout)}) + for name, was_suspended in original_crons.items(): + restore = patch_cronjob_suspend(name, was_suspended) + steps.append({"step": "restore-cronjob", "name": name, "suspend": was_suspended, "result": bounded(restore)}) + after = du_size(REGISTRY_ROOT, 60) or 0 + return { + "reclaimedBytes": max(0, before - after), + "commandOutput": { + "registryPlan": plan.get("summary"), + "deletedTagCount": len(delete_rows), + "deletedRevisionCount": len(delete_revision_rows), + "deletedManifestCount": sum(len(items) for items in delete_manifests.values()), + "diskBeforeBytes": before, + "diskAfterBytes": after, + "steps": steps[-12:], + }, + } + +def execute_registry_garbage_collect_only(): + if PROVIDER_ID.upper() != "G14": + raise RuntimeError("HWLAB registry garbage-collect is only supported on G14") + deployment = registry_deployment_preflight() + if not deployment.get("ok"): + raise RuntimeError("registry deployment preflight failed: %s" % deployment.get("reason")) + cronjobs = ["hwlab-g14-branch-poller", "hwlab-v02-branch-poller"] + original_crons = cronjob_suspend_states(cronjobs) + before = du_size(REGISTRY_ROOT, 60) or 0 + gc_name = "hwlab-registry-gc-%s" % int(time.time()) + steps = [] + try: + for name in original_crons: + result = patch_cronjob_suspend(name, True) + steps.append({"step": "suspend-cronjob", "name": name, "result": bounded(result)}) + if result["exitCode"] != 0: + raise RuntimeError("failed to suspend cronjob %s" % name) + idle_after_suspend = wait_no_active_hwlab_ci(180) + steps.append({"step": "idle-after-suspend", "result": idle_after_suspend}) + if not idle_after_suspend.get("ok"): + raise RuntimeError("refusing registry maintenance because hwlab-ci did not become idle after suspend") + + scale_down = kctl(["-n", "hwlab-ci", "scale", "deploy", "hwlab-registry", "--replicas=0"], 60) + steps.append({"step": "scale-registry-down", "result": bounded(scale_down)}) + if scale_down["exitCode"] != 0: + raise RuntimeError("failed to scale registry down") + waited_down = wait_registry_pod_count(0, 120) + steps.append({"step": "wait-registry-down", "result": waited_down}) + if not waited_down.get("ok"): + raise RuntimeError("registry pod did not scale down") + + overrides = { + "apiVersion": "v1", + "spec": { + "restartPolicy": "Never", + "containers": [{ + "name": "registry-gc", + "image": "registry:2.8.3", + "command": ["registry", "garbage-collect", "/etc/docker/registry/config.yml"], + "volumeMounts": [{"name": "storage", "mountPath": "/var/lib/registry"}], + }], + "volumes": [{"name": "storage", "hostPath": {"path": REGISTRY_ROOT, "type": "DirectoryOrCreate"}}], + }, + } + run_gc = kctl(["-n", "hwlab-ci", "run", gc_name, "--restart=Never", "--image=registry:2.8.3", "--overrides=%s" % json.dumps(overrides)], 60) + steps.append({"step": "start-registry-gc-pod", "result": bounded(run_gc), "pod": gc_name}) + if run_gc["exitCode"] != 0: + raise RuntimeError("failed to start registry GC pod") + waited_gc = wait_pod_terminal(gc_name, 900) + steps.append({"step": "wait-registry-gc", "result": waited_gc}) + logs = kctl(["-n", "hwlab-ci", "logs", gc_name], 120) + steps.append({"step": "registry-gc-logs", "result": bounded(logs)}) + if not waited_gc.get("ok"): + raise RuntimeError("registry GC pod did not complete successfully") + finally: + cleanup_gc = kctl(["-n", "hwlab-ci", "delete", "pod", gc_name, "--ignore-not-found=true"], 60) + steps.append({"step": "delete-registry-gc-pod", "result": bounded(cleanup_gc)}) + scale_up = kctl(["-n", "hwlab-ci", "scale", "deploy", "hwlab-registry", "--replicas=%s" % int(deployment.get("replicas") or 1)], 60) + steps.append({"step": "scale-registry-up", "result": bounded(scale_up)}) + rollout = kctl(["-n", "hwlab-ci", "rollout", "status", "deploy/hwlab-registry", "--timeout=180s"], 200) + steps.append({"step": "wait-registry-rollout", "result": bounded(rollout)}) + for name, was_suspended in original_crons.items(): + restore = patch_cronjob_suspend(name, was_suspended) + steps.append({"step": "restore-cronjob", "name": name, "suspend": was_suspended, "result": bounded(restore)}) + after = du_size(REGISTRY_ROOT, 60) or 0 + return { + "reclaimedBytes": max(0, before - after), + "commandOutput": { + "message": "official registry garbage-collect only; no additional tag deletion", + "diskBeforeBytes": before, + "diskAfterBytes": after, + "steps": steps[-12:], + }, + } + +def start_registry_retention_job(mode): + job_id = "g14-registry-%s-%s" % (int(time.time()), os.getpid()) + paths = job_paths(job_id) + started_at = now_iso() + initial = { + "ok": True, + "action": "gc remote status", + "providerId": PROVIDER_ID, + "jobId": job_id, + "status": "running", + "kind": "hwlab-registry-retention-gc" if mode == "retention" else "hwlab-registry-garbage-collect", + "mode": mode, + "startedAt": started_at, + "statePath": paths["state"], + "logPath": paths["log"], + "options": OPTIONS, + } + write_json_atomic(paths["state"], initial) + pid = os.fork() + if pid != 0: + return { + "status": "started", + "reclaimedBytes": None, + "commandOutput": { + "jobId": job_id, + "pid": pid, + "statePath": paths["state"], + "logPath": paths["log"], + "statusCommand": "bun scripts/cli.ts gc remote %s status --job-id %s" % (PROVIDER_ID, job_id), + "message": "registry retention GC is running as a detached remote job", + }, + } + + try: + os.setsid() + except Exception: + pass + try: + devnull = os.open(os.devnull, os.O_RDONLY) + os.dup2(devnull, 0) + os.close(devnull) + except Exception: + pass + try: + log_handle = open(paths["log"], "a", encoding="utf-8", buffering=1) + os.dup2(log_handle.fileno(), 1) + os.dup2(log_handle.fileno(), 2) + except Exception: + log_handle = None + try: + print("[%s] starting HWLAB registry %s job %s" % (now_iso(), mode, job_id), flush=True) + result = execute_registry_retention() if mode == "retention" else execute_registry_garbage_collect_only() + payload = dict(initial) + payload.update({ + "status": "succeeded", + "finishedAt": now_iso(), + "result": result, + "diskAfter": df_snapshot(), + "clusterAfter": cluster_preflight(), + }) + write_json_atomic(paths["state"], payload) + print("[%s] completed HWLAB registry %s job %s" % (now_iso(), mode, job_id), flush=True) + os._exit(0) + except Exception as exc: + payload = dict(initial) + payload.update({ + "ok": False, + "status": "failed", + "finishedAt": now_iso(), + "error": str(exc), + "diskAfter": df_snapshot(), + "clusterAfter": cluster_preflight(), + }) + try: + write_json_atomic(paths["state"], payload) + except Exception: + pass + print("[%s] failed HWLAB registry %s job %s: %s" % (now_iso(), mode, job_id, exc), flush=True) + os._exit(1) + finally: + try: + if log_handle: + log_handle.close() + except Exception: + pass + +def collect_protected(): + protected_paths = [ + ("hwlab-k3s-runtime", "/var/lib/rancher/k3s", "Native k3s runtime, containerd state, local-path storage and control-plane data are protected."), + ("hwlab-k3s-storage", "/var/lib/rancher/k3s/storage", "Local-path PVC data is protected; cleanup must go through HWLAB/Tekton retention commands."), + ("hwlab-kubelet", "/var/lib/kubelet", "Kubelet pod/runtime state is protected."), + ("host-containerd", "/var/lib/containerd", "Host containerd state is protected; generic gc does not prune containerd images."), + ("hwlab-host-data", "/var/lib/hwlab", "HWLAB host data/cache is protected from generic remote gc until a HWLAB-specific retention rule classifies it."), + ("hwlab-source", "/root/hwlab", "G14 HWLAB fixed source workspace is protected."), + ("hwlab-v02-source", "/root/hwlab-v02", "HWLAB v0.2 fixed source workspace is protected."), + ("agentrun-source", "/root/agentrun", "AgentRun fixed source workspace is protected."), + ("docker-images-and-volumes", "docker-images-volumes", "Remote gc does not remove Docker images, containers, volumes or Compose projects."), + ("k8s-api-objects", "deployments-statefulsets-secrets-pvcs", "Remote gc does not mutate Kubernetes workloads, Secrets, PVCs, PVs, Argo CD or Tekton objects."), + ] + result = [] + for kind, ref, reason in protected_paths: + item = {"kind": kind, "risk": "blocked", "ref": ref, "reason": reason} + if ref.startswith("/") and os.path.exists(ref): + item["sizeBytes"] = du_size(ref) + result.append(item) + return result + +def collect_candidates(observed_at): + candidates = [] + if OPTIONS.get("journal", True): + usage = command(["journalctl", "--disk-usage"], 5) + current = parse_journal_usage((usage["stdout"] or "") + (usage["stderr"] or "")) + target = int(OPTIONS.get("journalTargetBytes") or 536870912) + if current is not None and current > target: + candidates.append({ + "id": "journalctl:vacuum", + "kind": "journal-vacuum", + "risk": "medium", + "description": "Vacuum systemd journal to %s" % fmt_bytes(target), + "sizeBytes": current, + "estimatedReclaimBytes": max(0, current - target), + "action": {"command": ["journalctl", "--vacuum-size=%s" % target]}, + }) + + if OPTIONS.get("dockerLogs", True): + max_bytes = int(OPTIONS.get("dockerLogMaxBytes") or 52428800) + for container in docker_containers(): + path = container.get("logPath") or "" + if not path or not os.path.exists(path): + continue + try: + size = os.path.getsize(path) + except OSError: + continue + if size <= max_bytes: + continue + candidates.append({ + "id": "docker-json-log:%s" % container["id"], + "kind": "docker-json-log-truncate", + "risk": "medium", + "description": "Truncate Docker json-file log larger than %s" % fmt_bytes(max_bytes), + "path": path, + "container": {"id": container["id"][:12], "name": container["name"], "image": container["image"]}, + "sizeBytes": size, + "estimatedReclaimBytes": size, + "action": {"op": "truncate", "targetBytes": 0}, + }) + + if OPTIONS.get("buildCache", True): + system_df = command(["docker", "system", "df"], 8) + if system_df["exitCode"] == 0: + cache = parse_docker_build_cache(system_df["stdout"]) + if cache is not None and cache["reclaimableBytes"] > 0: + until = str(OPTIONS.get("buildCacheUntil") or "24h") + candidates.append({ + "id": "docker-builder:prune", + "kind": "docker-build-cache-prune", + "risk": "low", + "description": "Prune Docker BuildKit cache unused for %s" % until, + "sizeBytes": cache["sizeBytes"], + "estimatedReclaimBytes": cache["reclaimableBytes"], + "action": {"command": ["docker", "builder", "prune", "--all", "--force", "--filter", "until=%s" % until], "estimate": "docker-system-df-reclaimable-upper-bound"}, + }) + + if OPTIONS.get("aptCache", True): + apt_path = "/var/cache/apt/archives" + size = du_size(apt_path) or 0 + if size > 10 * 1024 * 1024: + candidates.append({ + "id": "apt-cache:clean", + "kind": "apt-cache-clean", + "risk": "low", + "description": "Clean downloaded apt package archives", + "path": apt_path, + "sizeBytes": size, + "estimatedReclaimBytes": size, + "action": {"command": ["apt-get", "clean"]}, + }) + + if OPTIONS.get("toolCaches", False): + for item in TOOL_CACHE_ALLOWLIST: + path = item["path"] + size = du_size(path, 8) or 0 + if size <= 0: + continue + candidates.append({ + "id": "tool-cache:%s" % item["id"], + "kind": "tool-cache-delete", + "risk": "medium", + "description": item["description"], + "path": path, + "sizeBytes": size, + "estimatedReclaimBytes": size, + "action": {"op": "rm-recursive", "allowlist": "remote-tool-cache"}, + }) + + if OPTIONS.get("coreDumps", True): + cutoff = time.time() - float(OPTIONS.get("coreDumpMinAgeHours") or 1) * 3600 + for root in sorted(CORE_DUMP_DIR_ALLOWLIST): + if not os.path.isdir(root): + continue + for name in os.listdir(root): + if not re.match(r"^core\.\d+$", name): + continue + path = os.path.join(root, name) + try: + stat = os.lstat(path) + except OSError: + continue + if not os.path.isfile(path) or os.path.islink(path): + continue + if stat.st_mtime >= cutoff: + continue + disk_size = allocated_file_size(path) + candidates.append({ + "id": "core-dump:%s" % path, + "kind": "core-dump-delete", + "risk": "low", + "description": "Delete untracked process core dump older than %s hours" % OPTIONS.get("coreDumpMinAgeHours"), + "path": path, + "sizeBytes": int(disk_size), + "estimatedReclaimBytes": int(disk_size), + "apparentSizeBytes": int(stat.st_size), + "action": {"op": "unlink", "allowlist": "root-unidesk-core-dot-pid"}, + }) + + if OPTIONS.get("tmp", True) and os.path.isdir("/tmp"): + cutoff = time.time() - float(OPTIONS.get("tmpMinAgeHours") or 24) * 3600 + for name in os.listdir("/tmp"): + path = os.path.join("/tmp", name) + if path in TMP_EXACT_PROTECT: + continue + if not any(name.startswith(prefix) for prefix in TMP_PREFIX_ALLOWLIST): + continue + try: + stat = os.lstat(path) + except OSError: + continue + if stat.st_mtime >= cutoff: + continue + size = du_size(path, 8) or path_size(path) + if size <= 0: + continue + candidates.append({ + "id": "tmp:%s" % path, + "kind": "tmp-path-delete", + "risk": "low", + "description": "Delete allowlisted /tmp path older than %s hours" % OPTIONS.get("tmpMinAgeHours"), + "path": path, + "sizeBytes": size, + "estimatedReclaimBytes": size, + "action": {"op": "rm-recursive", "allowlist": "tmp-prefix"}, + }) + if OPTIONS.get("hwlabRegistry", False): + registry = plan_registry_retention() + summary = registry.get("summary") or {} + delete_rows = registry.get("deleteRows") or [] + delete_revision_rows = registry.get("deleteRevisionRows") or [] + estimate = int(summary.get("estimatedReclaimBytes") or 0) + if delete_rows or delete_revision_rows: + candidates.append({ + "id": "hwlab-registry:retention-gc", + "kind": "hwlab-registry-retention-gc", + "risk": "medium", + "description": "Conservative HWLAB registry retention: keep current workload refs, retained tags and protected repos, delete stale manifest revisions, then run official registry garbage-collect", + "path": REGISTRY_ROOT, + "sizeBytes": int(summary.get("registrySizeBytes") or 0), + "estimatedReclaimBytes": estimate, + "action": { + "op": "registry-retention-gc", + "requiresMaintenanceWindow": True, + "keepPerRepo": summary.get("keepPerRepo"), + "minAgeHours": summary.get("minAgeHours"), + "deleteTags": len(delete_rows), + "deleteManifests": summary.get("deleteManifests"), + "deleteRevisions": summary.get("deleteRevisions"), + "deleteByRepo": summary.get("deleteByRepo"), + "revisionDeleteByRepo": summary.get("revisionDeleteByRepo"), + "protectedWorkloadRefs": summary.get("protectedWorkloadRefs"), + "protectedDigestRefs": summary.get("protectedDigestRefs"), + "protectedDigestClosure": summary.get("protectedDigestClosure"), + }, + }) + elif bool(OPTIONS.get("registryGcOnly")) and int(summary.get("totalTags") or 0) > 0 and int(summary.get("deleteTags") or 0) == 0: + candidates.append({ + "id": "hwlab-registry:garbage-collect-only", + "kind": "hwlab-registry-garbage-collect", + "risk": "medium", + "description": "Run official HWLAB registry garbage-collect without deleting additional tags; useful after a previously interrupted retention run", + "path": REGISTRY_ROOT, + "sizeBytes": int(summary.get("registrySizeBytes") or 0), + "estimatedReclaimBytes": 0, + "action": { + "op": "registry-garbage-collect-only", + "requiresMaintenanceWindow": True, + "deleteTags": 0, + "registryPlan": summary, + }, + }) + return sorted(candidates, key=lambda item: item.get("estimatedReclaimBytes") or 0, reverse=True) + +def target_assessment(disk, estimated_reclaim): + raw = OPTIONS.get("targetUsePercent") + if raw is None: + return None + if not disk: + return { + "targetUsePercent": raw, + "ok": False, + "state": "unavailable", + "reason": "disk-snapshot-unavailable", + } + try: + target = int(raw) + size = int(disk.get("sizeBytes") or 0) + used = int(disk.get("usedBytes") or 0) + available = int(disk.get("availableBytes") or 0) + reclaim = max(0, int(estimated_reclaim or 0)) + except Exception: + return { + "targetUsePercent": raw, + "ok": False, + "state": "unavailable", + "reason": "invalid-disk-snapshot", + } + df_basis = used + available + if df_basis <= 0: + df_basis = size + legacy_target_used_bytes = (size * target) // 100 + legacy_required = max(0, used - legacy_target_used_bytes) + target_used_bytes = (df_basis * target) // 100 + required = max(0, used - target_used_bytes) + projected_used = max(0, used - reclaim) + projected_use_percent = disk_use_percent(df_basis, projected_used) + legacy_projected_use_percent = disk_use_percent(size, projected_used) + enough = reclaim >= required + if required == 0: + state = "already-below-target" + elif enough: + state = "candidate-estimate-meets-target" + elif reclaim <= 0: + state = "safe-stop-no-meaningful-candidates" + else: + state = "shortfall" + shortfall = max(0, required - reclaim) + return { + "targetUsePercent": target, + "ok": required == 0 or enough, + "state": state, + "currentUsePercent": disk.get("usePercent"), + "currentUsePercentExact": disk.get("usePercentExact"), + "basis": "df-used-over-used-plus-available", + "dfBasisBytes": df_basis, + "dfBasis": fmt_bytes(df_basis), + "reservedBytes": max(0, size - df_basis), + "reserved": fmt_bytes(max(0, size - df_basis)), + "currentUsedBytes": used, + "currentUsed": fmt_bytes(used), + "targetUsedBytes": target_used_bytes, + "targetUsed": fmt_bytes(target_used_bytes), + "requiredReclaimBytes": required, + "requiredReclaim": fmt_bytes(required), + "estimatedReclaimBytes": reclaim, + "estimatedReclaim": fmt_bytes(reclaim), + "shortfallBytes": shortfall, + "shortfall": fmt_bytes(shortfall), + "projectedUsedBytes": projected_used, + "projectedUsed": fmt_bytes(projected_used), + "projectedUsePercent": projected_use_percent, + "safeStop": required > 0 and not enough, + "decision": "stop-and-escalate-retention-or-capacity" if required > 0 and not enough else "target-covered-by-safe-candidates", + "legacySizeBasis": { + "basis": "df-size-column-includes-reserved-blocks", + "sizeBytes": size, + "size": fmt_bytes(size), + "targetUsedBytes": legacy_target_used_bytes, + "targetUsed": fmt_bytes(legacy_target_used_bytes), + "requiredReclaimBytes": legacy_required, + "requiredReclaim": fmt_bytes(legacy_required), + "projectedUsePercent": legacy_projected_use_percent, + "note": "informational only; ok/safeStop use the same basis as df Use%", + }, + } + +def summarize(candidates, returned, disk=None): + by_kind = {} + total = 0 + for item in candidates: + size = int(item.get("estimatedReclaimBytes") or 0) + total += size + kind = item.get("kind") or "unknown" + current = by_kind.setdefault(kind, {"count": 0, "estimatedReclaimBytes": 0, "estimatedReclaim": "0 B"}) + current["count"] += 1 + current["estimatedReclaimBytes"] += size + current["estimatedReclaim"] = fmt_bytes(current["estimatedReclaimBytes"]) + returned_total = sum(int(item.get("estimatedReclaimBytes") or 0) for item in returned) + return { + "candidateCount": len(candidates), + "returnedCandidateCount": len(returned), + "estimatedReclaimBytes": total, + "estimatedReclaim": fmt_bytes(total), + "returnedEstimatedReclaimBytes": returned_total, + "returnedEstimatedReclaim": fmt_bytes(returned_total), + "byKind": by_kind, + "target": target_assessment(disk, total), + } + +def assert_tmp_candidate(path): + resolved = os.path.abspath(path) + if not resolved.startswith("/tmp/"): + raise RuntimeError("refusing to remove non-/tmp path: %s" % path) + if resolved in TMP_EXACT_PROTECT: + raise RuntimeError("refusing to remove protected tmp path: %s" % path) + name = os.path.basename(resolved) + if not any(name.startswith(prefix) for prefix in TMP_PREFIX_ALLOWLIST): + raise RuntimeError("refusing to remove tmp path outside allowlist: %s" % path) + +def assert_core_dump_candidate(path): + resolved = os.path.abspath(path) + parent = os.path.dirname(resolved) + name = os.path.basename(resolved) + if parent not in CORE_DUMP_DIR_ALLOWLIST: + raise RuntimeError("refusing to remove core dump outside allowlisted directory: %s" % path) + if not re.match(r"^core\.\d+$", name): + raise RuntimeError("refusing to remove non core. file: %s" % path) + if not os.path.isfile(resolved) or os.path.islink(resolved): + raise RuntimeError("refusing to remove non-regular core dump: %s" % path) + git = command(["git", "-C", parent, "ls-files", "--error-unmatch", name], 10) + if git["exitCode"] == 0: + raise RuntimeError("refusing to remove git-tracked file: %s" % path) + if git["exitCode"] != 1: + raise RuntimeError("refusing to remove core dump because git tracking check failed: %s" % path) + fuser = command(["fuser", resolved], 5) + if fuser["exitCode"] is None: + raise RuntimeError("refusing to remove core dump because fuser check was unavailable: %s" % path) + if fuser["exitCode"] == 0: + raise RuntimeError("refusing to remove core dump with active process reference: %s" % path) + +def assert_tool_cache_candidate(path): + resolved = os.path.abspath(path) + allowed = set(item["path"] for item in TOOL_CACHE_ALLOWLIST) + if resolved not in allowed: + raise RuntimeError("refusing to remove tool cache outside allowlist: %s" % path) + if os.path.islink(resolved): + raise RuntimeError("refusing to remove symlink tool cache: %s" % path) + +def execute(candidate): + kind = candidate.get("kind") + if kind == "journal-vacuum": + result = command(["journalctl", "--vacuum-size=%s" % int(OPTIONS.get("journalTargetBytes") or 536870912)], 30) + if result["exitCode"] != 0: + raise RuntimeError((result["stderr"] or "journalctl vacuum failed").strip()) + return {"reclaimedBytes": None, "commandOutput": bounded(result)} + if kind == "docker-json-log-truncate": + path = candidate.get("path") or "" + if not path.startswith("/var/lib/docker/containers/"): + raise RuntimeError("refusing to truncate Docker log outside /var/lib/docker/containers") + before = os.path.getsize(path) if os.path.exists(path) else 0 + with open(path, "r+b") as handle: + handle.truncate(0) + return {"reclaimedBytes": before} + if kind == "docker-build-cache-prune": + until = str(OPTIONS.get("buildCacheUntil") or "24h") + result = command(["docker", "builder", "prune", "--all", "--force", "--filter", "until=%s" % until], 45) + if result["exitCode"] != 0: + raise RuntimeError((result["stderr"] or "docker builder prune failed").strip()) + return {"reclaimedBytes": None, "commandOutput": bounded(result)} + if kind == "apt-cache-clean": + before = du_size("/var/cache/apt/archives") or 0 + result = command(["apt-get", "clean"], 30) + if result["exitCode"] != 0: + raise RuntimeError((result["stderr"] or "apt-get clean failed").strip()) + after = du_size("/var/cache/apt/archives") or 0 + return {"reclaimedBytes": max(0, before - after), "commandOutput": bounded(result)} + if kind == "tool-cache-delete": + path = candidate.get("path") or "" + assert_tool_cache_candidate(path) + before = du_size(path, 8) or path_size(path) + if os.path.isdir(path): + shutil.rmtree(path, ignore_errors=True) + elif os.path.exists(path): + os.unlink(path) + return {"reclaimedBytes": before} + if kind == "tmp-path-delete": + path = candidate.get("path") or "" + assert_tmp_candidate(path) + before = du_size(path, 8) or path_size(path) + if os.path.isdir(path) and not os.path.islink(path): + shutil.rmtree(path, ignore_errors=True) + else: + try: + os.unlink(path) + except FileNotFoundError: + pass + return {"reclaimedBytes": before} + if kind == "core-dump-delete": + path = candidate.get("path") or "" + assert_core_dump_candidate(path) + before = allocated_file_size(path) + os.unlink(path) + return {"reclaimedBytes": before} + if kind == "hwlab-registry-retention-gc": + return start_registry_retention_job("retention") + if kind == "hwlab-registry-garbage-collect": + return start_registry_retention_job("garbage-collect") + raise RuntimeError("unsupported remote gc candidate kind: %s" % kind) + +def visible_items(items): + if bool(OPTIONS.get("full")): + return items + return items[:int(OPTIONS.get("limit") or 50)] + +def returned_results(results): + if bool(OPTIONS.get("full")): + return results + failed = [item for item in results if item.get("status") == "failed"] + started = [item for item in results if item.get("status") == "started"] + succeeded = [item for item in results if item.get("status") == "succeeded"] + return (failed + started + succeeded)[:int(OPTIONS.get("resultLimit") or 50)] + +def plan_payload(observed_at, preflight, protected, candidates, visible): + disk = df_snapshot() + ci_storage = ci_storage_snapshot() + memory_pressure = collect_memory_pressure() + compact_pvc = compact_pvc_attribution(ci_storage) + policy = { + "requiresRunConfirm": True, + "runCommand": "bun scripts/cli.ts gc remote %s run --confirm" % PROVIDER_ID, + "neverTouches": [ + "/var/lib/rancher/k3s", + "/var/lib/rancher/k3s/storage", + "/var/lib/kubelet", + "/var/lib/containerd", + "/var/lib/hwlab unless --include-hwlab-registry is explicitly supplied", + "Kubernetes Deployments/StatefulSets/Secrets/PVCs/PVs", + "HWLAB fixed source workspaces", + "Docker images, containers and volumes", + ], + "notes": [ + "Remote gc only executes the returned candidate page unless --full or a larger --limit is supplied.", + "G14 run requires the expected native k3s node preflight before mutation.", + "HWLAB DEV runtime and local-path PVC data are protected and require HWLAB-specific retention commands.", + "Core dump cleanup only removes untracked /root/unidesk/core. regular files with no active fuser reference.", + "HWLAB registry retention is opt-in: it keeps workload tag/digest refs, all tags newer than the retention age and the newest N tags per repo before official registry garbage-collect.", + "When summary.target.safeStop is true, do not broaden deletion scope; choose registry retention, k3s/containerd image cache maintenance, PVC/runtime retention or capacity expansion explicitly.", + ], + } + if not bool(OPTIONS.get("full")): + policy = { + "requiresRunConfirm": True, + "runCommand": "bun scripts/cli.ts gc remote %s run --confirm" % PROVIDER_ID, + "neverTouches": ["k3s runtime", "PVC/PV/local-path data", "Secrets/auth/config", "Docker volumes/images"], + "notes": [ + "Default plan is compact; rerun with --full for complete policy notes and protected rows.", + "When summary.target.safeStop is true, stop at protected boundaries and choose an owner-aware retention or capacity decision.", + ], + } + payload = { + "ok": True, + "action": "gc remote plan", + "providerId": PROVIDER_ID, + "dryRun": True, + "mutation": False, + "observedAt": observed_at, + "options": OPTIONS, + "diskBefore": disk, + "clusterPreflight": preflight, + "summary": summarize(candidates, visible, disk), + "candidates": visible, + "protected": protected if bool(OPTIONS.get("full")) else protected[:3], + "policy": policy, + } + if bool(OPTIONS.get("full")): + payload.update({ + "memoryPressure": memory_pressure, + "pvcAttribution": ci_storage, + "ciStorage": ci_storage, + }) + else: + payload["pressureSummary"] = { + "memory": (compact_memory_pressure(memory_pressure).get("summary") if isinstance(compact_memory_pressure(memory_pressure), dict) else None), + "pvc": { + "pvcCount": compact_pvc.get("pvcCount"), + "reviewCandidateCount": compact_pvc.get("reviewCandidateCount"), + "estimatedBytes": compact_pvc.get("estimatedBytes"), + "estimatedHuman": compact_pvc.get("estimatedHuman"), + "byNamespace": compact_pvc.get("byNamespace"), + "handoff": compact_pvc.get("handoff"), + }, + "drillDown": "bun scripts/cli.ts gc remote %s status --limit %s" % (PROVIDER_ID, int(OPTIONS.get("limit") or 50)), + } + return payload + +def safe_unit_name(value): + raw = str(value or "").strip().lower() + raw = re.sub(r"[^a-z0-9_.@-]+", "-", raw).strip("-") + if not raw: + raw = "unidesk-%s-low-risk-gc" % re.sub(r"[^a-z0-9]+", "-", PROVIDER_ID.lower()).strip("-") + return raw[:80] + +def render_remote_policy(): + unit_name = safe_unit_name(config_str(POLICY_TIMER_CONFIG, "name", "unidesk-%s-low-risk-gc" % PROVIDER_ID.lower())) + on_calendar = config_str(POLICY_TIMER_CONFIG, "onCalendar", "daily") + randomized_delay_sec = config_str(POLICY_TIMER_CONFIG, "randomizedDelaySec", "15min") + journal_target = parse_size_value(POLICY_TIMER_CONFIG.get("journalTargetBytes"), int(OPTIONS.get("journalTargetBytes") or 536870912)) + tmp_min_age_hours = config_float(POLICY_TIMER_CONFIG, "tmpMinAgeHours", float(OPTIONS.get("tmpMinAgeHours") or 24), minimum=0.0) + include_apt_cache = config_bool(POLICY_TIMER_CONFIG, "includeAptCache", bool(OPTIONS.get("aptCache", True))) + include_tool_caches = config_bool(POLICY_TIMER_CONFIG, "includeToolCaches", False) + script_path = "/usr/local/sbin/%s.sh" % unit_name + service_path = "/etc/systemd/system/%s.service" % unit_name + timer_path = "/etc/systemd/system/%s.timer" % unit_name + tool_paths = [item["path"] for item in TOOL_CACHE_ALLOWLIST] if include_tool_caches else [] + script = "\n".join([ + "#!/bin/sh", + "set -eu", + "umask 077", + "journalctl --vacuum-size=%s >/dev/null 2>&1 || true" % int(journal_target), + "apt-get clean >/dev/null 2>&1 || true" if include_apt_cache else ": apt cache disabled by YAML", + "python3 - <<'PY'", + "import json, os, shutil, time", + "prefixes = json.loads(%r)" % json.dumps(TMP_PREFIX_ALLOWLIST), + "protected = set(json.loads(%r))" % json.dumps(sorted(TMP_EXACT_PROTECT)), + "tool_paths = json.loads(%r)" % json.dumps(tool_paths), + "cutoff = time.time() - float(%r) * 3600.0" % tmp_min_age_hours, + "for name in os.listdir('/tmp'):", + " path = os.path.join('/tmp', name)", + " if path in protected or not any(name.startswith(prefix) for prefix in prefixes):", + " continue", + " try:", + " stat = os.lstat(path)", + " except OSError:", + " continue", + " if stat.st_mtime >= cutoff:", + " continue", + " if os.path.isdir(path) and not os.path.islink(path):", + " shutil.rmtree(path, ignore_errors=True)", + " elif os.path.exists(path):", + " try:", + " os.unlink(path)", + " except FileNotFoundError:", + " pass", + "for path in tool_paths:", + " resolved = os.path.abspath(path)", + " if resolved != path or os.path.islink(resolved) or resolved in ['/', '/root', '/root/.npm', '/root/.bun']:", + " continue", + " if os.path.isdir(resolved):", + " shutil.rmtree(resolved, ignore_errors=True)", + " elif os.path.exists(resolved):", + " try:", + " os.unlink(resolved)", + " except FileNotFoundError:", + " pass", + "PY", + "", + ]) + service = "\n".join([ + "[Unit]", + "Description=UniDesk remote low-risk GC for %s" % PROVIDER_ID, + "Documentation=config/unidesk-cli.yaml#gc.remote.targets.%s.policyTimer" % PROVIDER_ID, + "", + "[Service]", + "Type=oneshot", + "ExecStart=%s" % script_path, + "Nice=10", + "IOSchedulingClass=best-effort", + "IOSchedulingPriority=7", + "", + ]) + timer = "\n".join([ + "[Unit]", + "Description=UniDesk remote low-risk GC timer for %s" % PROVIDER_ID, + "", + "[Timer]", + "OnCalendar=%s" % on_calendar, + "RandomizedDelaySec=%s" % randomized_delay_sec, + "Persistent=true", + "", + "[Install]", + "WantedBy=timers.target", + "", + ]) + return { + "unitName": unit_name, + "scriptPath": script_path, + "servicePath": service_path, + "timerPath": timer_path, + "onCalendar": on_calendar, + "randomizedDelaySec": randomized_delay_sec, + "journalTargetBytes": int(journal_target), + "journalTarget": fmt_bytes(journal_target), + "tmpMinAgeHours": tmp_min_age_hours, + "includeAptCache": include_apt_cache, + "includeToolCaches": include_tool_caches, + "script": script, + "service": service, + "timer": timer, + } + +def remote_policy_plan_payload(observed_at): + rendered = render_remote_policy() + return { + "ok": True, + "action": "gc remote policy plan", + "providerId": PROVIDER_ID, + "dryRun": True, + "mutation": False, + "observedAt": observed_at, + "configSource": "config/unidesk-cli.yaml#gc.remote.targets.%s.policyTimer" % PROVIDER_ID, + "enabled": config_bool(POLICY_TIMER_CONFIG, "enabled", False), + "timer": {key: rendered.get(key) for key in ["unitName", "scriptPath", "servicePath", "timerPath", "onCalendar", "randomizedDelaySec", "journalTargetBytes", "journalTarget", "tmpMinAgeHours", "includeAptCache", "includeToolCaches"]}, + "scriptPreview": "\n".join(rendered["script"].splitlines()[:20]), + "servicePreview": rendered["service"], + "timerPreview": rendered["timer"], + "installCommand": "bun scripts/cli.ts gc remote %s policy install --confirm" % PROVIDER_ID, + "policy": { + "risk": "low", + "neverTouches": [ + "k3s runtime directories", + "PVC/PV/local-path data", + "Docker images, containers, volumes or Docker build cache", + "Secret/auth/config state", + "active Web observe runners or Chrome processes", + ], + "toolCaches": "disabled unless config/unidesk-cli.yaml enables includeToolCaches for this remote target", + }, + } + +def remote_policy_install_payload(observed_at): + rendered = render_remote_policy() + try: + with open(rendered["scriptPath"], "w", encoding="utf-8") as handle: + handle.write(rendered["script"]) + os.chmod(rendered["scriptPath"], 0o755) + with open(rendered["servicePath"], "w", encoding="utf-8") as handle: + handle.write(rendered["service"]) + with open(rendered["timerPath"], "w", encoding="utf-8") as handle: + handle.write(rendered["timer"]) + daemon = command(["systemctl", "daemon-reload"], 30) + enable = command(["systemctl", "enable", "--now", "%s.timer" % rendered["unitName"]], 30) + status = command(["systemctl", "show", "%s.timer" % rendered["unitName"], "--property=LoadState,ActiveState,SubState,NextElapseUSecRealtime,LastTriggerUSec"], 10) + except Exception as exc: + return { + "ok": False, + "action": "gc remote policy install", + "providerId": PROVIDER_ID, + "dryRun": False, + "mutation": True, + "observedAt": observed_at, + "error": "policy-install-failed", + "message": str(exc), + } + ok = daemon.get("exitCode") == 0 and enable.get("exitCode") == 0 + return { + "ok": ok, + "action": "gc remote policy install", + "providerId": PROVIDER_ID, + "dryRun": False, + "mutation": True, + "observedAt": observed_at, + "configSource": "config/unidesk-cli.yaml#gc.remote.targets.%s.policyTimer" % PROVIDER_ID, + "timer": {key: rendered.get(key) for key in ["unitName", "scriptPath", "servicePath", "timerPath", "onCalendar", "randomizedDelaySec", "journalTargetBytes", "journalTarget", "tmpMinAgeHours", "includeAptCache", "includeToolCaches"]}, + "systemd": { + "daemonReload": bounded(daemon), + "enableNow": bounded(enable), + "status": bounded(status), + }, + } + +def main(): + observed_at = now_iso() + preflight = cluster_preflight() + if ACTION == "policy-plan": + emit_json(remote_policy_plan_payload(observed_at), persist_large=False) + return 0 + if ACTION == "policy-install": + emit_json(remote_policy_install_payload(observed_at), persist_large=False) + return 0 + if ACTION == "trend": + history_limit = int(OPTIONS.get("historyLimit") or 12) + history = read_growth_snapshots(history_limit) + emit_json({ + "ok": True, + "action": "gc remote trend", + "providerId": PROVIDER_ID, + "dryRun": True, + "mutation": False, + "observedAt": observed_at, + "statePath": growth_snapshot_path(), + "historyLimit": history_limit, + "trend": growth_trend_payload(history), + "points": history if bool(OPTIONS.get("full")) else [compact_growth_point(item) for item in history[-min(len(history), 3):]], + "returnedPointCount": min(len(history), 3) if not bool(OPTIONS.get("full")) else len(history), + "totalPointCount": len(history), + "next": { + "snapshot": "bun scripts/cli.ts gc remote %s snapshot --include-hwlab-registry --history-limit %s" % (PROVIDER_ID, history_limit), + }, + }, persist_large=True) + return 0 + if ACTION == "snapshot": + history_limit = int(OPTIONS.get("historyLimit") or 12) + snapshot = collect_growth_snapshot(observed_at, preflight) + state_path = growth_snapshot_path() + if bool(OPTIONS.get("saveSnapshot", True)): + state_path = append_growth_snapshot(snapshot) + history = read_growth_snapshots(history_limit) + if not bool(OPTIONS.get("saveSnapshot", True)): + history = history + [snapshot] + trend_payload = growth_trend_payload(history[-history_limit:]) + recent_history = history[-min(len(history), 3):] + if not bool(OPTIONS.get("full")): + trend_payload = compact_trend_payload(trend_payload) + recent_history = history[-min(len(history), 1):] + snapshot.update({ + "statePath": state_path, + "historyLimit": history_limit, + "saved": bool(OPTIONS.get("saveSnapshot", True)), + "trend": trend_payload, + "history": { + "totalPointCount": len(read_growth_snapshots(1000000)) if bool(OPTIONS.get("saveSnapshot", True)) else len(history), + "returnedPointCount": len(recent_history) if bool(OPTIONS.get("full")) else 0, + "recentPoints": recent_history if bool(OPTIONS.get("full")) else [], + "drillDown": "bun scripts/cli.ts gc remote %s trend --history-limit %s" % (PROVIDER_ID, history_limit), + }, + }) + emit_json(snapshot, persist_large=True) + return 0 + protected = collect_protected() + candidates = collect_candidates(observed_at) + visible = visible_items(candidates) + if ACTION == "plan": + emit_json(plan_payload(observed_at, preflight, protected, candidates, visible), persist_large=True) + return 0 + if ACTION == "status": + emit_json(remote_gc_job_status(), persist_large=False) + return 0 + if ACTION != "run": + emit_json({"ok": False, "error": "unsupported-remote-gc-action", "action": ACTION}, persist_large=False) + return 0 + if PROVIDER_ID.upper() == "G14" and not preflight.get("ok"): + emit_json({ + "ok": False, + "error": "gc-remote-g14-preflight-failed", + "action": "gc remote run", + "providerId": PROVIDER_ID, + "dryRun": True, + "mutation": False, + "clusterPreflight": preflight, + "plan": plan_payload(observed_at, preflight, protected, candidates, visible), + }, persist_large=True) + return 0 + disk_before = df_snapshot() + results = [] + for candidate in visible: + try: + execution = execute(candidate) + item = dict(candidate) + item.update({"status": execution.get("status") or "succeeded", "reclaimedBytes": execution.get("reclaimedBytes")}) + if "commandOutput" in execution: + item["commandOutput"] = execution["commandOutput"] + results.append(item) + except Exception as exc: + item = dict(candidate) + item.update({"status": "failed", "reclaimedBytes": None, "error": str(exc)}) + results.append(item) + disk_after = df_snapshot() + failed = [item for item in results if item.get("status") == "failed"] + returned = returned_results(results) + run_summary = summarize(visible, returned, disk_before) + run_summary.update({ + "plannedCandidateCount": len(visible), + "attemptedCount": len(results), + "startedCount": len([item for item in results if item.get("status") == "started"]), + "succeededCount": len([item for item in results if item.get("status") == "succeeded"]), + "failedCount": len(failed), + "actualDiskReclaimBytes": (disk_after["availableBytes"] - disk_before["availableBytes"]) if disk_before and disk_after else None, + "actualDiskReclaim": fmt_bytes(disk_after["availableBytes"] - disk_before["availableBytes"]) if disk_before and disk_after else None, + "targetAfter": target_assessment(disk_after, 0), + "resultCount": len(results), + "returnedResultCount": len(returned), + "omittedResultCount": max(0, len(results) - len(returned)), + }) + payload = { + "ok": len(failed) == 0, + "action": "gc remote run", + "providerId": PROVIDER_ID, + "dryRun": False, + "mutation": True, + "observedAt": now_iso(), + "options": OPTIONS, + "diskBefore": disk_before, + "diskAfter": disk_after, + "clusterPreflight": preflight, + "clusterAfter": cluster_preflight(), + "summary": run_summary, + "results": returned, + "protected": protected, + } + emit_json(payload, persist_large=True) + return 0 + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/src/gc-remote.ts b/scripts/src/gc-remote.ts index 672689a4..e0b55957 100644 --- a/scripts/src/gc-remote.ts +++ b/scripts/src/gc-remote.ts @@ -2,7 +2,8 @@ import { Buffer } from "node:buffer"; import { existsSync, readFileSync } from "node:fs"; import { type UniDeskConfig, rootPath } from "./config"; -import { classifySshTcpPoolFailure, runSshCommandCapture, type SshCaptureResult } from "./ssh"; +import { remoteGcDegradedFailure } from "./gc-remote-degraded"; +import { runSshCommandCapture } from "./ssh"; type RemoteGcAction = "plan" | "snapshot" | "trend" | "run" | "status" | "policy-plan" | "policy-install"; @@ -60,6 +61,8 @@ const DEFAULT_REMOTE_OPTIONS: RemoteGcOptions = { const GC_CONFIG_RELATIVE_PATH = "config/unidesk-cli.yaml"; const GC_REMOTE_CONFIG_REF = `${GC_CONFIG_RELATIVE_PATH}#gc.remote.targets`; +const GC_REMOTE_RUNNER_RELATIVE_PATH = "scripts/src/gc-remote-runner.py"; +const GC_REMOTE_RUNNER_CONFIG_PLACEHOLDER = "__UNIDESK_GC_REMOTE_CONFIG_BASE64__"; export async function runRemoteGcCommand(config: UniDeskConfig, providerId: string | undefined, action: string | undefined, args: string[]): Promise { if (providerId === undefined || providerId.length === 0) { @@ -287,2985 +290,10 @@ async function runRemoteGc(config: UniDeskConfig, providerId: string, action: Re } } -function remoteGcDegradedFailure(providerId: string, action: RemoteGcAction, result: SshCaptureResult): Record { - const text = `${result.stderr}\n${result.stdout}`; - const failureKind = classifySshTcpPoolFailure(text); - const timeout = result.exitCode === 124 || text.includes("ssh-runtime-timeout"); - const providerOffline = text.includes(`provider is not online: ${providerId}`) || text.includes("provider is not online"); - const degradedReason = failureKind ?? (providerOffline ? "provider-offline" : timeout ? "ssh-runtime-timeout" : "remote-command-failed"); - return { - ok: false, - degraded: true, - providerId, - action: `gc remote ${action}`, - degradedReason, - transport: { - sshTcpPoolFailureKind: failureKind, - providerOffline, - sshRuntimeTimeout: timeout, - exitCode: result.exitCode, - }, - safeCandidateCount: null, - runAllowed: false, - mutation: false, - summary: failureKind !== null - ? `remote GC could not acquire a provider data channel: ${failureKind}` - : providerOffline - ? `provider ${providerId} is offline from the controlled CLI transport view` - : timeout - ? "remote GC did not complete before the SSH runtime timeout" - : "remote GC command failed before producing a valid plan", - next: { - sshPool: `bun scripts/cli.ts debug ssh-pool ${providerId}`, - fullHealth: "bun scripts/cli.ts debug health", - smoke: `trans ${providerId} argv true`, - retryPlan: `bun scripts/cli.ts gc remote ${providerId} plan --no-snapshot-save`, - }, - }; -} - function remoteGcPython(configBase64: string): string { - return String.raw` -import base64 -import calendar -import json -import os -import re -import shutil -import subprocess -import sys -import time -import urllib.error -import urllib.parse -import urllib.request - -CONFIG = json.loads(base64.b64decode("${configBase64}").decode("utf-8")) -PROVIDER_ID = str(CONFIG.get("providerId") or "") -ACTION = str(CONFIG.get("action") or "plan") -OPTIONS = CONFIG.get("options") or {} -REMOTE_TARGET = CONFIG.get("remoteTarget") if isinstance(CONFIG.get("remoteTarget"), dict) else {} -MEMORY_CONFIG = REMOTE_TARGET.get("memoryPressure") if isinstance(REMOTE_TARGET.get("memoryPressure"), dict) else {} -PVC_CONFIG = REMOTE_TARGET.get("pvcAttribution") if isinstance(REMOTE_TARGET.get("pvcAttribution"), dict) else {} -POLICY_TIMER_CONFIG = REMOTE_TARGET.get("policyTimer") if isinstance(REMOTE_TARGET.get("policyTimer"), dict) else {} - -TMP_PREFIX_ALLOWLIST = [ - "hwlab-agent-", - "hwlab-cd-", - "hwlab-cli-cicd-", - "hwlab-codeagent-trace", - "hwlab-desired-state-", - "hwlab-g14-", - "hwlab-main-", - "hwlab-merge-", - "hwlab-pr", - "hwlab-refresh-", - "hwlab-remote-", - "hwlab-ts-check", - "hwlab-bun-runtime-check-", - "hwlab-v02-", - "playwright-artifacts-", - "playwright_chromiumdev_profile-", - "unidesk-apply-patch-v2-perf-", - "unidesk-clean-", - "unidesk-code-queue", - "unidesk-hwlab-cd-", - "unidesk-pr", - "unidesk-tran-runner", - "bunx-", - "codex-app-schema", - "codex-app-ts", - "marked-", - "node-compile-cache", -] - -TMP_EXACT_PROTECT = set([ - "/tmp/codex-apply-patch", - "/tmp/codex-ipc", - "/tmp/tmux-0", - "/tmp/snap-private-tmp", -]) - -CORE_DUMP_DIR_ALLOWLIST = set([ - "/root/unidesk", -]) - -TOOL_CACHE_ALLOWLIST = [ - { - "id": "npm-cacache", - "path": "/root/.npm/_cacache", - "description": "Delete npm content-addressable package cache; npm can rebuild it.", - }, - { - "id": "npm-npx", - "path": "/root/.npm/_npx", - "description": "Delete npx package execution cache; npx can rebuild it.", - }, - { - "id": "bun-install-cache", - "path": "/root/.bun/install/cache", - "description": "Delete Bun install package cache; bun can rebuild it.", - }, -] - -REGISTRY_REPOSITORY_ROOT = "/var/lib/hwlab/registry/docker/registry/v2/repositories" -REGISTRY_ROOT = "/var/lib/hwlab/registry" -REGISTRY_PROTECTED_TAGS = set([ - "latest", - "16-alpine", - "20-bookworm-slim", - "node22-alpine-v1", - "node22-alpine-bun-v1", - "sidecar", - "1b99888d3dae", -]) - -EXPECTED_G14_NODE = "ubuntu-rog-zephyrus-g14-ga401iv-ga401iv" -REMOTE_GC_JOB_DIR = "/tmp/unidesk-gc-remote/jobs" -REMOTE_GROWTH_SNAPSHOT_DIR = "/tmp/unidesk-gc-remote/growth-snapshots" -REMOTE_STDOUT_JSON_LIMIT = 256 * 1024 - -def config_list(cfg, key, default=None): - value = cfg.get(key) if isinstance(cfg, dict) else None - if isinstance(value, list): - return [str(item) for item in value if isinstance(item, (str, int, float)) and str(item)] - return list(default or []) - -def config_bool(cfg, key, default=False): - value = cfg.get(key) if isinstance(cfg, dict) else None - if isinstance(value, bool): - return value - return bool(default) - -def config_int(cfg, key, default=0, minimum=None, maximum=None): - value = cfg.get(key) if isinstance(cfg, dict) else None - try: - parsed = int(value) - except Exception: - parsed = int(default) - if minimum is not None: - parsed = max(int(minimum), parsed) - if maximum is not None: - parsed = min(int(maximum), parsed) - return parsed - -def config_float(cfg, key, default=0.0, minimum=None, maximum=None): - value = cfg.get(key) if isinstance(cfg, dict) else None - try: - parsed = float(value) - except Exception: - parsed = float(default) - if minimum is not None: - parsed = max(float(minimum), parsed) - if maximum is not None: - parsed = min(float(maximum), parsed) - return parsed - -def config_str(cfg, key, default=""): - value = cfg.get(key) if isinstance(cfg, dict) else None - if isinstance(value, str) and value: - return value - return str(default) - -def parse_size_value(value, default=None): - if isinstance(value, (int, float)) and value > 0: - return int(value) - if not isinstance(value, str): - return default - match = re.match(r"^\s*([0-9]+(?:\.[0-9]+)?)\s*(b|k|kb|kib|m|mb|mib|g|gb|gib)?\s*$", value, re.I) - if not match: - return default - unit = (match.group(2) or "b").lower() - mult = 1024**3 if unit in set(["g", "gb", "gib"]) else 1024**2 if unit in set(["m", "mb", "mib"]) else 1024 if unit in set(["k", "kb", "kib"]) else 1 - return int(float(match.group(1)) * mult) - -def now_iso(): - return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()) - -def command(cmd, timeout=10): - try: - p = subprocess.run(cmd, text=True, capture_output=True, timeout=timeout) - return {"exitCode": p.returncode, "stdout": p.stdout, "stderr": p.stderr, "timedOut": False} - except subprocess.TimeoutExpired as exc: - return { - "exitCode": None, - "stdout": exc.stdout or "", - "stderr": exc.stderr or ("timed out after %ss" % timeout), - "timedOut": True, - } - except Exception as exc: - return {"exitCode": None, "stdout": "", "stderr": str(exc), "timedOut": False} - -def bounded(result): - return { - "exitCode": result.get("exitCode"), - "timedOut": bool(result.get("timedOut")), - "stdoutTail": str(result.get("stdout") or "")[-2000:], - "stderrTail": str(result.get("stderr") or "")[-2000:], - } - -def job_id_or_none(): - raw = str(OPTIONS.get("jobId") or "") - if raw and re.match(r"^[A-Za-z0-9._-]{1,128}$", raw): - return raw - return None - -def job_paths(job_id): - os.makedirs(REMOTE_GC_JOB_DIR, exist_ok=True) - return { - "state": os.path.join(REMOTE_GC_JOB_DIR, "%s.json" % job_id), - "log": os.path.join(REMOTE_GC_JOB_DIR, "%s.log" % job_id), - } - -def status_command(job_id): - return "bun scripts/cli.ts gc remote %s status --job-id %s" % (PROVIDER_ID, job_id) - -def write_json_atomic(path, payload): - tmp = "%s.tmp.%s" % (path, os.getpid()) - with open(tmp, "w", encoding="utf-8") as handle: - json.dump(payload, handle, ensure_ascii=False, indent=2) - handle.write("\n") - os.replace(tmp, path) - -def read_file_tail(path, limit=12000): - try: - size = os.path.getsize(path) - with open(path, "rb") as handle: - if size > limit: - handle.seek(size - limit) - data = handle.read() - return data.decode("utf-8", errors="replace") - except OSError: - return "" - -def stdout_page(items): - if not isinstance(items, list): - return items - raw_limit = OPTIONS.get("resultLimit") or OPTIONS.get("limit") or 50 - try: - limit = int(raw_limit) - except Exception: - limit = 50 - limit = max(1, min(limit, 100)) - return items[:limit] - -def compact_payload_for_stdout(payload, full_size_bytes, job_id=None, paths=None): - compact = { - "ok": payload.get("ok", True), - "action": payload.get("action") or "gc remote", - "providerId": payload.get("providerId") or PROVIDER_ID, - "output": { - "truncated": True, - "reason": "stdout-size-guard", - "fullResultBytes": full_size_bytes, - }, - } - for key in [ - "dryRun", "mutation", "observedAt", "status", "kind", "mode", - "startedAt", "finishedAt", "error", "message", "options", - "diskBefore", "diskAfter", "clusterPreflight", "clusterAfter", - "summary", "policy", - ]: - if key in payload: - compact[key] = payload[key] - if job_id: - state_path = paths["state"] if paths else payload.get("statePath") - compact["jobId"] = job_id - compact["statePath"] = state_path - compact["statusCommand"] = status_command(job_id) - compact["fullResult"] = { - "jobId": job_id, - "statePath": state_path, - "statusCommand": status_command(job_id), - } - compact["output"]["fullResultJobId"] = job_id - if "results" in payload: - results = payload.get("results") or [] - compact["results"] = stdout_page(results) - compact["returnedResultCount"] = len(compact["results"]) - compact["omittedResultCount"] = max(0, len(results) - len(compact["results"])) if isinstance(results, list) else 0 - if "candidates" in payload: - candidates = payload.get("candidates") or [] - compact["candidates"] = stdout_page(candidates) - compact["returnedCandidateCount"] = len(compact["candidates"]) - compact["omittedCandidateCount"] = max(0, len(candidates) - len(compact["candidates"])) if isinstance(candidates, list) else 0 - if "protected" in payload: - compact["protected"] = payload["protected"] - if "logTail" in payload: - compact["logTail"] = str(payload.get("logTail") or "")[-12000:] - return compact - -def emit_json(payload, persist_large=True): - raw = json.dumps(payload, ensure_ascii=False, indent=2) - full_size = len(raw.encode("utf-8")) - if full_size <= REMOTE_STDOUT_JSON_LIMIT: - print(raw) - return - job_id = str(payload.get("jobId") or "") - paths = None - if persist_large: - if not job_id: - provider_slug = re.sub(r"[^A-Za-z0-9._-]+", "-", PROVIDER_ID.lower()).strip("-") or "provider" - job_id = "%s-gc-output-%s-%s" % (provider_slug, int(time.time()), os.getpid()) - paths = job_paths(job_id) - payload = dict(payload) - payload.update({ - "jobId": job_id, - "statePath": paths["state"], - "statusCommand": status_command(job_id), - "outputPersistedAt": now_iso(), - }) - write_json_atomic(paths["state"], payload) - elif job_id: - paths = job_paths(job_id) - compact = compact_payload_for_stdout(payload, full_size, job_id or None, paths) - print(json.dumps(compact, ensure_ascii=False, indent=2)) - -def remote_gc_job_status(): - job_id = job_id_or_none() - if not job_id: - return remote_gc_live_status(now_iso(), cluster_preflight()) - paths = job_paths(job_id) - if not os.path.isfile(paths["state"]): - return { - "ok": False, - "error": "gc-remote-job-not-found", - "jobId": job_id, - "statePath": paths["state"], - "logTail": read_file_tail(paths["log"]), - } - try: - with open(paths["state"], "r", encoding="utf-8") as handle: - payload = json.load(handle) - except Exception as exc: - return { - "ok": False, - "error": "gc-remote-job-state-invalid", - "jobId": job_id, - "message": str(exc), - "statePath": paths["state"], - "logTail": read_file_tail(paths["log"]), - } - payload["logTail"] = read_file_tail(paths["log"]) - return payload - -def remote_gc_live_status(observed_at, preflight): - memory_pressure = collect_memory_pressure() - ci_storage = ci_storage_snapshot() - compact_pvc = compact_pvc_attribution(ci_storage) - return { - "ok": True, - "action": "gc remote status", - "providerId": PROVIDER_ID, - "dryRun": True, - "mutation": False, - "observedAt": observed_at, - "disk": df_snapshot(), - "clusterPreflight": preflight, - "memoryPressure": compact_memory_pressure(memory_pressure), - "pvcAttribution": compact_pvc, - "policy": growth_watermark_policy(df_snapshot() or {}), - "next": { - "snapshot": "bun scripts/cli.ts gc remote %s snapshot --history-limit %s" % (PROVIDER_ID, int(OPTIONS.get("historyLimit") or 12)), - "plan": "bun scripts/cli.ts gc remote %s plan --target-use-percent --limit %s" % (PROVIDER_ID, int(OPTIONS.get("limit") or 50)), - "policy": "bun scripts/cli.ts gc remote %s policy plan" % PROVIDER_ID, - "jobStatus": "bun scripts/cli.ts gc remote %s status --job-id " % PROVIDER_ID, - }, - } - -def path_size(path): - try: - if os.path.islink(path) or os.path.isfile(path): - return os.lstat(path).st_size - if not os.path.isdir(path): - return 0 - total = 0 - for root, dirs, files in os.walk(path): - for name in files: - child = os.path.join(root, name) - try: - total += os.lstat(child).st_size - except OSError: - pass - for name in dirs: - child = os.path.join(root, name) - try: - if os.path.islink(child): - total += os.lstat(child).st_size - except OSError: - pass - return total - except OSError: - return 0 - -def du_size(path, timeout=20): - if not os.path.exists(path): - return None - result = command(["du", "-sxB1", path], timeout) - if result["exitCode"] != 0: - return path_size(path) - text = result["stdout"].strip() - if not text: - return 0 - try: - return int(text.split()[0]) - except Exception: - return path_size(path) - -def safe_int(value, default=0): - try: - if value is None: - return default - return int(value) - except Exception: - return default - -def iso_to_epoch(value): - try: - return calendar.timegm(time.strptime(str(value), "%Y-%m-%dT%H:%M:%SZ")) - except Exception: - return None - -def growth_snapshot_path(): - os.makedirs(REMOTE_GROWTH_SNAPSHOT_DIR, exist_ok=True) - provider_slug = re.sub(r"[^A-Za-z0-9._-]+", "-", PROVIDER_ID.lower()).strip("-") or "provider" - return os.path.join(REMOTE_GROWTH_SNAPSHOT_DIR, "%s.jsonl" % provider_slug) - -def read_growth_snapshots(limit=None): - path = growth_snapshot_path() - if not os.path.isfile(path): - return [] - try: - with open(path, "r", encoding="utf-8") as handle: - lines = handle.readlines() - except OSError: - return [] - rows = [] - for line in lines[-max(1, int(limit or 200)):]: - line = line.strip() - if not line: - continue - try: - item = json.loads(line) - except Exception: - continue - if isinstance(item, dict): - rows.append(item) - return rows - -def append_growth_snapshot(snapshot): - path = growth_snapshot_path() - os.makedirs(os.path.dirname(path), exist_ok=True) - with open(path, "a", encoding="utf-8") as handle: - handle.write(json.dumps(snapshot, ensure_ascii=False, sort_keys=True)) - handle.write("\n") - return path - -def source_size_item(source_id, label, path, cleanup_owner, timeout=20): - size = du_size(path, timeout) if os.path.exists(path) else None - return { - "id": source_id, - "label": label, - "path": path, - "exists": size is not None, - "sizeBytes": size, - "sizeHuman": fmt_bytes(size or 0), - "cleanupOwner": cleanup_owner, - } - -def pid_alive(pid): - try: - pid_int = int(pid) - except Exception: - return False - if pid_int <= 0: - return False - return os.path.exists("/proc/%s" % pid_int) - -def read_json_file(path): - try: - with open(path, "r", encoding="utf-8") as handle: - value = json.load(handle) - return value if isinstance(value, dict) else None - except Exception: - return None - -def read_pid_file(path): - try: - with open(path, "r", encoding="utf-8") as handle: - raw = handle.read().strip() - return int(raw) if re.match(r"^\d+$", raw) else None - except Exception: - return None - -def iso_or_epoch_to_epoch(value): - if value is None: - return None - if isinstance(value, (int, float)): - return float(value) - text_value = str(value).strip() - if not text_value: - return None - for fmt in ["%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%S.%fZ", "%Y-%m-%d %H:%M:%S"]: - try: - return float(calendar.timegm(time.strptime(text_value, fmt))) - except Exception: - pass - return None - -def redact_command_preview(value): - text_value = str(value or "") - text_value = re.sub(r"(?i)(api[_-]?key|token|authorization|password|secret)=\S+", r"\1=", text_value) - text_value = re.sub(r"(?i)(--(?:api-key|token|password|secret))\s+\S+", r"\1 ", text_value) - return text_value[:180] - -def collect_process_pressure(patterns): - result = command(["ps", "-eo", "pid=,ppid=,rss=,comm=,args="], 10) - if result["exitCode"] != 0: - return { - "ok": False, - "error": "ps-failed", - "command": bounded(result), - "processCount": 0, - "rssBytes": 0, - "rows": [], - } - lowered = [(pattern, pattern.lower()) for pattern in patterns] - rows = [] - by_pattern = {} - for line in result["stdout"].splitlines(): - parts = line.strip().split(None, 4) - if len(parts) < 4: - continue - pid, ppid, rss_kib, comm = parts[:4] - args = parts[4] if len(parts) >= 5 else comm - haystack = ("%s %s" % (comm, args)).lower() - matches = [original for original, lowered_pattern in lowered if lowered_pattern and lowered_pattern in haystack] - if not matches: - continue - rss_bytes = safe_int(rss_kib) * 1024 - row = { - "pid": safe_int(pid), - "ppid": safe_int(ppid), - "comm": comm, - "rssBytes": rss_bytes, - "rssHuman": fmt_bytes(rss_bytes), - "matchedPatterns": matches, - "commandPreview": redact_command_preview(args), - } - rows.append(row) - for pattern in matches: - bucket = by_pattern.setdefault(pattern, {"processCount": 0, "rssBytes": 0, "rssHuman": "0 B"}) - bucket["processCount"] += 1 - bucket["rssBytes"] += rss_bytes - bucket["rssHuman"] = fmt_bytes(bucket["rssBytes"]) - rows.sort(key=lambda item: safe_int(item.get("rssBytes")), reverse=True) - total = sum(safe_int(item.get("rssBytes")) for item in rows) - return { - "ok": True, - "patterns": patterns, - "processCount": len(rows), - "rssBytes": total, - "rssHuman": fmt_bytes(total), - "byPattern": by_pattern, - "top": rows[:int(OPTIONS.get("limit") or 50)], - } - -def collect_memory_snapshot(): - result = command(["free", "-b"], 5) - if result["exitCode"] != 0: - return {"ok": False, "error": "free-failed", "command": bounded(result)} - memory = {} - for line in result["stdout"].splitlines(): - parts = line.split() - if parts and parts[0].rstrip(":") == "Mem" and len(parts) >= 7: - memory = { - "totalBytes": safe_int(parts[1]), - "usedBytes": safe_int(parts[2]), - "freeBytes": safe_int(parts[3]), - "availableBytes": safe_int(parts[6]), - "totalHuman": fmt_bytes(parts[1]), - "usedHuman": fmt_bytes(parts[2]), - "availableHuman": fmt_bytes(parts[6]), - } - break - return {"ok": bool(memory), "memory": memory, "command": bounded(result)} - -def observe_run_record(path, stale_hours): - stat = os.stat(path) - heartbeat = read_json_file(os.path.join(path, "heartbeat.json")) or {} - manifest = read_json_file(os.path.join(path, "manifest.json")) or {} - pid = None - for candidate in ["pid", "observer.pid", "browser.pid", "runner.pid"]: - pid = read_pid_file(os.path.join(path, candidate)) - if pid is not None: - break - if pid is None: - for source in [heartbeat, manifest]: - for key in ["pid", "processId", "runnerPid", "browserPid"]: - if source.get(key) is not None: - try: - pid = int(source.get(key)) - break - except Exception: - pass - if pid is not None: - break - timestamp = None - for source in [heartbeat, manifest]: - for key in ["updatedAt", "completedAt", "finishedAt", "stoppedAt", "startedAt", "createdAt"]: - timestamp = iso_or_epoch_to_epoch(source.get(key)) - if timestamp is not None: - break - if timestamp is not None: - break - if timestamp is None: - timestamp = stat.st_mtime - age_hours = max(0.0, (time.time() - timestamp) / 3600.0) - status = heartbeat.get("status") or manifest.get("status") or manifest.get("state") - alive = pid_alive(pid) - terminal = str(status or "").lower() in set(["done", "completed", "complete", "failed", "blocked", "timeout", "timed-out", "stopped", "exited"]) - stale_signal = (not alive) and age_hours >= float(stale_hours) and (terminal or status is None) - return { - "id": os.path.basename(path), - "path": path, - "pid": pid, - "pidAlive": alive, - "status": status, - "ageHours": round(age_hours, 2), - "timestampBasis": "manifest-or-heartbeat" if heartbeat or manifest else "directory-mtime-fallback", - "staleSignal": stale_signal, - "classification": "review-only", - } - -def collect_web_observe_summary(): - roots = config_list(MEMORY_CONFIG, "observeStateRoots", config_list(MEMORY_CONFIG, "webObserveRoots", [])) - stale_hours = config_float(MEMORY_CONFIG, "staleRunMaxAgeHours", 6.0, minimum=0.0) - if not roots: - return { - "ok": True, - "skipped": True, - "reason": "no-yaml-observe-roots", - "configSource": "config/unidesk-cli.yaml#gc.remote.targets.%s.memoryPressure.observeStateRoots" % PROVIDER_ID, - } - root_rows = [] - stale_rows = [] - active_rows = [] - run_count = 0 - total_bytes = 0 - for root in roots: - exists = os.path.isdir(root) - root_size = du_size(root, 15) if exists else None - if root_size is not None: - total_bytes += safe_int(root_size) - row = { - "root": root, - "exists": exists, - "sizeBytes": root_size, - "sizeHuman": fmt_bytes(root_size or 0), - "runCount": 0, - "staleSignalCount": 0, - "activeSignalCount": 0, - } - if exists: - try: - children = [os.path.join(root, name) for name in os.listdir(root)] - except OSError: - children = [] - for child in children: - if not os.path.isdir(child): - continue - try: - record = observe_run_record(child, stale_hours) - except OSError: - continue - row["runCount"] += 1 - run_count += 1 - if record.get("pidAlive"): - row["activeSignalCount"] += 1 - active_rows.append(record) - if record.get("staleSignal"): - row["staleSignalCount"] += 1 - stale_rows.append(record) - root_rows.append(row) - stale_rows.sort(key=lambda item: float(item.get("ageHours") or 0), reverse=True) - active_rows.sort(key=lambda item: safe_int(item.get("pid"))) - return { - "ok": True, - "configSource": "config/unidesk-cli.yaml#gc.remote.targets.%s.memoryPressure" % PROVIDER_ID, - "staleRunMaxAgeHours": stale_hours, - "rootCount": len(root_rows), - "totalBytes": total_bytes, - "totalHuman": fmt_bytes(total_bytes), - "runCount": run_count, - "activeSignalCount": len(active_rows), - "staleSignalCount": len(stale_rows), - "roots": root_rows, - "activeSignals": active_rows[:int(OPTIONS.get("limit") or 50)], - "staleSignals": stale_rows[:int(OPTIONS.get("limit") or 50)], - "policy": "analysis-only; active or stale observe runs must be stopped/retained through controlled observer lifecycle commands, not raw process kill or directory deletion", - } - -def collect_memory_pressure(): - patterns = config_list(MEMORY_CONFIG, "processPatterns", []) - if not patterns: - return { - "ok": True, - "skipped": True, - "reason": "no-yaml-process-patterns", - "configSource": "config/unidesk-cli.yaml#gc.remote.targets.%s.memoryPressure.processPatterns" % PROVIDER_ID, - } - processes = collect_process_pressure(patterns) - observe = collect_web_observe_summary() - return { - "ok": processes.get("ok") is True, - "configSource": "config/unidesk-cli.yaml#gc.remote.targets.%s.memoryPressure" % PROVIDER_ID, - "hostMemory": collect_memory_snapshot(), - "processes": processes, - "webObserve": observe, - "summary": { - "matchedProcessCount": processes.get("processCount"), - "matchedRssBytes": processes.get("rssBytes"), - "matchedRssHuman": processes.get("rssHuman"), - "chromeProcessCount": (processes.get("byPattern") or {}).get("chrome", {}).get("processCount"), - "observerRunCount": observe.get("runCount"), - "activeObserverSignals": observe.get("activeSignalCount"), - "staleObserverSignals": observe.get("staleSignalCount"), - "observeStateBytes": observe.get("totalBytes"), - "observeStateHuman": observe.get("totalHuman"), - }, - "drillDown": { - "processes": "bun scripts/cli.ts gc remote %s snapshot --full --no-save" % PROVIDER_ID, - "status": "bun scripts/cli.ts gc remote %s status --job-id " % PROVIDER_ID, - }, - } - -def disk_source_snapshot(): - sources = [ - source_size_item("hwlab-host-data", "HWLAB host data", "/var/lib/hwlab", "hwlab-registry-retention", 60), - source_size_item("hwlab-registry", "HWLAB registry", REGISTRY_ROOT, "gc-remote-hwlab-registry", 60), - source_size_item("k3s-storage", "k3s local-path storage", "/var/lib/rancher/k3s/storage", "owner-aware-pvc-retention", 45), - source_size_item("k3s-containerd", "k3s containerd", "/var/lib/rancher/k3s/agent/containerd", "observation-only", 45), - source_size_item("host-containerd", "host containerd", "/var/lib/containerd", "observation-only", 30), - source_size_item("kubelet", "kubelet state", "/var/lib/kubelet", "protected-runtime", 20), - source_size_item("var-log", "host logs", "/var/log", "gc-remote-logs-journald", 20), - source_size_item("tmp", "allowlisted tmp and other tmp", "/tmp", "gc-remote-tmp-allowlist", 20), - source_size_item("apt-cache", "apt archives", "/var/cache/apt/archives", "gc-remote-apt-cache", 10), - source_size_item("hwlab-v02-source", "HWLAB v0.2 source workspace", "/root/hwlab-v02", "protected-source", 20), - source_size_item("agentrun-source", "AgentRun source workspace", "/root/agentrun-v01", "protected-source", 20), - ] - return [item for item in sources if item.get("exists")] - -def containerd_breakdown_snapshot(): - rows = [ - source_size_item("k3s-containerd-content", "k3s containerd content store", "/var/lib/rancher/k3s/agent/containerd/io.containerd.content.v1.content", "observation-only", 30), - source_size_item("k3s-containerd-overlayfs", "k3s containerd overlay snapshots", "/var/lib/rancher/k3s/agent/containerd/io.containerd.snapshotter.v1.overlayfs", "observation-only", 30), - source_size_item("host-containerd-content", "host containerd content store", "/var/lib/containerd/io.containerd.content.v1.content", "observation-only", 20), - source_size_item("host-containerd-overlayfs", "host containerd overlay snapshots", "/var/lib/containerd/io.containerd.snapshotter.v1.overlayfs", "observation-only", 20), - ] - rows = [item for item in rows if item.get("exists")] - return { - "state": "observation-only", - "cleanupSupported": False, - "reason": "containerd cleanup still requires a reference-safe image/content classifier; this snapshot only classifies growth sources", - "breakdown": rows, - } - -def pv_host_path(pv): - spec = (pv or {}).get("spec") or {} - host_path = (spec.get("hostPath") or {}).get("path") - if isinstance(host_path, str) and host_path: - return host_path - local_path = (spec.get("local") or {}).get("path") - if isinstance(local_path, str) and local_path: - return local_path - return None - -def pvc_owner_group(namespace, owner): - owner = str(owner or "") - if namespace == "agentrun-ci": - return "agentrun" - if namespace == "hwlab-ci": - if owner.startswith("agentrun-"): - return "agentrun" - return "hwlab" - if namespace.startswith("hwlab-"): - return "hwlab-runtime" - return "other" - -def parse_k8s_quantity(value): - if value is None: - return None - raw = str(value).strip() - match = re.match(r"^([0-9]+(?:\.[0-9]+)?)(Ki|Mi|Gi|Ti|K|M|G|T)?$", raw) - if not match: - return None - multiplier = { - None: 1, - "K": 1000, - "M": 1000**2, - "G": 1000**3, - "T": 1000**4, - "Ki": 1024, - "Mi": 1024**2, - "Gi": 1024**3, - "Ti": 1024**4, - }.get(match.group(2), 1) - return int(float(match.group(1)) * multiplier) - -def metadata_owner(meta): - refs = meta.get("ownerReferences") or [] - if refs: - first = refs[0] or {} - return first.get("kind"), first.get("name"), [{"kind": item.get("kind"), "name": item.get("name")} for item in refs[:5]] - labels = meta.get("labels") or {} - annotations = meta.get("annotations") or {} - for key in [ - "tekton.dev/pipelineRun", - "tekton.dev/taskRun", - "agentrun.unidesk/run-id", - "hwlab.unidesk/run-id", - "app.kubernetes.io/instance", - ]: - value = labels.get(key) or annotations.get(key) - if value: - return "Label", value, [] - return None, None, [] - -def ci_storage_snapshot(): - namespaces = set(config_list(PVC_CONFIG, "namespaces", ["hwlab-ci", "agentrun-ci"])) - candidate_namespaces = set(config_list(PVC_CONFIG, "candidateNamespaces", [])) - hwlab_node = config_str(PVC_CONFIG, "hwlabNode", PROVIDER_ID) - hwlab_lane = config_str(PVC_CONFIG, "hwlabLane", "v03") - agentrun_node = config_str(PVC_CONFIG, "agentrunNode", PROVIDER_ID) - agentrun_lane = config_str(PVC_CONFIG, "agentrunLane", "v02") - limit = config_int(PVC_CONFIG, "limit", int(OPTIONS.get("limit") or 50), minimum=1, maximum=5000) - pv_data = kubectl_json(["get", "pv"], 30) or {} - pvc_data = kubectl_json(["get", "pvc", "-A"], 30) or {} - pod_data = kubectl_json(["get", "pod", "-A"], 30) or {} - pvs = {} - for pv in pv_data.get("items") or []: - meta = pv.get("metadata") or {} - name = meta.get("name") - if name: - pvs[name] = pv - mounts = {} - for pod in pod_data.get("items") or []: - meta = pod.get("metadata") or {} - ns = str(meta.get("namespace") or "") - pod_name = str(meta.get("name") or "") - phase = str(((pod.get("status") or {}).get("phase")) or "") - if phase in set(["Succeeded", "Failed"]): - continue - spec = pod.get("spec") or {} - for vol in spec.get("volumes") or []: - claim = (vol.get("persistentVolumeClaim") or {}).get("claimName") - if claim: - mounts.setdefault((ns, claim), []).append(pod_name) - rows = [] - for pvc in pvc_data.get("items") or []: - meta = pvc.get("metadata") or {} - spec = pvc.get("spec") or {} - status = pvc.get("status") or {} - ns = str(meta.get("namespace") or "") - name = str(meta.get("name") or "") - if ns not in namespaces: - continue - volume = str(spec.get("volumeName") or "") - pv = pvs.get(volume) or {} - pv_spec = pv.get("spec") or {} - pv_meta = pv.get("metadata") or {} - owner_kind, owner_name, owner_refs = metadata_owner(meta) - requested = parse_k8s_quantity((((spec.get("resources") or {}).get("requests") or {}).get("storage"))) - host_path = pv_host_path(pv) - active = sorted(mounts.get((ns, name), [])) - estimated = du_size(host_path, 8) if host_path else None - candidate_reasons = [] - if not active: - candidate_reasons.append("no-active-mount-observed") - if status.get("phase") != "Bound": - candidate_reasons.append("pvc-not-bound") - if (pv.get("status") or {}).get("phase") == "Released": - candidate_reasons.append("pv-released") - review_candidate = ns in candidate_namespaces and len(candidate_reasons) > 0 - rows.append({ - "namespace": ns, - "pvc": name, - "volume": volume or None, - "phase": status.get("phase"), - "pvPhase": (pv.get("status") or {}).get("phase"), - "ownerKind": owner_kind, - "owner": owner_name, - "ownerRefs": owner_refs, - "ownerGroup": pvc_owner_group(ns, owner_name), - "storageClass": spec.get("storageClassName") or pv_spec.get("storageClassName"), - "reclaimPolicy": pv_spec.get("persistentVolumeReclaimPolicy"), - "requestedBytes": requested, - "requestedHuman": fmt_bytes(requested or 0), - "hostPath": host_path, - "pvCreatedAt": (pv_meta.get("creationTimestamp") if isinstance(pv_meta, dict) else None), - "pvcCreatedAt": meta.get("creationTimestamp"), - "activeMountPods": active, - "estimatedBytes": estimated, - "estimatedHuman": fmt_bytes(estimated or 0), - "reviewCandidate": review_candidate, - "reviewReasons": candidate_reasons, - "dryRunOnly": True, - }) - rows.sort(key=lambda item: safe_int(item.get("estimatedBytes")), reverse=True) - by_namespace = {} - by_owner_group = {} - for row in rows: - for bucket, key in [(by_namespace, row.get("namespace") or "unknown"), (by_owner_group, row.get("ownerGroup") or "unknown")]: - current = bucket.setdefault(key, {"count": 0, "estimatedBytes": 0, "activeMountCount": 0}) - current["count"] += 1 - current["estimatedBytes"] += safe_int(row.get("estimatedBytes")) - current["activeMountCount"] += len(row.get("activeMountPods") or []) - current["estimatedHuman"] = fmt_bytes(current["estimatedBytes"]) - review_candidates = [row for row in rows if row.get("reviewCandidate")] - return { - "scope": "YAML-configured PVC namespaces", - "configSource": "config/unidesk-cli.yaml#gc.remote.targets.%s.pvcAttribution" % PROVIDER_ID, - "namespaces": sorted(namespaces), - "candidateNamespaces": sorted(candidate_namespaces), - "pvcCount": len(rows), - "reviewCandidateCount": len(review_candidates), - "estimatedBytes": sum(safe_int(row.get("estimatedBytes")) for row in rows), - "estimatedHuman": fmt_bytes(sum(safe_int(row.get("estimatedBytes")) for row in rows)), - "requestedBytes": sum(safe_int(row.get("requestedBytes")) for row in rows), - "requestedHuman": fmt_bytes(sum(safe_int(row.get("requestedBytes")) for row in rows)), - "byNamespace": by_namespace, - "byOwnerGroup": by_owner_group, - "topPvcs": rows[:limit], - "reviewCandidates": review_candidates[:limit], - "handoff": { - "hwlab": { - "dryRun": "bun scripts/cli.ts hwlab nodes control-plane cleanup-runs --node %s --lane %s --min-age-minutes 30 --limit 200 --dry-run" % (hwlab_node, hwlab_lane), - "releasedPvs": "bun scripts/cli.ts hwlab nodes control-plane cleanup-released-pvs --node %s --lane %s --limit 200 --dry-run" % (hwlab_node, hwlab_lane), - }, - "agentrun": { - "dryRun": "bun scripts/cli.ts agentrun control-plane cleanup-runs --node %s --lane %s --min-age-minutes 30 --limit 200 --dry-run" % (agentrun_node, agentrun_lane), - "releasedPvs": "bun scripts/cli.ts agentrun control-plane cleanup-released-pvs --node %s --lane %s --limit 200 --dry-run" % (agentrun_node, agentrun_lane), - }, - }, - "policy": "analysis-only; remote GC never deletes PVC/PV/local-path data and only hands off to owner-aware retention commands", - } - -def compact_pvc_row(row): - return { - "namespace": row.get("namespace"), - "pvc": row.get("pvc"), - "volume": row.get("volume"), - "phase": row.get("phase"), - "pvPhase": row.get("pvPhase"), - "ownerKind": row.get("ownerKind"), - "owner": row.get("owner"), - "ownerGroup": row.get("ownerGroup"), - "storageClass": row.get("storageClass"), - "reclaimPolicy": row.get("reclaimPolicy"), - "requestedBytes": row.get("requestedBytes"), - "requestedHuman": row.get("requestedHuman"), - "estimatedBytes": row.get("estimatedBytes"), - "estimatedHuman": row.get("estimatedHuman"), - "activeMountCount": len(row.get("activeMountPods") or []), - "activeMountPods": (row.get("activeMountPods") or [])[:5], - "reviewCandidate": row.get("reviewCandidate"), - "reviewReasons": row.get("reviewReasons"), - "dryRunOnly": True, - } - -def compact_pvc_attribution(payload): - if bool(OPTIONS.get("full")): - return payload - limit = 1 - top = payload.get("topPvcs") or [] - review = payload.get("reviewCandidates") or [] - return { - "scope": payload.get("scope"), - "configSource": payload.get("configSource"), - "namespaces": payload.get("namespaces"), - "candidateNamespaces": payload.get("candidateNamespaces"), - "pvcCount": payload.get("pvcCount"), - "reviewCandidateCount": payload.get("reviewCandidateCount"), - "estimatedBytes": payload.get("estimatedBytes"), - "estimatedHuman": payload.get("estimatedHuman"), - "requestedBytes": payload.get("requestedBytes"), - "requestedHuman": payload.get("requestedHuman"), - "byNamespace": payload.get("byNamespace"), - "byOwnerGroup": payload.get("byOwnerGroup"), - "topPvcs": [compact_pvc_row(row) for row in top[:limit] if isinstance(row, dict)], - "reviewCandidates": [compact_pvc_row(row) for row in review[:limit] if isinstance(row, dict)], - "handoff": payload.get("handoff"), - "policy": payload.get("policy"), - "compacted": True, - "fullDisclosure": "rerun with --full for hostPath, creation timestamps and complete row details", - } - -def compact_ci_storage_summary(payload): - return { - "scope": payload.get("scope"), - "configSource": payload.get("configSource"), - "pvcCount": payload.get("pvcCount"), - "reviewCandidateCount": payload.get("reviewCandidateCount"), - "estimatedBytes": payload.get("estimatedBytes"), - "estimatedHuman": payload.get("estimatedHuman"), - "requestedBytes": payload.get("requestedBytes"), - "requestedHuman": payload.get("requestedHuman"), - "compacted": True, - "fullDisclosure": "use pvcAttribution or --full for row-level details", - } - -def compact_memory_pressure(payload): - if bool(OPTIONS.get("full")): - return payload - processes = payload.get("processes") or {} - observe = payload.get("webObserve") or {} - process_limit = max(1, min(int(OPTIONS.get("limit") or 50), 8)) - signal_limit = max(1, min(int(OPTIONS.get("limit") or 50), 5)) - compact_processes = dict(processes) - compact_processes["top"] = (processes.get("top") or [])[:process_limit] - compact_observe = dict(observe) - compact_observe["activeSignals"] = (observe.get("activeSignals") or [])[:signal_limit] - compact_observe["staleSignals"] = (observe.get("staleSignals") or [])[:signal_limit] - return { - "ok": payload.get("ok"), - "configSource": payload.get("configSource"), - "hostMemory": payload.get("hostMemory"), - "processes": compact_processes, - "webObserve": compact_observe, - "summary": payload.get("summary"), - "drillDown": payload.get("drillDown"), - "compacted": True, - } - -def compact_memory_summary(payload): - observe = payload.get("webObserve") or {} - return { - "ok": payload.get("ok"), - "configSource": payload.get("configSource"), - "summary": payload.get("summary"), - "webObserve": { - "rootCount": observe.get("rootCount"), - "totalBytes": observe.get("totalBytes"), - "totalHuman": observe.get("totalHuman"), - "runCount": observe.get("runCount"), - "activeSignalCount": observe.get("activeSignalCount"), - "staleSignalCount": observe.get("staleSignalCount"), - }, - "compacted": True, - "drillDown": "bun scripts/cli.ts gc remote %s status --limit %s" % (PROVIDER_ID, int(OPTIONS.get("limit") or 50)), - } - -def registry_growth_snapshot(): - summary = { - "path": REGISTRY_ROOT, - "sizeBytes": du_size(REGISTRY_ROOT, 60) or 0, - } - summary["sizeHuman"] = fmt_bytes(summary["sizeBytes"]) - if OPTIONS.get("hwlabRegistry", False): - plan = plan_registry_retention() - retention = dict(plan.get("summary") or {}) - for key in ["registrySizeBytes", "estimatedReclaimBytes"]: - if key in retention: - retention[key.replace("Bytes", "Human")] = fmt_bytes(retention.get(key) or 0) - summary["retentionPlan"] = retention - else: - summary["retentionPlan"] = { - "skipped": True, - "reason": "rerun snapshot with --include-hwlab-registry to compute tag/revision retention counters", - } - summary["cadence"] = { - "dryRun": "daily or before/after every v0.2 CI/CD burst", - "maintenanceRun": "weekly, or when root >=80%, or when registry growth exceeds the agreed daily threshold", - "planCommand": "bun scripts/cli.ts gc remote %s plan --target-use-percent 70 --include-hwlab-registry --limit 50" % PROVIDER_ID, - "snapshotCommand": "bun scripts/cli.ts gc remote %s snapshot --include-hwlab-registry --history-limit 12" % PROVIDER_ID, - "runCommand": "bun scripts/cli.ts gc remote %s run --confirm --include-hwlab-registry --target-use-percent 70 --limit 50" % PROVIDER_ID, - "defaultRetention": { - "keepPerRepo": int(OPTIONS.get("registryKeepPerRepo") or 20), - "minAgeHours": float(OPTIONS.get("registryMinAgeHours") or 48), - "protects": ["current workload refs", "digest closure", "protected tags", "recent tags", "newest N tags per repo"], - }, - } - return summary - -def growth_watermark_policy(root_disk): - use_percent = root_disk.get("usePercent") if isinstance(root_disk, dict) else None - if use_percent is None: - state = "unknown" - action = "collect-snapshot" - elif use_percent < 75: - state = "healthy" - action = "observe-trend" - elif use_percent < 80: - state = "watch" - action = "run-dry-run-plan" - elif use_percent < 85: - state = "maintenance" - action = "schedule-owner-aware-retention" - else: - state = "emergency" - action = "restore-runtime-then-file-evidence" - return { - "state": state, - "recommendedAction": action, - "watermarks": [ - {"range": "<75%", "action": "trend only"}, - {"range": "75%-80%", "action": "run dry-run plan and identify source"}, - {"range": "80%-85%", "action": "small owner-aware retention run"}, - {"range": ">=85%", "action": "runtime recovery first, then root-cause growth source"}, - ], - "growthThresholdPolicy": "If bytes/day remains high for consecutive snapshots, act before 80%; exact threshold should be set from the first week of saved snapshots.", - } - -def snapshot_metric_map(snapshot): - metrics = {} - root = snapshot.get("rootDisk") or {} - if isinstance(root, dict) and root.get("usedBytes") is not None: - metrics["root.usedBytes"] = {"value": safe_int(root.get("usedBytes")), "unit": "bytes", "label": "root used bytes"} - for item in snapshot.get("sources") or []: - if not isinstance(item, dict) or item.get("sizeBytes") is None: - continue - key = "source.%s.sizeBytes" % item.get("id") - metrics[key] = {"value": safe_int(item.get("sizeBytes")), "unit": "bytes", "label": item.get("label") or item.get("id")} - storage = ((snapshot.get("ciStorage") or {}).get("byOwnerGroup") or {}) - if not storage: - storage = ((snapshot.get("pvcAttribution") or {}).get("byOwnerGroup") or {}) - for owner, value in storage.items(): - metrics["ciStorage.%s.estimatedBytes" % owner] = {"value": safe_int((value or {}).get("estimatedBytes")), "unit": "bytes", "label": "CI storage %s" % owner} - memory = snapshot.get("memoryPressure") or {} - memory_summary = memory.get("summary") or {} - if memory_summary.get("matchedRssBytes") is not None: - metrics["memoryPressure.matchedRssBytes"] = {"value": safe_int(memory_summary.get("matchedRssBytes")), "unit": "bytes", "label": "matched observer/chrome RSS"} - if memory_summary.get("observeStateBytes") is not None: - metrics["memoryPressure.observeStateBytes"] = {"value": safe_int(memory_summary.get("observeStateBytes")), "unit": "bytes", "label": "web observe state bytes"} - for key in ["matchedProcessCount", "activeObserverSignals", "staleObserverSignals"]: - if memory_summary.get(key) is not None: - metrics["memoryPressure.%s" % key] = {"value": safe_int(memory_summary.get(key)), "unit": "count", "label": "memory pressure %s" % key} - registry = snapshot.get("registry") or {} - retention = registry.get("retentionPlan") or {} - for key in ["totalTags", "totalRevisions", "deleteTags", "deleteRevisions", "estimatedReclaimBytes"]: - if key in retention and retention.get(key) is not None: - unit = "bytes" if key.endswith("Bytes") else "count" - metrics["registry.%s" % key] = {"value": safe_int(retention.get(key)), "unit": unit, "label": "registry %s" % key} - return metrics - -def delta_metric_rows(before, after): - before_metrics = snapshot_metric_map(before) - after_metrics = snapshot_metric_map(after) - before_ts = iso_to_epoch(before.get("observedAt")) - after_ts = iso_to_epoch(after.get("observedAt")) - seconds = (after_ts - before_ts) if before_ts is not None and after_ts is not None else None - rows = [] - for key in sorted(set(before_metrics.keys()) | set(after_metrics.keys())): - old = before_metrics.get(key, {"value": 0, "unit": (after_metrics.get(key) or {}).get("unit"), "label": key}) - new = after_metrics.get(key, {"value": 0, "unit": old.get("unit"), "label": old.get("label")}) - delta = safe_int(new.get("value")) - safe_int(old.get("value")) - row = { - "key": key, - "label": new.get("label") or old.get("label") or key, - "unit": new.get("unit") or old.get("unit"), - "before": old.get("value"), - "after": new.get("value"), - "delta": delta, - } - if row["unit"] == "bytes": - row["beforeHuman"] = fmt_bytes(row["before"] or 0) - row["afterHuman"] = fmt_bytes(row["after"] or 0) - row["deltaHuman"] = ("-" if delta < 0 else "") + fmt_bytes(abs(delta)) - if seconds and seconds > 0: - per_day = int(delta * 86400 / seconds) - row["perDayBytes"] = per_day - row["perDayHuman"] = ("-" if per_day < 0 else "") + fmt_bytes(abs(per_day)) + "/day" - rows.append(row) - rows.sort(key=lambda item: safe_int(item.get("delta")), reverse=True) - return {"durationSeconds": seconds, "metrics": rows} - -def growth_trend_payload(points): - points = [point for point in points if isinstance(point, dict)] - if len(points) < 2: - return { - "pointCount": len(points), - "state": "insufficient-history", - "message": "Run snapshot at least twice to compute deltas.", - } - latest_delta = delta_metric_rows(points[-2], points[-1]) - window_delta = delta_metric_rows(points[0], points[-1]) - def rate_warning(delta): - seconds = delta.get("durationSeconds") - if seconds is not None and seconds < 3600: - return { - "code": "short-window-rate-noisy", - "message": "Per-day rates from windows shorter than 1 hour are directional only; use daily snapshots for governance decisions.", - "durationSeconds": seconds, - } - return None - return { - "pointCount": len(points), - "oldestAt": points[0].get("observedAt"), - "latestAt": points[-1].get("observedAt"), - "latestDelta": { - "durationSeconds": latest_delta.get("durationSeconds"), - "rateWarning": rate_warning(latest_delta), - "topGrowingBytes": [row for row in latest_delta.get("metrics", []) if row.get("unit") == "bytes" and safe_int(row.get("delta")) > 0][:10], - "topShrinkingBytes": [row for row in reversed(latest_delta.get("metrics", [])) if row.get("unit") == "bytes" and safe_int(row.get("delta")) < 0][:10], - "registryCounters": [row for row in latest_delta.get("metrics", []) if str(row.get("key", "")).startswith("registry.") and row.get("unit") == "count"], - }, - "windowDelta": { - "durationSeconds": window_delta.get("durationSeconds"), - "rateWarning": rate_warning(window_delta), - "topGrowingBytes": [row for row in window_delta.get("metrics", []) if row.get("unit") == "bytes" and safe_int(row.get("delta")) > 0][:10], - "topShrinkingBytes": [row for row in reversed(window_delta.get("metrics", [])) if row.get("unit") == "bytes" and safe_int(row.get("delta")) < 0][:10], - "registryCounters": [row for row in window_delta.get("metrics", []) if str(row.get("key", "")).startswith("registry.") and row.get("unit") == "count"], - }, - } - -def compact_metric_rows(rows, limit=3): - compact = [] - for row in (rows or [])[:limit]: - compact.append({ - "key": row.get("key"), - "label": row.get("label"), - "unit": row.get("unit"), - "delta": row.get("delta"), - "deltaHuman": row.get("deltaHuman"), - "perDayHuman": row.get("perDayHuman"), - }) - return compact - -def compact_trend_payload(payload): - if payload.get("state") == "insufficient-history": - return payload - latest = payload.get("latestDelta") or {} - window = payload.get("windowDelta") or {} - return { - "pointCount": payload.get("pointCount"), - "oldestAt": payload.get("oldestAt"), - "latestAt": payload.get("latestAt"), - "latestDelta": { - "durationSeconds": latest.get("durationSeconds"), - "rateWarning": latest.get("rateWarning"), - "topGrowingBytes": compact_metric_rows(latest.get("topGrowingBytes") or [], 1), - "topShrinkingBytes": compact_metric_rows(latest.get("topShrinkingBytes") or [], 1), - "registryCounters": compact_metric_rows(latest.get("registryCounters") or [], 1), - }, - "windowDelta": { - "durationSeconds": window.get("durationSeconds"), - "rateWarning": window.get("rateWarning"), - "topGrowingBytes": compact_metric_rows(window.get("topGrowingBytes") or [], 1), - "topShrinkingBytes": compact_metric_rows(window.get("topShrinkingBytes") or [], 1), - "registryCounters": compact_metric_rows(window.get("registryCounters") or [], 1), - }, - "fullDisclosure": "rerun trend --full for all metric rows", - } - -def compact_growth_point(item): - registry = item.get("registry") or {} - retention = registry.get("retentionPlan") or {} - ci_storage = item.get("ciStorage") or {} - containerd = item.get("containerd") or {} - memory = item.get("memoryPressure") or {} - memory_summary = memory.get("summary") or {} - observe = (memory.get("webObserve") or {}) - return { - "observedAt": item.get("observedAt"), - "rootDisk": item.get("rootDisk"), - "sourceCount": len(item.get("sources") or []), - "registry": { - "sizeBytes": registry.get("sizeBytes"), - "sizeHuman": registry.get("sizeHuman"), - "totalTags": retention.get("totalTags"), - "totalRevisions": retention.get("totalRevisions"), - "deleteTags": retention.get("deleteTags"), - "deleteRevisions": retention.get("deleteRevisions"), - "estimatedReclaimBytes": retention.get("estimatedReclaimBytes"), - "estimatedReclaimHuman": retention.get("estimatedReclaimHuman"), - }, - "ciStorage": { - "pvcCount": ci_storage.get("pvcCount"), - "estimatedBytes": ci_storage.get("estimatedBytes"), - "estimatedHuman": ci_storage.get("estimatedHuman"), - "byOwnerGroup": ci_storage.get("byOwnerGroup"), - }, - "containerd": { - "state": containerd.get("state"), - "cleanupSupported": containerd.get("cleanupSupported"), - }, - "memoryPressure": { - "matchedProcessCount": memory_summary.get("matchedProcessCount"), - "matchedRssBytes": memory_summary.get("matchedRssBytes"), - "matchedRssHuman": memory_summary.get("matchedRssHuman"), - "activeObserverSignals": memory_summary.get("activeObserverSignals"), - "staleObserverSignals": memory_summary.get("staleObserverSignals"), - "observeStateBytes": memory_summary.get("observeStateBytes"), - "observeStateHuman": memory_summary.get("observeStateHuman"), - "webObserveRootCount": observe.get("rootCount"), - }, - } - -def collect_growth_snapshot(observed_at, preflight): - root_disk = df_snapshot() - sources = disk_source_snapshot() - ci_storage = ci_storage_snapshot() - memory_pressure = collect_memory_pressure() - compact_pvc = compact_pvc_attribution(ci_storage) - if bool(OPTIONS.get("full")): - public_pvc = ci_storage - public_memory = memory_pressure - else: - public_pvc = compact_ci_storage_summary(ci_storage) - public_memory = compact_memory_summary(memory_pressure) - registry = registry_growth_snapshot() - containerd = containerd_breakdown_snapshot() - commands = { - "snapshot": "bun scripts/cli.ts gc remote %s snapshot --include-hwlab-registry --history-limit %s" % (PROVIDER_ID, int(OPTIONS.get("historyLimit") or 12)), - "trend": "bun scripts/cli.ts gc remote %s trend --history-limit %s" % (PROVIDER_ID, int(OPTIONS.get("historyLimit") or 12)), - "registryPlan": "bun scripts/cli.ts gc remote %s plan --target-use-percent 70 --include-hwlab-registry --limit 50" % PROVIDER_ID, - "hwlabCiRetention": ((ci_storage.get("handoff") or {}).get("hwlab") or {}).get("dryRun"), - "agentrunRetention": ((ci_storage.get("handoff") or {}).get("agentrun") or {}).get("dryRun"), - "remotePolicy": "bun scripts/cli.ts gc remote %s policy plan" % PROVIDER_ID, - } - if not bool(OPTIONS.get("full")): - commands = { - "trend": "bun scripts/cli.ts gc remote %s trend --history-limit %s" % (PROVIDER_ID, int(OPTIONS.get("historyLimit") or 12)), - "status": "bun scripts/cli.ts gc remote %s status --limit %s" % (PROVIDER_ID, int(OPTIONS.get("limit") or 50)), - "full": "bun scripts/cli.ts gc remote %s snapshot --full --no-save" % PROVIDER_ID, - } - return { - "ok": True, - "action": "gc remote snapshot", - "providerId": PROVIDER_ID, - "dryRun": True, - "mutation": False, - "diagnosticStateMutation": bool(OPTIONS.get("saveSnapshot", True)), - "observedAt": observed_at, - "rootDisk": root_disk, - "clusterPreflight": preflight, - "sources": sources, - "registry": registry, - "pvcAttribution": public_pvc, - "memoryPressure": public_memory, - "containerd": containerd, - "policy": growth_watermark_policy(root_disk or {}), - "commands": commands, - } - -def allocated_file_size(path): - try: - stat = os.stat(path) - blocks = getattr(stat, "st_blocks", 0) - if blocks: - return int(blocks) * 512 - return int(stat.st_size) - except OSError: - return 0 - -def df_snapshot(): - result = command(["df", "-B1", "-P", "/"], 5) - if result["exitCode"] != 0: - return None - lines = result["stdout"].strip().splitlines() - if len(lines) < 2: - return None - parts = lines[1].split() - if len(parts) < 6: - return None - return { - "filesystem": parts[0], - "sizeBytes": int(parts[1]), - "usedBytes": int(parts[2]), - "availableBytes": int(parts[3]), - "dfBasisBytes": int(parts[2]) + int(parts[3]), - "reservedBytes": max(0, int(parts[1]) - int(parts[2]) - int(parts[3])), - "usePercentExact": round((int(parts[2]) * 100.0 / (int(parts[2]) + int(parts[3]))) if (int(parts[2]) + int(parts[3])) > 0 else 0.0, 2), - "usePercent": int(parts[4].replace("%", "")), - "mount": parts[5], - } - -def fmt_bytes(value): - units = ["B", "KiB", "MiB", "GiB", "TiB"] - size = float(max(0, int(value or 0))) - idx = 0 - while size >= 1024 and idx < len(units) - 1: - size /= 1024.0 - idx += 1 - return ("%0.0f %s" if size >= 10 or idx == 0 else "%0.1f %s") % (size, units[idx]) - -def disk_use_percent(size_bytes, used_bytes): - try: - size = int(size_bytes or 0) - used = int(used_bytes or 0) - except Exception: - return None - if size <= 0: - return None - return int((max(0, used) * 100 + size - 1) // size) - -def parse_journal_usage(text): - m = re.search(r"take up\s+([0-9.]+)\s*([KMGT]?)(?:i?B|B)?", text, re.I) - if not m: - return None - mult = {"": 1, "K": 1024, "M": 1024**2, "G": 1024**3, "T": 1024**4}.get(m.group(2).upper(), 1) - return int(float(m.group(1)) * mult) - -def parse_docker_human_size(raw): - raw = str(raw).split("(")[0].strip() - m = re.match(r"^([0-9.]+)\s*([KMGT]?B)$", raw, re.I) - if not m: - return None - mult = {"B": 1, "KB": 1000, "MB": 1000**2, "GB": 1000**3, "TB": 1000**4}.get(m.group(2).upper(), 1) - return int(float(m.group(1)) * mult) - -def parse_docker_build_cache(text): - for line in text.splitlines(): - if not line.startswith("Build Cache"): - continue - match = re.match(r"^Build Cache\s+\S+\s+\S+\s+(\S+)\s+(\S+)", line.strip()) - if not match: - continue - size = parse_docker_human_size(match.group(1)) - reclaim = parse_docker_human_size(match.group(2)) - if size is None or reclaim is None: - return None - return {"sizeBytes": size, "reclaimableBytes": reclaim} - return None - -def docker_containers(): - ps = command(["docker", "ps", "-qa", "--no-trunc"], 5) - if ps["exitCode"] != 0 or not ps["stdout"].strip(): - return [] - ids = ps["stdout"].split() - inspect = command(["docker", "inspect"] + ids, 10) - if inspect["exitCode"] != 0 or not inspect["stdout"].strip(): - return [] - try: - data = json.loads(inspect["stdout"]) - except Exception: - return [] - rows = [] - for item in data: - cfg = item.get("Config") or {} - rows.append({ - "id": str(item.get("Id") or ""), - "name": str(item.get("Name") or "").lstrip("/"), - "image": str(cfg.get("Image") or item.get("Image") or ""), - "logPath": str(item.get("LogPath") or ""), - }) - return [row for row in rows if row["id"]] - -def cluster_preflight(): - node_cmd = command(["sh", "-lc", "KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{\"\\n\"}{end}' 2>/dev/null"], 10) - pods_cmd = command(["sh", "-lc", "KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl get pods -n hwlab-dev --no-headers 2>/dev/null | wc -l"], 10) - nodes = [line.strip() for line in node_cmd["stdout"].splitlines() if line.strip()] - expected = EXPECTED_G14_NODE if PROVIDER_ID.upper() == "G14" else None - ok = True - reason = "ok" - if expected is not None and expected not in nodes: - ok = False - reason = "expected-g14-node-missing" - return { - "ok": ok, - "reason": reason, - "providerId": PROVIDER_ID, - "hostname": command(["hostname"], 5)["stdout"].strip(), - "expectedNode": expected, - "nodes": nodes, - "nodeCommand": bounded(node_cmd), - "hwlabDevPodCount": int(pods_cmd["stdout"].strip() or "0") if pods_cmd["exitCode"] == 0 else None, - "hwlabDevPodCommand": bounded(pods_cmd), - } - -def active_hwlab_ci_writes(): - result = command(["sh", "-lc", "KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl get pipelinerun,taskrun -n hwlab-ci --no-headers 2>/dev/null | awk '$2 != \"True\" && $2 != \"False\" {print}' | head -40"], 15) - lines = [line for line in (result.get("stdout") or "").splitlines() if line.strip()] - return {"ok": result["exitCode"] == 0, "activeCount": len(lines), "activePreview": lines, "command": bounded(result)} - -def active_hwlab_ci_jobs(): - result = command(["sh", "-lc", "KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl get jobs -n hwlab-ci --no-headers 2>/dev/null | awk '$2 != \"Complete\" && $2 != \"Failed\" {print}' | head -40"], 15) - lines = [line for line in (result.get("stdout") or "").splitlines() if line.strip()] - return {"ok": result["exitCode"] == 0, "activeCount": len(lines), "activePreview": lines, "command": bounded(result)} - -def wait_no_active_hwlab_ci(timeout=180): - deadline = time.time() + timeout - last = None - while time.time() < deadline: - writes = active_hwlab_ci_writes() - jobs = active_hwlab_ci_jobs() - last = {"writes": writes, "jobs": jobs} - if writes.get("ok") and jobs.get("ok") and int(writes.get("activeCount") or 0) == 0 and int(jobs.get("activeCount") or 0) == 0: - return {"ok": True, "last": last} - time.sleep(5) - return {"ok": False, "last": last} - -def kubectl_json(args, timeout=20): - result = command(["env", "KUBECONFIG=/etc/rancher/k3s/k3s.yaml", "kubectl"] + args + ["-o", "json"], timeout) - if result["exitCode"] != 0: - return None - try: - return json.loads(result["stdout"] or "{}") - except Exception: - return None - -def kctl(args, timeout=30): - return command(["env", "KUBECONFIG=/etc/rancher/k3s/k3s.yaml", "kubectl"] + args, timeout) - -def workload_image_refs(): - result = command(["sh", "-lc", "KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl get deploy,sts,ds,pod -A -o jsonpath='{range .items[*]}{range .spec.containers[*]}{.image}{\"\\n\"}{end}{range .spec.initContainers[*]}{.image}{\"\\n\"}{end}{range .spec.template.spec.containers[*]}{.image}{\"\\n\"}{end}{range .spec.template.spec.initContainers[*]}{.image}{\"\\n\"}{end}{end}' 2>/dev/null | sort -u"], 30) - refs = set() - digests = set() - for image in (result.get("stdout") or "").splitlines(): - image = image.strip() - if not image.startswith("127.0.0.1:5000/"): - continue - ref = image.split("127.0.0.1:5000/", 1)[1] - if "@sha256:" in ref: - repo, digest = ref.split("@", 1) - refs.add((repo, "@" + digest)) - digests.add("sha256:" + digest.split(":", 1)[1]) - elif ":" in ref: - repo, tag = ref.rsplit(":", 1) - refs.add((repo, tag)) - return refs, digests, bounded(result) - -def registry_request(method, path, headers=None, timeout=20): - url = "http://127.0.0.1:5000" + path - req = urllib.request.Request(url, method=method, headers=headers or {}) - with urllib.request.urlopen(req, timeout=timeout) as response: - body = response.read() - return {"status": response.status, "headers": dict(response.headers), "body": body.decode("utf-8", errors="replace")} - -def registry_tag_rows(): - rows = [] - root = REGISTRY_REPOSITORY_ROOT - if not os.path.isdir(root): - return rows - for repo_root, dirs, files in os.walk(root): - if os.path.basename(repo_root) != "tags": - continue - rel = os.path.relpath(repo_root, root) - suffix = "/_manifests/tags" - if not rel.endswith(suffix): - continue - repo = rel[:-len(suffix)] - try: - tags = os.listdir(repo_root) - except OSError: - continue - for tag in sorted(tags): - link = os.path.join(repo_root, tag, "current", "link") - if not os.path.isfile(link): - continue - try: - with open(link, "r", encoding="utf-8") as handle: - digest = handle.read().strip() - stat = os.stat(link) - except OSError: - continue - rows.append({ - "repo": repo, - "tag": tag, - "digest": digest, - "mtime": stat.st_mtime, - "mtimeIso": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(stat.st_mtime)), - "path": os.path.join(repo_root, tag), - }) - return rows - -def registry_revision_rows(): - rows = [] - root = REGISTRY_REPOSITORY_ROOT - if not os.path.isdir(root): - return rows - for repo_root, dirs, files in os.walk(root): - if os.path.basename(repo_root) != "sha256": - continue - rel = os.path.relpath(repo_root, root) - suffix = "/_manifests/revisions/sha256" - if not rel.endswith(suffix): - continue - repo = rel[:-len(suffix)] - try: - revisions = os.listdir(repo_root) - except OSError: - continue - for digest_hex in sorted(revisions): - path = os.path.join(repo_root, digest_hex) - link = os.path.join(path, "link") - if not os.path.isfile(link): - continue - try: - with open(link, "r", encoding="utf-8") as handle: - digest = handle.read().strip() - stat = os.stat(link) - except OSError: - continue - rows.append({ - "repo": repo, - "digest": digest, - "mtime": stat.st_mtime, - "mtimeIso": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(stat.st_mtime)), - "path": path, - }) - return rows - -def registry_retention_repo(repo): - return repo.startswith("hwlab/hwlab-") or repo.startswith("hwlab/cache/hwlab-") - -def registry_digest_hex(digest): - if not isinstance(digest, str) or not digest.startswith("sha256:"): - return None - value = digest.split(":", 1)[1] - if re.match(r"^[0-9a-f]{64}$", value) is None: - return None - return value - -def registry_blob_data_path(digest): - value = registry_digest_hex(digest) - if value is None: - return None - return os.path.join(REGISTRY_ROOT, "docker/registry/v2/blobs/sha256", value[:2], value, "data") - -_manifest_cache = {} -def registry_manifest_json(digest): - if digest in _manifest_cache: - return _manifest_cache[digest] - path = registry_blob_data_path(digest) - if path is None or not os.path.isfile(path): - _manifest_cache[digest] = None - return None - try: - with open(path, "rb") as handle: - data = handle.read(8 * 1024 * 1024) - value = json.loads(data.decode("utf-8")) - except Exception: - value = None - _manifest_cache[digest] = value - return value - -def registry_manifest_refs(digest): - manifest = registry_manifest_json(digest) - if not isinstance(manifest, dict): - return set() - refs = set() - config = manifest.get("config") or {} - config_digest = config.get("digest") - if isinstance(config_digest, str) and registry_digest_hex(config_digest) is not None: - refs.add(config_digest) - for item in manifest.get("layers") or []: - item_digest = (item or {}).get("digest") - if isinstance(item_digest, str) and registry_digest_hex(item_digest) is not None: - refs.add(item_digest) - for item in manifest.get("manifests") or []: - item_digest = (item or {}).get("digest") - if isinstance(item_digest, str) and registry_digest_hex(item_digest) is not None: - refs.add(item_digest) - return refs - -def registry_digest_closure(seed): - seen = set() - stack = list(seed) - while stack: - digest = stack.pop() - if digest in seen or registry_digest_hex(digest) is None: - continue - seen.add(digest) - for child in registry_manifest_refs(digest): - if child not in seen: - stack.append(child) - return seen - -def registry_blob_size(digest): - path = registry_blob_data_path(digest) - if path is None or not os.path.isfile(path): - return 0 - try: - return int(os.lstat(path).st_blocks) * 512 - except OSError: - return 0 - -def estimate_registry_reclaim(delete_manifest_digests, kept_manifest_digests): - deleted = registry_digest_closure(delete_manifest_digests) - kept = registry_digest_closure(kept_manifest_digests) - reclaim = deleted - kept - return sum(registry_blob_size(digest) for digest in reclaim) - -def plan_registry_retention(): - keep_per_repo = int(OPTIONS.get("registryKeepPerRepo") if OPTIONS.get("registryKeepPerRepo") is not None else 5) - min_age_hours = float(OPTIONS.get("registryMinAgeHours") if OPTIONS.get("registryMinAgeHours") is not None else 48) - cutoff = time.time() - min_age_hours * 3600 - refs, digests, refs_command = workload_image_refs() - rows = registry_tag_rows() - revision_rows = registry_revision_rows() - by_repo = {} - for row in rows: - by_repo.setdefault(row["repo"], []).append(row) - keep = set() - keep_reasons = {} - for repo, items in by_repo.items(): - items.sort(key=lambda item: item["mtime"], reverse=True) - for row in items[:keep_per_repo]: - key = (row["repo"], row["tag"]) - keep.add(key) - keep_reasons[key] = "latest-per-repo" - for row in items: - key = (row["repo"], row["tag"]) - if row["tag"] in REGISTRY_PROTECTED_TAGS: - keep.add(key) - keep_reasons[key] = "protected-tag" - if key in refs: - keep.add(key) - keep_reasons[key] = "workload-tag-ref" - if row["digest"] in digests: - keep.add(key) - keep_reasons[key] = "workload-digest-ref" - if row["repo"].startswith("hwlab/cache/"): - keep.add(key) - keep_reasons[key] = "cache-repo" - if row["mtime"] >= cutoff: - keep.add(key) - keep_reasons[key] = "recent-tag" - delete_rows = [] - kept_count = 0 - delete_by_repo = {} - keep_by_repo = {} - kept_digests = set() - for row in rows: - key = (row["repo"], row["tag"]) - should_delete = ( - key not in keep - and row["repo"].startswith("hwlab/hwlab-") - and re.match(r"^[0-9a-f]{7,40}$", row["tag"]) is not None - ) - if should_delete: - delete_rows.append(row) - delete_by_repo[row["repo"]] = delete_by_repo.get(row["repo"], 0) + 1 - else: - kept_count += 1 - kept_digests.add(row["digest"]) - keep_by_repo[row["repo"]] = keep_by_repo.get(row["repo"], 0) + 1 - protected_digests = kept_digests | digests - protected_digests.update(row["digest"] for row in revision_rows if not registry_retention_repo(row["repo"])) - protected_digests = registry_digest_closure(protected_digests) - delete_revision_rows = [] - revision_delete_by_repo = {} - for row in revision_rows: - if not registry_retention_repo(row["repo"]): - continue - if row["digest"] in protected_digests: - continue - delete_revision_rows.append(row) - revision_delete_by_repo[row["repo"]] = revision_delete_by_repo.get(row["repo"], 0) + 1 - kept_revision_digests = set(row["digest"] for row in revision_rows if row not in delete_revision_rows) - delete_revision_digests = set(row["digest"] for row in delete_revision_rows) - deletable_manifests = {} - for row in delete_rows: - if row["digest"] in kept_digests: - continue - deletable_manifests.setdefault(row["repo"], set()).add(row["digest"]) - for row in delete_revision_rows: - deletable_manifests.setdefault(row["repo"], set()).add(row["digest"]) - deletable_manifest_count = sum(len(items) for items in deletable_manifests.values()) - registry_size = du_size(REGISTRY_ROOT, 30) or 0 - estimate = estimate_registry_reclaim(delete_revision_digests, kept_revision_digests) - return { - "tagRows": rows, - "revisionRows": revision_rows, - "deleteRows": delete_rows, - "deleteRevisionRows": delete_revision_rows, - "summary": { - "totalTags": len(rows), - "totalRevisions": len(revision_rows), - "repoCount": len(by_repo), - "keepPerRepo": keep_per_repo, - "minAgeHours": min_age_hours, - "protectedWorkloadRefs": len(refs), - "protectedDigestRefs": len(digests), - "protectedDigestClosure": len(protected_digests), - "keptTags": kept_count, - "deleteTags": len(delete_rows), - "deleteManifests": deletable_manifest_count, - "deleteRevisions": len(delete_revision_rows), - "deleteByRepo": delete_by_repo, - "revisionDeleteByRepo": revision_delete_by_repo, - "keepByRepo": keep_by_repo, - "registrySizeBytes": registry_size, - "estimatedReclaimBytes": estimate, - }, - "deleteManifestsByRepo": {repo: sorted(list(digests)) for repo, digests in deletable_manifests.items()}, - "refsCommand": refs_command, - } - -def registry_deployment_preflight(): - dep = kubectl_json(["-n", "hwlab-ci", "get", "deploy", "hwlab-registry"], 20) - if not dep: - return {"ok": False, "reason": "registry-deployment-missing"} - spec = ((dep.get("spec") or {}).get("template") or {}).get("spec") or {} - containers = spec.get("containers") or [] - volumes = spec.get("volumes") or [] - registry_container = next((item for item in containers if item.get("name") == "registry"), containers[0] if containers else {}) - mounts = registry_container.get("volumeMounts") or [] - has_host_path = any(((vol.get("hostPath") or {}).get("path") == REGISTRY_ROOT and vol.get("name") == "storage") for vol in volumes) - has_mount = any((mount.get("name") == "storage" and mount.get("mountPath") == "/var/lib/registry") for mount in mounts) - image = str(registry_container.get("image") or "") - ok = bool(has_host_path and has_mount and image.startswith("registry:") and spec.get("hostNetwork") is True) - return { - "ok": ok, - "reason": "ok" if ok else "unexpected-registry-deployment-shape", - "image": image, - "hostNetwork": spec.get("hostNetwork"), - "hasExpectedHostPath": has_host_path, - "hasExpectedMount": has_mount, - "replicas": (dep.get("spec") or {}).get("replicas"), - "readyReplicas": (dep.get("status") or {}).get("readyReplicas"), - } - -def cronjob_suspend_states(names): - states = {} - for name in names: - data = kubectl_json(["-n", "hwlab-ci", "get", "cronjob", name], 15) - if data: - states[name] = bool(((data.get("spec") or {}).get("suspend")) is True) - return states - -def patch_cronjob_suspend(name, suspend): - payload = json.dumps({"spec": {"suspend": bool(suspend)}}) - return kctl(["-n", "hwlab-ci", "patch", "cronjob", name, "--type=merge", "-p", payload], 30) - -def wait_registry_pod_count(target, timeout=90): - deadline = time.time() + timeout - last = None - while time.time() < deadline: - result = kctl(["-n", "hwlab-ci", "get", "pods", "-l", "app.kubernetes.io/name=hwlab-registry", "--no-headers"], 20) - last = bounded(result) - lines = [line for line in (result.get("stdout") or "").splitlines() if line.strip()] - active = [] - for line in lines: - parts = line.split() - status = parts[2] if len(parts) >= 3 else "" - if status in set(["Completed", "Error", "Failed", "Succeeded"]): - continue - active.append(line) - if len(active) == target: - return {"ok": True, "lines": active, "allLines": lines, "last": last} - time.sleep(2) - return {"ok": False, "lines": [], "last": last} - -def wait_pod_terminal(name, timeout=900): - deadline = time.time() + timeout - last = None - while time.time() < deadline: - data = kubectl_json(["-n", "hwlab-ci", "get", "pod", name], 20) - if data: - phase = ((data.get("status") or {}).get("phase")) or "" - last = {"phase": phase} - if phase == "Succeeded": - return {"ok": True, "phase": phase} - if phase == "Failed": - return {"ok": False, "phase": phase} - time.sleep(3) - return {"ok": False, "phase": "Timeout", "last": last} - -def execute_registry_retention(): - if PROVIDER_ID.upper() != "G14": - raise RuntimeError("HWLAB registry retention is only supported on G14") - deployment = registry_deployment_preflight() - if not deployment.get("ok"): - raise RuntimeError("registry deployment preflight failed: %s" % deployment.get("reason")) - plan = plan_registry_retention() - delete_rows = plan.get("deleteRows") or [] - delete_revision_rows = plan.get("deleteRevisionRows") or [] - delete_manifests = plan.get("deleteManifestsByRepo") or {} - if not delete_rows and not delete_revision_rows: - return {"reclaimedBytes": 0, "commandOutput": {"message": "no registry tags or revisions matched conservative retention", "registryPlan": plan.get("summary")}} - if not delete_manifests: - return {"reclaimedBytes": 0, "commandOutput": {"message": "matched manifests are still referenced by retained manifests; registry GC would not reclaim blobs", "registryPlan": plan.get("summary")}} - cronjobs = ["hwlab-g14-branch-poller", "hwlab-v02-branch-poller"] - original_crons = cronjob_suspend_states(cronjobs) - before = du_size(REGISTRY_ROOT, 60) or 0 - gc_name = "hwlab-registry-gc-%s" % int(time.time()) - steps = [] - try: - for name in original_crons: - result = patch_cronjob_suspend(name, True) - steps.append({"step": "suspend-cronjob", "name": name, "result": bounded(result)}) - if result["exitCode"] != 0: - raise RuntimeError("failed to suspend cronjob %s" % name) - idle_after_suspend = wait_no_active_hwlab_ci(180) - steps.append({"step": "idle-after-suspend", "result": idle_after_suspend}) - if not idle_after_suspend.get("ok"): - raise RuntimeError("refusing registry maintenance because hwlab-ci did not become idle after suspend") - - deleted_manifests = [] - for repo, digests in delete_manifests.items(): - encoded_repo = "/".join(urllib.parse.quote(part, safe="") for part in repo.split("/")) - for digest in digests: - try: - result = registry_request("DELETE", "/v2/%s/manifests/%s" % (encoded_repo, urllib.parse.quote(digest, safe=":")), {"Accept": "application/vnd.docker.distribution.manifest.v2+json, application/vnd.oci.image.manifest.v1+json"}) - deleted_manifests.append({"repo": repo, "digest": digest, "status": result.get("status")}) - except urllib.error.HTTPError as exc: - if exc.code == 404: - deleted_manifests.append({"repo": repo, "digest": digest, "status": 404}) - else: - raise - steps.append({"step": "registry-api-delete-manifests", "count": len(deleted_manifests), "preview": deleted_manifests[:20]}) - - scale_down = kctl(["-n", "hwlab-ci", "scale", "deploy", "hwlab-registry", "--replicas=0"], 60) - steps.append({"step": "scale-registry-down", "result": bounded(scale_down)}) - if scale_down["exitCode"] != 0: - raise RuntimeError("failed to scale registry down") - waited_down = wait_registry_pod_count(0, 120) - steps.append({"step": "wait-registry-down", "result": waited_down}) - if not waited_down.get("ok"): - raise RuntimeError("registry pod did not scale down") - - deleted = [] - for row in delete_rows: - path = os.path.abspath(str(row.get("path") or "")) - if not path.startswith(REGISTRY_REPOSITORY_ROOT + "/") or "/_manifests/tags/" not in path: - raise RuntimeError("refusing unexpected registry tag path: %s" % path) - if not re.match(r"^[0-9a-f]{7,40}$", str(row.get("tag") or "")): - raise RuntimeError("refusing unexpected registry tag name: %s" % row.get("tag")) - if os.path.isdir(path) and not os.path.islink(path): - shutil.rmtree(path) - deleted.append({"repo": row.get("repo"), "tag": row.get("tag"), "digest": row.get("digest")}) - steps.append({"step": "delete-tag-directories", "count": len(deleted)}) - - deleted_revisions = [] - for row in delete_revision_rows: - path = os.path.abspath(str(row.get("path") or "")) - digest_hex = registry_digest_hex(str(row.get("digest") or "")) - if digest_hex is None: - raise RuntimeError("refusing unexpected registry revision digest: %s" % row.get("digest")) - if not path.startswith(REGISTRY_REPOSITORY_ROOT + "/") or "/_manifests/revisions/sha256/" not in path: - raise RuntimeError("refusing unexpected registry revision path: %s" % path) - if os.path.basename(path) != digest_hex: - raise RuntimeError("refusing registry revision path/digest mismatch: %s" % path) - if os.path.isdir(path) and not os.path.islink(path): - shutil.rmtree(path) - deleted_revisions.append({"repo": row.get("repo"), "digest": row.get("digest")}) - steps.append({"step": "delete-revision-directories", "count": len(deleted_revisions)}) - - overrides = { - "apiVersion": "v1", - "spec": { - "restartPolicy": "Never", - "containers": [{ - "name": "registry-gc", - "image": "registry:2.8.3", - "command": ["registry", "garbage-collect", "/etc/docker/registry/config.yml"], - "volumeMounts": [{"name": "storage", "mountPath": "/var/lib/registry"}], - }], - "volumes": [{"name": "storage", "hostPath": {"path": REGISTRY_ROOT, "type": "DirectoryOrCreate"}}], - }, - } - run_gc = kctl(["-n", "hwlab-ci", "run", gc_name, "--restart=Never", "--image=registry:2.8.3", "--overrides=%s" % json.dumps(overrides)], 60) - steps.append({"step": "start-registry-gc-pod", "result": bounded(run_gc), "pod": gc_name}) - if run_gc["exitCode"] != 0: - raise RuntimeError("failed to start registry GC pod") - waited_gc = wait_pod_terminal(gc_name, 900) - steps.append({"step": "wait-registry-gc", "result": waited_gc}) - logs = kctl(["-n", "hwlab-ci", "logs", gc_name], 120) - steps.append({"step": "registry-gc-logs", "result": bounded(logs)}) - if not waited_gc.get("ok"): - raise RuntimeError("registry GC pod did not complete successfully") - finally: - cleanup_gc = kctl(["-n", "hwlab-ci", "delete", "pod", gc_name, "--ignore-not-found=true"], 60) - steps.append({"step": "delete-registry-gc-pod", "result": bounded(cleanup_gc)}) - scale_up = kctl(["-n", "hwlab-ci", "scale", "deploy", "hwlab-registry", "--replicas=%s" % int(deployment.get("replicas") or 1)], 60) - steps.append({"step": "scale-registry-up", "result": bounded(scale_up)}) - rollout = kctl(["-n", "hwlab-ci", "rollout", "status", "deploy/hwlab-registry", "--timeout=180s"], 200) - steps.append({"step": "wait-registry-rollout", "result": bounded(rollout)}) - for name, was_suspended in original_crons.items(): - restore = patch_cronjob_suspend(name, was_suspended) - steps.append({"step": "restore-cronjob", "name": name, "suspend": was_suspended, "result": bounded(restore)}) - after = du_size(REGISTRY_ROOT, 60) or 0 - return { - "reclaimedBytes": max(0, before - after), - "commandOutput": { - "registryPlan": plan.get("summary"), - "deletedTagCount": len(delete_rows), - "deletedRevisionCount": len(delete_revision_rows), - "deletedManifestCount": sum(len(items) for items in delete_manifests.values()), - "diskBeforeBytes": before, - "diskAfterBytes": after, - "steps": steps[-12:], - }, - } - -def execute_registry_garbage_collect_only(): - if PROVIDER_ID.upper() != "G14": - raise RuntimeError("HWLAB registry garbage-collect is only supported on G14") - deployment = registry_deployment_preflight() - if not deployment.get("ok"): - raise RuntimeError("registry deployment preflight failed: %s" % deployment.get("reason")) - cronjobs = ["hwlab-g14-branch-poller", "hwlab-v02-branch-poller"] - original_crons = cronjob_suspend_states(cronjobs) - before = du_size(REGISTRY_ROOT, 60) or 0 - gc_name = "hwlab-registry-gc-%s" % int(time.time()) - steps = [] - try: - for name in original_crons: - result = patch_cronjob_suspend(name, True) - steps.append({"step": "suspend-cronjob", "name": name, "result": bounded(result)}) - if result["exitCode"] != 0: - raise RuntimeError("failed to suspend cronjob %s" % name) - idle_after_suspend = wait_no_active_hwlab_ci(180) - steps.append({"step": "idle-after-suspend", "result": idle_after_suspend}) - if not idle_after_suspend.get("ok"): - raise RuntimeError("refusing registry maintenance because hwlab-ci did not become idle after suspend") - - scale_down = kctl(["-n", "hwlab-ci", "scale", "deploy", "hwlab-registry", "--replicas=0"], 60) - steps.append({"step": "scale-registry-down", "result": bounded(scale_down)}) - if scale_down["exitCode"] != 0: - raise RuntimeError("failed to scale registry down") - waited_down = wait_registry_pod_count(0, 120) - steps.append({"step": "wait-registry-down", "result": waited_down}) - if not waited_down.get("ok"): - raise RuntimeError("registry pod did not scale down") - - overrides = { - "apiVersion": "v1", - "spec": { - "restartPolicy": "Never", - "containers": [{ - "name": "registry-gc", - "image": "registry:2.8.3", - "command": ["registry", "garbage-collect", "/etc/docker/registry/config.yml"], - "volumeMounts": [{"name": "storage", "mountPath": "/var/lib/registry"}], - }], - "volumes": [{"name": "storage", "hostPath": {"path": REGISTRY_ROOT, "type": "DirectoryOrCreate"}}], - }, - } - run_gc = kctl(["-n", "hwlab-ci", "run", gc_name, "--restart=Never", "--image=registry:2.8.3", "--overrides=%s" % json.dumps(overrides)], 60) - steps.append({"step": "start-registry-gc-pod", "result": bounded(run_gc), "pod": gc_name}) - if run_gc["exitCode"] != 0: - raise RuntimeError("failed to start registry GC pod") - waited_gc = wait_pod_terminal(gc_name, 900) - steps.append({"step": "wait-registry-gc", "result": waited_gc}) - logs = kctl(["-n", "hwlab-ci", "logs", gc_name], 120) - steps.append({"step": "registry-gc-logs", "result": bounded(logs)}) - if not waited_gc.get("ok"): - raise RuntimeError("registry GC pod did not complete successfully") - finally: - cleanup_gc = kctl(["-n", "hwlab-ci", "delete", "pod", gc_name, "--ignore-not-found=true"], 60) - steps.append({"step": "delete-registry-gc-pod", "result": bounded(cleanup_gc)}) - scale_up = kctl(["-n", "hwlab-ci", "scale", "deploy", "hwlab-registry", "--replicas=%s" % int(deployment.get("replicas") or 1)], 60) - steps.append({"step": "scale-registry-up", "result": bounded(scale_up)}) - rollout = kctl(["-n", "hwlab-ci", "rollout", "status", "deploy/hwlab-registry", "--timeout=180s"], 200) - steps.append({"step": "wait-registry-rollout", "result": bounded(rollout)}) - for name, was_suspended in original_crons.items(): - restore = patch_cronjob_suspend(name, was_suspended) - steps.append({"step": "restore-cronjob", "name": name, "suspend": was_suspended, "result": bounded(restore)}) - after = du_size(REGISTRY_ROOT, 60) or 0 - return { - "reclaimedBytes": max(0, before - after), - "commandOutput": { - "message": "official registry garbage-collect only; no additional tag deletion", - "diskBeforeBytes": before, - "diskAfterBytes": after, - "steps": steps[-12:], - }, - } - -def start_registry_retention_job(mode): - job_id = "g14-registry-%s-%s" % (int(time.time()), os.getpid()) - paths = job_paths(job_id) - started_at = now_iso() - initial = { - "ok": True, - "action": "gc remote status", - "providerId": PROVIDER_ID, - "jobId": job_id, - "status": "running", - "kind": "hwlab-registry-retention-gc" if mode == "retention" else "hwlab-registry-garbage-collect", - "mode": mode, - "startedAt": started_at, - "statePath": paths["state"], - "logPath": paths["log"], - "options": OPTIONS, - } - write_json_atomic(paths["state"], initial) - pid = os.fork() - if pid != 0: - return { - "status": "started", - "reclaimedBytes": None, - "commandOutput": { - "jobId": job_id, - "pid": pid, - "statePath": paths["state"], - "logPath": paths["log"], - "statusCommand": "bun scripts/cli.ts gc remote %s status --job-id %s" % (PROVIDER_ID, job_id), - "message": "registry retention GC is running as a detached remote job", - }, - } - - try: - os.setsid() - except Exception: - pass - try: - devnull = os.open(os.devnull, os.O_RDONLY) - os.dup2(devnull, 0) - os.close(devnull) - except Exception: - pass - try: - log_handle = open(paths["log"], "a", encoding="utf-8", buffering=1) - os.dup2(log_handle.fileno(), 1) - os.dup2(log_handle.fileno(), 2) - except Exception: - log_handle = None - try: - print("[%s] starting HWLAB registry %s job %s" % (now_iso(), mode, job_id), flush=True) - result = execute_registry_retention() if mode == "retention" else execute_registry_garbage_collect_only() - payload = dict(initial) - payload.update({ - "status": "succeeded", - "finishedAt": now_iso(), - "result": result, - "diskAfter": df_snapshot(), - "clusterAfter": cluster_preflight(), - }) - write_json_atomic(paths["state"], payload) - print("[%s] completed HWLAB registry %s job %s" % (now_iso(), mode, job_id), flush=True) - os._exit(0) - except Exception as exc: - payload = dict(initial) - payload.update({ - "ok": False, - "status": "failed", - "finishedAt": now_iso(), - "error": str(exc), - "diskAfter": df_snapshot(), - "clusterAfter": cluster_preflight(), - }) - try: - write_json_atomic(paths["state"], payload) - except Exception: - pass - print("[%s] failed HWLAB registry %s job %s: %s" % (now_iso(), mode, job_id, exc), flush=True) - os._exit(1) - finally: - try: - if log_handle: - log_handle.close() - except Exception: - pass - -def collect_protected(): - protected_paths = [ - ("hwlab-k3s-runtime", "/var/lib/rancher/k3s", "Native k3s runtime, containerd state, local-path storage and control-plane data are protected."), - ("hwlab-k3s-storage", "/var/lib/rancher/k3s/storage", "Local-path PVC data is protected; cleanup must go through HWLAB/Tekton retention commands."), - ("hwlab-kubelet", "/var/lib/kubelet", "Kubelet pod/runtime state is protected."), - ("host-containerd", "/var/lib/containerd", "Host containerd state is protected; generic gc does not prune containerd images."), - ("hwlab-host-data", "/var/lib/hwlab", "HWLAB host data/cache is protected from generic remote gc until a HWLAB-specific retention rule classifies it."), - ("hwlab-source", "/root/hwlab", "G14 HWLAB fixed source workspace is protected."), - ("hwlab-v02-source", "/root/hwlab-v02", "HWLAB v0.2 fixed source workspace is protected."), - ("agentrun-source", "/root/agentrun", "AgentRun fixed source workspace is protected."), - ("docker-images-and-volumes", "docker-images-volumes", "Remote gc does not remove Docker images, containers, volumes or Compose projects."), - ("k8s-api-objects", "deployments-statefulsets-secrets-pvcs", "Remote gc does not mutate Kubernetes workloads, Secrets, PVCs, PVs, Argo CD or Tekton objects."), - ] - result = [] - for kind, ref, reason in protected_paths: - item = {"kind": kind, "risk": "blocked", "ref": ref, "reason": reason} - if ref.startswith("/") and os.path.exists(ref): - item["sizeBytes"] = du_size(ref) - result.append(item) - return result - -def collect_candidates(observed_at): - candidates = [] - if OPTIONS.get("journal", True): - usage = command(["journalctl", "--disk-usage"], 5) - current = parse_journal_usage((usage["stdout"] or "") + (usage["stderr"] or "")) - target = int(OPTIONS.get("journalTargetBytes") or 536870912) - if current is not None and current > target: - candidates.append({ - "id": "journalctl:vacuum", - "kind": "journal-vacuum", - "risk": "medium", - "description": "Vacuum systemd journal to %s" % fmt_bytes(target), - "sizeBytes": current, - "estimatedReclaimBytes": max(0, current - target), - "action": {"command": ["journalctl", "--vacuum-size=%s" % target]}, - }) - - if OPTIONS.get("dockerLogs", True): - max_bytes = int(OPTIONS.get("dockerLogMaxBytes") or 52428800) - for container in docker_containers(): - path = container.get("logPath") or "" - if not path or not os.path.exists(path): - continue - try: - size = os.path.getsize(path) - except OSError: - continue - if size <= max_bytes: - continue - candidates.append({ - "id": "docker-json-log:%s" % container["id"], - "kind": "docker-json-log-truncate", - "risk": "medium", - "description": "Truncate Docker json-file log larger than %s" % fmt_bytes(max_bytes), - "path": path, - "container": {"id": container["id"][:12], "name": container["name"], "image": container["image"]}, - "sizeBytes": size, - "estimatedReclaimBytes": size, - "action": {"op": "truncate", "targetBytes": 0}, - }) - - if OPTIONS.get("buildCache", True): - system_df = command(["docker", "system", "df"], 8) - if system_df["exitCode"] == 0: - cache = parse_docker_build_cache(system_df["stdout"]) - if cache is not None and cache["reclaimableBytes"] > 0: - until = str(OPTIONS.get("buildCacheUntil") or "24h") - candidates.append({ - "id": "docker-builder:prune", - "kind": "docker-build-cache-prune", - "risk": "low", - "description": "Prune Docker BuildKit cache unused for %s" % until, - "sizeBytes": cache["sizeBytes"], - "estimatedReclaimBytes": cache["reclaimableBytes"], - "action": {"command": ["docker", "builder", "prune", "--all", "--force", "--filter", "until=%s" % until], "estimate": "docker-system-df-reclaimable-upper-bound"}, - }) - - if OPTIONS.get("aptCache", True): - apt_path = "/var/cache/apt/archives" - size = du_size(apt_path) or 0 - if size > 10 * 1024 * 1024: - candidates.append({ - "id": "apt-cache:clean", - "kind": "apt-cache-clean", - "risk": "low", - "description": "Clean downloaded apt package archives", - "path": apt_path, - "sizeBytes": size, - "estimatedReclaimBytes": size, - "action": {"command": ["apt-get", "clean"]}, - }) - - if OPTIONS.get("toolCaches", False): - for item in TOOL_CACHE_ALLOWLIST: - path = item["path"] - size = du_size(path, 8) or 0 - if size <= 0: - continue - candidates.append({ - "id": "tool-cache:%s" % item["id"], - "kind": "tool-cache-delete", - "risk": "medium", - "description": item["description"], - "path": path, - "sizeBytes": size, - "estimatedReclaimBytes": size, - "action": {"op": "rm-recursive", "allowlist": "remote-tool-cache"}, - }) - - if OPTIONS.get("coreDumps", True): - cutoff = time.time() - float(OPTIONS.get("coreDumpMinAgeHours") or 1) * 3600 - for root in sorted(CORE_DUMP_DIR_ALLOWLIST): - if not os.path.isdir(root): - continue - for name in os.listdir(root): - if not re.match(r"^core\.\d+$", name): - continue - path = os.path.join(root, name) - try: - stat = os.lstat(path) - except OSError: - continue - if not os.path.isfile(path) or os.path.islink(path): - continue - if stat.st_mtime >= cutoff: - continue - disk_size = allocated_file_size(path) - candidates.append({ - "id": "core-dump:%s" % path, - "kind": "core-dump-delete", - "risk": "low", - "description": "Delete untracked process core dump older than %s hours" % OPTIONS.get("coreDumpMinAgeHours"), - "path": path, - "sizeBytes": int(disk_size), - "estimatedReclaimBytes": int(disk_size), - "apparentSizeBytes": int(stat.st_size), - "action": {"op": "unlink", "allowlist": "root-unidesk-core-dot-pid"}, - }) - - if OPTIONS.get("tmp", True) and os.path.isdir("/tmp"): - cutoff = time.time() - float(OPTIONS.get("tmpMinAgeHours") or 24) * 3600 - for name in os.listdir("/tmp"): - path = os.path.join("/tmp", name) - if path in TMP_EXACT_PROTECT: - continue - if not any(name.startswith(prefix) for prefix in TMP_PREFIX_ALLOWLIST): - continue - try: - stat = os.lstat(path) - except OSError: - continue - if stat.st_mtime >= cutoff: - continue - size = du_size(path, 8) or path_size(path) - if size <= 0: - continue - candidates.append({ - "id": "tmp:%s" % path, - "kind": "tmp-path-delete", - "risk": "low", - "description": "Delete allowlisted /tmp path older than %s hours" % OPTIONS.get("tmpMinAgeHours"), - "path": path, - "sizeBytes": size, - "estimatedReclaimBytes": size, - "action": {"op": "rm-recursive", "allowlist": "tmp-prefix"}, - }) - if OPTIONS.get("hwlabRegistry", False): - registry = plan_registry_retention() - summary = registry.get("summary") or {} - delete_rows = registry.get("deleteRows") or [] - delete_revision_rows = registry.get("deleteRevisionRows") or [] - estimate = int(summary.get("estimatedReclaimBytes") or 0) - if delete_rows or delete_revision_rows: - candidates.append({ - "id": "hwlab-registry:retention-gc", - "kind": "hwlab-registry-retention-gc", - "risk": "medium", - "description": "Conservative HWLAB registry retention: keep current workload refs, retained tags and protected repos, delete stale manifest revisions, then run official registry garbage-collect", - "path": REGISTRY_ROOT, - "sizeBytes": int(summary.get("registrySizeBytes") or 0), - "estimatedReclaimBytes": estimate, - "action": { - "op": "registry-retention-gc", - "requiresMaintenanceWindow": True, - "keepPerRepo": summary.get("keepPerRepo"), - "minAgeHours": summary.get("minAgeHours"), - "deleteTags": len(delete_rows), - "deleteManifests": summary.get("deleteManifests"), - "deleteRevisions": summary.get("deleteRevisions"), - "deleteByRepo": summary.get("deleteByRepo"), - "revisionDeleteByRepo": summary.get("revisionDeleteByRepo"), - "protectedWorkloadRefs": summary.get("protectedWorkloadRefs"), - "protectedDigestRefs": summary.get("protectedDigestRefs"), - "protectedDigestClosure": summary.get("protectedDigestClosure"), - }, - }) - elif bool(OPTIONS.get("registryGcOnly")) and int(summary.get("totalTags") or 0) > 0 and int(summary.get("deleteTags") or 0) == 0: - candidates.append({ - "id": "hwlab-registry:garbage-collect-only", - "kind": "hwlab-registry-garbage-collect", - "risk": "medium", - "description": "Run official HWLAB registry garbage-collect without deleting additional tags; useful after a previously interrupted retention run", - "path": REGISTRY_ROOT, - "sizeBytes": int(summary.get("registrySizeBytes") or 0), - "estimatedReclaimBytes": 0, - "action": { - "op": "registry-garbage-collect-only", - "requiresMaintenanceWindow": True, - "deleteTags": 0, - "registryPlan": summary, - }, - }) - return sorted(candidates, key=lambda item: item.get("estimatedReclaimBytes") or 0, reverse=True) - -def target_assessment(disk, estimated_reclaim): - raw = OPTIONS.get("targetUsePercent") - if raw is None: - return None - if not disk: - return { - "targetUsePercent": raw, - "ok": False, - "state": "unavailable", - "reason": "disk-snapshot-unavailable", - } - try: - target = int(raw) - size = int(disk.get("sizeBytes") or 0) - used = int(disk.get("usedBytes") or 0) - available = int(disk.get("availableBytes") or 0) - reclaim = max(0, int(estimated_reclaim or 0)) - except Exception: - return { - "targetUsePercent": raw, - "ok": False, - "state": "unavailable", - "reason": "invalid-disk-snapshot", - } - df_basis = used + available - if df_basis <= 0: - df_basis = size - legacy_target_used_bytes = (size * target) // 100 - legacy_required = max(0, used - legacy_target_used_bytes) - target_used_bytes = (df_basis * target) // 100 - required = max(0, used - target_used_bytes) - projected_used = max(0, used - reclaim) - projected_use_percent = disk_use_percent(df_basis, projected_used) - legacy_projected_use_percent = disk_use_percent(size, projected_used) - enough = reclaim >= required - if required == 0: - state = "already-below-target" - elif enough: - state = "candidate-estimate-meets-target" - elif reclaim <= 0: - state = "safe-stop-no-meaningful-candidates" - else: - state = "shortfall" - shortfall = max(0, required - reclaim) - return { - "targetUsePercent": target, - "ok": required == 0 or enough, - "state": state, - "currentUsePercent": disk.get("usePercent"), - "currentUsePercentExact": disk.get("usePercentExact"), - "basis": "df-used-over-used-plus-available", - "dfBasisBytes": df_basis, - "dfBasis": fmt_bytes(df_basis), - "reservedBytes": max(0, size - df_basis), - "reserved": fmt_bytes(max(0, size - df_basis)), - "currentUsedBytes": used, - "currentUsed": fmt_bytes(used), - "targetUsedBytes": target_used_bytes, - "targetUsed": fmt_bytes(target_used_bytes), - "requiredReclaimBytes": required, - "requiredReclaim": fmt_bytes(required), - "estimatedReclaimBytes": reclaim, - "estimatedReclaim": fmt_bytes(reclaim), - "shortfallBytes": shortfall, - "shortfall": fmt_bytes(shortfall), - "projectedUsedBytes": projected_used, - "projectedUsed": fmt_bytes(projected_used), - "projectedUsePercent": projected_use_percent, - "safeStop": required > 0 and not enough, - "decision": "stop-and-escalate-retention-or-capacity" if required > 0 and not enough else "target-covered-by-safe-candidates", - "legacySizeBasis": { - "basis": "df-size-column-includes-reserved-blocks", - "sizeBytes": size, - "size": fmt_bytes(size), - "targetUsedBytes": legacy_target_used_bytes, - "targetUsed": fmt_bytes(legacy_target_used_bytes), - "requiredReclaimBytes": legacy_required, - "requiredReclaim": fmt_bytes(legacy_required), - "projectedUsePercent": legacy_projected_use_percent, - "note": "informational only; ok/safeStop use the same basis as df Use%", - }, - } - -def summarize(candidates, returned, disk=None): - by_kind = {} - total = 0 - for item in candidates: - size = int(item.get("estimatedReclaimBytes") or 0) - total += size - kind = item.get("kind") or "unknown" - current = by_kind.setdefault(kind, {"count": 0, "estimatedReclaimBytes": 0, "estimatedReclaim": "0 B"}) - current["count"] += 1 - current["estimatedReclaimBytes"] += size - current["estimatedReclaim"] = fmt_bytes(current["estimatedReclaimBytes"]) - returned_total = sum(int(item.get("estimatedReclaimBytes") or 0) for item in returned) - return { - "candidateCount": len(candidates), - "returnedCandidateCount": len(returned), - "estimatedReclaimBytes": total, - "estimatedReclaim": fmt_bytes(total), - "returnedEstimatedReclaimBytes": returned_total, - "returnedEstimatedReclaim": fmt_bytes(returned_total), - "byKind": by_kind, - "target": target_assessment(disk, total), - } - -def assert_tmp_candidate(path): - resolved = os.path.abspath(path) - if not resolved.startswith("/tmp/"): - raise RuntimeError("refusing to remove non-/tmp path: %s" % path) - if resolved in TMP_EXACT_PROTECT: - raise RuntimeError("refusing to remove protected tmp path: %s" % path) - name = os.path.basename(resolved) - if not any(name.startswith(prefix) for prefix in TMP_PREFIX_ALLOWLIST): - raise RuntimeError("refusing to remove tmp path outside allowlist: %s" % path) - -def assert_core_dump_candidate(path): - resolved = os.path.abspath(path) - parent = os.path.dirname(resolved) - name = os.path.basename(resolved) - if parent not in CORE_DUMP_DIR_ALLOWLIST: - raise RuntimeError("refusing to remove core dump outside allowlisted directory: %s" % path) - if not re.match(r"^core\.\d+$", name): - raise RuntimeError("refusing to remove non core. file: %s" % path) - if not os.path.isfile(resolved) or os.path.islink(resolved): - raise RuntimeError("refusing to remove non-regular core dump: %s" % path) - git = command(["git", "-C", parent, "ls-files", "--error-unmatch", name], 10) - if git["exitCode"] == 0: - raise RuntimeError("refusing to remove git-tracked file: %s" % path) - if git["exitCode"] != 1: - raise RuntimeError("refusing to remove core dump because git tracking check failed: %s" % path) - fuser = command(["fuser", resolved], 5) - if fuser["exitCode"] is None: - raise RuntimeError("refusing to remove core dump because fuser check was unavailable: %s" % path) - if fuser["exitCode"] == 0: - raise RuntimeError("refusing to remove core dump with active process reference: %s" % path) - -def assert_tool_cache_candidate(path): - resolved = os.path.abspath(path) - allowed = set(item["path"] for item in TOOL_CACHE_ALLOWLIST) - if resolved not in allowed: - raise RuntimeError("refusing to remove tool cache outside allowlist: %s" % path) - if os.path.islink(resolved): - raise RuntimeError("refusing to remove symlink tool cache: %s" % path) - -def execute(candidate): - kind = candidate.get("kind") - if kind == "journal-vacuum": - result = command(["journalctl", "--vacuum-size=%s" % int(OPTIONS.get("journalTargetBytes") or 536870912)], 30) - if result["exitCode"] != 0: - raise RuntimeError((result["stderr"] or "journalctl vacuum failed").strip()) - return {"reclaimedBytes": None, "commandOutput": bounded(result)} - if kind == "docker-json-log-truncate": - path = candidate.get("path") or "" - if not path.startswith("/var/lib/docker/containers/"): - raise RuntimeError("refusing to truncate Docker log outside /var/lib/docker/containers") - before = os.path.getsize(path) if os.path.exists(path) else 0 - with open(path, "r+b") as handle: - handle.truncate(0) - return {"reclaimedBytes": before} - if kind == "docker-build-cache-prune": - until = str(OPTIONS.get("buildCacheUntil") or "24h") - result = command(["docker", "builder", "prune", "--all", "--force", "--filter", "until=%s" % until], 45) - if result["exitCode"] != 0: - raise RuntimeError((result["stderr"] or "docker builder prune failed").strip()) - return {"reclaimedBytes": None, "commandOutput": bounded(result)} - if kind == "apt-cache-clean": - before = du_size("/var/cache/apt/archives") or 0 - result = command(["apt-get", "clean"], 30) - if result["exitCode"] != 0: - raise RuntimeError((result["stderr"] or "apt-get clean failed").strip()) - after = du_size("/var/cache/apt/archives") or 0 - return {"reclaimedBytes": max(0, before - after), "commandOutput": bounded(result)} - if kind == "tool-cache-delete": - path = candidate.get("path") or "" - assert_tool_cache_candidate(path) - before = du_size(path, 8) or path_size(path) - if os.path.isdir(path): - shutil.rmtree(path, ignore_errors=True) - elif os.path.exists(path): - os.unlink(path) - return {"reclaimedBytes": before} - if kind == "tmp-path-delete": - path = candidate.get("path") or "" - assert_tmp_candidate(path) - before = du_size(path, 8) or path_size(path) - if os.path.isdir(path) and not os.path.islink(path): - shutil.rmtree(path, ignore_errors=True) - else: - try: - os.unlink(path) - except FileNotFoundError: - pass - return {"reclaimedBytes": before} - if kind == "core-dump-delete": - path = candidate.get("path") or "" - assert_core_dump_candidate(path) - before = allocated_file_size(path) - os.unlink(path) - return {"reclaimedBytes": before} - if kind == "hwlab-registry-retention-gc": - return start_registry_retention_job("retention") - if kind == "hwlab-registry-garbage-collect": - return start_registry_retention_job("garbage-collect") - raise RuntimeError("unsupported remote gc candidate kind: %s" % kind) - -def visible_items(items): - if bool(OPTIONS.get("full")): - return items - return items[:int(OPTIONS.get("limit") or 50)] - -def returned_results(results): - if bool(OPTIONS.get("full")): - return results - failed = [item for item in results if item.get("status") == "failed"] - started = [item for item in results if item.get("status") == "started"] - succeeded = [item for item in results if item.get("status") == "succeeded"] - return (failed + started + succeeded)[:int(OPTIONS.get("resultLimit") or 50)] - -def plan_payload(observed_at, preflight, protected, candidates, visible): - disk = df_snapshot() - ci_storage = ci_storage_snapshot() - memory_pressure = collect_memory_pressure() - compact_pvc = compact_pvc_attribution(ci_storage) - policy = { - "requiresRunConfirm": True, - "runCommand": "bun scripts/cli.ts gc remote %s run --confirm" % PROVIDER_ID, - "neverTouches": [ - "/var/lib/rancher/k3s", - "/var/lib/rancher/k3s/storage", - "/var/lib/kubelet", - "/var/lib/containerd", - "/var/lib/hwlab unless --include-hwlab-registry is explicitly supplied", - "Kubernetes Deployments/StatefulSets/Secrets/PVCs/PVs", - "HWLAB fixed source workspaces", - "Docker images, containers and volumes", - ], - "notes": [ - "Remote gc only executes the returned candidate page unless --full or a larger --limit is supplied.", - "G14 run requires the expected native k3s node preflight before mutation.", - "HWLAB DEV runtime and local-path PVC data are protected and require HWLAB-specific retention commands.", - "Core dump cleanup only removes untracked /root/unidesk/core. regular files with no active fuser reference.", - "HWLAB registry retention is opt-in: it keeps workload tag/digest refs, all tags newer than the retention age and the newest N tags per repo before official registry garbage-collect.", - "When summary.target.safeStop is true, do not broaden deletion scope; choose registry retention, k3s/containerd image cache maintenance, PVC/runtime retention or capacity expansion explicitly.", - ], - } - if not bool(OPTIONS.get("full")): - policy = { - "requiresRunConfirm": True, - "runCommand": "bun scripts/cli.ts gc remote %s run --confirm" % PROVIDER_ID, - "neverTouches": ["k3s runtime", "PVC/PV/local-path data", "Secrets/auth/config", "Docker volumes/images"], - "notes": [ - "Default plan is compact; rerun with --full for complete policy notes and protected rows.", - "When summary.target.safeStop is true, stop at protected boundaries and choose an owner-aware retention or capacity decision.", - ], - } - payload = { - "ok": True, - "action": "gc remote plan", - "providerId": PROVIDER_ID, - "dryRun": True, - "mutation": False, - "observedAt": observed_at, - "options": OPTIONS, - "diskBefore": disk, - "clusterPreflight": preflight, - "summary": summarize(candidates, visible, disk), - "candidates": visible, - "protected": protected if bool(OPTIONS.get("full")) else protected[:3], - "policy": policy, - } - if bool(OPTIONS.get("full")): - payload.update({ - "memoryPressure": memory_pressure, - "pvcAttribution": ci_storage, - "ciStorage": ci_storage, - }) - else: - payload["pressureSummary"] = { - "memory": (compact_memory_pressure(memory_pressure).get("summary") if isinstance(compact_memory_pressure(memory_pressure), dict) else None), - "pvc": { - "pvcCount": compact_pvc.get("pvcCount"), - "reviewCandidateCount": compact_pvc.get("reviewCandidateCount"), - "estimatedBytes": compact_pvc.get("estimatedBytes"), - "estimatedHuman": compact_pvc.get("estimatedHuman"), - "byNamespace": compact_pvc.get("byNamespace"), - "handoff": compact_pvc.get("handoff"), - }, - "drillDown": "bun scripts/cli.ts gc remote %s status --limit %s" % (PROVIDER_ID, int(OPTIONS.get("limit") or 50)), - } - return payload - -def safe_unit_name(value): - raw = str(value or "").strip().lower() - raw = re.sub(r"[^a-z0-9_.@-]+", "-", raw).strip("-") - if not raw: - raw = "unidesk-%s-low-risk-gc" % re.sub(r"[^a-z0-9]+", "-", PROVIDER_ID.lower()).strip("-") - return raw[:80] - -def render_remote_policy(): - unit_name = safe_unit_name(config_str(POLICY_TIMER_CONFIG, "name", "unidesk-%s-low-risk-gc" % PROVIDER_ID.lower())) - on_calendar = config_str(POLICY_TIMER_CONFIG, "onCalendar", "daily") - randomized_delay_sec = config_str(POLICY_TIMER_CONFIG, "randomizedDelaySec", "15min") - journal_target = parse_size_value(POLICY_TIMER_CONFIG.get("journalTargetBytes"), int(OPTIONS.get("journalTargetBytes") or 536870912)) - tmp_min_age_hours = config_float(POLICY_TIMER_CONFIG, "tmpMinAgeHours", float(OPTIONS.get("tmpMinAgeHours") or 24), minimum=0.0) - include_apt_cache = config_bool(POLICY_TIMER_CONFIG, "includeAptCache", bool(OPTIONS.get("aptCache", True))) - include_tool_caches = config_bool(POLICY_TIMER_CONFIG, "includeToolCaches", False) - script_path = "/usr/local/sbin/%s.sh" % unit_name - service_path = "/etc/systemd/system/%s.service" % unit_name - timer_path = "/etc/systemd/system/%s.timer" % unit_name - tool_paths = [item["path"] for item in TOOL_CACHE_ALLOWLIST] if include_tool_caches else [] - script = "\n".join([ - "#!/bin/sh", - "set -eu", - "umask 077", - "journalctl --vacuum-size=%s >/dev/null 2>&1 || true" % int(journal_target), - "apt-get clean >/dev/null 2>&1 || true" if include_apt_cache else ": apt cache disabled by YAML", - "python3 - <<'PY'", - "import json, os, shutil, time", - "prefixes = json.loads(%r)" % json.dumps(TMP_PREFIX_ALLOWLIST), - "protected = set(json.loads(%r))" % json.dumps(sorted(TMP_EXACT_PROTECT)), - "tool_paths = json.loads(%r)" % json.dumps(tool_paths), - "cutoff = time.time() - float(%r) * 3600.0" % tmp_min_age_hours, - "for name in os.listdir('/tmp'):", - " path = os.path.join('/tmp', name)", - " if path in protected or not any(name.startswith(prefix) for prefix in prefixes):", - " continue", - " try:", - " stat = os.lstat(path)", - " except OSError:", - " continue", - " if stat.st_mtime >= cutoff:", - " continue", - " if os.path.isdir(path) and not os.path.islink(path):", - " shutil.rmtree(path, ignore_errors=True)", - " elif os.path.exists(path):", - " try:", - " os.unlink(path)", - " except FileNotFoundError:", - " pass", - "for path in tool_paths:", - " resolved = os.path.abspath(path)", - " if resolved != path or os.path.islink(resolved) or resolved in ['/', '/root', '/root/.npm', '/root/.bun']:", - " continue", - " if os.path.isdir(resolved):", - " shutil.rmtree(resolved, ignore_errors=True)", - " elif os.path.exists(resolved):", - " try:", - " os.unlink(resolved)", - " except FileNotFoundError:", - " pass", - "PY", - "", - ]) - service = "\n".join([ - "[Unit]", - "Description=UniDesk remote low-risk GC for %s" % PROVIDER_ID, - "Documentation=config/unidesk-cli.yaml#gc.remote.targets.%s.policyTimer" % PROVIDER_ID, - "", - "[Service]", - "Type=oneshot", - "ExecStart=%s" % script_path, - "Nice=10", - "IOSchedulingClass=best-effort", - "IOSchedulingPriority=7", - "", - ]) - timer = "\n".join([ - "[Unit]", - "Description=UniDesk remote low-risk GC timer for %s" % PROVIDER_ID, - "", - "[Timer]", - "OnCalendar=%s" % on_calendar, - "RandomizedDelaySec=%s" % randomized_delay_sec, - "Persistent=true", - "", - "[Install]", - "WantedBy=timers.target", - "", - ]) - return { - "unitName": unit_name, - "scriptPath": script_path, - "servicePath": service_path, - "timerPath": timer_path, - "onCalendar": on_calendar, - "randomizedDelaySec": randomized_delay_sec, - "journalTargetBytes": int(journal_target), - "journalTarget": fmt_bytes(journal_target), - "tmpMinAgeHours": tmp_min_age_hours, - "includeAptCache": include_apt_cache, - "includeToolCaches": include_tool_caches, - "script": script, - "service": service, - "timer": timer, - } - -def remote_policy_plan_payload(observed_at): - rendered = render_remote_policy() - return { - "ok": True, - "action": "gc remote policy plan", - "providerId": PROVIDER_ID, - "dryRun": True, - "mutation": False, - "observedAt": observed_at, - "configSource": "config/unidesk-cli.yaml#gc.remote.targets.%s.policyTimer" % PROVIDER_ID, - "enabled": config_bool(POLICY_TIMER_CONFIG, "enabled", False), - "timer": {key: rendered.get(key) for key in ["unitName", "scriptPath", "servicePath", "timerPath", "onCalendar", "randomizedDelaySec", "journalTargetBytes", "journalTarget", "tmpMinAgeHours", "includeAptCache", "includeToolCaches"]}, - "scriptPreview": "\n".join(rendered["script"].splitlines()[:20]), - "servicePreview": rendered["service"], - "timerPreview": rendered["timer"], - "installCommand": "bun scripts/cli.ts gc remote %s policy install --confirm" % PROVIDER_ID, - "policy": { - "risk": "low", - "neverTouches": [ - "k3s runtime directories", - "PVC/PV/local-path data", - "Docker images, containers, volumes or Docker build cache", - "Secret/auth/config state", - "active Web observe runners or Chrome processes", - ], - "toolCaches": "disabled unless config/unidesk-cli.yaml enables includeToolCaches for this remote target", - }, - } - -def remote_policy_install_payload(observed_at): - rendered = render_remote_policy() - try: - with open(rendered["scriptPath"], "w", encoding="utf-8") as handle: - handle.write(rendered["script"]) - os.chmod(rendered["scriptPath"], 0o755) - with open(rendered["servicePath"], "w", encoding="utf-8") as handle: - handle.write(rendered["service"]) - with open(rendered["timerPath"], "w", encoding="utf-8") as handle: - handle.write(rendered["timer"]) - daemon = command(["systemctl", "daemon-reload"], 30) - enable = command(["systemctl", "enable", "--now", "%s.timer" % rendered["unitName"]], 30) - status = command(["systemctl", "show", "%s.timer" % rendered["unitName"], "--property=LoadState,ActiveState,SubState,NextElapseUSecRealtime,LastTriggerUSec"], 10) - except Exception as exc: - return { - "ok": False, - "action": "gc remote policy install", - "providerId": PROVIDER_ID, - "dryRun": False, - "mutation": True, - "observedAt": observed_at, - "error": "policy-install-failed", - "message": str(exc), - } - ok = daemon.get("exitCode") == 0 and enable.get("exitCode") == 0 - return { - "ok": ok, - "action": "gc remote policy install", - "providerId": PROVIDER_ID, - "dryRun": False, - "mutation": True, - "observedAt": observed_at, - "configSource": "config/unidesk-cli.yaml#gc.remote.targets.%s.policyTimer" % PROVIDER_ID, - "timer": {key: rendered.get(key) for key in ["unitName", "scriptPath", "servicePath", "timerPath", "onCalendar", "randomizedDelaySec", "journalTargetBytes", "journalTarget", "tmpMinAgeHours", "includeAptCache", "includeToolCaches"]}, - "systemd": { - "daemonReload": bounded(daemon), - "enableNow": bounded(enable), - "status": bounded(status), - }, - } - -def main(): - observed_at = now_iso() - preflight = cluster_preflight() - if ACTION == "policy-plan": - emit_json(remote_policy_plan_payload(observed_at), persist_large=False) - return 0 - if ACTION == "policy-install": - emit_json(remote_policy_install_payload(observed_at), persist_large=False) - return 0 - if ACTION == "trend": - history_limit = int(OPTIONS.get("historyLimit") or 12) - history = read_growth_snapshots(history_limit) - emit_json({ - "ok": True, - "action": "gc remote trend", - "providerId": PROVIDER_ID, - "dryRun": True, - "mutation": False, - "observedAt": observed_at, - "statePath": growth_snapshot_path(), - "historyLimit": history_limit, - "trend": growth_trend_payload(history), - "points": history if bool(OPTIONS.get("full")) else [compact_growth_point(item) for item in history[-min(len(history), 3):]], - "returnedPointCount": min(len(history), 3) if not bool(OPTIONS.get("full")) else len(history), - "totalPointCount": len(history), - "next": { - "snapshot": "bun scripts/cli.ts gc remote %s snapshot --include-hwlab-registry --history-limit %s" % (PROVIDER_ID, history_limit), - }, - }, persist_large=True) - return 0 - if ACTION == "snapshot": - history_limit = int(OPTIONS.get("historyLimit") or 12) - snapshot = collect_growth_snapshot(observed_at, preflight) - state_path = growth_snapshot_path() - if bool(OPTIONS.get("saveSnapshot", True)): - state_path = append_growth_snapshot(snapshot) - history = read_growth_snapshots(history_limit) - if not bool(OPTIONS.get("saveSnapshot", True)): - history = history + [snapshot] - trend_payload = growth_trend_payload(history[-history_limit:]) - recent_history = history[-min(len(history), 3):] - if not bool(OPTIONS.get("full")): - trend_payload = compact_trend_payload(trend_payload) - recent_history = history[-min(len(history), 1):] - snapshot.update({ - "statePath": state_path, - "historyLimit": history_limit, - "saved": bool(OPTIONS.get("saveSnapshot", True)), - "trend": trend_payload, - "history": { - "totalPointCount": len(read_growth_snapshots(1000000)) if bool(OPTIONS.get("saveSnapshot", True)) else len(history), - "returnedPointCount": len(recent_history) if bool(OPTIONS.get("full")) else 0, - "recentPoints": recent_history if bool(OPTIONS.get("full")) else [], - "drillDown": "bun scripts/cli.ts gc remote %s trend --history-limit %s" % (PROVIDER_ID, history_limit), - }, - }) - emit_json(snapshot, persist_large=True) - return 0 - protected = collect_protected() - candidates = collect_candidates(observed_at) - visible = visible_items(candidates) - if ACTION == "plan": - emit_json(plan_payload(observed_at, preflight, protected, candidates, visible), persist_large=True) - return 0 - if ACTION == "status": - emit_json(remote_gc_job_status(), persist_large=False) - return 0 - if ACTION != "run": - emit_json({"ok": False, "error": "unsupported-remote-gc-action", "action": ACTION}, persist_large=False) - return 0 - if PROVIDER_ID.upper() == "G14" and not preflight.get("ok"): - emit_json({ - "ok": False, - "error": "gc-remote-g14-preflight-failed", - "action": "gc remote run", - "providerId": PROVIDER_ID, - "dryRun": True, - "mutation": False, - "clusterPreflight": preflight, - "plan": plan_payload(observed_at, preflight, protected, candidates, visible), - }, persist_large=True) - return 0 - disk_before = df_snapshot() - results = [] - for candidate in visible: - try: - execution = execute(candidate) - item = dict(candidate) - item.update({"status": execution.get("status") or "succeeded", "reclaimedBytes": execution.get("reclaimedBytes")}) - if "commandOutput" in execution: - item["commandOutput"] = execution["commandOutput"] - results.append(item) - except Exception as exc: - item = dict(candidate) - item.update({"status": "failed", "reclaimedBytes": None, "error": str(exc)}) - results.append(item) - disk_after = df_snapshot() - failed = [item for item in results if item.get("status") == "failed"] - returned = returned_results(results) - run_summary = summarize(visible, returned, disk_before) - run_summary.update({ - "plannedCandidateCount": len(visible), - "attemptedCount": len(results), - "startedCount": len([item for item in results if item.get("status") == "started"]), - "succeededCount": len([item for item in results if item.get("status") == "succeeded"]), - "failedCount": len(failed), - "actualDiskReclaimBytes": (disk_after["availableBytes"] - disk_before["availableBytes"]) if disk_before and disk_after else None, - "actualDiskReclaim": fmt_bytes(disk_after["availableBytes"] - disk_before["availableBytes"]) if disk_before and disk_after else None, - "targetAfter": target_assessment(disk_after, 0), - "resultCount": len(results), - "returnedResultCount": len(returned), - "omittedResultCount": max(0, len(results) - len(returned)), - }) - payload = { - "ok": len(failed) == 0, - "action": "gc remote run", - "providerId": PROVIDER_ID, - "dryRun": False, - "mutation": True, - "observedAt": now_iso(), - "options": OPTIONS, - "diskBefore": disk_before, - "diskAfter": disk_after, - "clusterPreflight": preflight, - "clusterAfter": cluster_preflight(), - "summary": run_summary, - "results": returned, - "protected": protected, - } - emit_json(payload, persist_large=True) - return 0 - -if __name__ == "__main__": - raise SystemExit(main()) -`; + const template = readFileSync(rootPath(GC_REMOTE_RUNNER_RELATIVE_PATH), "utf8"); + if (!template.includes(GC_REMOTE_RUNNER_CONFIG_PLACEHOLDER)) { + throw new Error(`${GC_REMOTE_RUNNER_RELATIVE_PATH} missing ${GC_REMOTE_RUNNER_CONFIG_PLACEHOLDER}`); + } + return template.replace(GC_REMOTE_RUNNER_CONFIG_PLACEHOLDER, configBase64); }