diff --git a/config/unidesk-cli.yaml b/config/unidesk-cli.yaml index ca02ca90..7db62883 100644 --- a/config/unidesk-cli.yaml +++ b/config/unidesk-cli.yaml @@ -106,6 +106,44 @@ gc: limit: 50 resultLimit: 50 full: false + remote: + targets: + JD01: + memoryPressure: + processPatterns: + - chrome + - chromium + - web-probe observe + - playwright + observeStateRoots: + - /root/workspace/hwlab-v03/.state/web-observe + - /root/workspace/hwlab-v03/.state/playwright-cli + staleRunMaxAgeHours: 6 + pvcAttribution: + namespaces: + - agentrun-ci + - agentrun-v02 + - hwlab-ci + - hwlab-dev + - hwlab-prod + - hwlab-v03 + candidateNamespaces: + - agentrun-ci + - hwlab-ci + hwlabNode: JD01 + hwlabLane: v03 + agentrunNode: JD01 + agentrunLane: v02 + limit: 80 + policyTimer: + enabled: true + name: unidesk-jd01-low-risk-gc + onCalendar: daily + randomizedDelaySec: 20min + journalTargetBytes: 256MiB + tmpMinAgeHours: 24 + includeAptCache: true + includeToolCaches: false policyTimer: journald: systemMaxUse: 512MiB diff --git a/scripts/src/gc-remote.ts b/scripts/src/gc-remote.ts index d847c5e1..60ab8f90 100644 --- a/scripts/src/gc-remote.ts +++ b/scripts/src/gc-remote.ts @@ -1,8 +1,11 @@ import { Buffer } from "node:buffer"; +import { existsSync, readFileSync } from "node:fs"; -import { type UniDeskConfig } from "./config"; +import { type UniDeskConfig, rootPath } from "./config"; import { runSshCommandCapture } from "./ssh"; +type RemoteGcAction = "plan" | "snapshot" | "trend" | "run" | "status" | "policy-plan" | "policy-install"; + interface RemoteGcOptions { confirm: boolean; journal: boolean; @@ -55,15 +58,45 @@ const DEFAULT_REMOTE_OPTIONS: RemoteGcOptions = { saveSnapshot: true, }; +const GC_CONFIG_RELATIVE_PATH = "config/unidesk-cli.yaml"; +const GC_REMOTE_CONFIG_REF = `${GC_CONFIG_RELATIVE_PATH}#gc.remote.targets`; + export async function runRemoteGcCommand(config: UniDeskConfig, providerId: string | undefined, action: string | undefined, args: string[]): Promise { if (providerId === undefined || providerId.length === 0) { return { ok: false, error: "gc-remote-provider-required", - usage: "bun scripts/cli.ts gc remote plan|snapshot|trend|run|status [--confirm]", + usage: "bun scripts/cli.ts gc remote plan|snapshot|trend|run|status|policy [--confirm]", }; } const subaction = action ?? "plan"; + if (subaction === "policy") { + const [policyAction = "plan", ...policyArgs] = args; + const options = parseRemoteGcOptions(policyArgs); + if (policyAction === "plan" || policyAction === "render" || policyAction === "dry-run") { + return await runRemoteGc(config, providerId, "policy-plan", options); + } + if (policyAction === "install") { + if (!options.confirm) { + return { + ok: false, + error: "gc-remote-policy-install-requires-confirm", + dryRun: true, + mutation: false, + requiredFlag: "--confirm", + planCommand: `bun scripts/cli.ts gc remote ${providerId} policy plan`, + installCommand: `bun scripts/cli.ts gc remote ${providerId} policy install --confirm`, + }; + } + return await runRemoteGc(config, providerId, "policy-install", options); + } + return { + ok: false, + error: "unsupported-gc-remote-policy-action", + action: policyAction, + supportedActions: ["plan", "render", "dry-run", "install"], + }; + } const options = parseRemoteGcOptions(args); if (subaction === "plan" || subaction === "dry-run") return await runRemoteGc(config, providerId, "plan", options); if (subaction === "snapshot" || subaction === "growth") return await runRemoteGc(config, providerId, "snapshot", options); @@ -87,7 +120,7 @@ export async function runRemoteGcCommand(config: UniDeskConfig, providerId: stri ok: false, error: "unsupported-gc-remote-action", action: subaction, - supportedActions: ["plan", "snapshot", "trend", "run", "status"], + supportedActions: ["plan", "snapshot", "trend", "run", "status", "policy"], }; } @@ -196,8 +229,29 @@ function parseSize(raw: string): number | null { return Number.isFinite(bytes) ? bytes : null; } -async function runRemoteGc(config: UniDeskConfig, providerId: string, action: "plan" | "snapshot" | "trend" | "run" | "status", options: RemoteGcOptions): Promise { - const scriptConfig = Buffer.from(JSON.stringify({ providerId, action, options }), "utf8").toString("base64"); +function yamlRecordOrEmpty(value: unknown, label: string): Record { + if (value === undefined || value === null) return {}; + if (typeof value !== "object" || Array.isArray(value)) throw new Error(`${label} must be a YAML object`); + return value as Record; +} + +function loadRemoteGcTargetConfig(providerId: string): Record { + const configPath = rootPath(GC_CONFIG_RELATIVE_PATH); + if (!existsSync(configPath)) return {}; + const parsed = yamlRecordOrEmpty(Bun.YAML.parse(readFileSync(configPath, "utf8")) as unknown, GC_CONFIG_RELATIVE_PATH); + const gc = yamlRecordOrEmpty(parsed.gc, `${GC_CONFIG_RELATIVE_PATH}#gc`); + const remote = yamlRecordOrEmpty(gc.remote, `${GC_CONFIG_RELATIVE_PATH}#gc.remote`); + const targets = yamlRecordOrEmpty(remote.targets, GC_REMOTE_CONFIG_REF); + const candidates = [providerId, providerId.toUpperCase(), providerId.toLowerCase()]; + for (const key of candidates) { + if (Object.prototype.hasOwnProperty.call(targets, key)) return yamlRecordOrEmpty(targets[key], `${GC_REMOTE_CONFIG_REF}.${key}`); + } + return {}; +} + +async function runRemoteGc(config: UniDeskConfig, providerId: string, action: RemoteGcAction, options: RemoteGcOptions): Promise { + const remoteTarget = loadRemoteGcTargetConfig(providerId); + const scriptConfig = Buffer.from(JSON.stringify({ providerId, action, options, remoteTarget }), "utf8").toString("base64"); const result = await runSshCommandCapture(config, providerId, ["py"], remoteGcPython(scriptConfig)); if (result.exitCode !== 0) { return { @@ -244,6 +298,10 @@ CONFIG = json.loads(base64.b64decode("${configBase64}").decode("utf-8")) PROVIDER_ID = str(CONFIG.get("providerId") or "") ACTION = str(CONFIG.get("action") or "plan") OPTIONS = CONFIG.get("options") or {} +REMOTE_TARGET = CONFIG.get("remoteTarget") if isinstance(CONFIG.get("remoteTarget"), dict) else {} +MEMORY_CONFIG = REMOTE_TARGET.get("memoryPressure") if isinstance(REMOTE_TARGET.get("memoryPressure"), dict) else {} +PVC_CONFIG = REMOTE_TARGET.get("pvcAttribution") if isinstance(REMOTE_TARGET.get("pvcAttribution"), dict) else {} +POLICY_TIMER_CONFIG = REMOTE_TARGET.get("policyTimer") if isinstance(REMOTE_TARGET.get("policyTimer"), dict) else {} TMP_PREFIX_ALLOWLIST = [ "hwlab-agent-", @@ -321,6 +379,60 @@ REMOTE_GC_JOB_DIR = "/tmp/unidesk-gc-remote/jobs" REMOTE_GROWTH_SNAPSHOT_DIR = "/tmp/unidesk-gc-remote/growth-snapshots" REMOTE_STDOUT_JSON_LIMIT = 256 * 1024 +def config_list(cfg, key, default=None): + value = cfg.get(key) if isinstance(cfg, dict) else None + if isinstance(value, list): + return [str(item) for item in value if isinstance(item, (str, int, float)) and str(item)] + return list(default or []) + +def config_bool(cfg, key, default=False): + value = cfg.get(key) if isinstance(cfg, dict) else None + if isinstance(value, bool): + return value + return bool(default) + +def config_int(cfg, key, default=0, minimum=None, maximum=None): + value = cfg.get(key) if isinstance(cfg, dict) else None + try: + parsed = int(value) + except Exception: + parsed = int(default) + if minimum is not None: + parsed = max(int(minimum), parsed) + if maximum is not None: + parsed = min(int(maximum), parsed) + return parsed + +def config_float(cfg, key, default=0.0, minimum=None, maximum=None): + value = cfg.get(key) if isinstance(cfg, dict) else None + try: + parsed = float(value) + except Exception: + parsed = float(default) + if minimum is not None: + parsed = max(float(minimum), parsed) + if maximum is not None: + parsed = min(float(maximum), parsed) + return parsed + +def config_str(cfg, key, default=""): + value = cfg.get(key) if isinstance(cfg, dict) else None + if isinstance(value, str) and value: + return value + return str(default) + +def parse_size_value(value, default=None): + if isinstance(value, (int, float)) and value > 0: + return int(value) + if not isinstance(value, str): + return default + match = re.match(r"^\s*([0-9]+(?:\.[0-9]+)?)\s*(b|k|kb|kib|m|mb|mib|g|gb|gib)?\s*$", value, re.I) + if not match: + return default + unit = (match.group(2) or "b").lower() + mult = 1024**3 if unit in set(["g", "gb", "gib"]) else 1024**2 if unit in set(["m", "mb", "mib"]) else 1024 if unit in set(["k", "kb", "kib"]) else 1 + return int(float(match.group(1)) * mult) + def now_iso(): return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()) @@ -466,11 +578,7 @@ def emit_json(payload, persist_large=True): def remote_gc_job_status(): job_id = job_id_or_none() if not job_id: - return { - "ok": False, - "error": "gc-remote-status-requires-job-id", - "requiredFlag": "--job-id", - } + return remote_gc_live_status(now_iso(), cluster_preflight()) paths = job_paths(job_id) if not os.path.isfile(paths["state"]): return { @@ -495,6 +603,30 @@ def remote_gc_job_status(): payload["logTail"] = read_file_tail(paths["log"]) return payload +def remote_gc_live_status(observed_at, preflight): + memory_pressure = collect_memory_pressure() + ci_storage = ci_storage_snapshot() + compact_pvc = compact_pvc_attribution(ci_storage) + return { + "ok": True, + "action": "gc remote status", + "providerId": PROVIDER_ID, + "dryRun": True, + "mutation": False, + "observedAt": observed_at, + "disk": df_snapshot(), + "clusterPreflight": preflight, + "memoryPressure": compact_memory_pressure(memory_pressure), + "pvcAttribution": compact_pvc, + "policy": growth_watermark_policy(df_snapshot() or {}), + "next": { + "snapshot": "bun scripts/cli.ts gc remote %s snapshot --history-limit %s" % (PROVIDER_ID, int(OPTIONS.get("historyLimit") or 12)), + "plan": "bun scripts/cli.ts gc remote %s plan --target-use-percent --limit %s" % (PROVIDER_ID, int(OPTIONS.get("limit") or 50)), + "policy": "bun scripts/cli.ts gc remote %s policy plan" % PROVIDER_ID, + "jobStatus": "bun scripts/cli.ts gc remote %s status --job-id " % PROVIDER_ID, + }, + } + def path_size(path): try: if os.path.islink(path) or os.path.isfile(path): @@ -595,6 +727,273 @@ def source_size_item(source_id, label, path, cleanup_owner, timeout=20): "cleanupOwner": cleanup_owner, } +def pid_alive(pid): + try: + pid_int = int(pid) + except Exception: + return False + if pid_int <= 0: + return False + return os.path.exists("/proc/%s" % pid_int) + +def read_json_file(path): + try: + with open(path, "r", encoding="utf-8") as handle: + value = json.load(handle) + return value if isinstance(value, dict) else None + except Exception: + return None + +def read_pid_file(path): + try: + with open(path, "r", encoding="utf-8") as handle: + raw = handle.read().strip() + return int(raw) if re.match(r"^\d+$", raw) else None + except Exception: + return None + +def iso_or_epoch_to_epoch(value): + if value is None: + return None + if isinstance(value, (int, float)): + return float(value) + text_value = str(value).strip() + if not text_value: + return None + for fmt in ["%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%S.%fZ", "%Y-%m-%d %H:%M:%S"]: + try: + return float(calendar.timegm(time.strptime(text_value, fmt))) + except Exception: + pass + return None + +def redact_command_preview(value): + text_value = str(value or "") + text_value = re.sub(r"(?i)(api[_-]?key|token|authorization|password|secret)=\S+", r"\1=", text_value) + text_value = re.sub(r"(?i)(--(?:api-key|token|password|secret))\s+\S+", r"\1 ", text_value) + return text_value[:180] + +def collect_process_pressure(patterns): + result = command(["ps", "-eo", "pid=,ppid=,rss=,comm=,args="], 10) + if result["exitCode"] != 0: + return { + "ok": False, + "error": "ps-failed", + "command": bounded(result), + "processCount": 0, + "rssBytes": 0, + "rows": [], + } + lowered = [(pattern, pattern.lower()) for pattern in patterns] + rows = [] + by_pattern = {} + for line in result["stdout"].splitlines(): + parts = line.strip().split(None, 4) + if len(parts) < 4: + continue + pid, ppid, rss_kib, comm = parts[:4] + args = parts[4] if len(parts) >= 5 else comm + haystack = ("%s %s" % (comm, args)).lower() + matches = [original for original, lowered_pattern in lowered if lowered_pattern and lowered_pattern in haystack] + if not matches: + continue + rss_bytes = safe_int(rss_kib) * 1024 + row = { + "pid": safe_int(pid), + "ppid": safe_int(ppid), + "comm": comm, + "rssBytes": rss_bytes, + "rssHuman": fmt_bytes(rss_bytes), + "matchedPatterns": matches, + "commandPreview": redact_command_preview(args), + } + rows.append(row) + for pattern in matches: + bucket = by_pattern.setdefault(pattern, {"processCount": 0, "rssBytes": 0, "rssHuman": "0 B"}) + bucket["processCount"] += 1 + bucket["rssBytes"] += rss_bytes + bucket["rssHuman"] = fmt_bytes(bucket["rssBytes"]) + rows.sort(key=lambda item: safe_int(item.get("rssBytes")), reverse=True) + total = sum(safe_int(item.get("rssBytes")) for item in rows) + return { + "ok": True, + "patterns": patterns, + "processCount": len(rows), + "rssBytes": total, + "rssHuman": fmt_bytes(total), + "byPattern": by_pattern, + "top": rows[:int(OPTIONS.get("limit") or 50)], + } + +def collect_memory_snapshot(): + result = command(["free", "-b"], 5) + if result["exitCode"] != 0: + return {"ok": False, "error": "free-failed", "command": bounded(result)} + memory = {} + for line in result["stdout"].splitlines(): + parts = line.split() + if parts and parts[0].rstrip(":") == "Mem" and len(parts) >= 7: + memory = { + "totalBytes": safe_int(parts[1]), + "usedBytes": safe_int(parts[2]), + "freeBytes": safe_int(parts[3]), + "availableBytes": safe_int(parts[6]), + "totalHuman": fmt_bytes(parts[1]), + "usedHuman": fmt_bytes(parts[2]), + "availableHuman": fmt_bytes(parts[6]), + } + break + return {"ok": bool(memory), "memory": memory, "command": bounded(result)} + +def observe_run_record(path, stale_hours): + stat = os.stat(path) + heartbeat = read_json_file(os.path.join(path, "heartbeat.json")) or {} + manifest = read_json_file(os.path.join(path, "manifest.json")) or {} + pid = None + for candidate in ["pid", "observer.pid", "browser.pid", "runner.pid"]: + pid = read_pid_file(os.path.join(path, candidate)) + if pid is not None: + break + if pid is None: + for source in [heartbeat, manifest]: + for key in ["pid", "processId", "runnerPid", "browserPid"]: + if source.get(key) is not None: + try: + pid = int(source.get(key)) + break + except Exception: + pass + if pid is not None: + break + timestamp = None + for source in [heartbeat, manifest]: + for key in ["updatedAt", "completedAt", "finishedAt", "stoppedAt", "startedAt", "createdAt"]: + timestamp = iso_or_epoch_to_epoch(source.get(key)) + if timestamp is not None: + break + if timestamp is not None: + break + if timestamp is None: + timestamp = stat.st_mtime + age_hours = max(0.0, (time.time() - timestamp) / 3600.0) + status = heartbeat.get("status") or manifest.get("status") or manifest.get("state") + alive = pid_alive(pid) + terminal = str(status or "").lower() in set(["done", "completed", "complete", "failed", "blocked", "timeout", "timed-out", "stopped", "exited"]) + stale_signal = (not alive) and age_hours >= float(stale_hours) and (terminal or status is None) + return { + "id": os.path.basename(path), + "path": path, + "pid": pid, + "pidAlive": alive, + "status": status, + "ageHours": round(age_hours, 2), + "timestampBasis": "manifest-or-heartbeat" if heartbeat or manifest else "directory-mtime-fallback", + "staleSignal": stale_signal, + "classification": "review-only", + } + +def collect_web_observe_summary(): + roots = config_list(MEMORY_CONFIG, "observeStateRoots", config_list(MEMORY_CONFIG, "webObserveRoots", [])) + stale_hours = config_float(MEMORY_CONFIG, "staleRunMaxAgeHours", 6.0, minimum=0.0) + if not roots: + return { + "ok": True, + "skipped": True, + "reason": "no-yaml-observe-roots", + "configSource": "config/unidesk-cli.yaml#gc.remote.targets.%s.memoryPressure.observeStateRoots" % PROVIDER_ID, + } + root_rows = [] + stale_rows = [] + active_rows = [] + run_count = 0 + total_bytes = 0 + for root in roots: + exists = os.path.isdir(root) + root_size = du_size(root, 15) if exists else None + if root_size is not None: + total_bytes += safe_int(root_size) + row = { + "root": root, + "exists": exists, + "sizeBytes": root_size, + "sizeHuman": fmt_bytes(root_size or 0), + "runCount": 0, + "staleSignalCount": 0, + "activeSignalCount": 0, + } + if exists: + try: + children = [os.path.join(root, name) for name in os.listdir(root)] + except OSError: + children = [] + for child in children: + if not os.path.isdir(child): + continue + try: + record = observe_run_record(child, stale_hours) + except OSError: + continue + row["runCount"] += 1 + run_count += 1 + if record.get("pidAlive"): + row["activeSignalCount"] += 1 + active_rows.append(record) + if record.get("staleSignal"): + row["staleSignalCount"] += 1 + stale_rows.append(record) + root_rows.append(row) + stale_rows.sort(key=lambda item: float(item.get("ageHours") or 0), reverse=True) + active_rows.sort(key=lambda item: safe_int(item.get("pid"))) + return { + "ok": True, + "configSource": "config/unidesk-cli.yaml#gc.remote.targets.%s.memoryPressure" % PROVIDER_ID, + "staleRunMaxAgeHours": stale_hours, + "rootCount": len(root_rows), + "totalBytes": total_bytes, + "totalHuman": fmt_bytes(total_bytes), + "runCount": run_count, + "activeSignalCount": len(active_rows), + "staleSignalCount": len(stale_rows), + "roots": root_rows, + "activeSignals": active_rows[:int(OPTIONS.get("limit") or 50)], + "staleSignals": stale_rows[:int(OPTIONS.get("limit") or 50)], + "policy": "analysis-only; active or stale observe runs must be stopped/retained through controlled observer lifecycle commands, not raw process kill or directory deletion", + } + +def collect_memory_pressure(): + patterns = config_list(MEMORY_CONFIG, "processPatterns", []) + if not patterns: + return { + "ok": True, + "skipped": True, + "reason": "no-yaml-process-patterns", + "configSource": "config/unidesk-cli.yaml#gc.remote.targets.%s.memoryPressure.processPatterns" % PROVIDER_ID, + } + processes = collect_process_pressure(patterns) + observe = collect_web_observe_summary() + return { + "ok": processes.get("ok") is True, + "configSource": "config/unidesk-cli.yaml#gc.remote.targets.%s.memoryPressure" % PROVIDER_ID, + "hostMemory": collect_memory_snapshot(), + "processes": processes, + "webObserve": observe, + "summary": { + "matchedProcessCount": processes.get("processCount"), + "matchedRssBytes": processes.get("rssBytes"), + "matchedRssHuman": processes.get("rssHuman"), + "chromeProcessCount": (processes.get("byPattern") or {}).get("chrome", {}).get("processCount"), + "observerRunCount": observe.get("runCount"), + "activeObserverSignals": observe.get("activeSignalCount"), + "staleObserverSignals": observe.get("staleSignalCount"), + "observeStateBytes": observe.get("totalBytes"), + "observeStateHuman": observe.get("totalHuman"), + }, + "drillDown": { + "processes": "bun scripts/cli.ts gc remote %s snapshot --full --no-save" % PROVIDER_ID, + "status": "bun scripts/cli.ts gc remote %s status --job-id " % PROVIDER_ID, + }, + } + def disk_source_snapshot(): sources = [ source_size_item("hwlab-host-data", "HWLAB host data", "/var/lib/hwlab", "hwlab-registry-retention", 60), @@ -648,7 +1047,53 @@ def pvc_owner_group(namespace, owner): return "hwlab-runtime" return "other" +def parse_k8s_quantity(value): + if value is None: + return None + raw = str(value).strip() + match = re.match(r"^([0-9]+(?:\.[0-9]+)?)(Ki|Mi|Gi|Ti|K|M|G|T)?$", raw) + if not match: + return None + multiplier = { + None: 1, + "K": 1000, + "M": 1000**2, + "G": 1000**3, + "T": 1000**4, + "Ki": 1024, + "Mi": 1024**2, + "Gi": 1024**3, + "Ti": 1024**4, + }.get(match.group(2), 1) + return int(float(match.group(1)) * multiplier) + +def metadata_owner(meta): + refs = meta.get("ownerReferences") or [] + if refs: + first = refs[0] or {} + return first.get("kind"), first.get("name"), [{"kind": item.get("kind"), "name": item.get("name")} for item in refs[:5]] + labels = meta.get("labels") or {} + annotations = meta.get("annotations") or {} + for key in [ + "tekton.dev/pipelineRun", + "tekton.dev/taskRun", + "agentrun.unidesk/run-id", + "hwlab.unidesk/run-id", + "app.kubernetes.io/instance", + ]: + value = labels.get(key) or annotations.get(key) + if value: + return "Label", value, [] + return None, None, [] + def ci_storage_snapshot(): + namespaces = set(config_list(PVC_CONFIG, "namespaces", ["hwlab-ci", "agentrun-ci"])) + candidate_namespaces = set(config_list(PVC_CONFIG, "candidateNamespaces", [])) + hwlab_node = config_str(PVC_CONFIG, "hwlabNode", PROVIDER_ID) + hwlab_lane = config_str(PVC_CONFIG, "hwlabLane", "v03") + agentrun_node = config_str(PVC_CONFIG, "agentrunNode", PROVIDER_ID) + agentrun_lane = config_str(PVC_CONFIG, "agentrunLane", "v02") + limit = config_int(PVC_CONFIG, "limit", int(OPTIONS.get("limit") or 50), minimum=1, maximum=5000) pv_data = kubectl_json(["get", "pv"], 30) or {} pvc_data = kubectl_json(["get", "pvc", "-A"], 30) or {} pod_data = kubectl_json(["get", "pod", "-A"], 30) or {} @@ -678,34 +1123,48 @@ def ci_storage_snapshot(): status = pvc.get("status") or {} ns = str(meta.get("namespace") or "") name = str(meta.get("name") or "") - if ns not in set(["hwlab-ci", "agentrun-ci"]): + if ns not in namespaces: continue volume = str(spec.get("volumeName") or "") pv = pvs.get(volume) or {} pv_spec = pv.get("spec") or {} - owner_refs = meta.get("ownerReferences") or [] - owner_kind = None - owner_name = None - if owner_refs: - owner_kind = owner_refs[0].get("kind") - owner_name = owner_refs[0].get("name") + pv_meta = pv.get("metadata") or {} + owner_kind, owner_name, owner_refs = metadata_owner(meta) + requested = parse_k8s_quantity((((spec.get("resources") or {}).get("requests") or {}).get("storage"))) host_path = pv_host_path(pv) active = sorted(mounts.get((ns, name), [])) estimated = du_size(host_path, 8) if host_path else None + candidate_reasons = [] + if not active: + candidate_reasons.append("no-active-mount-observed") + if status.get("phase") != "Bound": + candidate_reasons.append("pvc-not-bound") + if (pv.get("status") or {}).get("phase") == "Released": + candidate_reasons.append("pv-released") + review_candidate = ns in candidate_namespaces and len(candidate_reasons) > 0 rows.append({ "namespace": ns, "pvc": name, "volume": volume or None, "phase": status.get("phase"), + "pvPhase": (pv.get("status") or {}).get("phase"), "ownerKind": owner_kind, "owner": owner_name, + "ownerRefs": owner_refs, "ownerGroup": pvc_owner_group(ns, owner_name), "storageClass": spec.get("storageClassName") or pv_spec.get("storageClassName"), "reclaimPolicy": pv_spec.get("persistentVolumeReclaimPolicy"), + "requestedBytes": requested, + "requestedHuman": fmt_bytes(requested or 0), "hostPath": host_path, + "pvCreatedAt": (pv_meta.get("creationTimestamp") if isinstance(pv_meta, dict) else None), + "pvcCreatedAt": meta.get("creationTimestamp"), "activeMountPods": active, "estimatedBytes": estimated, "estimatedHuman": fmt_bytes(estimated or 0), + "reviewCandidate": review_candidate, + "reviewReasons": candidate_reasons, + "dryRunOnly": True, }) rows.sort(key=lambda item: safe_int(item.get("estimatedBytes")), reverse=True) by_namespace = {} @@ -717,24 +1176,138 @@ def ci_storage_snapshot(): current["estimatedBytes"] += safe_int(row.get("estimatedBytes")) current["activeMountCount"] += len(row.get("activeMountPods") or []) current["estimatedHuman"] = fmt_bytes(current["estimatedBytes"]) + review_candidates = [row for row in rows if row.get("reviewCandidate")] return { - "scope": "hwlab-ci and agentrun-ci PVCs only", + "scope": "YAML-configured PVC namespaces", + "configSource": "config/unidesk-cli.yaml#gc.remote.targets.%s.pvcAttribution" % PROVIDER_ID, + "namespaces": sorted(namespaces), + "candidateNamespaces": sorted(candidate_namespaces), "pvcCount": len(rows), + "reviewCandidateCount": len(review_candidates), "estimatedBytes": sum(safe_int(row.get("estimatedBytes")) for row in rows), "estimatedHuman": fmt_bytes(sum(safe_int(row.get("estimatedBytes")) for row in rows)), + "requestedBytes": sum(safe_int(row.get("requestedBytes")) for row in rows), + "requestedHuman": fmt_bytes(sum(safe_int(row.get("requestedBytes")) for row in rows)), "byNamespace": by_namespace, "byOwnerGroup": by_owner_group, - "topPvcs": rows[:int(OPTIONS.get("limit") or 50)], + "topPvcs": rows[:limit], + "reviewCandidates": review_candidates[:limit], "handoff": { "hwlab": { - "dryRun": "bun scripts/cli.ts hwlab g14 control-plane cleanup-runs --lane v02 --min-age-minutes 30 --limit 200 --dry-run", - "releasedPvs": "bun scripts/cli.ts hwlab g14 control-plane cleanup-released-pvs --lane all --limit 200 --dry-run", + "dryRun": "bun scripts/cli.ts hwlab nodes control-plane cleanup-runs --node %s --lane %s --min-age-minutes 30 --limit 200 --dry-run" % (hwlab_node, hwlab_lane), + "releasedPvs": "bun scripts/cli.ts hwlab nodes control-plane cleanup-released-pvs --node %s --lane %s --limit 200 --dry-run" % (hwlab_node, hwlab_lane), }, "agentrun": { - "dryRun": "bun scripts/cli.ts agentrun control-plane cleanup-runs --min-age-minutes 30 --limit 200 --dry-run", - "releasedPvs": "bun scripts/cli.ts agentrun control-plane cleanup-released-pvs --limit 200 --dry-run", + "dryRun": "bun scripts/cli.ts agentrun control-plane cleanup-runs --node %s --lane %s --min-age-minutes 30 --limit 200 --dry-run" % (agentrun_node, agentrun_lane), + "releasedPvs": "bun scripts/cli.ts agentrun control-plane cleanup-released-pvs --node %s --lane %s --limit 200 --dry-run" % (agentrun_node, agentrun_lane), }, }, + "policy": "analysis-only; remote GC never deletes PVC/PV/local-path data and only hands off to owner-aware retention commands", + } + +def compact_pvc_row(row): + return { + "namespace": row.get("namespace"), + "pvc": row.get("pvc"), + "volume": row.get("volume"), + "phase": row.get("phase"), + "pvPhase": row.get("pvPhase"), + "ownerKind": row.get("ownerKind"), + "owner": row.get("owner"), + "ownerGroup": row.get("ownerGroup"), + "storageClass": row.get("storageClass"), + "reclaimPolicy": row.get("reclaimPolicy"), + "requestedBytes": row.get("requestedBytes"), + "requestedHuman": row.get("requestedHuman"), + "estimatedBytes": row.get("estimatedBytes"), + "estimatedHuman": row.get("estimatedHuman"), + "activeMountCount": len(row.get("activeMountPods") or []), + "activeMountPods": (row.get("activeMountPods") or [])[:5], + "reviewCandidate": row.get("reviewCandidate"), + "reviewReasons": row.get("reviewReasons"), + "dryRunOnly": True, + } + +def compact_pvc_attribution(payload): + if bool(OPTIONS.get("full")): + return payload + limit = 1 + top = payload.get("topPvcs") or [] + review = payload.get("reviewCandidates") or [] + return { + "scope": payload.get("scope"), + "configSource": payload.get("configSource"), + "namespaces": payload.get("namespaces"), + "candidateNamespaces": payload.get("candidateNamespaces"), + "pvcCount": payload.get("pvcCount"), + "reviewCandidateCount": payload.get("reviewCandidateCount"), + "estimatedBytes": payload.get("estimatedBytes"), + "estimatedHuman": payload.get("estimatedHuman"), + "requestedBytes": payload.get("requestedBytes"), + "requestedHuman": payload.get("requestedHuman"), + "byNamespace": payload.get("byNamespace"), + "byOwnerGroup": payload.get("byOwnerGroup"), + "topPvcs": [compact_pvc_row(row) for row in top[:limit] if isinstance(row, dict)], + "reviewCandidates": [compact_pvc_row(row) for row in review[:limit] if isinstance(row, dict)], + "handoff": payload.get("handoff"), + "policy": payload.get("policy"), + "compacted": True, + "fullDisclosure": "rerun with --full for hostPath, creation timestamps and complete row details", + } + +def compact_ci_storage_summary(payload): + return { + "scope": payload.get("scope"), + "configSource": payload.get("configSource"), + "pvcCount": payload.get("pvcCount"), + "reviewCandidateCount": payload.get("reviewCandidateCount"), + "estimatedBytes": payload.get("estimatedBytes"), + "estimatedHuman": payload.get("estimatedHuman"), + "requestedBytes": payload.get("requestedBytes"), + "requestedHuman": payload.get("requestedHuman"), + "compacted": True, + "fullDisclosure": "use pvcAttribution or --full for row-level details", + } + +def compact_memory_pressure(payload): + if bool(OPTIONS.get("full")): + return payload + processes = payload.get("processes") or {} + observe = payload.get("webObserve") or {} + process_limit = max(1, min(int(OPTIONS.get("limit") or 50), 8)) + signal_limit = max(1, min(int(OPTIONS.get("limit") or 50), 5)) + compact_processes = dict(processes) + compact_processes["top"] = (processes.get("top") or [])[:process_limit] + compact_observe = dict(observe) + compact_observe["activeSignals"] = (observe.get("activeSignals") or [])[:signal_limit] + compact_observe["staleSignals"] = (observe.get("staleSignals") or [])[:signal_limit] + return { + "ok": payload.get("ok"), + "configSource": payload.get("configSource"), + "hostMemory": payload.get("hostMemory"), + "processes": compact_processes, + "webObserve": compact_observe, + "summary": payload.get("summary"), + "drillDown": payload.get("drillDown"), + "compacted": True, + } + +def compact_memory_summary(payload): + observe = payload.get("webObserve") or {} + return { + "ok": payload.get("ok"), + "configSource": payload.get("configSource"), + "summary": payload.get("summary"), + "webObserve": { + "rootCount": observe.get("rootCount"), + "totalBytes": observe.get("totalBytes"), + "totalHuman": observe.get("totalHuman"), + "runCount": observe.get("runCount"), + "activeSignalCount": observe.get("activeSignalCount"), + "staleSignalCount": observe.get("staleSignalCount"), + }, + "compacted": True, + "drillDown": "bun scripts/cli.ts gc remote %s status --limit %s" % (PROVIDER_ID, int(OPTIONS.get("limit") or 50)), } def registry_growth_snapshot(): @@ -809,8 +1382,19 @@ def snapshot_metric_map(snapshot): key = "source.%s.sizeBytes" % item.get("id") metrics[key] = {"value": safe_int(item.get("sizeBytes")), "unit": "bytes", "label": item.get("label") or item.get("id")} storage = ((snapshot.get("ciStorage") or {}).get("byOwnerGroup") or {}) + if not storage: + storage = ((snapshot.get("pvcAttribution") or {}).get("byOwnerGroup") or {}) for owner, value in storage.items(): metrics["ciStorage.%s.estimatedBytes" % owner] = {"value": safe_int((value or {}).get("estimatedBytes")), "unit": "bytes", "label": "CI storage %s" % owner} + memory = snapshot.get("memoryPressure") or {} + memory_summary = memory.get("summary") or {} + if memory_summary.get("matchedRssBytes") is not None: + metrics["memoryPressure.matchedRssBytes"] = {"value": safe_int(memory_summary.get("matchedRssBytes")), "unit": "bytes", "label": "matched observer/chrome RSS"} + if memory_summary.get("observeStateBytes") is not None: + metrics["memoryPressure.observeStateBytes"] = {"value": safe_int(memory_summary.get("observeStateBytes")), "unit": "bytes", "label": "web observe state bytes"} + for key in ["matchedProcessCount", "activeObserverSignals", "staleObserverSignals"]: + if memory_summary.get(key) is not None: + metrics["memoryPressure.%s" % key] = {"value": safe_int(memory_summary.get(key)), "unit": "count", "label": "memory pressure %s" % key} registry = snapshot.get("registry") or {} retention = registry.get("retentionPlan") or {} for key in ["totalTags", "totalRevisions", "deleteTags", "deleteRevisions", "estimatedReclaimBytes"]: @@ -889,11 +1473,53 @@ def growth_trend_payload(points): }, } +def compact_metric_rows(rows, limit=3): + compact = [] + for row in (rows or [])[:limit]: + compact.append({ + "key": row.get("key"), + "label": row.get("label"), + "unit": row.get("unit"), + "delta": row.get("delta"), + "deltaHuman": row.get("deltaHuman"), + "perDayHuman": row.get("perDayHuman"), + }) + return compact + +def compact_trend_payload(payload): + if payload.get("state") == "insufficient-history": + return payload + latest = payload.get("latestDelta") or {} + window = payload.get("windowDelta") or {} + return { + "pointCount": payload.get("pointCount"), + "oldestAt": payload.get("oldestAt"), + "latestAt": payload.get("latestAt"), + "latestDelta": { + "durationSeconds": latest.get("durationSeconds"), + "rateWarning": latest.get("rateWarning"), + "topGrowingBytes": compact_metric_rows(latest.get("topGrowingBytes") or [], 1), + "topShrinkingBytes": compact_metric_rows(latest.get("topShrinkingBytes") or [], 1), + "registryCounters": compact_metric_rows(latest.get("registryCounters") or [], 1), + }, + "windowDelta": { + "durationSeconds": window.get("durationSeconds"), + "rateWarning": window.get("rateWarning"), + "topGrowingBytes": compact_metric_rows(window.get("topGrowingBytes") or [], 1), + "topShrinkingBytes": compact_metric_rows(window.get("topShrinkingBytes") or [], 1), + "registryCounters": compact_metric_rows(window.get("registryCounters") or [], 1), + }, + "fullDisclosure": "rerun trend --full for all metric rows", + } + def compact_growth_point(item): registry = item.get("registry") or {} retention = registry.get("retentionPlan") or {} ci_storage = item.get("ciStorage") or {} containerd = item.get("containerd") or {} + memory = item.get("memoryPressure") or {} + memory_summary = memory.get("summary") or {} + observe = (memory.get("webObserve") or {}) return { "observedAt": item.get("observedAt"), "rootDisk": item.get("rootDisk"), @@ -918,14 +1544,46 @@ def compact_growth_point(item): "state": containerd.get("state"), "cleanupSupported": containerd.get("cleanupSupported"), }, + "memoryPressure": { + "matchedProcessCount": memory_summary.get("matchedProcessCount"), + "matchedRssBytes": memory_summary.get("matchedRssBytes"), + "matchedRssHuman": memory_summary.get("matchedRssHuman"), + "activeObserverSignals": memory_summary.get("activeObserverSignals"), + "staleObserverSignals": memory_summary.get("staleObserverSignals"), + "observeStateBytes": memory_summary.get("observeStateBytes"), + "observeStateHuman": memory_summary.get("observeStateHuman"), + "webObserveRootCount": observe.get("rootCount"), + }, } def collect_growth_snapshot(observed_at, preflight): root_disk = df_snapshot() sources = disk_source_snapshot() ci_storage = ci_storage_snapshot() + memory_pressure = collect_memory_pressure() + compact_pvc = compact_pvc_attribution(ci_storage) + if bool(OPTIONS.get("full")): + public_pvc = ci_storage + public_memory = memory_pressure + else: + public_pvc = compact_ci_storage_summary(ci_storage) + public_memory = compact_memory_summary(memory_pressure) registry = registry_growth_snapshot() containerd = containerd_breakdown_snapshot() + commands = { + "snapshot": "bun scripts/cli.ts gc remote %s snapshot --include-hwlab-registry --history-limit %s" % (PROVIDER_ID, int(OPTIONS.get("historyLimit") or 12)), + "trend": "bun scripts/cli.ts gc remote %s trend --history-limit %s" % (PROVIDER_ID, int(OPTIONS.get("historyLimit") or 12)), + "registryPlan": "bun scripts/cli.ts gc remote %s plan --target-use-percent 70 --include-hwlab-registry --limit 50" % PROVIDER_ID, + "hwlabCiRetention": ((ci_storage.get("handoff") or {}).get("hwlab") or {}).get("dryRun"), + "agentrunRetention": ((ci_storage.get("handoff") or {}).get("agentrun") or {}).get("dryRun"), + "remotePolicy": "bun scripts/cli.ts gc remote %s policy plan" % PROVIDER_ID, + } + if not bool(OPTIONS.get("full")): + commands = { + "trend": "bun scripts/cli.ts gc remote %s trend --history-limit %s" % (PROVIDER_ID, int(OPTIONS.get("historyLimit") or 12)), + "status": "bun scripts/cli.ts gc remote %s status --limit %s" % (PROVIDER_ID, int(OPTIONS.get("limit") or 50)), + "full": "bun scripts/cli.ts gc remote %s snapshot --full --no-save" % PROVIDER_ID, + } return { "ok": True, "action": "gc remote snapshot", @@ -938,16 +1596,11 @@ def collect_growth_snapshot(observed_at, preflight): "clusterPreflight": preflight, "sources": sources, "registry": registry, - "ciStorage": ci_storage, + "pvcAttribution": public_pvc, + "memoryPressure": public_memory, "containerd": containerd, "policy": growth_watermark_policy(root_disk or {}), - "commands": { - "snapshot": "bun scripts/cli.ts gc remote %s snapshot --include-hwlab-registry --history-limit %s" % (PROVIDER_ID, int(OPTIONS.get("historyLimit") or 12)), - "trend": "bun scripts/cli.ts gc remote %s trend --history-limit %s" % (PROVIDER_ID, int(OPTIONS.get("historyLimit") or 12)), - "registryPlan": "bun scripts/cli.ts gc remote %s plan --target-use-percent 70 --include-hwlab-registry --limit 50" % PROVIDER_ID, - "hwlabCiRetention": "bun scripts/cli.ts hwlab g14 control-plane cleanup-runs --lane v02 --min-age-minutes 30 --limit 200 --dry-run", - "agentrunRetention": "bun scripts/cli.ts agentrun control-plane cleanup-runs --min-age-minutes 30 --limit 200 --dry-run", - }, + "commands": commands, } def allocated_file_size(path): @@ -975,6 +1628,9 @@ def df_snapshot(): "sizeBytes": int(parts[1]), "usedBytes": int(parts[2]), "availableBytes": int(parts[3]), + "dfBasisBytes": int(parts[2]) + int(parts[3]), + "reservedBytes": max(0, int(parts[1]) - int(parts[2]) - int(parts[3])), + "usePercentExact": round((int(parts[2]) * 100.0 / (int(parts[2]) + int(parts[3]))) if (int(parts[2]) + int(parts[3])) > 0 else 0.0, 2), "usePercent": int(parts[4].replace("%", "")), "mount": parts[5], } @@ -1980,6 +2636,7 @@ def target_assessment(disk, estimated_reclaim): target = int(raw) size = int(disk.get("sizeBytes") or 0) used = int(disk.get("usedBytes") or 0) + available = int(disk.get("availableBytes") or 0) reclaim = max(0, int(estimated_reclaim or 0)) except Exception: return { @@ -1988,10 +2645,16 @@ def target_assessment(disk, estimated_reclaim): "state": "unavailable", "reason": "invalid-disk-snapshot", } - target_used_bytes = (size * target) // 100 + df_basis = used + available + if df_basis <= 0: + df_basis = size + legacy_target_used_bytes = (size * target) // 100 + legacy_required = max(0, used - legacy_target_used_bytes) + target_used_bytes = (df_basis * target) // 100 required = max(0, used - target_used_bytes) projected_used = max(0, used - reclaim) - projected_use_percent = disk_use_percent(size, projected_used) + projected_use_percent = disk_use_percent(df_basis, projected_used) + legacy_projected_use_percent = disk_use_percent(size, projected_used) enough = reclaim >= required if required == 0: state = "already-below-target" @@ -2007,6 +2670,12 @@ def target_assessment(disk, estimated_reclaim): "ok": required == 0 or enough, "state": state, "currentUsePercent": disk.get("usePercent"), + "currentUsePercentExact": disk.get("usePercentExact"), + "basis": "df-used-over-used-plus-available", + "dfBasisBytes": df_basis, + "dfBasis": fmt_bytes(df_basis), + "reservedBytes": max(0, size - df_basis), + "reserved": fmt_bytes(max(0, size - df_basis)), "currentUsedBytes": used, "currentUsed": fmt_bytes(used), "targetUsedBytes": target_used_bytes, @@ -2022,6 +2691,17 @@ def target_assessment(disk, estimated_reclaim): "projectedUsePercent": projected_use_percent, "safeStop": required > 0 and not enough, "decision": "stop-and-escalate-retention-or-capacity" if required > 0 and not enough else "target-covered-by-safe-candidates", + "legacySizeBasis": { + "basis": "df-size-column-includes-reserved-blocks", + "sizeBytes": size, + "size": fmt_bytes(size), + "targetUsedBytes": legacy_target_used_bytes, + "targetUsed": fmt_bytes(legacy_target_used_bytes), + "requiredReclaimBytes": legacy_required, + "requiredReclaim": fmt_bytes(legacy_required), + "projectedUsePercent": legacy_projected_use_percent, + "note": "informational only; ok/safeStop use the same basis as df Use%", + }, } def summarize(candidates, returned, disk=None): @@ -2162,7 +2842,42 @@ def returned_results(results): def plan_payload(observed_at, preflight, protected, candidates, visible): disk = df_snapshot() - return { + ci_storage = ci_storage_snapshot() + memory_pressure = collect_memory_pressure() + compact_pvc = compact_pvc_attribution(ci_storage) + policy = { + "requiresRunConfirm": True, + "runCommand": "bun scripts/cli.ts gc remote %s run --confirm" % PROVIDER_ID, + "neverTouches": [ + "/var/lib/rancher/k3s", + "/var/lib/rancher/k3s/storage", + "/var/lib/kubelet", + "/var/lib/containerd", + "/var/lib/hwlab unless --include-hwlab-registry is explicitly supplied", + "Kubernetes Deployments/StatefulSets/Secrets/PVCs/PVs", + "HWLAB fixed source workspaces", + "Docker images, containers and volumes", + ], + "notes": [ + "Remote gc only executes the returned candidate page unless --full or a larger --limit is supplied.", + "G14 run requires the expected native k3s node preflight before mutation.", + "HWLAB DEV runtime and local-path PVC data are protected and require HWLAB-specific retention commands.", + "Core dump cleanup only removes untracked /root/unidesk/core. regular files with no active fuser reference.", + "HWLAB registry retention is opt-in: it keeps workload tag/digest refs, all tags newer than the retention age and the newest N tags per repo before official registry garbage-collect.", + "When summary.target.safeStop is true, do not broaden deletion scope; choose registry retention, k3s/containerd image cache maintenance, PVC/runtime retention or capacity expansion explicitly.", + ], + } + if not bool(OPTIONS.get("full")): + policy = { + "requiresRunConfirm": True, + "runCommand": "bun scripts/cli.ts gc remote %s run --confirm" % PROVIDER_ID, + "neverTouches": ["k3s runtime", "PVC/PV/local-path data", "Secrets/auth/config", "Docker volumes/images"], + "notes": [ + "Default plan is compact; rerun with --full for complete policy notes and protected rows.", + "When summary.target.safeStop is true, stop at protected boundaries and choose an owner-aware retention or capacity decision.", + ], + } + payload = { "ok": True, "action": "gc remote plan", "providerId": PROVIDER_ID, @@ -2174,34 +2889,214 @@ def plan_payload(observed_at, preflight, protected, candidates, visible): "clusterPreflight": preflight, "summary": summarize(candidates, visible, disk), "candidates": visible, - "protected": protected, + "protected": protected if bool(OPTIONS.get("full")) else protected[:3], + "policy": policy, + } + if bool(OPTIONS.get("full")): + payload.update({ + "memoryPressure": memory_pressure, + "pvcAttribution": ci_storage, + "ciStorage": ci_storage, + }) + else: + payload["pressureSummary"] = { + "memory": (compact_memory_pressure(memory_pressure).get("summary") if isinstance(compact_memory_pressure(memory_pressure), dict) else None), + "pvc": { + "pvcCount": compact_pvc.get("pvcCount"), + "reviewCandidateCount": compact_pvc.get("reviewCandidateCount"), + "estimatedBytes": compact_pvc.get("estimatedBytes"), + "estimatedHuman": compact_pvc.get("estimatedHuman"), + "byNamespace": compact_pvc.get("byNamespace"), + "handoff": compact_pvc.get("handoff"), + }, + "drillDown": "bun scripts/cli.ts gc remote %s status --limit %s" % (PROVIDER_ID, int(OPTIONS.get("limit") or 50)), + } + return payload + +def safe_unit_name(value): + raw = str(value or "").strip().lower() + raw = re.sub(r"[^a-z0-9_.@-]+", "-", raw).strip("-") + if not raw: + raw = "unidesk-%s-low-risk-gc" % re.sub(r"[^a-z0-9]+", "-", PROVIDER_ID.lower()).strip("-") + return raw[:80] + +def render_remote_policy(): + unit_name = safe_unit_name(config_str(POLICY_TIMER_CONFIG, "name", "unidesk-%s-low-risk-gc" % PROVIDER_ID.lower())) + on_calendar = config_str(POLICY_TIMER_CONFIG, "onCalendar", "daily") + randomized_delay_sec = config_str(POLICY_TIMER_CONFIG, "randomizedDelaySec", "15min") + journal_target = parse_size_value(POLICY_TIMER_CONFIG.get("journalTargetBytes"), int(OPTIONS.get("journalTargetBytes") or 536870912)) + tmp_min_age_hours = config_float(POLICY_TIMER_CONFIG, "tmpMinAgeHours", float(OPTIONS.get("tmpMinAgeHours") or 24), minimum=0.0) + include_apt_cache = config_bool(POLICY_TIMER_CONFIG, "includeAptCache", bool(OPTIONS.get("aptCache", True))) + include_tool_caches = config_bool(POLICY_TIMER_CONFIG, "includeToolCaches", False) + script_path = "/usr/local/sbin/%s.sh" % unit_name + service_path = "/etc/systemd/system/%s.service" % unit_name + timer_path = "/etc/systemd/system/%s.timer" % unit_name + tool_paths = [item["path"] for item in TOOL_CACHE_ALLOWLIST] if include_tool_caches else [] + script = "\n".join([ + "#!/bin/sh", + "set -eu", + "umask 077", + "journalctl --vacuum-size=%s >/dev/null 2>&1 || true" % int(journal_target), + "apt-get clean >/dev/null 2>&1 || true" if include_apt_cache else ": apt cache disabled by YAML", + "python3 - <<'PY'", + "import json, os, shutil, time", + "prefixes = json.loads(%r)" % json.dumps(TMP_PREFIX_ALLOWLIST), + "protected = set(json.loads(%r))" % json.dumps(sorted(TMP_EXACT_PROTECT)), + "tool_paths = json.loads(%r)" % json.dumps(tool_paths), + "cutoff = time.time() - float(%r) * 3600.0" % tmp_min_age_hours, + "for name in os.listdir('/tmp'):", + " path = os.path.join('/tmp', name)", + " if path in protected or not any(name.startswith(prefix) for prefix in prefixes):", + " continue", + " try:", + " stat = os.lstat(path)", + " except OSError:", + " continue", + " if stat.st_mtime >= cutoff:", + " continue", + " if os.path.isdir(path) and not os.path.islink(path):", + " shutil.rmtree(path, ignore_errors=True)", + " elif os.path.exists(path):", + " try:", + " os.unlink(path)", + " except FileNotFoundError:", + " pass", + "for path in tool_paths:", + " resolved = os.path.abspath(path)", + " if resolved != path or os.path.islink(resolved) or resolved in ['/', '/root', '/root/.npm', '/root/.bun']:", + " continue", + " if os.path.isdir(resolved):", + " shutil.rmtree(resolved, ignore_errors=True)", + " elif os.path.exists(resolved):", + " try:", + " os.unlink(resolved)", + " except FileNotFoundError:", + " pass", + "PY", + "", + ]) + service = "\n".join([ + "[Unit]", + "Description=UniDesk remote low-risk GC for %s" % PROVIDER_ID, + "Documentation=config/unidesk-cli.yaml#gc.remote.targets.%s.policyTimer" % PROVIDER_ID, + "", + "[Service]", + "Type=oneshot", + "ExecStart=%s" % script_path, + "Nice=10", + "IOSchedulingClass=best-effort", + "IOSchedulingPriority=7", + "", + ]) + timer = "\n".join([ + "[Unit]", + "Description=UniDesk remote low-risk GC timer for %s" % PROVIDER_ID, + "", + "[Timer]", + "OnCalendar=%s" % on_calendar, + "RandomizedDelaySec=%s" % randomized_delay_sec, + "Persistent=true", + "", + "[Install]", + "WantedBy=timers.target", + "", + ]) + return { + "unitName": unit_name, + "scriptPath": script_path, + "servicePath": service_path, + "timerPath": timer_path, + "onCalendar": on_calendar, + "randomizedDelaySec": randomized_delay_sec, + "journalTargetBytes": int(journal_target), + "journalTarget": fmt_bytes(journal_target), + "tmpMinAgeHours": tmp_min_age_hours, + "includeAptCache": include_apt_cache, + "includeToolCaches": include_tool_caches, + "script": script, + "service": service, + "timer": timer, + } + +def remote_policy_plan_payload(observed_at): + rendered = render_remote_policy() + return { + "ok": True, + "action": "gc remote policy plan", + "providerId": PROVIDER_ID, + "dryRun": True, + "mutation": False, + "observedAt": observed_at, + "configSource": "config/unidesk-cli.yaml#gc.remote.targets.%s.policyTimer" % PROVIDER_ID, + "enabled": config_bool(POLICY_TIMER_CONFIG, "enabled", False), + "timer": {key: rendered.get(key) for key in ["unitName", "scriptPath", "servicePath", "timerPath", "onCalendar", "randomizedDelaySec", "journalTargetBytes", "journalTarget", "tmpMinAgeHours", "includeAptCache", "includeToolCaches"]}, + "scriptPreview": "\n".join(rendered["script"].splitlines()[:20]), + "servicePreview": rendered["service"], + "timerPreview": rendered["timer"], + "installCommand": "bun scripts/cli.ts gc remote %s policy install --confirm" % PROVIDER_ID, "policy": { - "requiresRunConfirm": True, - "runCommand": "bun scripts/cli.ts gc remote %s run --confirm" % PROVIDER_ID, + "risk": "low", "neverTouches": [ - "/var/lib/rancher/k3s", - "/var/lib/rancher/k3s/storage", - "/var/lib/kubelet", - "/var/lib/containerd", - "/var/lib/hwlab unless --include-hwlab-registry is explicitly supplied", - "Kubernetes Deployments/StatefulSets/Secrets/PVCs/PVs", - "HWLAB fixed source workspaces", - "Docker images, containers and volumes", - ], - "notes": [ - "Remote gc only executes the returned candidate page unless --full or a larger --limit is supplied.", - "G14 run requires the expected native k3s node preflight before mutation.", - "HWLAB DEV runtime and local-path PVC data are protected and require HWLAB-specific retention commands.", - "Core dump cleanup only removes untracked /root/unidesk/core. regular files with no active fuser reference.", - "HWLAB registry retention is opt-in: it keeps workload tag/digest refs, all tags newer than the retention age and the newest N tags per repo before official registry garbage-collect.", - "When summary.target.safeStop is true, do not broaden deletion scope; choose registry retention, k3s/containerd image cache maintenance, PVC/runtime retention or capacity expansion explicitly.", + "k3s runtime directories", + "PVC/PV/local-path data", + "Docker images, containers, volumes or Docker build cache", + "Secret/auth/config state", + "active Web observe runners or Chrome processes", ], + "toolCaches": "disabled unless config/unidesk-cli.yaml enables includeToolCaches for this remote target", + }, + } + +def remote_policy_install_payload(observed_at): + rendered = render_remote_policy() + try: + with open(rendered["scriptPath"], "w", encoding="utf-8") as handle: + handle.write(rendered["script"]) + os.chmod(rendered["scriptPath"], 0o755) + with open(rendered["servicePath"], "w", encoding="utf-8") as handle: + handle.write(rendered["service"]) + with open(rendered["timerPath"], "w", encoding="utf-8") as handle: + handle.write(rendered["timer"]) + daemon = command(["systemctl", "daemon-reload"], 30) + enable = command(["systemctl", "enable", "--now", "%s.timer" % rendered["unitName"]], 30) + status = command(["systemctl", "show", "%s.timer" % rendered["unitName"], "--property=LoadState,ActiveState,SubState,NextElapseUSecRealtime,LastTriggerUSec"], 10) + except Exception as exc: + return { + "ok": False, + "action": "gc remote policy install", + "providerId": PROVIDER_ID, + "dryRun": False, + "mutation": True, + "observedAt": observed_at, + "error": "policy-install-failed", + "message": str(exc), + } + ok = daemon.get("exitCode") == 0 and enable.get("exitCode") == 0 + return { + "ok": ok, + "action": "gc remote policy install", + "providerId": PROVIDER_ID, + "dryRun": False, + "mutation": True, + "observedAt": observed_at, + "configSource": "config/unidesk-cli.yaml#gc.remote.targets.%s.policyTimer" % PROVIDER_ID, + "timer": {key: rendered.get(key) for key in ["unitName", "scriptPath", "servicePath", "timerPath", "onCalendar", "randomizedDelaySec", "journalTargetBytes", "journalTarget", "tmpMinAgeHours", "includeAptCache", "includeToolCaches"]}, + "systemd": { + "daemonReload": bounded(daemon), + "enableNow": bounded(enable), + "status": bounded(status), }, } def main(): observed_at = now_iso() preflight = cluster_preflight() + if ACTION == "policy-plan": + emit_json(remote_policy_plan_payload(observed_at), persist_large=False) + return 0 + if ACTION == "policy-install": + emit_json(remote_policy_install_payload(observed_at), persist_large=False) + return 0 if ACTION == "trend": history_limit = int(OPTIONS.get("historyLimit") or 12) history = read_growth_snapshots(history_limit) @@ -2232,15 +3127,21 @@ def main(): history = read_growth_snapshots(history_limit) if not bool(OPTIONS.get("saveSnapshot", True)): history = history + [snapshot] + trend_payload = growth_trend_payload(history[-history_limit:]) + recent_history = history[-min(len(history), 3):] + if not bool(OPTIONS.get("full")): + trend_payload = compact_trend_payload(trend_payload) + recent_history = history[-min(len(history), 1):] snapshot.update({ "statePath": state_path, "historyLimit": history_limit, "saved": bool(OPTIONS.get("saveSnapshot", True)), - "trend": growth_trend_payload(history[-history_limit:]), + "trend": trend_payload, "history": { "totalPointCount": len(read_growth_snapshots(1000000)) if bool(OPTIONS.get("saveSnapshot", True)) else len(history), - "returnedPointCount": len(history[-min(len(history), 3):]), - "recentPoints": history[-min(len(history), 3):] if bool(OPTIONS.get("full")) else [compact_growth_point(item) for item in history[-min(len(history), 3):]], + "returnedPointCount": len(recent_history) if bool(OPTIONS.get("full")) else 0, + "recentPoints": recent_history if bool(OPTIONS.get("full")) else [], + "drillDown": "bun scripts/cli.ts gc remote %s trend --history-limit %s" % (PROVIDER_ID, history_limit), }, }) emit_json(snapshot, persist_large=True) diff --git a/scripts/src/hwlab-node-web-sentinel-cicd.ts b/scripts/src/hwlab-node-web-sentinel-cicd.ts index 773bb842..7ffb91b6 100644 --- a/scripts/src/hwlab-node-web-sentinel-cicd.ts +++ b/scripts/src/hwlab-node-web-sentinel-cicd.ts @@ -1729,6 +1729,9 @@ function probeArgoApplication(state: SentinelCicdState, timeoutSeconds: number, const revision = nonEmptyString(revisionRaw); const revisionMatches = expectedRevision === null || revision === expectedRevision; const healthy = result.exitCode === 0 && syncStatus === "Synced" && healthStatus === "Healthy"; + const diagnostics = result.exitCode === 0 && !healthy + ? probeArgoApplicationDiagnostics(state, timeoutSeconds, namespace, applicationName) + : null; return { ok: healthy, present: result.exitCode === 0, @@ -1741,10 +1744,80 @@ function probeArgoApplication(state: SentinelCicdState, timeoutSeconds: number, warning: healthy && !revisionMatches ? "Argo app is Synced/Healthy but status.sync.revision differs from current GitOps branch HEAD; in multi-sentinel GitOps this can happen when another sentinel path advances the branch. Runtime image/manifest checks remain authoritative for rollout readiness." : null, + diagnostics, result: compactCommand(result), }; } +function probeArgoApplicationDiagnostics(state: SentinelCicdState, timeoutSeconds: number, namespace: string, applicationName: string): Record { + const route = stringAt(state.controlPlaneNode, "kubeRoute"); + const appResult = runCommand(["trans", route, "kubectl", "-n", namespace, "get", "application", applicationName, "-o", "json"], repoRoot, { timeoutMs: Math.min(timeoutSeconds, 60) * 1000 }); + const parsed = parseJsonObject(appResult.stdout); + const status = record(parsed?.status); + const resourcesRaw = Array.isArray(status.resources) ? status.resources : []; + const resources = resourcesRaw + .filter((item): item is Record => typeof item === "object" && item !== null && !Array.isArray(item)) + .map((item) => { + const health = record(item.health); + return { + kind: item.kind ?? null, + namespace: item.namespace ?? null, + name: item.name ?? null, + status: item.status ?? null, + healthStatus: health.status ?? null, + healthMessage: short(health.message), + }; + }); + const problemResources = resources + .filter((item) => { + const sync = nonEmptyString(item.status); + const health = nonEmptyString(item.healthStatus); + return (sync !== null && sync !== "Synced") || (health !== null && health !== "Healthy"); + }) + .slice(0, 12); + const conditions = (Array.isArray(status.conditions) ? status.conditions : []) + .filter((item): item is Record => typeof item === "object" && item !== null && !Array.isArray(item)) + .slice(-8) + .map((item) => ({ + type: item.type ?? null, + message: short(item.message), + lastTransitionTime: item.lastTransitionTime ?? null, + })); + const operationState = record(status.operationState); + const eventResult = runCommand(["trans", route, "kubectl", "-n", namespace, "get", "events", "--field-selector", `involvedObject.name=${applicationName}`, "--sort-by=.lastTimestamp", "-o", "json"], repoRoot, { timeoutMs: Math.min(timeoutSeconds, 60) * 1000 }); + const eventsJson = parseJsonObject(eventResult.stdout); + const events = (Array.isArray(eventsJson?.items) ? eventsJson.items : []) + .filter((item): item is Record => typeof item === "object" && item !== null && !Array.isArray(item)) + .slice(-8) + .map((item) => ({ + type: item.type ?? null, + reason: item.reason ?? null, + message: short(item.message), + count: item.count ?? null, + lastTimestamp: item.lastTimestamp ?? item.eventTime ?? null, + })); + return { + ok: appResult.exitCode === 0, + resourceCount: resources.length, + problemResourceCount: problemResources.length, + problemResources, + conditions, + operationState: { + phase: operationState.phase ?? null, + message: short(operationState.message), + startedAt: operationState.startedAt ?? null, + finishedAt: operationState.finishedAt ?? null, + }, + events, + result: compactCommand(appResult), + eventsResult: compactCommand(eventResult), + drillDown: { + application: `trans ${route} kubectl -n ${namespace} get application ${applicationName} -o json`, + events: `trans ${route} kubectl -n ${namespace} get events --field-selector involvedObject.name=${applicationName} --sort-by=.lastTimestamp`, + }, + }; +} + function probeGitopsRuntimeManifest(state: SentinelCicdState, timeoutSeconds: number): Record { const namespace = stringAt(state.cicd, "builder.namespace"); const repository = stringAt(state.controlPlaneTarget, "source.repository"); @@ -3857,7 +3930,15 @@ function observedDetail(name: string, item: Record): string { if (name === "registry") return `${record(item.probe).present === true ? "present" : "missing"} ${short(record(item.probe).digest)}`; if (name === "git-mirror" && item.skipped === true) return `${item.reason ?? "skipped"}`; if (name === "gitops") return `${short(item.revision)} image=${short(item.image)}`; - if (name === "argo") return `${item.syncStatus ?? "-"} ${item.healthStatus ?? "-"} ${short(item.revision)}/${short(item.expectedRevision)}`; + if (name === "argo") { + const diagnostics = record(item.diagnostics); + const problems = Array.isArray(diagnostics.problemResources) ? diagnostics.problemResources : []; + const first = problems.find((entry) => typeof entry === "object" && entry !== null && !Array.isArray(entry)) as Record | undefined; + const problemText = Number(diagnostics.problemResourceCount ?? 0) > 0 + ? ` degraded=${diagnostics.problemResourceCount}:${first?.kind ?? "-"} ${first?.namespace ?? "-"}/${first?.name ?? "-"} ${first?.healthStatus ?? first?.status ?? "-"}` + : ""; + return `${item.syncStatus ?? "-"} ${item.healthStatus ?? "-"} ${short(item.revision)}/${short(item.expectedRevision)}${problemText}`; + } if (name === "runtime") { const probe = record(item.probe); const deployment = record(probe.deployment);