From ab3566435ced8780215b48eb7a1160f324c87cf8 Mon Sep 17 00:00:00 2001 From: Codex Date: Sun, 5 Jul 2026 04:18:22 +0000 Subject: [PATCH] fix: add JD01 GC retention controls --- AGENTS.md | 2 +- config/agentrun.yaml | 9 +- config/unidesk-cli.yaml | 25 +- docs/reference/devops-hygiene.md | 2 + docs/reference/gc.md | 8 + scripts/src/agentrun-lanes.ts | 18 + scripts/src/agentrun/cleanup-session-pvcs.mjs | 102 ++ scripts/src/agentrun/control-plane.ts | 14 +- scripts/src/agentrun/entry.ts | 10 +- scripts/src/agentrun/options.ts | 7 + scripts/src/agentrun/yaml-lane.ts | 51 +- scripts/src/gc-remote-containerd.py | 386 +++++ scripts/src/gc-remote-growth.py | 292 ++++ scripts/src/gc-remote-pvc.py | 383 +++++ scripts/src/gc-remote-registry.py | 677 +++++++++ scripts/src/gc-remote-runner.py | 1340 ++--------------- scripts/src/gc-remote-web-observe.py | 57 + scripts/src/gc-remote.ts | 62 +- 18 files changed, 2205 insertions(+), 1240 deletions(-) create mode 100644 scripts/src/agentrun/cleanup-session-pvcs.mjs create mode 100644 scripts/src/gc-remote-containerd.py create mode 100644 scripts/src/gc-remote-growth.py create mode 100644 scripts/src/gc-remote-pvc.py create mode 100644 scripts/src/gc-remote-registry.py create mode 100644 scripts/src/gc-remote-web-observe.py diff --git a/AGENTS.md b/AGENTS.md index 8ee4b8d5..a9e344cd 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -4,7 +4,7 @@ UniDesk 是一个以主 server 为统一入口的分布式工作平台。本文 ## P0: 文件体积与脚本分流 -- P0: 任何源码/CLI 文件超过 3000 行必须先按职责拆分再继续,禁止继续追加绕过。 +- P0: 任何源码/CLI 文件超过 3000 行必须先按职责差分拆到 2000 行以下再继续,禁止卡在 3000 行边界反复触发。 - P0: 禁止把 shell/Node/Python 等脚本作为大段字符串内嵌;脚本必须放入原生后缀文件(如 `.sh`/`.mjs`/`.py`)并从文件加载。 ## P0: 主 worktree 同步提交第一原则 diff --git a/config/agentrun.yaml b/config/agentrun.yaml index c2f73d23..1b577f5d 100644 --- a/config/agentrun.yaml +++ b/config/agentrun.yaml @@ -719,7 +719,7 @@ controlPlane: - hyueapi.com - .hyueapi.com retention: - maxRunners: 20 + maxRunners: 3 cleanupOrder: oldest-inactive-last-active-first activeHeartbeatMaxAgeMs: 900000 selectors: @@ -734,6 +734,13 @@ controlPlane: ageBasedCleanup: enabled: false maxAgeHours: 48 + sessionPvcRetention: + enabled: true + prefixes: + - agentrun-v01-session- + - agentrun-v02-session- + - agentrun-jd01-v02-session- + maxDeletePerRun: 1000 cancelLifecycle: deliveryMode: manager-epoch gracefulAbortMs: 15000 diff --git a/config/unidesk-cli.yaml b/config/unidesk-cli.yaml index 7db62883..d5319700 100644 --- a/config/unidesk-cli.yaml +++ b/config/unidesk-cli.yaml @@ -133,8 +133,31 @@ gc: hwlabNode: JD01 hwlabLane: v03 agentrunNode: JD01 - agentrunLane: v02 + agentrunLane: jd01-v02 limit: 80 + containerdImageCache: + enabled: true + runtimeEndpoint: unix:///run/k3s/containerd/containerd.sock + namespace: k8s.io + ciNamespaces: + - hwlab-ci + - agentrun-ci + hostContainerdCache: + enabled: true + root: /var/lib/containerd + address: /run/containerd/containerd.sock + namespaces: + - default + orphanCleanup: + enabled: true + overlaySnapshotsRoot: /var/lib/containerd/io.containerd.snapshotter.v1.overlayfs/snapshots + contentBlobRoot: /var/lib/containerd/io.containerd.content.v1.content/blobs/sha256 + localPathStorage: + enabled: true + root: /var/lib/rancher/k3s/storage + orphanDirPrefixes: + - pvc- + orphanMinAgeMinutes: 0 policyTimer: enabled: true name: unidesk-jd01-low-risk-gc diff --git a/docs/reference/devops-hygiene.md b/docs/reference/devops-hygiene.md index 635ae740..5b27f74a 100644 --- a/docs/reference/devops-hygiene.md +++ b/docs/reference/devops-hygiene.md @@ -14,6 +14,8 @@ Local worktrees, D601 runtime files, copied scripts, copied images, ad-hoc Kuber When stable release lanes such as `release/v1` are enabled, the desired-state ref must be explicit in the command, job log and deploy output. Until that support exists, commands that are documented to read `origin/master:deploy.json` must keep doing so and must not silently switch to another branch or a dirty manifest. +Source and CLI files must not be kept near the 3000-line split boundary. Once a file exceeds 3000 lines, split it by responsibility until the original file is below 2000 lines before continuing feature or fix work. Do not make token-preserving micro-edits that leave the file just under or exactly at 3000 lines; that only guarantees the next small change will trigger the same split problem again. + ## Prohibited Deployment Truth The following practices are not acceptable as the long-term or hidden source of a working environment: diff --git a/docs/reference/gc.md b/docs/reference/gc.md index b5e0c03d..a892a5b7 100644 --- a/docs/reference/gc.md +++ b/docs/reference/gc.md @@ -93,6 +93,14 @@ JD01 远端 plan 必须适配短连接:`snapshot` 和轻量 `plan` 返回有 JD01 PVC 归因必须按 YAML 配置的 namespace 集合读取 k8s API,不得复用 G14 专属 namespace 硬编码。报告至少包含 namespace、PVC、PV、host path、requested size、estimated actual bytes、active mount pods、owner/session/PipelineRun/runId、phase 和 reclaim policy。默认只做 plan 和归因;删除 PVC/PV、local-path host path、k3s storage、containerd snapshot/blob 或 workload 对象必须通过对应高层 retention 子命令和 GitOps/运行面 owner 判定,不能由 remote GC 扩大成 raw `kubectl delete` 或 host path 删除。 +JD01/AgentRun 这类 PVC retention 确认入口必须适配短连接:确认步骤只提交经过 plan 选中的 Kubernetes 删除请求并快速返回,不能等待 local-path PV 后端同步回收完成;收敛状态通过下一次 dry-run、`gc remote JD01 status` 或专用 status 子命令查询。若一次提交在 transport 窗口内仍不稳定,应降低 YAML/CLI 批量,而不是改成手工 raw kubectl 或 host path 删除。 + +JD01 local-path storage 中没有 PV 引用的 orphan 目录只能通过 `gc remote JD01 plan|run --include-local-path-orphans` 进入候选。该入口必须从 YAML 读取 storage root、目录前缀 allowlist 和年龄策略,只允许删除 root 的直接子目录,且执行前重新确认无 PV 引用、无 symlink、无打开 fd/cwd;不得把它扩大成通用 `/var/lib/rancher/k3s/storage` 清空或 raw host path 删除。 + +JD01 host containerd 只能通过 `gc remote JD01 plan|run --include-host-containerd-cache` 进入候选。该入口必须从 YAML 读取 containerd root、socket address 和 namespace allowlist;只有 host containerd 目标 namespace 中没有 task/container 时才允许执行 `ctr images prune --all`,不得直接删除 `/var/lib/containerd` 下的 content、snapshot 或 metadata 路径。 + +当 host containerd 的 `ctr` 元数据中 images、containers、tasks、leases、snapshots 和 content 全为空,但 YAML allowlist 下仍残留 overlay snapshot 目录或 content blob 文件时,才能把它们分类为 orphan state。orphan state 清理仍必须通过 `--include-host-containerd-cache` 的 plan/run,执行前重新检查元数据为空、路径在 YAML root 下、名称匹配受控形态、无 symlink、无打开 fd/cwd;不得删除 metadata DB 或扩大到 containerd root。 + JD01 Web observe artifact 是一等 GC 对象。state root 必须来自 YAML;候选按 run 聚合并读取 `manifest.json`、`heartbeat.json`、`pid`、report sha 和 top files。年龄判定以 manifest/heartbeat 的 started/completed/updated 字段、pid 存活和打开 fd 检查为准,不以目录 mtime 为唯一依据,因为手动 GC 或目录遍历可能刷新 mtime。active run、pid alive、open fd、未生成必要 report 的 run 均为 protected。safe 候选只覆盖超过 YAML retention 且可重建的 raw samples、browser-process、network/trace、screenshot 等大 artifact;长期保留 report summary、report json/md、最终截图或诊断摘要由 YAML cap/retention 策略控制。 JD01 Chrome 内存治理应优先管理 observer runner 生命周期,而不是孤立清理 Chrome 进程。Web probe sentinel 和 quick-verify 启动 observer 后,所有终态路径(成功、blocked、失败、timeout、异常)都必须执行 YAML 控制的 `web-probe observe stop`/force stop 流程,并验证对应 runner/Chrome process tree 退出;observe runner 自身也必须从 scenario/YAML 获得最大运行时长或 max samples 兜底,即使调用方退出也会停止采样并关闭 browser。browser freeze policy 只能作为异常保护,不替代正常任务生命周期结束后的 stop。 diff --git a/scripts/src/agentrun-lanes.ts b/scripts/src/agentrun-lanes.ts index 8001b8f8..6ac68b7b 100644 --- a/scripts/src/agentrun-lanes.ts +++ b/scripts/src/agentrun-lanes.ts @@ -223,6 +223,11 @@ export interface AgentRunRunnerRetentionSpec { readonly enabled: boolean; readonly maxAgeHours: number | null; }; + readonly sessionPvcRetention: { + readonly enabled: boolean; + readonly prefixes: readonly string[]; + readonly maxDeletePerRun: number; + }; } export type AgentRunCancelLifecycleStage = "accepted" | "persisted" | "delivered" | "aborting" | "terminalized" | "fenced" | "late-write-rejected"; @@ -701,6 +706,14 @@ function parseCancelLifecycleStages(input: unknown, path: string): readonly Agen function parseRunnerRetention(input: Record, path: string): AgentRunRunnerRetentionSpec { const selectors = recordField(input, "selectors", path); const ageBasedCleanup = recordField(input, "ageBasedCleanup", path); + const sessionPvcRetentionRaw = input.sessionPvcRetention; + const sessionPvcRetention = typeof sessionPvcRetentionRaw === "object" && sessionPvcRetentionRaw !== null && !Array.isArray(sessionPvcRetentionRaw) + ? sessionPvcRetentionRaw as Record + : {}; + const sessionPvcPrefixes = sessionPvcRetention.prefixes === undefined ? [] : stringArrayField(sessionPvcRetention, "prefixes", `${path}.sessionPvcRetention`); + for (const [index, prefix] of sessionPvcPrefixes.entries()) { + if (!/^[a-z0-9]([-a-z0-9]*[a-z0-9-])?$/u.test(prefix)) throw new Error(`${path}.sessionPvcRetention.prefixes[${index}] must be a lowercase Kubernetes PVC name prefix`); + } return { maxRunners: positiveIntegerField(input, "maxRunners", path), cleanupOrder: enumField(input, "cleanupOrder", path, ["oldest-inactive-last-active-first"]), @@ -716,6 +729,11 @@ function parseRunnerRetention(input: Record, path: string): Age enabled: booleanField(ageBasedCleanup, "enabled", `${path}.ageBasedCleanup`), maxAgeHours: optionalPositiveIntegerField(ageBasedCleanup, "maxAgeHours", `${path}.ageBasedCleanup`) ?? null, }, + sessionPvcRetention: { + enabled: sessionPvcRetention.enabled === undefined ? false : booleanField(sessionPvcRetention, "enabled", `${path}.sessionPvcRetention`), + prefixes: sessionPvcPrefixes, + maxDeletePerRun: optionalPositiveIntegerField(sessionPvcRetention, "maxDeletePerRun", `${path}.sessionPvcRetention`) ?? 100, + }, }; } diff --git a/scripts/src/agentrun/cleanup-session-pvcs.mjs b/scripts/src/agentrun/cleanup-session-pvcs.mjs new file mode 100644 index 00000000..13a9c456 --- /dev/null +++ b/scripts/src/agentrun/cleanup-session-pvcs.mjs @@ -0,0 +1,102 @@ +import { execFileSync, spawnSync } from "node:child_process"; + +function runJson(args) { + return JSON.parse(execFileSync("kubectl", args, { encoding: "utf8", maxBuffer: 32 * 1024 * 1024 })); +} + +function duBytes(path) { + if (!path || !path.startsWith("/var/lib/rancher/k3s/storage/")) return null; + const result = spawnSync("du", ["-sb", path], { encoding: "utf8", timeout: 8000 }); + if (result.status !== 0) return null; + const value = Number(result.stdout.trim().split(/\s+/u)[0]); + return Number.isFinite(value) ? value : null; +} + +const namespace = process.env.NAMESPACE; +const confirm = process.env.CONFIRM === "true"; +const enabled = process.env.ENABLED === "true"; +const limit = Math.max(1, Math.min(Number(process.env.LIMIT || "100"), 1000)); +const prefixes = JSON.parse(Buffer.from(process.env.PREFIXES_JSON_B64 || "W10=", "base64").toString("utf8")); + +if (!enabled) { + console.log(JSON.stringify({ ok: false, error: "session-pvc-retention-disabled", selectedPvcCount: 0, mutation: false })); + process.exit(0); +} +if (!namespace || !Array.isArray(prefixes) || prefixes.length === 0) throw new Error("session PVC cleanup requires namespace and YAML prefixes"); + +const pvData = runJson(["get", "pv", "-o", "json"]); +const pvcData = runJson(["-n", namespace, "get", "pvc", "-o", "json"]); +const podData = runJson(["-n", namespace, "get", "pod", "-o", "json"]); +const pvs = new Map((pvData.items || []).map((pv) => [pv.metadata?.name, pv])); +const activeClaims = new Map(); +for (const pod of podData.items || []) { + const phase = pod.status?.phase; + if (phase === "Succeeded" || phase === "Failed") continue; + for (const volume of pod.spec?.volumes || []) { + const claim = volume.persistentVolumeClaim?.claimName; + if (!claim) continue; + const list = activeClaims.get(claim) || []; + list.push(pod.metadata?.name); + activeClaims.set(claim, list); + } +} + +const candidates = []; +const protectedRows = []; +for (const pvc of pvcData.items || []) { + const name = pvc.metadata?.name || ""; + const matchedPrefix = prefixes.find((prefix) => name.startsWith(prefix)); + if (!matchedPrefix) continue; + const activeMountPods = activeClaims.get(name) || []; + const pv = pvs.get(pvc.spec?.volumeName); + const storageClass = pvc.spec?.storageClassName || pv?.spec?.storageClassName || null; + const reclaimPolicy = pv?.spec?.persistentVolumeReclaimPolicy || null; + const hostPath = pv?.spec?.hostPath?.path || pv?.spec?.local?.path || null; + const row = { + namespace, + pvc: name, + volume: pvc.spec?.volumeName || null, + matchedPrefix, + phase: pvc.status?.phase || null, + pvPhase: pv?.status?.phase || null, + storageClass, + reclaimPolicy, + activeMountCount: activeMountPods.length, + activeMountPods: activeMountPods.slice(0, 5), + estimatedBytes: duBytes(hostPath), + }; + if (activeMountPods.length > 0 || storageClass !== "local-path" || reclaimPolicy !== "Delete") { + protectedRows.push({ ...row, reason: activeMountPods.length > 0 ? "active-mount" : "not-local-path-delete" }); + } else { + candidates.push(row); + } +} + +candidates.sort((a, b) => (b.estimatedBytes || 0) - (a.estimatedBytes || 0)); +const selected = candidates.slice(0, limit); +const result = { + ok: true, + planKind: "agentrun-session-pvc-retention", + namespace, + dryRun: !confirm, + mutation: confirm, + criteria: { prefixes, storageClass: "local-path", reclaimPolicy: "Delete", requireNoActiveMount: true, limit }, + candidatePvcCount: candidates.length, + selectedPvcCount: selected.length, + protectedPvcCount: protectedRows.length, + estimatedReclaimBytes: selected.reduce((sum, item) => sum + (item.estimatedBytes || 0), 0), + selectedPreview: selected.slice(0, 12), + protectedPreview: protectedRows.slice(0, 12), + deletedPvcCount: 0, + valuesPrinted: false, +}; + +if (confirm && selected.length > 0) { + for (let index = 0; index < selected.length; index += 50) { + execFileSync("kubectl", ["-n", namespace, "delete", "pvc", "--wait=false", ...selected.slice(index, index + 50).map((item) => item.pvc)], { encoding: "utf8", maxBuffer: 1024 * 1024 }); + } + result.deletedPvcCount = selected.length; + result.deleteMode = "submit-only-wait-false"; +} + +console.log(JSON.stringify(result)); diff --git a/scripts/src/agentrun/control-plane.ts b/scripts/src/agentrun/control-plane.ts index 7c557ede..fa6ac8f0 100644 --- a/scripts/src/agentrun/control-plane.ts +++ b/scripts/src/agentrun/control-plane.ts @@ -33,7 +33,7 @@ import { } from "../agentrun-manifests"; import { sha256Fingerprint } from "../platform-infra-ops-library"; -import type { CleanupReleasedPvOptions, CleanupRunnersOptions, CleanupRunsOptions, ConfirmOptions, GitMirrorOptions, LaneConfirmOptions, RefreshOptions, SecretSyncOptions, StatusOptions } from "./options"; +import type { CleanupReleasedPvOptions, CleanupRunnersOptions, CleanupRunsOptions, CleanupSessionPvcsOptions, ConfirmOptions, GitMirrorOptions, LaneConfirmOptions, RefreshOptions, SecretSyncOptions, StatusOptions } from "./options"; import { agentRunControlPlaneStatusCommand } from "./public-exposure"; import { applyYamlScript, manifestObjectRef, yamlLaneGitMirrorStatusScript } from "./secrets"; import { compactAgentRunLaneStatusTarget, compactLaneSecretsStatus } from "./trigger"; @@ -193,6 +193,18 @@ export function parseCleanupReleasedPvOptions(args: string[]): CleanupReleasedPv }; } +export function parseCleanupSessionPvcsOptions(args: string[]): CleanupSessionPvcsOptions { + validateOptions(args, new Set(["--confirm", "--dry-run"]), new Set(["--limit", "--timeout-seconds", "--node", "--lane"])); + const base = parseConfirmOptions(args); + return { + ...base, + node: optionValue(args, "--node") ?? null, + lane: optionValue(args, "--lane") ?? null, + limit: positiveIntegerOption(args, "--limit", 100, 1000), + timeoutSeconds: positiveIntegerOption(args, "--timeout-seconds", 180, 900), + }; +} + export function validateOptions(args: string[], booleanOptions: Set, valueOptions: Set): void { for (let index = 0; index < args.length; index += 1) { const arg = args[index]; diff --git a/scripts/src/agentrun/entry.ts b/scripts/src/agentrun/entry.ts index 0ad087d7..4dac3190 100644 --- a/scripts/src/agentrun/entry.ts +++ b/scripts/src/agentrun/entry.ts @@ -34,7 +34,7 @@ import { import { sha256Fingerprint } from "../platform-infra-ops-library"; import type { AgentRunResourceVerb, AgentRunRestCompatGroup } from "./utils"; -import { controlPlaneApply, controlPlanePlan, parseCleanupReleasedPvOptions, parseCleanupRunnersOptions, parseCleanupRunsOptions, parseConfirmOptions, parseGitMirrorOptions, parseLaneConfirmOptions, parseRefreshOptions, parseSecretSyncOptions, status } from "./control-plane"; +import { controlPlaneApply, controlPlanePlan, parseCleanupReleasedPvOptions, parseCleanupRunnersOptions, parseCleanupRunsOptions, parseCleanupSessionPvcsOptions, parseConfirmOptions, parseGitMirrorOptions, parseLaneConfirmOptions, parseRefreshOptions, parseSecretSyncOptions, status } from "./control-plane"; import { gitMirrorStatus } from "./git-mirror"; import { agentRunExplain, isRecord, parseGitMirrorStatusOptions, parseStatusOptions, parseTriggerOptions } from "./options"; import { renderAgentRunControlPlaneActionSummary, renderAgentRunControlPlanePlanSummary, renderAgentRunControlPlaneStatusSummary } from "./public-exposure"; @@ -43,7 +43,7 @@ import { agentRunGetKindHelp, runAgentRunResourceCommand } from "./resource-acti import { runAgentRunRestCompatCommand, runGitMirrorJob, startAsyncAgentRunJob } from "./rest-bridge"; import { exposeAgentRun, restartYamlLane, secretSync, triggerCurrent } from "./trigger"; import { unsupported } from "./utils"; -import { cleanupReleasedPvs, cleanupRunners, cleanupRuns, refresh } from "./yaml-lane"; +import { cleanupReleasedPvs, cleanupRunners, cleanupRuns, cleanupSessionPvcs, refresh } from "./yaml-lane"; export function agentRunHelp(): unknown { return { @@ -143,6 +143,9 @@ export async function runAgentRunCommand(config: UniDeskConfig | null, args: str return options.full || options.raw ? result : renderAgentRunControlPlaneActionSummary(result, "AGENTRUN RUNNER CLEANUP"); } if (action === "cleanup-runs") return await cleanupRuns(config, parseCleanupRunsOptions(actionArgs)); + if (action === "cleanup-session-pvcs") { + return await cleanupSessionPvcs(config, parseCleanupSessionPvcsOptions(actionArgs)); + } if (action === "cleanup-released-pvs") return await cleanupReleasedPvs(config, parseCleanupReleasedPvOptions(actionArgs)); } if (group === "git-mirror") { @@ -271,7 +274,7 @@ export function agentRunHelpText(args: string[]): string { return [ "Usage: bun scripts/cli.ts agentrun control-plane [options]", "", - "Actions: plan, apply, status, secret-sync, expose, trigger-current, refresh, cleanup-runners, cleanup-runs, cleanup-released-pvs", + "Actions: plan, apply, status, secret-sync, expose, trigger-current, refresh, cleanup-runners, cleanup-runs, cleanup-session-pvcs, cleanup-released-pvs", "Examples:", " bun scripts/cli.ts agentrun control-plane plan --node D601 --lane v02", " bun scripts/cli.ts agentrun control-plane apply --node D601 --lane v02 --dry-run", @@ -283,6 +286,7 @@ export function agentRunHelpText(args: string[]): string { " bun scripts/cli.ts agentrun control-plane expose --dry-run", " bun scripts/cli.ts agentrun control-plane trigger-current --dry-run", " bun scripts/cli.ts agentrun control-plane cleanup-runners --node D601 --lane v02 --dry-run", + " bun scripts/cli.ts agentrun control-plane cleanup-session-pvcs --node JD01 --lane jd01-v02 --dry-run", " bun scripts/cli.ts agentrun control-plane cleanup-runs --min-age-minutes 30 --limit 200 --dry-run", ].join("\n"); } diff --git a/scripts/src/agentrun/options.ts b/scripts/src/agentrun/options.ts index 49183de2..195155de 100644 --- a/scripts/src/agentrun/options.ts +++ b/scripts/src/agentrun/options.ts @@ -265,6 +265,13 @@ export interface CleanupReleasedPvOptions extends ConfirmOptions { timeoutSeconds: number; } +export interface CleanupSessionPvcsOptions extends ConfirmOptions { + node: string | null; + lane: string | null; + limit: number; + timeoutSeconds: number; +} + export interface DisclosureOptions { full: boolean; raw: boolean; diff --git a/scripts/src/agentrun/yaml-lane.ts b/scripts/src/agentrun/yaml-lane.ts index 7b3725c1..c4931184 100644 --- a/scripts/src/agentrun/yaml-lane.ts +++ b/scripts/src/agentrun/yaml-lane.ts @@ -35,7 +35,7 @@ import { } from "../agentrun-manifests"; import { sha256Fingerprint } from "../platform-infra-ops-library"; -import type { CleanupReleasedPvOptions, CleanupRunnersOptions, CleanupRunsOptions, RefreshOptions } from "./options"; +import type { CleanupReleasedPvOptions, CleanupRunnersOptions, CleanupRunsOptions, CleanupSessionPvcsOptions, RefreshOptions } from "./options"; import { cleanupReleasedPvsFinalizeNodeScript, cleanupReleasedPvsPlanNodeScript, cleanupRunnersFinalizeNodeScript, cleanupRunsFinalizeNodeScript, cleanupRunsPlanNodeScript, refreshYamlLaneScript } from "./git-mirror"; import { cleanupRunnersFactsNodeScript, cleanupRunnersPlanNodeScript, collectLaneSecretSources, createYamlLaneJobScript, yamlLaneGitopsPublishJobManifest, yamlLaneGitopsPublishPayloadFromProbe, yamlLaneJobProbeScript } from "./secrets"; import { capture, captureJsonPayload, compactCapture, progressEvent, shQuote, sleep, stringOrNull } from "./utils"; @@ -204,6 +204,55 @@ export async function cleanupReleasedPvs(config: UniDeskConfig, options: Cleanup }; } +export async function cleanupSessionPvcs(config: UniDeskConfig, options: CleanupSessionPvcsOptions): Promise> { + const { configPath, spec } = resolveAgentRunLaneTarget(options); + const result = await capture(config, spec.nodeKubeRoute, ["sh", "--", cleanupSessionPvcsScript(options, spec)]); + const payload = captureJsonPayload(result); + const ok = result.exitCode === 0 && payload.ok !== false; + const base = { + ...payload, + ok, + command: "agentrun control-plane cleanup-session-pvcs", + configPath, + target: { node: spec.nodeId, lane: spec.lane, namespace: spec.runtime.namespace }, + mode: options.dryRun || !options.confirm ? "dry-run" : "confirmed-cleanup", + namespace: spec.runtime.namespace, + retention: spec.deployment.runner.retention.sessionPvcRetention, + probe: result.exitCode === 0 ? undefined : compactCapture(result, { full: true, stdoutTailChars: 3000, stderrTailChars: 3000 }), + }; + if (options.dryRun || !options.confirm) { + return { ...base, dryRun: true, mutation: false, next: { confirm: `bun scripts/cli.ts agentrun control-plane cleanup-session-pvcs --node ${spec.nodeId} --lane ${spec.lane} --limit ${options.limit} --confirm` } }; + } + return { + ...base, + dryRun: false, + mutation: true, + followUp: { + dryRun: `bun scripts/cli.ts agentrun control-plane cleanup-session-pvcs --node ${spec.nodeId} --lane ${spec.lane} --limit ${options.limit} --dry-run`, + diskPressure: `bun scripts/cli.ts gc remote ${spec.nodeId} status --limit 20`, + }, + }; +} + +export function cleanupSessionPvcsScript(options: CleanupSessionPvcsOptions, spec: AgentRunLaneSpec): string { + const retention = spec.deployment.runner.retention.sessionPvcRetention; + const script = readFileSync(rootPath("scripts/src/agentrun/cleanup-session-pvcs.mjs"), "utf8"); + return [ + "set -eu", + `namespace=${shQuote(spec.runtime.namespace)}`, + `confirm=${options.confirm && !options.dryRun ? "true" : "false"}`, + `limit=${String(Math.min(options.limit, retention.maxDeletePerRun))}`, + `enabled=${retention.enabled ? "true" : "false"}`, + `prefixes_json_b64=${shQuote(Buffer.from(JSON.stringify(retention.prefixes), "utf8").toString("base64"))}`, + "tmp_dir=$(mktemp -d)", + "trap 'rm -rf \"$tmp_dir\"' EXIT", + "cat > \"$tmp_dir/cleanup-session-pvcs.mjs\" <<'NODE'", + script, + "NODE", + "env NAMESPACE=\"$namespace\" CONFIRM=\"$confirm\" LIMIT=\"$limit\" ENABLED=\"$enabled\" PREFIXES_JSON_B64=\"$prefixes_json_b64\" node \"$tmp_dir/cleanup-session-pvcs.mjs\"", + ].join("\n"); +} + export function cleanupRunnersScript(options: CleanupRunnersOptions, spec: AgentRunLaneSpec): string { const retention = spec.deployment.runner.retention; const matchLabelsB64 = Buffer.from(JSON.stringify(retention.selectors.matchLabels), "utf8").toString("base64"); diff --git a/scripts/src/gc-remote-containerd.py b/scripts/src/gc-remote-containerd.py new file mode 100644 index 00000000..add56792 --- /dev/null +++ b/scripts/src/gc-remote-containerd.py @@ -0,0 +1,386 @@ +def k3s_crictl_base(): + endpoint = str(CONTAINERD_CONFIG.get("runtimeEndpoint") or "unix:///run/k3s/containerd/containerd.sock") + return ["crictl", "--runtime-endpoint", endpoint] + +def shell_single_quote(value): + return "'" + str(value).replace("'", "'\"'\"'") + "'" + +def k3s_crictl_json(args, timeout=30): + result = command(k3s_crictl_base() + args + ["-o", "json"], timeout) + if result["exitCode"] != 0: + return None, result + try: + return json.loads(result["stdout"] or "{}"), result + except Exception: + return None, result + +def ci_activity_snapshot_for_prune(): + namespaces = config_list(CONTAINERD_CONFIG, "ciNamespaces", ["hwlab-ci", "agentrun-ci"]) + active = [] + commands = [] + for namespace in namespaces: + result = command(["sh", "-lc", "KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl get pipelinerun,taskrun,job -n %s --no-headers 2>/dev/null | awk '$2 != \"True\" && $2 != \"False\" && $2 != \"Complete\" && $2 != \"Failed\" {print}' | head -20" % shell_single_quote(namespace)], 15) + commands.append({"namespace": namespace, "command": bounded(result)}) + for line in (result.get("stdout") or "").splitlines(): + if line.strip(): + active.append({"namespace": namespace, "line": line.strip()}) + return {"ok": True, "activeCount": len(active), "activePreview": active[:20], "commands": commands} + +def compact_ci_activity(activity): + return { + "ok": activity.get("ok"), + "activeCount": activity.get("activeCount"), + "activePreview": activity.get("activePreview") or [], + } + +def compact_image_ref(ref): + ref = str(ref or "") + return ref if len(ref) <= 120 else ref[:117] + "..." + +def k3s_cri_image_rows(): + images, image_cmd = k3s_crictl_json(["images"], 45) + containers, container_cmd = k3s_crictl_json(["ps", "-a"], 30) + if images is None: + return None, {"ok": False, "reason": "crictl-images-failed", "command": bounded(image_cmd)} + if containers is None: + return None, {"ok": False, "reason": "crictl-ps-failed", "command": bounded(container_cmd)} + used = set() + for container in containers.get("containers") or []: + for key in ["imageRef", "image", "imageId"]: + value = container.get(key) + if isinstance(value, str) and value: + used.add(value) + image = container.get("image") or {} + if isinstance(image, dict): + for key in ["image", "annotations", "userSpecifiedImage"]: + value = image.get(key) + if isinstance(value, str) and value: + used.add(value) + rows = [] + for image in images.get("images") or []: + refs = [] + for key in ["repoTags", "repoDigests"]: + value = image.get(key) + if isinstance(value, list): + refs.extend([str(item) for item in value if item]) + image_id = str(image.get("id") or "") + pinned = bool(image.get("pinned")) + size = safe_int(image.get("size_") or image.get("size") or 0) + in_use = pinned or image_id in used or any(ref in used for ref in refs) + rows.append({"id": image_id, "refs": refs, "sizeBytes": size, "inUse": in_use, "pinned": pinned}) + return rows, {"ok": True, "imageCommand": bounded(image_cmd), "containerCommand": bounded(container_cmd)} + +def k3s_image_cache_candidate(): + if not config_bool(CONTAINERD_CONFIG, "enabled", False): + return { + "id": "k3s-cri-image-prune:disabled", + "kind": "k3s-cri-image-prune-disabled", + "risk": "blocked", + "description": "K3s CRI image prune is disabled in YAML", + "estimatedReclaimBytes": 0, + "configSource": "config/unidesk-cli.yaml#gc.remote.targets.%s.containerdImageCache.enabled" % PROVIDER_ID, + } + activity = ci_activity_snapshot_for_prune() + if int(activity.get("activeCount") or 0) > 0: + return { + "id": "k3s-cri-image-prune:ci-active", + "kind": "k3s-cri-image-prune-blocked", + "risk": "blocked", + "description": "K3s CRI image prune is blocked while CI workloads are active", + "estimatedReclaimBytes": 0, + "ciActivity": compact_ci_activity(activity), + } + rows, meta = k3s_cri_image_rows() + if rows is None: + return { + "id": "k3s-cri-image-prune:unavailable", + "kind": "k3s-cri-image-prune-unavailable", + "risk": "blocked", + "description": "K3s CRI image list is unavailable", + "estimatedReclaimBytes": 0, + "diagnostic": meta, + } + unused = [row for row in rows if not row.get("inUse")] + estimated = sum(safe_int(row.get("sizeBytes")) for row in unused) + if estimated <= 0: + return None + return { + "id": "k3s-cri-image-prune:unused", + "kind": "k3s-cri-image-prune", + "risk": "medium", + "description": "Prune unused k3s CRI images through crictl rmi --prune; no containerd paths are deleted directly", + "sizeBytes": estimated, + "estimatedReclaimBytes": estimated, + "imageCount": len(rows), + "unusedImageCount": len(unused), + "unusedPreview": [{"id": row.get("id"), "refs": [compact_image_ref(ref) for ref in (row.get("refs") or [])[:2]], "sizeBytes": row.get("sizeBytes")} for row in unused[:3]], + "ciActivity": compact_ci_activity(activity), + "action": {"command": k3s_crictl_base() + ["rmi", "--prune"], "mode": "cri-unused-images-only"}, + } + +def execute_k3s_image_cache_prune(): + activity = ci_activity_snapshot_for_prune() + if int(activity.get("activeCount") or 0) > 0: + raise RuntimeError("refusing k3s image prune while CI workloads are active") + before = du_size("/var/lib/rancher/k3s/agent/containerd", 45) or 0 + result = command(k3s_crictl_base() + ["rmi", "--prune"], 300) + if result["exitCode"] != 0: + raise RuntimeError((result["stderr"] or result["stdout"] or "crictl rmi --prune failed").strip()) + after = du_size("/var/lib/rancher/k3s/agent/containerd", 45) or 0 + return {"reclaimedBytes": max(0, before - after), "commandOutput": bounded(result), "ciActivity": compact_ci_activity(activity)} + +def host_ctr_base(namespace=None): + address = config_str(HOST_CONTAINERD_CONFIG, "address", "") + args = ["ctr"] + if address: + args.extend(["--address", address]) + if namespace: + args.extend(["-n", namespace]) + return args + +def host_ctr(args, timeout=30, namespace=None): + return command(host_ctr_base(namespace) + args, timeout) + +def host_containerd_namespaces(): + configured = config_list(HOST_CONTAINERD_CONFIG, "namespaces", []) + if configured: + return configured, {"source": "yaml", "command": None} + result = host_ctr(["namespaces", "list", "-q"], 20) + if result["exitCode"] != 0: + return [], {"source": "ctr", "command": bounded(result), "error": "ctr-namespaces-failed"} + return [line.strip() for line in (result.get("stdout") or "").splitlines() if line.strip()], {"source": "ctr", "command": bounded(result)} + +def host_containerd_activity(): + if not config_bool(HOST_CONTAINERD_CONFIG, "enabled", False): + return {"ok": False, "reason": "host-containerd-cache-disabled", "activeCount": 0} + root = config_str(HOST_CONTAINERD_CONFIG, "root", "") + if not root or not os.path.isdir(root): + return {"ok": False, "reason": "host-containerd-root-unavailable", "root": root, "activeCount": 0} + namespaces, namespace_meta = host_containerd_namespaces() + active = [] + commands = [] + for namespace in namespaces: + task_result = host_ctr(["tasks", "list", "-q"], 20, namespace) + container_result = host_ctr(["containers", "list", "-q"], 20, namespace) + image_result = host_ctr(["images", "list", "-q"], 20, namespace) + lease_result = host_ctr(["leases", "list", "-q"], 20, namespace) + snapshot_result = host_ctr(["snapshots", "list"], 20, namespace) + content_result = host_ctr(["content", "list"], 20, namespace) + snapshot_lines = table_data_lines(snapshot_result.get("stdout") or "", "KEY") + content_lines = table_data_lines(content_result.get("stdout") or "", "DIGEST") + commands.append({ + "namespace": namespace, + "tasks": bounded(task_result), + "containers": bounded(container_result), + "images": bounded(image_result), + "leases": bounded(lease_result), + "snapshots": bounded(snapshot_result), + "content": bounded(content_result), + }) + for kind, result in [("task", task_result), ("container", container_result), ("lease", lease_result)]: + if result["exitCode"] != 0: + active.append({"namespace": namespace, "kind": kind, "state": "unknown", "reason": "ctr-list-failed"}) + continue + for line in (result.get("stdout") or "").splitlines(): + if line.strip(): + active.append({"namespace": namespace, "kind": kind, "name": line.strip()}) + if snapshot_result["exitCode"] != 0: + active.append({"namespace": namespace, "kind": "snapshot", "state": "unknown", "reason": "ctr-list-failed"}) + for line in snapshot_lines: + active.append({"namespace": namespace, "kind": "snapshot", "name": line.split()[0] if line.split() else line}) + if content_result["exitCode"] != 0: + active.append({"namespace": namespace, "kind": "content", "state": "unknown", "reason": "ctr-list-failed"}) + for line in content_lines: + active.append({"namespace": namespace, "kind": "content", "name": line.split()[0] if line.split() else line}) + return { + "ok": True, + "root": root, + "namespaces": namespaces, + "namespaceMeta": namespace_meta, + "activeCount": len(active), + "activePreview": active[:20], + "commands": commands, + } + +def compact_host_containerd_activity(activity): + return { + "ok": activity.get("ok"), + "reason": activity.get("reason"), + "root": activity.get("root"), + "namespaces": activity.get("namespaces"), + "activeCount": activity.get("activeCount"), + "activePreview": activity.get("activePreview") or [], + } + +def table_data_lines(stdout, header_prefix): + lines = [line.strip() for line in str(stdout or "").splitlines() if line.strip()] + return [line for line in lines if not line.startswith(header_prefix)] + +def host_containerd_orphan_config(): + value = HOST_CONTAINERD_CONFIG.get("orphanCleanup") if isinstance(HOST_CONTAINERD_CONFIG, dict) else None + return value if isinstance(value, dict) else {} + +def direct_child_paths(root, predicate): + if not root or not os.path.isdir(root) or os.path.islink(root): + return [] + rows = [] + for name in sorted(os.listdir(root)): + path = os.path.realpath(os.path.abspath(os.path.join(root, name))) + if os.path.dirname(path) != os.path.realpath(os.path.abspath(root)): + continue + if not predicate(name, path): + continue + rows.append({"name": name, "path": path, "estimatedReclaimBytes": du_size(path, 10) or path_size(path)}) + return rows + +def host_containerd_orphan_rows(activity): + cfg = host_containerd_orphan_config() + if not config_bool(cfg, "enabled", False): + return [], {"ok": False, "reason": "host-containerd-orphan-cleanup-disabled"} + if not activity.get("ok") or int(activity.get("activeCount") or 0) > 0: + return [], {"ok": False, "reason": "host-containerd-metadata-not-empty", "activity": compact_host_containerd_activity(activity)} + overlay_root = os.path.realpath(os.path.abspath(config_str(cfg, "overlaySnapshotsRoot", ""))) + content_root = os.path.realpath(os.path.abspath(config_str(cfg, "contentBlobRoot", ""))) + root = os.path.realpath(os.path.abspath(config_str(HOST_CONTAINERD_CONFIG, "root", ""))) + if not root or not overlay_root.startswith(root.rstrip("/") + "/") or not content_root.startswith(root.rstrip("/") + "/"): + return [], {"ok": False, "reason": "host-containerd-orphan-root-outside-containerd-root", "root": root} + open_roots = [] + for candidate_root in [overlay_root, content_root]: + if os.path.exists(candidate_root) and path_has_open_fd(candidate_root): + open_roots.append(candidate_root) + if open_roots: + return [], {"ok": False, "reason": "host-containerd-orphan-root-open-fd", "openRoots": open_roots} + overlay_rows = direct_child_paths(overlay_root, lambda name, path: os.path.isdir(path) and not os.path.islink(path) and re.match(r"^[0-9]+$", name) is not None) + content_rows = direct_child_paths(content_root, lambda name, path: os.path.isfile(path) and not os.path.islink(path) and re.match(r"^[0-9a-f]{64}$", name) is not None) + safe_rows = [] + for kind, rows in [("overlay-snapshot-dir", overlay_rows), ("content-blob-file", content_rows)]: + for row in rows: + safe_rows.append({**row, "kind": kind}) + safe_rows.sort(key=lambda item: safe_int(item.get("estimatedReclaimBytes")), reverse=True) + return safe_rows, { + "ok": True, + "root": root, + "overlaySnapshotsRoot": overlay_root, + "contentBlobRoot": content_root, + "overlayCandidateCount": len(overlay_rows), + "contentCandidateCount": len(content_rows), + "protectedCount": 0, + "protectedPreview": [], + } + +def host_containerd_orphan_candidate(activity): + rows, meta = host_containerd_orphan_rows(activity) + if not meta.get("ok"): + return None + limit = int(OPTIONS.get("limit") or 50) + selected = rows[:limit] + estimated = sum(safe_int(row.get("estimatedReclaimBytes")) for row in selected) + if estimated <= 0: + return None + return { + "id": "host-containerd-orphan-state:delete", + "kind": "host-containerd-orphan-state-delete", + "risk": "medium", + "description": "Delete YAML-allowlisted host containerd orphan snapshot/content files only when ctr metadata has no tasks, containers, leases, images, snapshots or content", + "path": meta.get("root"), + "sizeBytes": estimated, + "estimatedReclaimBytes": estimated, + "orphanCount": len(rows), + "selectedOrphanCount": len(selected), + "overlayCandidateCount": meta.get("overlayCandidateCount"), + "contentCandidateCount": meta.get("contentCandidateCount"), + "protectedCount": meta.get("protectedCount"), + "selectedPreview": [{"kind": row.get("kind"), "name": row.get("name"), "estimatedReclaimBytes": row.get("estimatedReclaimBytes")} for row in selected[:8]], + "protectedPreview": meta.get("protectedPreview"), + "action": {"op": "remove-yaml-allowlisted-host-containerd-orphans", "limit": limit}, + } + +def host_containerd_cache_candidate(): + activity = host_containerd_activity() + if not activity.get("ok"): + return { + "id": "host-containerd-cache:unavailable", + "kind": "host-containerd-cache-unavailable", + "risk": "blocked", + "description": "Host containerd cache cleanup is disabled or unavailable by YAML", + "estimatedReclaimBytes": 0, + "diagnostic": compact_host_containerd_activity(activity), + } + if int(activity.get("activeCount") or 0) > 0: + return { + "id": "host-containerd-cache:active", + "kind": "host-containerd-cache-blocked", + "risk": "blocked", + "description": "Host containerd cache prune is blocked while host containerd tasks or containers exist", + "estimatedReclaimBytes": 0, + "activity": compact_host_containerd_activity(activity), + } + orphan = host_containerd_orphan_candidate(activity) + if orphan: + return orphan + root = activity.get("root") or "" + size = du_size(root, 45) or 0 + if size <= 0: + return None + return { + "id": "host-containerd-cache:prune-unused", + "kind": "host-containerd-cache-prune", + "risk": "medium", + "description": "Prune host containerd images in YAML-selected namespaces only when no host containerd tasks or containers exist", + "path": root, + "sizeBytes": size, + "estimatedReclaimBytes": size, + "activity": compact_host_containerd_activity(activity), + "action": {"command": "ctr images prune --all per namespace", "mode": "host-containerd-unused-images-only"}, + } + +def execute_host_containerd_cache_prune(): + activity = host_containerd_activity() + if not activity.get("ok"): + raise RuntimeError("host containerd cache cleanup unavailable: %s" % activity.get("reason")) + if int(activity.get("activeCount") or 0) > 0: + raise RuntimeError("refusing host containerd prune while tasks or containers exist") + root = activity.get("root") or "" + before = du_size(root, 45) or 0 + results = [] + for namespace in activity.get("namespaces") or []: + result = host_ctr(["images", "prune", "--all"], 300, namespace) + results.append({"namespace": namespace, "imagesPrune": bounded(result)}) + if result["exitCode"] != 0: + raise RuntimeError("host containerd image prune failed in namespace %s: %s" % (namespace, (result.get("stderr") or result.get("stdout") or "").strip())) + after = du_size(root, 45) or 0 + return { + "reclaimedBytes": max(0, before - after), + "activity": compact_host_containerd_activity(activity), + "commandResults": results[:8], + } + +def execute_host_containerd_orphan_cleanup(): + activity = host_containerd_activity() + rows, meta = host_containerd_orphan_rows(activity) + if not meta.get("ok"): + raise RuntimeError("host containerd orphan cleanup unavailable: %s" % meta.get("reason")) + for root_path in [meta.get("overlaySnapshotsRoot"), meta.get("contentBlobRoot")]: + if root_path and os.path.exists(root_path) and path_has_open_fd(root_path): + raise RuntimeError("refusing host containerd orphan cleanup with open fd/cwd under root: %s" % root_path) + limit = int(OPTIONS.get("limit") or 50) + selected = rows[:limit] + reclaimed = 0 + deleted = [] + for row in selected: + path = row.get("path") + before = du_size(path, 10) or path_size(path) + if row.get("kind") == "overlay-snapshot-dir": + shutil.rmtree(path, ignore_errors=True) + elif row.get("kind") == "content-blob-file": + os.unlink(path) + else: + raise RuntimeError("unsupported host containerd orphan kind: %s" % row.get("kind")) + reclaimed += before + deleted.append({"kind": row.get("kind"), "name": row.get("name"), "reclaimedBytes": before}) + return { + "reclaimedBytes": reclaimed, + "deletedOrphanCount": len(deleted), + "deletedPreview": deleted[:12], + "root": meta.get("root"), + } diff --git a/scripts/src/gc-remote-growth.py b/scripts/src/gc-remote-growth.py new file mode 100644 index 00000000..0b0ebb68 --- /dev/null +++ b/scripts/src/gc-remote-growth.py @@ -0,0 +1,292 @@ +def registry_growth_snapshot(): + summary = { + "path": REGISTRY_ROOT, + "sizeBytes": du_size(REGISTRY_ROOT, 60) or 0, + } + summary["sizeHuman"] = fmt_bytes(summary["sizeBytes"]) + if OPTIONS.get("hwlabRegistry", False): + plan = plan_registry_retention() + retention = dict(plan.get("summary") or {}) + for key in ["registrySizeBytes", "estimatedReclaimBytes"]: + if key in retention: + retention[key.replace("Bytes", "Human")] = fmt_bytes(retention.get(key) or 0) + summary["retentionPlan"] = retention + else: + summary["retentionPlan"] = { + "skipped": True, + "reason": "rerun snapshot with --include-hwlab-registry to compute tag/revision retention counters", + } + summary["cadence"] = { + "dryRun": "daily or before/after every v0.2 CI/CD burst", + "maintenanceRun": "weekly, or when root >=80%, or when registry growth exceeds the agreed daily threshold", + "planCommand": "bun scripts/cli.ts gc remote %s plan --target-use-percent 70 --include-hwlab-registry --limit 50" % PROVIDER_ID, + "snapshotCommand": "bun scripts/cli.ts gc remote %s snapshot --include-hwlab-registry --history-limit 12" % PROVIDER_ID, + "runCommand": "bun scripts/cli.ts gc remote %s run --confirm --include-hwlab-registry --target-use-percent 70 --limit 50" % PROVIDER_ID, + "defaultRetention": { + "keepPerRepo": int(OPTIONS.get("registryKeepPerRepo") or 20), + "minAgeHours": float(OPTIONS.get("registryMinAgeHours") or 48), + "protects": ["current workload refs", "digest closure", "protected tags", "recent tags", "newest N tags per repo"], + }, + } + return summary + +def growth_watermark_policy(root_disk): + use_percent = root_disk.get("usePercent") if isinstance(root_disk, dict) else None + if use_percent is None: + state = "unknown" + action = "collect-snapshot" + elif use_percent < 75: + state = "healthy" + action = "observe-trend" + elif use_percent < 80: + state = "watch" + action = "run-dry-run-plan" + elif use_percent < 85: + state = "maintenance" + action = "schedule-owner-aware-retention" + else: + state = "emergency" + action = "restore-runtime-then-file-evidence" + return { + "state": state, + "recommendedAction": action, + "watermarks": [ + {"range": "<75%", "action": "trend only"}, + {"range": "75%-80%", "action": "run dry-run plan and identify source"}, + {"range": "80%-85%", "action": "small owner-aware retention run"}, + {"range": ">=85%", "action": "runtime recovery first, then root-cause growth source"}, + ], + "growthThresholdPolicy": "If bytes/day remains high for consecutive snapshots, act before 80%; exact threshold should be set from the first week of saved snapshots.", + } + +def snapshot_metric_map(snapshot): + metrics = {} + root = snapshot.get("rootDisk") or {} + if isinstance(root, dict) and root.get("usedBytes") is not None: + metrics["root.usedBytes"] = {"value": safe_int(root.get("usedBytes")), "unit": "bytes", "label": "root used bytes"} + for item in snapshot.get("sources") or []: + if not isinstance(item, dict) or item.get("sizeBytes") is None: + continue + key = "source.%s.sizeBytes" % item.get("id") + metrics[key] = {"value": safe_int(item.get("sizeBytes")), "unit": "bytes", "label": item.get("label") or item.get("id")} + storage = ((snapshot.get("ciStorage") or {}).get("byOwnerGroup") or {}) + if not storage: + storage = ((snapshot.get("pvcAttribution") or {}).get("byOwnerGroup") or {}) + for owner, value in storage.items(): + metrics["ciStorage.%s.estimatedBytes" % owner] = {"value": safe_int((value or {}).get("estimatedBytes")), "unit": "bytes", "label": "CI storage %s" % owner} + memory = snapshot.get("memoryPressure") or {} + memory_summary = memory.get("summary") or {} + if memory_summary.get("matchedRssBytes") is not None: + metrics["memoryPressure.matchedRssBytes"] = {"value": safe_int(memory_summary.get("matchedRssBytes")), "unit": "bytes", "label": "matched observer/chrome RSS"} + if memory_summary.get("observeStateBytes") is not None: + metrics["memoryPressure.observeStateBytes"] = {"value": safe_int(memory_summary.get("observeStateBytes")), "unit": "bytes", "label": "web observe state bytes"} + for key in ["matchedProcessCount", "activeObserverSignals", "staleObserverSignals"]: + if memory_summary.get(key) is not None: + metrics["memoryPressure.%s" % key] = {"value": safe_int(memory_summary.get(key)), "unit": "count", "label": "memory pressure %s" % key} + registry = snapshot.get("registry") or {} + retention = registry.get("retentionPlan") or {} + for key in ["totalTags", "totalRevisions", "deleteTags", "deleteRevisions", "estimatedReclaimBytes"]: + if key in retention and retention.get(key) is not None: + unit = "bytes" if key.endswith("Bytes") else "count" + metrics["registry.%s" % key] = {"value": safe_int(retention.get(key)), "unit": unit, "label": "registry %s" % key} + return metrics + +def delta_metric_rows(before, after): + before_metrics = snapshot_metric_map(before) + after_metrics = snapshot_metric_map(after) + before_ts = iso_to_epoch(before.get("observedAt")) + after_ts = iso_to_epoch(after.get("observedAt")) + seconds = (after_ts - before_ts) if before_ts is not None and after_ts is not None else None + rows = [] + for key in sorted(set(before_metrics.keys()) | set(after_metrics.keys())): + old = before_metrics.get(key, {"value": 0, "unit": (after_metrics.get(key) or {}).get("unit"), "label": key}) + new = after_metrics.get(key, {"value": 0, "unit": old.get("unit"), "label": old.get("label")}) + delta = safe_int(new.get("value")) - safe_int(old.get("value")) + row = { + "key": key, + "label": new.get("label") or old.get("label") or key, + "unit": new.get("unit") or old.get("unit"), + "before": old.get("value"), + "after": new.get("value"), + "delta": delta, + } + if row["unit"] == "bytes": + row["beforeHuman"] = fmt_bytes(row["before"] or 0) + row["afterHuman"] = fmt_bytes(row["after"] or 0) + row["deltaHuman"] = ("-" if delta < 0 else "") + fmt_bytes(abs(delta)) + if seconds and seconds > 0: + per_day = int(delta * 86400 / seconds) + row["perDayBytes"] = per_day + row["perDayHuman"] = ("-" if per_day < 0 else "") + fmt_bytes(abs(per_day)) + "/day" + rows.append(row) + rows.sort(key=lambda item: safe_int(item.get("delta")), reverse=True) + return {"durationSeconds": seconds, "metrics": rows} + +def growth_trend_payload(points): + points = [point for point in points if isinstance(point, dict)] + if len(points) < 2: + return { + "pointCount": len(points), + "state": "insufficient-history", + "message": "Run snapshot at least twice to compute deltas.", + } + latest_delta = delta_metric_rows(points[-2], points[-1]) + window_delta = delta_metric_rows(points[0], points[-1]) + def rate_warning(delta): + seconds = delta.get("durationSeconds") + if seconds is not None and seconds < 3600: + return { + "code": "short-window-rate-noisy", + "message": "Per-day rates from windows shorter than 1 hour are directional only; use daily snapshots for governance decisions.", + "durationSeconds": seconds, + } + return None + return { + "pointCount": len(points), + "oldestAt": points[0].get("observedAt"), + "latestAt": points[-1].get("observedAt"), + "latestDelta": { + "durationSeconds": latest_delta.get("durationSeconds"), + "rateWarning": rate_warning(latest_delta), + "topGrowingBytes": [row for row in latest_delta.get("metrics", []) if row.get("unit") == "bytes" and safe_int(row.get("delta")) > 0][:10], + "topShrinkingBytes": [row for row in reversed(latest_delta.get("metrics", [])) if row.get("unit") == "bytes" and safe_int(row.get("delta")) < 0][:10], + "registryCounters": [row for row in latest_delta.get("metrics", []) if str(row.get("key", "")).startswith("registry.") and row.get("unit") == "count"], + }, + "windowDelta": { + "durationSeconds": window_delta.get("durationSeconds"), + "rateWarning": rate_warning(window_delta), + "topGrowingBytes": [row for row in window_delta.get("metrics", []) if row.get("unit") == "bytes" and safe_int(row.get("delta")) > 0][:10], + "topShrinkingBytes": [row for row in reversed(window_delta.get("metrics", [])) if row.get("unit") == "bytes" and safe_int(row.get("delta")) < 0][:10], + "registryCounters": [row for row in window_delta.get("metrics", []) if str(row.get("key", "")).startswith("registry.") and row.get("unit") == "count"], + }, + } + +def compact_metric_rows(rows, limit=3): + compact = [] + for row in (rows or [])[:limit]: + compact.append({ + "key": row.get("key"), + "label": row.get("label"), + "unit": row.get("unit"), + "delta": row.get("delta"), + "deltaHuman": row.get("deltaHuman"), + "perDayHuman": row.get("perDayHuman"), + }) + return compact + +def compact_trend_payload(payload): + if payload.get("state") == "insufficient-history": + return payload + latest = payload.get("latestDelta") or {} + window = payload.get("windowDelta") or {} + return { + "pointCount": payload.get("pointCount"), + "oldestAt": payload.get("oldestAt"), + "latestAt": payload.get("latestAt"), + "latestDelta": { + "durationSeconds": latest.get("durationSeconds"), + "rateWarning": latest.get("rateWarning"), + "topGrowingBytes": compact_metric_rows(latest.get("topGrowingBytes") or [], 1), + "topShrinkingBytes": compact_metric_rows(latest.get("topShrinkingBytes") or [], 1), + "registryCounters": compact_metric_rows(latest.get("registryCounters") or [], 1), + }, + "windowDelta": { + "durationSeconds": window.get("durationSeconds"), + "rateWarning": window.get("rateWarning"), + "topGrowingBytes": compact_metric_rows(window.get("topGrowingBytes") or [], 1), + "topShrinkingBytes": compact_metric_rows(window.get("topShrinkingBytes") or [], 1), + "registryCounters": compact_metric_rows(window.get("registryCounters") or [], 1), + }, + "fullDisclosure": "rerun trend --full for all metric rows", + } + +def compact_growth_point(item): + registry = item.get("registry") or {} + retention = registry.get("retentionPlan") or {} + ci_storage = item.get("ciStorage") or {} + containerd = item.get("containerd") or {} + memory = item.get("memoryPressure") or {} + memory_summary = memory.get("summary") or {} + observe = (memory.get("webObserve") or {}) + return { + "observedAt": item.get("observedAt"), + "rootDisk": item.get("rootDisk"), + "sourceCount": len(item.get("sources") or []), + "registry": { + "sizeBytes": registry.get("sizeBytes"), + "sizeHuman": registry.get("sizeHuman"), + "totalTags": retention.get("totalTags"), + "totalRevisions": retention.get("totalRevisions"), + "deleteTags": retention.get("deleteTags"), + "deleteRevisions": retention.get("deleteRevisions"), + "estimatedReclaimBytes": retention.get("estimatedReclaimBytes"), + "estimatedReclaimHuman": retention.get("estimatedReclaimHuman"), + }, + "ciStorage": { + "pvcCount": ci_storage.get("pvcCount"), + "estimatedBytes": ci_storage.get("estimatedBytes"), + "estimatedHuman": ci_storage.get("estimatedHuman"), + "byOwnerGroup": ci_storage.get("byOwnerGroup"), + }, + "containerd": { + "state": containerd.get("state"), + "cleanupSupported": containerd.get("cleanupSupported"), + }, + "memoryPressure": { + "matchedProcessCount": memory_summary.get("matchedProcessCount"), + "matchedRssBytes": memory_summary.get("matchedRssBytes"), + "matchedRssHuman": memory_summary.get("matchedRssHuman"), + "activeObserverSignals": memory_summary.get("activeObserverSignals"), + "staleObserverSignals": memory_summary.get("staleObserverSignals"), + "observeStateBytes": memory_summary.get("observeStateBytes"), + "observeStateHuman": memory_summary.get("observeStateHuman"), + "webObserveRootCount": observe.get("rootCount"), + }, + } + +def collect_growth_snapshot(observed_at, preflight): + root_disk = df_snapshot() + sources = disk_source_snapshot() + ci_storage = ci_storage_snapshot() + memory_pressure = collect_memory_pressure() + compact_pvc = compact_pvc_attribution(ci_storage) + if bool(OPTIONS.get("full")): + public_pvc = ci_storage + public_memory = memory_pressure + else: + public_pvc = compact_ci_storage_summary(ci_storage) + public_memory = compact_memory_summary(memory_pressure) + registry = registry_growth_snapshot() + containerd = containerd_breakdown_snapshot() + commands = { + "snapshot": "bun scripts/cli.ts gc remote %s snapshot --include-hwlab-registry --history-limit %s" % (PROVIDER_ID, int(OPTIONS.get("historyLimit") or 12)), + "trend": "bun scripts/cli.ts gc remote %s trend --history-limit %s" % (PROVIDER_ID, int(OPTIONS.get("historyLimit") or 12)), + "registryPlan": "bun scripts/cli.ts gc remote %s plan --target-use-percent 70 --include-hwlab-registry --limit 50" % PROVIDER_ID, + "hwlabCiRetention": ((ci_storage.get("handoff") or {}).get("hwlab") or {}).get("dryRun"), + "agentrunRetention": ((ci_storage.get("handoff") or {}).get("agentrun") or {}).get("dryRun"), + "remotePolicy": "bun scripts/cli.ts gc remote %s policy plan" % PROVIDER_ID, + } + if not bool(OPTIONS.get("full")): + commands = { + "trend": "bun scripts/cli.ts gc remote %s trend --history-limit %s" % (PROVIDER_ID, int(OPTIONS.get("historyLimit") or 12)), + "status": "bun scripts/cli.ts gc remote %s status --limit %s" % (PROVIDER_ID, int(OPTIONS.get("limit") or 50)), + "full": "bun scripts/cli.ts gc remote %s snapshot --full --no-save" % PROVIDER_ID, + } + return { + "ok": True, + "action": "gc remote snapshot", + "providerId": PROVIDER_ID, + "dryRun": True, + "mutation": False, + "diagnosticStateMutation": bool(OPTIONS.get("saveSnapshot", True)), + "observedAt": observed_at, + "rootDisk": root_disk, + "clusterPreflight": preflight, + "sources": sources, + "registry": registry, + "pvcAttribution": public_pvc, + "memoryPressure": public_memory, + "containerd": containerd, + "policy": growth_watermark_policy(root_disk or {}), + "commands": commands, + } diff --git a/scripts/src/gc-remote-pvc.py b/scripts/src/gc-remote-pvc.py new file mode 100644 index 00000000..95f5726a --- /dev/null +++ b/scripts/src/gc-remote-pvc.py @@ -0,0 +1,383 @@ +def pv_host_path(pv): + spec = (pv or {}).get("spec") or {} + host_path = (spec.get("hostPath") or {}).get("path") + if isinstance(host_path, str) and host_path: + return host_path + local_path = (spec.get("local") or {}).get("path") + if isinstance(local_path, str) and local_path: + return local_path + return None + +def pvc_owner_group(namespace, owner): + owner = str(owner or "") + if namespace == "agentrun-ci": + return "agentrun" + if namespace == "hwlab-ci": + if owner.startswith("agentrun-"): + return "agentrun" + return "hwlab" + if namespace.startswith("hwlab-"): + return "hwlab-runtime" + return "other" + +def parse_k8s_quantity(value): + if value is None: + return None + raw = str(value).strip() + match = re.match(r"^([0-9]+(?:\.[0-9]+)?)(Ki|Mi|Gi|Ti|K|M|G|T)?$", raw) + if not match: + return None + multiplier = { + None: 1, + "K": 1000, + "M": 1000**2, + "G": 1000**3, + "T": 1000**4, + "Ki": 1024, + "Mi": 1024**2, + "Gi": 1024**3, + "Ti": 1024**4, + }.get(match.group(2), 1) + return int(float(match.group(1)) * multiplier) + +def metadata_owner(meta): + refs = meta.get("ownerReferences") or [] + if refs: + first = refs[0] or {} + return first.get("kind"), first.get("name"), [{"kind": item.get("kind"), "name": item.get("name")} for item in refs[:5]] + labels = meta.get("labels") or {} + annotations = meta.get("annotations") or {} + for key in [ + "tekton.dev/pipelineRun", + "tekton.dev/taskRun", + "agentrun.unidesk/run-id", + "hwlab.unidesk/run-id", + "app.kubernetes.io/instance", + ]: + value = labels.get(key) or annotations.get(key) + if value: + return "Label", value, [] + return None, None, [] + +def ci_storage_snapshot(): + namespaces = set(config_list(PVC_CONFIG, "namespaces", ["hwlab-ci", "agentrun-ci"])) + candidate_namespaces = set(config_list(PVC_CONFIG, "candidateNamespaces", [])) + hwlab_node = config_str(PVC_CONFIG, "hwlabNode", PROVIDER_ID) + hwlab_lane = config_str(PVC_CONFIG, "hwlabLane", "v03") + agentrun_node = config_str(PVC_CONFIG, "agentrunNode", PROVIDER_ID) + agentrun_lane = config_str(PVC_CONFIG, "agentrunLane", "v02") + limit = config_int(PVC_CONFIG, "limit", int(OPTIONS.get("limit") or 50), minimum=1, maximum=5000) + pv_data = kubectl_json(["get", "pv"], 30) or {} + pvc_data = kubectl_json(["get", "pvc", "-A"], 30) or {} + pod_data = kubectl_json(["get", "pod", "-A"], 30) or {} + pvs = {} + for pv in pv_data.get("items") or []: + meta = pv.get("metadata") or {} + name = meta.get("name") + if name: + pvs[name] = pv + mounts = {} + for pod in pod_data.get("items") or []: + meta = pod.get("metadata") or {} + ns = str(meta.get("namespace") or "") + pod_name = str(meta.get("name") or "") + phase = str(((pod.get("status") or {}).get("phase")) or "") + if phase in set(["Succeeded", "Failed"]): + continue + spec = pod.get("spec") or {} + for vol in spec.get("volumes") or []: + claim = (vol.get("persistentVolumeClaim") or {}).get("claimName") + if claim: + mounts.setdefault((ns, claim), []).append(pod_name) + rows = [] + for pvc in pvc_data.get("items") or []: + meta = pvc.get("metadata") or {} + spec = pvc.get("spec") or {} + status = pvc.get("status") or {} + ns = str(meta.get("namespace") or "") + name = str(meta.get("name") or "") + if ns not in namespaces: + continue + volume = str(spec.get("volumeName") or "") + pv = pvs.get(volume) or {} + pv_spec = pv.get("spec") or {} + pv_meta = pv.get("metadata") or {} + owner_kind, owner_name, owner_refs = metadata_owner(meta) + requested = parse_k8s_quantity((((spec.get("resources") or {}).get("requests") or {}).get("storage"))) + host_path = pv_host_path(pv) + active = sorted(mounts.get((ns, name), [])) + estimated = du_size(host_path, 8) if host_path else None + candidate_reasons = [] + if not active: + candidate_reasons.append("no-active-mount-observed") + if status.get("phase") != "Bound": + candidate_reasons.append("pvc-not-bound") + if (pv.get("status") or {}).get("phase") == "Released": + candidate_reasons.append("pv-released") + review_candidate = ns in candidate_namespaces and len(candidate_reasons) > 0 + rows.append({ + "namespace": ns, + "pvc": name, + "volume": volume or None, + "phase": status.get("phase"), + "pvPhase": (pv.get("status") or {}).get("phase"), + "ownerKind": owner_kind, + "owner": owner_name, + "ownerRefs": owner_refs, + "ownerGroup": pvc_owner_group(ns, owner_name), + "storageClass": spec.get("storageClassName") or pv_spec.get("storageClassName"), + "reclaimPolicy": pv_spec.get("persistentVolumeReclaimPolicy"), + "requestedBytes": requested, + "requestedHuman": fmt_bytes(requested or 0), + "hostPath": host_path, + "pvCreatedAt": (pv_meta.get("creationTimestamp") if isinstance(pv_meta, dict) else None), + "pvcCreatedAt": meta.get("creationTimestamp"), + "activeMountPods": active, + "estimatedBytes": estimated, + "estimatedHuman": fmt_bytes(estimated or 0), + "reviewCandidate": review_candidate, + "reviewReasons": candidate_reasons, + "dryRunOnly": True, + }) + rows.sort(key=lambda item: safe_int(item.get("estimatedBytes")), reverse=True) + by_namespace = {} + by_owner_group = {} + for row in rows: + for bucket, key in [(by_namespace, row.get("namespace") or "unknown"), (by_owner_group, row.get("ownerGroup") or "unknown")]: + current = bucket.setdefault(key, {"count": 0, "estimatedBytes": 0, "activeMountCount": 0}) + current["count"] += 1 + current["estimatedBytes"] += safe_int(row.get("estimatedBytes")) + current["activeMountCount"] += len(row.get("activeMountPods") or []) + current["estimatedHuman"] = fmt_bytes(current["estimatedBytes"]) + review_candidates = [row for row in rows if row.get("reviewCandidate")] + return { + "scope": "YAML-configured PVC namespaces", + "configSource": "config/unidesk-cli.yaml#gc.remote.targets.%s.pvcAttribution" % PROVIDER_ID, + "namespaces": sorted(namespaces), + "candidateNamespaces": sorted(candidate_namespaces), + "pvcCount": len(rows), + "reviewCandidateCount": len(review_candidates), + "estimatedBytes": sum(safe_int(row.get("estimatedBytes")) for row in rows), + "estimatedHuman": fmt_bytes(sum(safe_int(row.get("estimatedBytes")) for row in rows)), + "requestedBytes": sum(safe_int(row.get("requestedBytes")) for row in rows), + "requestedHuman": fmt_bytes(sum(safe_int(row.get("requestedBytes")) for row in rows)), + "byNamespace": by_namespace, + "byOwnerGroup": by_owner_group, + "topPvcs": rows[:limit], + "reviewCandidates": review_candidates[:limit], + "handoff": { + "hwlab": { + "dryRun": "bun scripts/cli.ts hwlab nodes control-plane cleanup-runs --node %s --lane %s --min-age-minutes 30 --limit 200 --dry-run" % (hwlab_node, hwlab_lane), + "releasedPvs": "bun scripts/cli.ts hwlab nodes control-plane cleanup-released-pvs --node %s --lane %s --limit 200 --dry-run" % (hwlab_node, hwlab_lane), + }, + "agentrun": { + "dryRun": "bun scripts/cli.ts agentrun control-plane cleanup-runs --node %s --lane %s --min-age-minutes 30 --limit 200 --dry-run" % (agentrun_node, agentrun_lane), + "releasedPvs": "bun scripts/cli.ts agentrun control-plane cleanup-released-pvs --node %s --lane %s --limit 200 --dry-run" % (agentrun_node, agentrun_lane), + }, + }, + "policy": "analysis-only; remote GC never deletes PVC/PV/local-path data and only hands off to owner-aware retention commands", + } + +def compact_pvc_row(row): + return { + "namespace": row.get("namespace"), + "pvc": row.get("pvc"), + "phase": row.get("phase"), + "pvPhase": row.get("pvPhase"), + "ownerKind": row.get("ownerKind"), + "owner": row.get("owner"), + "ownerGroup": row.get("ownerGroup"), + "estimatedBytes": row.get("estimatedBytes"), + "estimatedHuman": row.get("estimatedHuman"), + "activeMountCount": len(row.get("activeMountPods") or []), + "reviewCandidate": row.get("reviewCandidate"), + "reviewReasons": row.get("reviewReasons"), + } + +def compact_pvc_attribution(payload): + if bool(OPTIONS.get("full")): + return payload + top = payload.get("topPvcs") or [] + review = payload.get("reviewCandidates") or [] + compact_top = [compact_pvc_row(row) for row in top[:8] if isinstance(row, dict)] + return { + "configSource": payload.get("configSource"), + "candidateNamespaces": payload.get("candidateNamespaces"), + "pvcCount": payload.get("pvcCount"), + "reviewCandidateCount": payload.get("reviewCandidateCount"), + "estimatedBytes": payload.get("estimatedBytes"), + "estimatedHuman": payload.get("estimatedHuman"), + "byNamespace": payload.get("byNamespace"), + "byOwnerGroup": payload.get("byOwnerGroup"), + "topPvcs": compact_top, + "reviewCandidates": [compact_pvc_row(row) for row in review[:2] if isinstance(row, dict)], + "handoff": payload.get("handoff"), + "compacted": True, + "fullDisclosure": "rerun with --full for hostPath, creation timestamps and complete row details", + } + +def compact_ci_storage_summary(payload): + return { + "scope": payload.get("scope"), + "configSource": payload.get("configSource"), + "pvcCount": payload.get("pvcCount"), + "reviewCandidateCount": payload.get("reviewCandidateCount"), + "estimatedBytes": payload.get("estimatedBytes"), + "estimatedHuman": payload.get("estimatedHuman"), + "requestedBytes": payload.get("requestedBytes"), + "requestedHuman": payload.get("requestedHuman"), + "compacted": True, + "fullDisclosure": "use pvcAttribution or --full for row-level details", + } + +def local_path_storage_root(): + root = config_str(LOCAL_PATH_CONFIG, "root", "") + if not root: + return "" + return os.path.realpath(os.path.abspath(root)) + +def local_path_orphan_prefixes(): + return config_list(LOCAL_PATH_CONFIG, "orphanDirPrefixes", []) + +def is_direct_local_path_child(root, path): + resolved = os.path.realpath(os.path.abspath(path)) + return os.path.dirname(resolved) == root and resolved.startswith(root.rstrip("/") + "/") + +def local_path_referenced_paths(root): + pv_data = kubectl_json(["get", "pv"], 30) or {} + referenced = set() + for pv in pv_data.get("items") or []: + host_path = pv_host_path(pv) + if not host_path: + continue + resolved = os.path.realpath(os.path.abspath(host_path)) + if resolved == root or resolved.startswith(root.rstrip("/") + "/"): + referenced.add(resolved) + return referenced + +def assert_local_path_orphan(path, referenced=None): + root = local_path_storage_root() + if not root: + raise RuntimeError("localPathStorage.root is not configured") + prefixes = local_path_orphan_prefixes() + resolved = os.path.realpath(os.path.abspath(path)) + name = os.path.basename(resolved) + if not is_direct_local_path_child(root, resolved): + raise RuntimeError("refusing to remove local-path orphan outside configured direct storage root: %s" % path) + if os.path.islink(path) or not os.path.isdir(resolved): + raise RuntimeError("refusing to remove non-directory or symlink local-path orphan: %s" % path) + if not prefixes or not any(name.startswith(prefix) for prefix in prefixes): + raise RuntimeError("refusing to remove local-path orphan outside YAML prefix allowlist: %s" % path) + refs = referenced if referenced is not None else local_path_referenced_paths(root) + for ref in refs: + if resolved == ref or ref.startswith(resolved.rstrip("/") + "/") or resolved.startswith(ref.rstrip("/") + "/"): + raise RuntimeError("refusing to remove local-path path still referenced by PV: %s" % path) + if path_has_open_fd(resolved): + raise RuntimeError("refusing to remove local-path orphan with open fd/cwd reference: %s" % path) + return resolved + +def local_path_orphan_rows(): + if not config_bool(LOCAL_PATH_CONFIG, "enabled", False): + return [], {"ok": False, "reason": "local-path-orphan-cleanup-disabled"} + root = local_path_storage_root() + prefixes = local_path_orphan_prefixes() + if not root or not os.path.isdir(root) or os.path.islink(root): + return [], {"ok": False, "reason": "local-path-root-unavailable", "root": root} + if not prefixes: + return [], {"ok": False, "reason": "local-path-prefix-allowlist-empty", "root": root} + referenced = local_path_referenced_paths(root) + min_age_minutes = config_float(LOCAL_PATH_CONFIG, "orphanMinAgeMinutes", 0.0, minimum=0.0) + cutoff = time.time() - min_age_minutes * 60.0 + rows = [] + protected = [] + for name in sorted(os.listdir(root)): + path = os.path.join(root, name) + resolved = os.path.realpath(os.path.abspath(path)) + try: + stat = os.lstat(path) + except OSError: + continue + if not os.path.isdir(path) or os.path.islink(path) or not any(name.startswith(prefix) for prefix in prefixes): + continue + row = {"path": resolved, "name": name, "sizeBytes": 0, "estimatedReclaimBytes": 0} + if not is_direct_local_path_child(root, resolved): + protected.append({**row, "reason": "not-direct-child"}) + continue + if stat.st_mtime >= cutoff: + protected.append({**row, "reason": "younger-than-min-age"}) + continue + referenced_by = [ref for ref in referenced if resolved == ref or ref.startswith(resolved.rstrip("/") + "/") or resolved.startswith(ref.rstrip("/") + "/")] + if referenced_by: + protected.append({**row, "reason": "pv-referenced", "referencedCount": len(referenced_by)}) + continue + if path_has_open_fd(resolved): + protected.append({**row, "reason": "open-fd"}) + continue + size = du_size(resolved, 10) or path_size(resolved) + rows.append({**row, "sizeBytes": size, "estimatedReclaimBytes": size}) + rows.sort(key=lambda item: safe_int(item.get("estimatedReclaimBytes")), reverse=True) + return rows, { + "ok": True, + "root": root, + "prefixes": prefixes, + "referencedPathCount": len(referenced), + "protectedCount": len(protected), + "protectedPreview": protected[:8], + "minAgeMinutes": min_age_minutes, + } + +def local_path_orphan_candidate(): + rows, meta = local_path_orphan_rows() + if not meta.get("ok"): + return { + "id": "k3s-local-path-orphans:unavailable", + "kind": "k3s-local-path-orphans-unavailable", + "risk": "blocked", + "description": "K3s local-path orphan cleanup is unavailable or disabled by YAML", + "estimatedReclaimBytes": 0, + "diagnostic": meta, + } + limit = int(OPTIONS.get("limit") or 50) + selected = rows[:limit] + estimated = sum(safe_int(row.get("estimatedReclaimBytes")) for row in selected) + if estimated <= 0: + return None + return { + "id": "k3s-local-path-orphans:delete", + "kind": "k3s-local-path-orphans-delete", + "risk": "medium", + "description": "Delete YAML-allowlisted k3s local-path storage directories that no PV references and no process has open", + "path": meta.get("root"), + "sizeBytes": estimated, + "estimatedReclaimBytes": estimated, + "orphanCount": len(rows), + "selectedOrphanCount": len(selected), + "protectedCount": meta.get("protectedCount"), + "referencedPathCount": meta.get("referencedPathCount"), + "selectedPreview": [{"name": row.get("name"), "path": row.get("path"), "estimatedReclaimBytes": row.get("estimatedReclaimBytes")} for row in selected[:8]], + "protectedPreview": meta.get("protectedPreview"), + "action": {"op": "rm-recursive", "allowlist": "yaml-local-path-orphan", "root": meta.get("root"), "limit": limit}, + } + +def execute_local_path_orphan_cleanup(): + rows, meta = local_path_orphan_rows() + if not meta.get("ok"): + raise RuntimeError("local-path orphan cleanup unavailable: %s" % meta.get("reason")) + limit = int(OPTIONS.get("limit") or 50) + selected = rows[:limit] + referenced = local_path_referenced_paths(local_path_storage_root()) + reclaimed = 0 + deleted = [] + for row in selected: + path = assert_local_path_orphan(row.get("path"), referenced) + before = du_size(path, 10) or path_size(path) + shutil.rmtree(path, ignore_errors=True) + reclaimed += before + deleted.append({"name": row.get("name"), "path": path, "reclaimedBytes": before}) + return { + "reclaimedBytes": reclaimed, + "deletedOrphanCount": len(deleted), + "deletedPreview": deleted[:12], + "root": meta.get("root"), + "protectedCount": meta.get("protectedCount"), + } diff --git a/scripts/src/gc-remote-registry.py b/scripts/src/gc-remote-registry.py new file mode 100644 index 00000000..918b2a49 --- /dev/null +++ b/scripts/src/gc-remote-registry.py @@ -0,0 +1,677 @@ +def active_hwlab_ci_writes(): + result = command(["sh", "-lc", "KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl get pipelinerun,taskrun -n hwlab-ci --no-headers 2>/dev/null | awk '$2 != \"True\" && $2 != \"False\" {print}' | head -40"], 15) + lines = [line for line in (result.get("stdout") or "").splitlines() if line.strip()] + return {"ok": result["exitCode"] == 0, "activeCount": len(lines), "activePreview": lines, "command": bounded(result)} + +def active_hwlab_ci_jobs(): + result = command(["sh", "-lc", "KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl get jobs -n hwlab-ci --no-headers 2>/dev/null | awk '$2 != \"Complete\" && $2 != \"Failed\" {print}' | head -40"], 15) + lines = [line for line in (result.get("stdout") or "").splitlines() if line.strip()] + return {"ok": result["exitCode"] == 0, "activeCount": len(lines), "activePreview": lines, "command": bounded(result)} + +def wait_no_active_hwlab_ci(timeout=180): + deadline = time.time() + timeout + last = None + while time.time() < deadline: + writes = active_hwlab_ci_writes() + jobs = active_hwlab_ci_jobs() + last = {"writes": writes, "jobs": jobs} + if writes.get("ok") and jobs.get("ok") and int(writes.get("activeCount") or 0) == 0 and int(jobs.get("activeCount") or 0) == 0: + return {"ok": True, "last": last} + time.sleep(5) + return {"ok": False, "last": last} + +def kubectl_json(args, timeout=20): + result = command(["env", "KUBECONFIG=/etc/rancher/k3s/k3s.yaml", "kubectl"] + args + ["-o", "json"], timeout) + if result["exitCode"] != 0: + return None + try: + return json.loads(result["stdout"] or "{}") + except Exception: + return None + +def kctl(args, timeout=30): + return command(["env", "KUBECONFIG=/etc/rancher/k3s/k3s.yaml", "kubectl"] + args, timeout) + +def workload_image_refs(): + result = command(["sh", "-lc", "KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl get deploy,sts,ds,pod -A -o jsonpath='{range .items[*]}{range .spec.containers[*]}{.image}{\"\\n\"}{end}{range .spec.initContainers[*]}{.image}{\"\\n\"}{end}{range .spec.template.spec.containers[*]}{.image}{\"\\n\"}{end}{range .spec.template.spec.initContainers[*]}{.image}{\"\\n\"}{end}{end}' 2>/dev/null | sort -u"], 30) + refs = set() + digests = set() + for image in (result.get("stdout") or "").splitlines(): + image = image.strip() + if not image.startswith("127.0.0.1:5000/"): + continue + ref = image.split("127.0.0.1:5000/", 1)[1] + if "@sha256:" in ref: + repo, digest = ref.split("@", 1) + refs.add((repo, "@" + digest)) + digests.add("sha256:" + digest.split(":", 1)[1]) + elif ":" in ref: + repo, tag = ref.rsplit(":", 1) + refs.add((repo, tag)) + return refs, digests, bounded(result) + +def registry_request(method, path, headers=None, timeout=20): + url = "http://127.0.0.1:5000" + path + req = urllib.request.Request(url, method=method, headers=headers or {}) + with urllib.request.urlopen(req, timeout=timeout) as response: + body = response.read() + return {"status": response.status, "headers": dict(response.headers), "body": body.decode("utf-8", errors="replace")} + +def registry_tag_rows(): + rows = [] + root = REGISTRY_REPOSITORY_ROOT + if not os.path.isdir(root): + return rows + for repo_root, dirs, files in os.walk(root): + if os.path.basename(repo_root) != "tags": + continue + rel = os.path.relpath(repo_root, root) + suffix = "/_manifests/tags" + if not rel.endswith(suffix): + continue + repo = rel[:-len(suffix)] + try: + tags = os.listdir(repo_root) + except OSError: + continue + for tag in sorted(tags): + link = os.path.join(repo_root, tag, "current", "link") + if not os.path.isfile(link): + continue + try: + with open(link, "r", encoding="utf-8") as handle: + digest = handle.read().strip() + stat = os.stat(link) + except OSError: + continue + rows.append({ + "repo": repo, + "tag": tag, + "digest": digest, + "mtime": stat.st_mtime, + "mtimeIso": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(stat.st_mtime)), + "path": os.path.join(repo_root, tag), + }) + return rows + +def registry_revision_rows(): + rows = [] + root = REGISTRY_REPOSITORY_ROOT + if not os.path.isdir(root): + return rows + for repo_root, dirs, files in os.walk(root): + if os.path.basename(repo_root) != "sha256": + continue + rel = os.path.relpath(repo_root, root) + suffix = "/_manifests/revisions/sha256" + if not rel.endswith(suffix): + continue + repo = rel[:-len(suffix)] + try: + revisions = os.listdir(repo_root) + except OSError: + continue + for digest_hex in sorted(revisions): + path = os.path.join(repo_root, digest_hex) + link = os.path.join(path, "link") + if not os.path.isfile(link): + continue + try: + with open(link, "r", encoding="utf-8") as handle: + digest = handle.read().strip() + stat = os.stat(link) + except OSError: + continue + rows.append({ + "repo": repo, + "digest": digest, + "mtime": stat.st_mtime, + "mtimeIso": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(stat.st_mtime)), + "path": path, + }) + return rows + +def registry_retention_repo(repo): + return repo.startswith("hwlab/hwlab-") or repo.startswith("hwlab/cache/hwlab-") + +def registry_digest_hex(digest): + if not isinstance(digest, str) or not digest.startswith("sha256:"): + return None + value = digest.split(":", 1)[1] + if re.match(r"^[0-9a-f]{64}$", value) is None: + return None + return value + +def registry_blob_data_path(digest): + value = registry_digest_hex(digest) + if value is None: + return None + return os.path.join(REGISTRY_ROOT, "docker/registry/v2/blobs/sha256", value[:2], value, "data") + +_manifest_cache = {} +def registry_manifest_json(digest): + if digest in _manifest_cache: + return _manifest_cache[digest] + path = registry_blob_data_path(digest) + if path is None or not os.path.isfile(path): + _manifest_cache[digest] = None + return None + try: + with open(path, "rb") as handle: + data = handle.read(8 * 1024 * 1024) + value = json.loads(data.decode("utf-8")) + except Exception: + value = None + _manifest_cache[digest] = value + return value + +def registry_manifest_refs(digest): + manifest = registry_manifest_json(digest) + if not isinstance(manifest, dict): + return set() + refs = set() + config = manifest.get("config") or {} + config_digest = config.get("digest") + if isinstance(config_digest, str) and registry_digest_hex(config_digest) is not None: + refs.add(config_digest) + for item in manifest.get("layers") or []: + item_digest = (item or {}).get("digest") + if isinstance(item_digest, str) and registry_digest_hex(item_digest) is not None: + refs.add(item_digest) + for item in manifest.get("manifests") or []: + item_digest = (item or {}).get("digest") + if isinstance(item_digest, str) and registry_digest_hex(item_digest) is not None: + refs.add(item_digest) + return refs + +def registry_digest_closure(seed): + seen = set() + stack = list(seed) + while stack: + digest = stack.pop() + if digest in seen or registry_digest_hex(digest) is None: + continue + seen.add(digest) + for child in registry_manifest_refs(digest): + if child not in seen: + stack.append(child) + return seen + +def registry_blob_size(digest): + path = registry_blob_data_path(digest) + if path is None or not os.path.isfile(path): + return 0 + try: + return int(os.lstat(path).st_blocks) * 512 + except OSError: + return 0 + +def estimate_registry_reclaim(delete_manifest_digests, kept_manifest_digests): + deleted = registry_digest_closure(delete_manifest_digests) + kept = registry_digest_closure(kept_manifest_digests) + reclaim = deleted - kept + return sum(registry_blob_size(digest) for digest in reclaim) + +def plan_registry_retention(): + keep_per_repo = int(OPTIONS.get("registryKeepPerRepo") if OPTIONS.get("registryKeepPerRepo") is not None else 5) + min_age_hours = float(OPTIONS.get("registryMinAgeHours") if OPTIONS.get("registryMinAgeHours") is not None else 48) + cutoff = time.time() - min_age_hours * 3600 + refs, digests, refs_command = workload_image_refs() + rows = registry_tag_rows() + revision_rows = registry_revision_rows() + by_repo = {} + for row in rows: + by_repo.setdefault(row["repo"], []).append(row) + keep = set() + keep_reasons = {} + for repo, items in by_repo.items(): + items.sort(key=lambda item: item["mtime"], reverse=True) + for row in items[:keep_per_repo]: + key = (row["repo"], row["tag"]) + keep.add(key) + keep_reasons[key] = "latest-per-repo" + for row in items: + key = (row["repo"], row["tag"]) + if row["tag"] in REGISTRY_PROTECTED_TAGS: + keep.add(key) + keep_reasons[key] = "protected-tag" + if key in refs: + keep.add(key) + keep_reasons[key] = "workload-tag-ref" + if row["digest"] in digests: + keep.add(key) + keep_reasons[key] = "workload-digest-ref" + if row["repo"].startswith("hwlab/cache/"): + keep.add(key) + keep_reasons[key] = "cache-repo" + if row["mtime"] >= cutoff: + keep.add(key) + keep_reasons[key] = "recent-tag" + delete_rows = [] + kept_count = 0 + delete_by_repo = {} + keep_by_repo = {} + kept_digests = set() + for row in rows: + key = (row["repo"], row["tag"]) + should_delete = ( + key not in keep + and row["repo"].startswith("hwlab/hwlab-") + and re.match(r"^[0-9a-f]{7,40}$", row["tag"]) is not None + ) + if should_delete: + delete_rows.append(row) + delete_by_repo[row["repo"]] = delete_by_repo.get(row["repo"], 0) + 1 + else: + kept_count += 1 + kept_digests.add(row["digest"]) + keep_by_repo[row["repo"]] = keep_by_repo.get(row["repo"], 0) + 1 + protected_digests = kept_digests | digests + protected_digests.update(row["digest"] for row in revision_rows if not registry_retention_repo(row["repo"])) + protected_digests = registry_digest_closure(protected_digests) + delete_revision_rows = [] + revision_delete_by_repo = {} + for row in revision_rows: + if not registry_retention_repo(row["repo"]): + continue + if row["digest"] in protected_digests: + continue + delete_revision_rows.append(row) + revision_delete_by_repo[row["repo"]] = revision_delete_by_repo.get(row["repo"], 0) + 1 + kept_revision_digests = set(row["digest"] for row in revision_rows if row not in delete_revision_rows) + delete_revision_digests = set(row["digest"] for row in delete_revision_rows) + deletable_manifests = {} + for row in delete_rows: + if row["digest"] in kept_digests: + continue + deletable_manifests.setdefault(row["repo"], set()).add(row["digest"]) + for row in delete_revision_rows: + deletable_manifests.setdefault(row["repo"], set()).add(row["digest"]) + deletable_manifest_count = sum(len(items) for items in deletable_manifests.values()) + registry_size = du_size(REGISTRY_ROOT, 30) or 0 + estimate = estimate_registry_reclaim(delete_revision_digests, kept_revision_digests) + return { + "tagRows": rows, + "revisionRows": revision_rows, + "deleteRows": delete_rows, + "deleteRevisionRows": delete_revision_rows, + "summary": { + "totalTags": len(rows), + "totalRevisions": len(revision_rows), + "repoCount": len(by_repo), + "keepPerRepo": keep_per_repo, + "minAgeHours": min_age_hours, + "protectedWorkloadRefs": len(refs), + "protectedDigestRefs": len(digests), + "protectedDigestClosure": len(protected_digests), + "keptTags": kept_count, + "deleteTags": len(delete_rows), + "deleteManifests": deletable_manifest_count, + "deleteRevisions": len(delete_revision_rows), + "deleteByRepo": delete_by_repo, + "revisionDeleteByRepo": revision_delete_by_repo, + "keepByRepo": keep_by_repo, + "registrySizeBytes": registry_size, + "estimatedReclaimBytes": estimate, + }, + "deleteManifestsByRepo": {repo: sorted(list(digests)) for repo, digests in deletable_manifests.items()}, + "refsCommand": refs_command, + } + +def registry_deployment_preflight(): + dep = kubectl_json(["-n", "hwlab-ci", "get", "deploy", "hwlab-registry"], 20) + if not dep: + return {"ok": False, "reason": "registry-deployment-missing"} + spec = ((dep.get("spec") or {}).get("template") or {}).get("spec") or {} + containers = spec.get("containers") or [] + volumes = spec.get("volumes") or [] + registry_container = next((item for item in containers if item.get("name") == "registry"), containers[0] if containers else {}) + mounts = registry_container.get("volumeMounts") or [] + has_host_path = any(((vol.get("hostPath") or {}).get("path") == REGISTRY_ROOT and vol.get("name") == "storage") for vol in volumes) + has_mount = any((mount.get("name") == "storage" and mount.get("mountPath") == "/var/lib/registry") for mount in mounts) + image = str(registry_container.get("image") or "") + ok = bool(has_host_path and has_mount and image.startswith("registry:") and spec.get("hostNetwork") is True) + return { + "ok": ok, + "reason": "ok" if ok else "unexpected-registry-deployment-shape", + "image": image, + "hostNetwork": spec.get("hostNetwork"), + "hasExpectedHostPath": has_host_path, + "hasExpectedMount": has_mount, + "replicas": (dep.get("spec") or {}).get("replicas"), + "readyReplicas": (dep.get("status") or {}).get("readyReplicas"), + } + +def cronjob_suspend_states(names): + states = {} + for name in names: + data = kubectl_json(["-n", "hwlab-ci", "get", "cronjob", name], 15) + if data: + states[name] = bool(((data.get("spec") or {}).get("suspend")) is True) + return states + +def patch_cronjob_suspend(name, suspend): + payload = json.dumps({"spec": {"suspend": bool(suspend)}}) + return kctl(["-n", "hwlab-ci", "patch", "cronjob", name, "--type=merge", "-p", payload], 30) + +def wait_registry_pod_count(target, timeout=90): + deadline = time.time() + timeout + last = None + while time.time() < deadline: + result = kctl(["-n", "hwlab-ci", "get", "pods", "-l", "app.kubernetes.io/name=hwlab-registry", "--no-headers"], 20) + last = bounded(result) + lines = [line for line in (result.get("stdout") or "").splitlines() if line.strip()] + active = [] + for line in lines: + parts = line.split() + status = parts[2] if len(parts) >= 3 else "" + if status in set(["Completed", "Error", "Failed", "Succeeded"]): + continue + active.append(line) + if len(active) == target: + return {"ok": True, "lines": active, "allLines": lines, "last": last} + time.sleep(2) + return {"ok": False, "lines": [], "last": last} + +def wait_pod_terminal(name, timeout=900): + deadline = time.time() + timeout + last = None + while time.time() < deadline: + data = kubectl_json(["-n", "hwlab-ci", "get", "pod", name], 20) + if data: + phase = ((data.get("status") or {}).get("phase")) or "" + last = {"phase": phase} + if phase == "Succeeded": + return {"ok": True, "phase": phase} + if phase == "Failed": + return {"ok": False, "phase": phase} + time.sleep(3) + return {"ok": False, "phase": "Timeout", "last": last} + +def execute_registry_retention(): + if PROVIDER_ID.upper() != "G14": + raise RuntimeError("HWLAB registry retention is only supported on G14") + deployment = registry_deployment_preflight() + if not deployment.get("ok"): + raise RuntimeError("registry deployment preflight failed: %s" % deployment.get("reason")) + plan = plan_registry_retention() + delete_rows = plan.get("deleteRows") or [] + delete_revision_rows = plan.get("deleteRevisionRows") or [] + delete_manifests = plan.get("deleteManifestsByRepo") or {} + if not delete_rows and not delete_revision_rows: + return {"reclaimedBytes": 0, "commandOutput": {"message": "no registry tags or revisions matched conservative retention", "registryPlan": plan.get("summary")}} + if not delete_manifests: + return {"reclaimedBytes": 0, "commandOutput": {"message": "matched manifests are still referenced by retained manifests; registry GC would not reclaim blobs", "registryPlan": plan.get("summary")}} + cronjobs = ["hwlab-g14-branch-poller", "hwlab-v02-branch-poller"] + original_crons = cronjob_suspend_states(cronjobs) + before = du_size(REGISTRY_ROOT, 60) or 0 + gc_name = "hwlab-registry-gc-%s" % int(time.time()) + steps = [] + try: + for name in original_crons: + result = patch_cronjob_suspend(name, True) + steps.append({"step": "suspend-cronjob", "name": name, "result": bounded(result)}) + if result["exitCode"] != 0: + raise RuntimeError("failed to suspend cronjob %s" % name) + idle_after_suspend = wait_no_active_hwlab_ci(180) + steps.append({"step": "idle-after-suspend", "result": idle_after_suspend}) + if not idle_after_suspend.get("ok"): + raise RuntimeError("refusing registry maintenance because hwlab-ci did not become idle after suspend") + + deleted_manifests = [] + for repo, digests in delete_manifests.items(): + encoded_repo = "/".join(urllib.parse.quote(part, safe="") for part in repo.split("/")) + for digest in digests: + try: + result = registry_request("DELETE", "/v2/%s/manifests/%s" % (encoded_repo, urllib.parse.quote(digest, safe=":")), {"Accept": "application/vnd.docker.distribution.manifest.v2+json, application/vnd.oci.image.manifest.v1+json"}) + deleted_manifests.append({"repo": repo, "digest": digest, "status": result.get("status")}) + except urllib.error.HTTPError as exc: + if exc.code == 404: + deleted_manifests.append({"repo": repo, "digest": digest, "status": 404}) + else: + raise + steps.append({"step": "registry-api-delete-manifests", "count": len(deleted_manifests), "preview": deleted_manifests[:20]}) + + scale_down = kctl(["-n", "hwlab-ci", "scale", "deploy", "hwlab-registry", "--replicas=0"], 60) + steps.append({"step": "scale-registry-down", "result": bounded(scale_down)}) + if scale_down["exitCode"] != 0: + raise RuntimeError("failed to scale registry down") + waited_down = wait_registry_pod_count(0, 120) + steps.append({"step": "wait-registry-down", "result": waited_down}) + if not waited_down.get("ok"): + raise RuntimeError("registry pod did not scale down") + + deleted = [] + for row in delete_rows: + path = os.path.abspath(str(row.get("path") or "")) + if not path.startswith(REGISTRY_REPOSITORY_ROOT + "/") or "/_manifests/tags/" not in path: + raise RuntimeError("refusing unexpected registry tag path: %s" % path) + if not re.match(r"^[0-9a-f]{7,40}$", str(row.get("tag") or "")): + raise RuntimeError("refusing unexpected registry tag name: %s" % row.get("tag")) + if os.path.isdir(path) and not os.path.islink(path): + shutil.rmtree(path) + deleted.append({"repo": row.get("repo"), "tag": row.get("tag"), "digest": row.get("digest")}) + steps.append({"step": "delete-tag-directories", "count": len(deleted)}) + + deleted_revisions = [] + for row in delete_revision_rows: + path = os.path.abspath(str(row.get("path") or "")) + digest_hex = registry_digest_hex(str(row.get("digest") or "")) + if digest_hex is None: + raise RuntimeError("refusing unexpected registry revision digest: %s" % row.get("digest")) + if not path.startswith(REGISTRY_REPOSITORY_ROOT + "/") or "/_manifests/revisions/sha256/" not in path: + raise RuntimeError("refusing unexpected registry revision path: %s" % path) + if os.path.basename(path) != digest_hex: + raise RuntimeError("refusing registry revision path/digest mismatch: %s" % path) + if os.path.isdir(path) and not os.path.islink(path): + shutil.rmtree(path) + deleted_revisions.append({"repo": row.get("repo"), "digest": row.get("digest")}) + steps.append({"step": "delete-revision-directories", "count": len(deleted_revisions)}) + + overrides = { + "apiVersion": "v1", + "spec": { + "restartPolicy": "Never", + "containers": [{ + "name": "registry-gc", + "image": "registry:2.8.3", + "command": ["registry", "garbage-collect", "/etc/docker/registry/config.yml"], + "volumeMounts": [{"name": "storage", "mountPath": "/var/lib/registry"}], + }], + "volumes": [{"name": "storage", "hostPath": {"path": REGISTRY_ROOT, "type": "DirectoryOrCreate"}}], + }, + } + run_gc = kctl(["-n", "hwlab-ci", "run", gc_name, "--restart=Never", "--image=registry:2.8.3", "--overrides=%s" % json.dumps(overrides)], 60) + steps.append({"step": "start-registry-gc-pod", "result": bounded(run_gc), "pod": gc_name}) + if run_gc["exitCode"] != 0: + raise RuntimeError("failed to start registry GC pod") + waited_gc = wait_pod_terminal(gc_name, 900) + steps.append({"step": "wait-registry-gc", "result": waited_gc}) + logs = kctl(["-n", "hwlab-ci", "logs", gc_name], 120) + steps.append({"step": "registry-gc-logs", "result": bounded(logs)}) + if not waited_gc.get("ok"): + raise RuntimeError("registry GC pod did not complete successfully") + finally: + cleanup_gc = kctl(["-n", "hwlab-ci", "delete", "pod", gc_name, "--ignore-not-found=true"], 60) + steps.append({"step": "delete-registry-gc-pod", "result": bounded(cleanup_gc)}) + scale_up = kctl(["-n", "hwlab-ci", "scale", "deploy", "hwlab-registry", "--replicas=%s" % int(deployment.get("replicas") or 1)], 60) + steps.append({"step": "scale-registry-up", "result": bounded(scale_up)}) + rollout = kctl(["-n", "hwlab-ci", "rollout", "status", "deploy/hwlab-registry", "--timeout=180s"], 200) + steps.append({"step": "wait-registry-rollout", "result": bounded(rollout)}) + for name, was_suspended in original_crons.items(): + restore = patch_cronjob_suspend(name, was_suspended) + steps.append({"step": "restore-cronjob", "name": name, "suspend": was_suspended, "result": bounded(restore)}) + after = du_size(REGISTRY_ROOT, 60) or 0 + return { + "reclaimedBytes": max(0, before - after), + "commandOutput": { + "registryPlan": plan.get("summary"), + "deletedTagCount": len(delete_rows), + "deletedRevisionCount": len(delete_revision_rows), + "deletedManifestCount": sum(len(items) for items in delete_manifests.values()), + "diskBeforeBytes": before, + "diskAfterBytes": after, + "steps": steps[-12:], + }, + } + +def execute_registry_garbage_collect_only(): + if PROVIDER_ID.upper() != "G14": + raise RuntimeError("HWLAB registry garbage-collect is only supported on G14") + deployment = registry_deployment_preflight() + if not deployment.get("ok"): + raise RuntimeError("registry deployment preflight failed: %s" % deployment.get("reason")) + cronjobs = ["hwlab-g14-branch-poller", "hwlab-v02-branch-poller"] + original_crons = cronjob_suspend_states(cronjobs) + before = du_size(REGISTRY_ROOT, 60) or 0 + gc_name = "hwlab-registry-gc-%s" % int(time.time()) + steps = [] + try: + for name in original_crons: + result = patch_cronjob_suspend(name, True) + steps.append({"step": "suspend-cronjob", "name": name, "result": bounded(result)}) + if result["exitCode"] != 0: + raise RuntimeError("failed to suspend cronjob %s" % name) + idle_after_suspend = wait_no_active_hwlab_ci(180) + steps.append({"step": "idle-after-suspend", "result": idle_after_suspend}) + if not idle_after_suspend.get("ok"): + raise RuntimeError("refusing registry maintenance because hwlab-ci did not become idle after suspend") + + scale_down = kctl(["-n", "hwlab-ci", "scale", "deploy", "hwlab-registry", "--replicas=0"], 60) + steps.append({"step": "scale-registry-down", "result": bounded(scale_down)}) + if scale_down["exitCode"] != 0: + raise RuntimeError("failed to scale registry down") + waited_down = wait_registry_pod_count(0, 120) + steps.append({"step": "wait-registry-down", "result": waited_down}) + if not waited_down.get("ok"): + raise RuntimeError("registry pod did not scale down") + + overrides = { + "apiVersion": "v1", + "spec": { + "restartPolicy": "Never", + "containers": [{ + "name": "registry-gc", + "image": "registry:2.8.3", + "command": ["registry", "garbage-collect", "/etc/docker/registry/config.yml"], + "volumeMounts": [{"name": "storage", "mountPath": "/var/lib/registry"}], + }], + "volumes": [{"name": "storage", "hostPath": {"path": REGISTRY_ROOT, "type": "DirectoryOrCreate"}}], + }, + } + run_gc = kctl(["-n", "hwlab-ci", "run", gc_name, "--restart=Never", "--image=registry:2.8.3", "--overrides=%s" % json.dumps(overrides)], 60) + steps.append({"step": "start-registry-gc-pod", "result": bounded(run_gc), "pod": gc_name}) + if run_gc["exitCode"] != 0: + raise RuntimeError("failed to start registry GC pod") + waited_gc = wait_pod_terminal(gc_name, 900) + steps.append({"step": "wait-registry-gc", "result": waited_gc}) + logs = kctl(["-n", "hwlab-ci", "logs", gc_name], 120) + steps.append({"step": "registry-gc-logs", "result": bounded(logs)}) + if not waited_gc.get("ok"): + raise RuntimeError("registry GC pod did not complete successfully") + finally: + cleanup_gc = kctl(["-n", "hwlab-ci", "delete", "pod", gc_name, "--ignore-not-found=true"], 60) + steps.append({"step": "delete-registry-gc-pod", "result": bounded(cleanup_gc)}) + scale_up = kctl(["-n", "hwlab-ci", "scale", "deploy", "hwlab-registry", "--replicas=%s" % int(deployment.get("replicas") or 1)], 60) + steps.append({"step": "scale-registry-up", "result": bounded(scale_up)}) + rollout = kctl(["-n", "hwlab-ci", "rollout", "status", "deploy/hwlab-registry", "--timeout=180s"], 200) + steps.append({"step": "wait-registry-rollout", "result": bounded(rollout)}) + for name, was_suspended in original_crons.items(): + restore = patch_cronjob_suspend(name, was_suspended) + steps.append({"step": "restore-cronjob", "name": name, "suspend": was_suspended, "result": bounded(restore)}) + after = du_size(REGISTRY_ROOT, 60) or 0 + return { + "reclaimedBytes": max(0, before - after), + "commandOutput": { + "message": "official registry garbage-collect only; no additional tag deletion", + "diskBeforeBytes": before, + "diskAfterBytes": after, + "steps": steps[-12:], + }, + } + +def start_registry_retention_job(mode): + job_id = "g14-registry-%s-%s" % (int(time.time()), os.getpid()) + paths = job_paths(job_id) + started_at = now_iso() + initial = { + "ok": True, + "action": "gc remote status", + "providerId": PROVIDER_ID, + "jobId": job_id, + "status": "running", + "kind": "hwlab-registry-retention-gc" if mode == "retention" else "hwlab-registry-garbage-collect", + "mode": mode, + "startedAt": started_at, + "statePath": paths["state"], + "logPath": paths["log"], + "options": OPTIONS, + } + write_json_atomic(paths["state"], initial) + pid = os.fork() + if pid != 0: + return { + "status": "started", + "reclaimedBytes": None, + "commandOutput": { + "jobId": job_id, + "pid": pid, + "statePath": paths["state"], + "logPath": paths["log"], + "statusCommand": "bun scripts/cli.ts gc remote %s status --job-id %s" % (PROVIDER_ID, job_id), + "message": "registry retention GC is running as a detached remote job", + }, + } + + try: + os.setsid() + except Exception: + pass + try: + devnull = os.open(os.devnull, os.O_RDONLY) + os.dup2(devnull, 0) + os.close(devnull) + except Exception: + pass + try: + log_handle = open(paths["log"], "a", encoding="utf-8", buffering=1) + os.dup2(log_handle.fileno(), 1) + os.dup2(log_handle.fileno(), 2) + except Exception: + log_handle = None + try: + print("[%s] starting HWLAB registry %s job %s" % (now_iso(), mode, job_id), flush=True) + result = execute_registry_retention() if mode == "retention" else execute_registry_garbage_collect_only() + payload = dict(initial) + payload.update({ + "status": "succeeded", + "finishedAt": now_iso(), + "result": result, + "diskAfter": df_snapshot(), + "clusterAfter": cluster_preflight(), + }) + write_json_atomic(paths["state"], payload) + print("[%s] completed HWLAB registry %s job %s" % (now_iso(), mode, job_id), flush=True) + os._exit(0) + except Exception as exc: + payload = dict(initial) + payload.update({ + "ok": False, + "status": "failed", + "finishedAt": now_iso(), + "error": str(exc), + "diskAfter": df_snapshot(), + "clusterAfter": cluster_preflight(), + }) + try: + write_json_atomic(paths["state"], payload) + except Exception: + pass + print("[%s] failed HWLAB registry %s job %s: %s" % (now_iso(), mode, job_id, exc), flush=True) + os._exit(1) + finally: + try: + if log_handle: + log_handle.close() + except Exception: + pass diff --git a/scripts/src/gc-remote-runner.py b/scripts/src/gc-remote-runner.py index fffff0ed..f84e8dfc 100644 --- a/scripts/src/gc-remote-runner.py +++ b/scripts/src/gc-remote-runner.py @@ -18,6 +18,9 @@ OPTIONS = CONFIG.get("options") or {} REMOTE_TARGET = CONFIG.get("remoteTarget") if isinstance(CONFIG.get("remoteTarget"), dict) else {} MEMORY_CONFIG = REMOTE_TARGET.get("memoryPressure") if isinstance(REMOTE_TARGET.get("memoryPressure"), dict) else {} PVC_CONFIG = REMOTE_TARGET.get("pvcAttribution") if isinstance(REMOTE_TARGET.get("pvcAttribution"), dict) else {} +CONTAINERD_CONFIG = REMOTE_TARGET.get("containerdImageCache") if isinstance(REMOTE_TARGET.get("containerdImageCache"), dict) else {} +HOST_CONTAINERD_CONFIG = REMOTE_TARGET.get("hostContainerdCache") if isinstance(REMOTE_TARGET.get("hostContainerdCache"), dict) else {} +LOCAL_PATH_CONFIG = REMOTE_TARGET.get("localPathStorage") if isinstance(REMOTE_TARGET.get("localPathStorage"), dict) else {} POLICY_TIMER_CONFIG = REMOTE_TARGET.get("policyTimer") if isinstance(REMOTE_TARGET.get("policyTimer"), dict) else {} TMP_PREFIX_ALLOWLIST = [ @@ -324,6 +327,7 @@ def remote_gc_live_status(observed_at, preflight): memory_pressure = collect_memory_pressure() ci_storage = ci_storage_snapshot() compact_pvc = compact_pvc_attribution(ci_storage) + policy = growth_watermark_policy(df_snapshot() or {}) return { "ok": True, "action": "gc remote status", @@ -332,10 +336,10 @@ def remote_gc_live_status(observed_at, preflight): "mutation": False, "observedAt": observed_at, "disk": df_snapshot(), - "clusterPreflight": preflight, + "clusterPreflight": preflight if bool(OPTIONS.get("full")) else {key: preflight.get(key) for key in ["ok", "reason", "providerId", "hostname", "expectedNode", "nodes"]}, "memoryPressure": compact_memory_pressure(memory_pressure), "pvcAttribution": compact_pvc, - "policy": growth_watermark_policy(df_snapshot() or {}), + "policy": policy if bool(OPTIONS.get("full")) else {key: policy.get(key) for key in ["state", "recommendedAction"]}, "next": { "snapshot": "bun scripts/cli.ts gc remote %s snapshot --history-limit %s" % (PROVIDER_ID, int(OPTIONS.get("historyLimit") or 12)), "plan": "bun scripts/cli.ts gc remote %s plan --target-use-percent --limit %s" % (PROVIDER_ID, int(OPTIONS.get("limit") or 50)), @@ -677,6 +681,8 @@ def collect_web_observe_summary(): "policy": "analysis-only; active or stale observe runs must be stopped/retained through controlled observer lifecycle commands, not raw process kill or directory deletion", } +# __UNIDESK_GC_REMOTE_WEB_OBSERVE_HELPERS__ + def collect_memory_pressure(): patterns = config_list(MEMORY_CONFIG, "processPatterns", []) if not patterns: @@ -737,273 +743,30 @@ def containerd_breakdown_snapshot(): rows = [item for item in rows if item.get("exists")] return { "state": "observation-only", - "cleanupSupported": False, - "reason": "containerd cleanup still requires a reference-safe image/content classifier; this snapshot only classifies growth sources", + "cleanupSupported": bool(CONTAINERD_CONFIG), + "reason": "k3s image cache cleanup requires --include-k3s-image-cache and uses CRI prune only; host containerd remains observation-only", "breakdown": rows, } -def pv_host_path(pv): - spec = (pv or {}).get("spec") or {} - host_path = (spec.get("hostPath") or {}).get("path") - if isinstance(host_path, str) and host_path: - return host_path - local_path = (spec.get("local") or {}).get("path") - if isinstance(local_path, str) and local_path: - return local_path - return None - -def pvc_owner_group(namespace, owner): - owner = str(owner or "") - if namespace == "agentrun-ci": - return "agentrun" - if namespace == "hwlab-ci": - if owner.startswith("agentrun-"): - return "agentrun" - return "hwlab" - if namespace.startswith("hwlab-"): - return "hwlab-runtime" - return "other" - -def parse_k8s_quantity(value): - if value is None: - return None - raw = str(value).strip() - match = re.match(r"^([0-9]+(?:\.[0-9]+)?)(Ki|Mi|Gi|Ti|K|M|G|T)?$", raw) - if not match: - return None - multiplier = { - None: 1, - "K": 1000, - "M": 1000**2, - "G": 1000**3, - "T": 1000**4, - "Ki": 1024, - "Mi": 1024**2, - "Gi": 1024**3, - "Ti": 1024**4, - }.get(match.group(2), 1) - return int(float(match.group(1)) * multiplier) - -def metadata_owner(meta): - refs = meta.get("ownerReferences") or [] - if refs: - first = refs[0] or {} - return first.get("kind"), first.get("name"), [{"kind": item.get("kind"), "name": item.get("name")} for item in refs[:5]] - labels = meta.get("labels") or {} - annotations = meta.get("annotations") or {} - for key in [ - "tekton.dev/pipelineRun", - "tekton.dev/taskRun", - "agentrun.unidesk/run-id", - "hwlab.unidesk/run-id", - "app.kubernetes.io/instance", - ]: - value = labels.get(key) or annotations.get(key) - if value: - return "Label", value, [] - return None, None, [] - -def ci_storage_snapshot(): - namespaces = set(config_list(PVC_CONFIG, "namespaces", ["hwlab-ci", "agentrun-ci"])) - candidate_namespaces = set(config_list(PVC_CONFIG, "candidateNamespaces", [])) - hwlab_node = config_str(PVC_CONFIG, "hwlabNode", PROVIDER_ID) - hwlab_lane = config_str(PVC_CONFIG, "hwlabLane", "v03") - agentrun_node = config_str(PVC_CONFIG, "agentrunNode", PROVIDER_ID) - agentrun_lane = config_str(PVC_CONFIG, "agentrunLane", "v02") - limit = config_int(PVC_CONFIG, "limit", int(OPTIONS.get("limit") or 50), minimum=1, maximum=5000) - pv_data = kubectl_json(["get", "pv"], 30) or {} - pvc_data = kubectl_json(["get", "pvc", "-A"], 30) or {} - pod_data = kubectl_json(["get", "pod", "-A"], 30) or {} - pvs = {} - for pv in pv_data.get("items") or []: - meta = pv.get("metadata") or {} - name = meta.get("name") - if name: - pvs[name] = pv - mounts = {} - for pod in pod_data.get("items") or []: - meta = pod.get("metadata") or {} - ns = str(meta.get("namespace") or "") - pod_name = str(meta.get("name") or "") - phase = str(((pod.get("status") or {}).get("phase")) or "") - if phase in set(["Succeeded", "Failed"]): - continue - spec = pod.get("spec") or {} - for vol in spec.get("volumes") or []: - claim = (vol.get("persistentVolumeClaim") or {}).get("claimName") - if claim: - mounts.setdefault((ns, claim), []).append(pod_name) - rows = [] - for pvc in pvc_data.get("items") or []: - meta = pvc.get("metadata") or {} - spec = pvc.get("spec") or {} - status = pvc.get("status") or {} - ns = str(meta.get("namespace") or "") - name = str(meta.get("name") or "") - if ns not in namespaces: - continue - volume = str(spec.get("volumeName") or "") - pv = pvs.get(volume) or {} - pv_spec = pv.get("spec") or {} - pv_meta = pv.get("metadata") or {} - owner_kind, owner_name, owner_refs = metadata_owner(meta) - requested = parse_k8s_quantity((((spec.get("resources") or {}).get("requests") or {}).get("storage"))) - host_path = pv_host_path(pv) - active = sorted(mounts.get((ns, name), [])) - estimated = du_size(host_path, 8) if host_path else None - candidate_reasons = [] - if not active: - candidate_reasons.append("no-active-mount-observed") - if status.get("phase") != "Bound": - candidate_reasons.append("pvc-not-bound") - if (pv.get("status") or {}).get("phase") == "Released": - candidate_reasons.append("pv-released") - review_candidate = ns in candidate_namespaces and len(candidate_reasons) > 0 - rows.append({ - "namespace": ns, - "pvc": name, - "volume": volume or None, - "phase": status.get("phase"), - "pvPhase": (pv.get("status") or {}).get("phase"), - "ownerKind": owner_kind, - "owner": owner_name, - "ownerRefs": owner_refs, - "ownerGroup": pvc_owner_group(ns, owner_name), - "storageClass": spec.get("storageClassName") or pv_spec.get("storageClassName"), - "reclaimPolicy": pv_spec.get("persistentVolumeReclaimPolicy"), - "requestedBytes": requested, - "requestedHuman": fmt_bytes(requested or 0), - "hostPath": host_path, - "pvCreatedAt": (pv_meta.get("creationTimestamp") if isinstance(pv_meta, dict) else None), - "pvcCreatedAt": meta.get("creationTimestamp"), - "activeMountPods": active, - "estimatedBytes": estimated, - "estimatedHuman": fmt_bytes(estimated or 0), - "reviewCandidate": review_candidate, - "reviewReasons": candidate_reasons, - "dryRunOnly": True, - }) - rows.sort(key=lambda item: safe_int(item.get("estimatedBytes")), reverse=True) - by_namespace = {} - by_owner_group = {} - for row in rows: - for bucket, key in [(by_namespace, row.get("namespace") or "unknown"), (by_owner_group, row.get("ownerGroup") or "unknown")]: - current = bucket.setdefault(key, {"count": 0, "estimatedBytes": 0, "activeMountCount": 0}) - current["count"] += 1 - current["estimatedBytes"] += safe_int(row.get("estimatedBytes")) - current["activeMountCount"] += len(row.get("activeMountPods") or []) - current["estimatedHuman"] = fmt_bytes(current["estimatedBytes"]) - review_candidates = [row for row in rows if row.get("reviewCandidate")] - return { - "scope": "YAML-configured PVC namespaces", - "configSource": "config/unidesk-cli.yaml#gc.remote.targets.%s.pvcAttribution" % PROVIDER_ID, - "namespaces": sorted(namespaces), - "candidateNamespaces": sorted(candidate_namespaces), - "pvcCount": len(rows), - "reviewCandidateCount": len(review_candidates), - "estimatedBytes": sum(safe_int(row.get("estimatedBytes")) for row in rows), - "estimatedHuman": fmt_bytes(sum(safe_int(row.get("estimatedBytes")) for row in rows)), - "requestedBytes": sum(safe_int(row.get("requestedBytes")) for row in rows), - "requestedHuman": fmt_bytes(sum(safe_int(row.get("requestedBytes")) for row in rows)), - "byNamespace": by_namespace, - "byOwnerGroup": by_owner_group, - "topPvcs": rows[:limit], - "reviewCandidates": review_candidates[:limit], - "handoff": { - "hwlab": { - "dryRun": "bun scripts/cli.ts hwlab nodes control-plane cleanup-runs --node %s --lane %s --min-age-minutes 30 --limit 200 --dry-run" % (hwlab_node, hwlab_lane), - "releasedPvs": "bun scripts/cli.ts hwlab nodes control-plane cleanup-released-pvs --node %s --lane %s --limit 200 --dry-run" % (hwlab_node, hwlab_lane), - }, - "agentrun": { - "dryRun": "bun scripts/cli.ts agentrun control-plane cleanup-runs --node %s --lane %s --min-age-minutes 30 --limit 200 --dry-run" % (agentrun_node, agentrun_lane), - "releasedPvs": "bun scripts/cli.ts agentrun control-plane cleanup-released-pvs --node %s --lane %s --limit 200 --dry-run" % (agentrun_node, agentrun_lane), - }, - }, - "policy": "analysis-only; remote GC never deletes PVC/PV/local-path data and only hands off to owner-aware retention commands", - } - -def compact_pvc_row(row): - return { - "namespace": row.get("namespace"), - "pvc": row.get("pvc"), - "volume": row.get("volume"), - "phase": row.get("phase"), - "pvPhase": row.get("pvPhase"), - "ownerKind": row.get("ownerKind"), - "owner": row.get("owner"), - "ownerGroup": row.get("ownerGroup"), - "storageClass": row.get("storageClass"), - "reclaimPolicy": row.get("reclaimPolicy"), - "requestedBytes": row.get("requestedBytes"), - "requestedHuman": row.get("requestedHuman"), - "estimatedBytes": row.get("estimatedBytes"), - "estimatedHuman": row.get("estimatedHuman"), - "activeMountCount": len(row.get("activeMountPods") or []), - "activeMountPods": (row.get("activeMountPods") or [])[:5], - "reviewCandidate": row.get("reviewCandidate"), - "reviewReasons": row.get("reviewReasons"), - "dryRunOnly": True, - } - -def compact_pvc_attribution(payload): - if bool(OPTIONS.get("full")): - return payload - limit = 1 - top = payload.get("topPvcs") or [] - review = payload.get("reviewCandidates") or [] - return { - "scope": payload.get("scope"), - "configSource": payload.get("configSource"), - "namespaces": payload.get("namespaces"), - "candidateNamespaces": payload.get("candidateNamespaces"), - "pvcCount": payload.get("pvcCount"), - "reviewCandidateCount": payload.get("reviewCandidateCount"), - "estimatedBytes": payload.get("estimatedBytes"), - "estimatedHuman": payload.get("estimatedHuman"), - "requestedBytes": payload.get("requestedBytes"), - "requestedHuman": payload.get("requestedHuman"), - "byNamespace": payload.get("byNamespace"), - "byOwnerGroup": payload.get("byOwnerGroup"), - "topPvcs": [compact_pvc_row(row) for row in top[:limit] if isinstance(row, dict)], - "reviewCandidates": [compact_pvc_row(row) for row in review[:limit] if isinstance(row, dict)], - "handoff": payload.get("handoff"), - "policy": payload.get("policy"), - "compacted": True, - "fullDisclosure": "rerun with --full for hostPath, creation timestamps and complete row details", - } - -def compact_ci_storage_summary(payload): - return { - "scope": payload.get("scope"), - "configSource": payload.get("configSource"), - "pvcCount": payload.get("pvcCount"), - "reviewCandidateCount": payload.get("reviewCandidateCount"), - "estimatedBytes": payload.get("estimatedBytes"), - "estimatedHuman": payload.get("estimatedHuman"), - "requestedBytes": payload.get("requestedBytes"), - "requestedHuman": payload.get("requestedHuman"), - "compacted": True, - "fullDisclosure": "use pvcAttribution or --full for row-level details", - } +# __UNIDESK_GC_REMOTE_CONTAINERD_HELPERS__ +# __UNIDESK_GC_REMOTE_PVC_HELPERS__ def compact_memory_pressure(payload): if bool(OPTIONS.get("full")): return payload - processes = payload.get("processes") or {} observe = payload.get("webObserve") or {} - process_limit = max(1, min(int(OPTIONS.get("limit") or 50), 8)) - signal_limit = max(1, min(int(OPTIONS.get("limit") or 50), 5)) - compact_processes = dict(processes) - compact_processes["top"] = (processes.get("top") or [])[:process_limit] - compact_observe = dict(observe) - compact_observe["activeSignals"] = (observe.get("activeSignals") or [])[:signal_limit] - compact_observe["staleSignals"] = (observe.get("staleSignals") or [])[:signal_limit] return { "ok": payload.get("ok"), "configSource": payload.get("configSource"), - "hostMemory": payload.get("hostMemory"), - "processes": compact_processes, - "webObserve": compact_observe, + "hostMemory": (payload.get("hostMemory") or {}).get("memory"), + "webObserve": { + "rootCount": observe.get("rootCount"), + "totalBytes": observe.get("totalBytes"), + "totalHuman": observe.get("totalHuman"), + "runCount": observe.get("runCount"), + "activeSignalCount": observe.get("activeSignalCount"), + "staleSignalCount": observe.get("staleSignalCount"), + }, "summary": payload.get("summary"), "drillDown": payload.get("drillDown"), "compacted": True, @@ -1027,299 +790,7 @@ def compact_memory_summary(payload): "drillDown": "bun scripts/cli.ts gc remote %s status --limit %s" % (PROVIDER_ID, int(OPTIONS.get("limit") or 50)), } -def registry_growth_snapshot(): - summary = { - "path": REGISTRY_ROOT, - "sizeBytes": du_size(REGISTRY_ROOT, 60) or 0, - } - summary["sizeHuman"] = fmt_bytes(summary["sizeBytes"]) - if OPTIONS.get("hwlabRegistry", False): - plan = plan_registry_retention() - retention = dict(plan.get("summary") or {}) - for key in ["registrySizeBytes", "estimatedReclaimBytes"]: - if key in retention: - retention[key.replace("Bytes", "Human")] = fmt_bytes(retention.get(key) or 0) - summary["retentionPlan"] = retention - else: - summary["retentionPlan"] = { - "skipped": True, - "reason": "rerun snapshot with --include-hwlab-registry to compute tag/revision retention counters", - } - summary["cadence"] = { - "dryRun": "daily or before/after every v0.2 CI/CD burst", - "maintenanceRun": "weekly, or when root >=80%, or when registry growth exceeds the agreed daily threshold", - "planCommand": "bun scripts/cli.ts gc remote %s plan --target-use-percent 70 --include-hwlab-registry --limit 50" % PROVIDER_ID, - "snapshotCommand": "bun scripts/cli.ts gc remote %s snapshot --include-hwlab-registry --history-limit 12" % PROVIDER_ID, - "runCommand": "bun scripts/cli.ts gc remote %s run --confirm --include-hwlab-registry --target-use-percent 70 --limit 50" % PROVIDER_ID, - "defaultRetention": { - "keepPerRepo": int(OPTIONS.get("registryKeepPerRepo") or 20), - "minAgeHours": float(OPTIONS.get("registryMinAgeHours") or 48), - "protects": ["current workload refs", "digest closure", "protected tags", "recent tags", "newest N tags per repo"], - }, - } - return summary - -def growth_watermark_policy(root_disk): - use_percent = root_disk.get("usePercent") if isinstance(root_disk, dict) else None - if use_percent is None: - state = "unknown" - action = "collect-snapshot" - elif use_percent < 75: - state = "healthy" - action = "observe-trend" - elif use_percent < 80: - state = "watch" - action = "run-dry-run-plan" - elif use_percent < 85: - state = "maintenance" - action = "schedule-owner-aware-retention" - else: - state = "emergency" - action = "restore-runtime-then-file-evidence" - return { - "state": state, - "recommendedAction": action, - "watermarks": [ - {"range": "<75%", "action": "trend only"}, - {"range": "75%-80%", "action": "run dry-run plan and identify source"}, - {"range": "80%-85%", "action": "small owner-aware retention run"}, - {"range": ">=85%", "action": "runtime recovery first, then root-cause growth source"}, - ], - "growthThresholdPolicy": "If bytes/day remains high for consecutive snapshots, act before 80%; exact threshold should be set from the first week of saved snapshots.", - } - -def snapshot_metric_map(snapshot): - metrics = {} - root = snapshot.get("rootDisk") or {} - if isinstance(root, dict) and root.get("usedBytes") is not None: - metrics["root.usedBytes"] = {"value": safe_int(root.get("usedBytes")), "unit": "bytes", "label": "root used bytes"} - for item in snapshot.get("sources") or []: - if not isinstance(item, dict) or item.get("sizeBytes") is None: - continue - key = "source.%s.sizeBytes" % item.get("id") - metrics[key] = {"value": safe_int(item.get("sizeBytes")), "unit": "bytes", "label": item.get("label") or item.get("id")} - storage = ((snapshot.get("ciStorage") or {}).get("byOwnerGroup") or {}) - if not storage: - storage = ((snapshot.get("pvcAttribution") or {}).get("byOwnerGroup") or {}) - for owner, value in storage.items(): - metrics["ciStorage.%s.estimatedBytes" % owner] = {"value": safe_int((value or {}).get("estimatedBytes")), "unit": "bytes", "label": "CI storage %s" % owner} - memory = snapshot.get("memoryPressure") or {} - memory_summary = memory.get("summary") or {} - if memory_summary.get("matchedRssBytes") is not None: - metrics["memoryPressure.matchedRssBytes"] = {"value": safe_int(memory_summary.get("matchedRssBytes")), "unit": "bytes", "label": "matched observer/chrome RSS"} - if memory_summary.get("observeStateBytes") is not None: - metrics["memoryPressure.observeStateBytes"] = {"value": safe_int(memory_summary.get("observeStateBytes")), "unit": "bytes", "label": "web observe state bytes"} - for key in ["matchedProcessCount", "activeObserverSignals", "staleObserverSignals"]: - if memory_summary.get(key) is not None: - metrics["memoryPressure.%s" % key] = {"value": safe_int(memory_summary.get(key)), "unit": "count", "label": "memory pressure %s" % key} - registry = snapshot.get("registry") or {} - retention = registry.get("retentionPlan") or {} - for key in ["totalTags", "totalRevisions", "deleteTags", "deleteRevisions", "estimatedReclaimBytes"]: - if key in retention and retention.get(key) is not None: - unit = "bytes" if key.endswith("Bytes") else "count" - metrics["registry.%s" % key] = {"value": safe_int(retention.get(key)), "unit": unit, "label": "registry %s" % key} - return metrics - -def delta_metric_rows(before, after): - before_metrics = snapshot_metric_map(before) - after_metrics = snapshot_metric_map(after) - before_ts = iso_to_epoch(before.get("observedAt")) - after_ts = iso_to_epoch(after.get("observedAt")) - seconds = (after_ts - before_ts) if before_ts is not None and after_ts is not None else None - rows = [] - for key in sorted(set(before_metrics.keys()) | set(after_metrics.keys())): - old = before_metrics.get(key, {"value": 0, "unit": (after_metrics.get(key) or {}).get("unit"), "label": key}) - new = after_metrics.get(key, {"value": 0, "unit": old.get("unit"), "label": old.get("label")}) - delta = safe_int(new.get("value")) - safe_int(old.get("value")) - row = { - "key": key, - "label": new.get("label") or old.get("label") or key, - "unit": new.get("unit") or old.get("unit"), - "before": old.get("value"), - "after": new.get("value"), - "delta": delta, - } - if row["unit"] == "bytes": - row["beforeHuman"] = fmt_bytes(row["before"] or 0) - row["afterHuman"] = fmt_bytes(row["after"] or 0) - row["deltaHuman"] = ("-" if delta < 0 else "") + fmt_bytes(abs(delta)) - if seconds and seconds > 0: - per_day = int(delta * 86400 / seconds) - row["perDayBytes"] = per_day - row["perDayHuman"] = ("-" if per_day < 0 else "") + fmt_bytes(abs(per_day)) + "/day" - rows.append(row) - rows.sort(key=lambda item: safe_int(item.get("delta")), reverse=True) - return {"durationSeconds": seconds, "metrics": rows} - -def growth_trend_payload(points): - points = [point for point in points if isinstance(point, dict)] - if len(points) < 2: - return { - "pointCount": len(points), - "state": "insufficient-history", - "message": "Run snapshot at least twice to compute deltas.", - } - latest_delta = delta_metric_rows(points[-2], points[-1]) - window_delta = delta_metric_rows(points[0], points[-1]) - def rate_warning(delta): - seconds = delta.get("durationSeconds") - if seconds is not None and seconds < 3600: - return { - "code": "short-window-rate-noisy", - "message": "Per-day rates from windows shorter than 1 hour are directional only; use daily snapshots for governance decisions.", - "durationSeconds": seconds, - } - return None - return { - "pointCount": len(points), - "oldestAt": points[0].get("observedAt"), - "latestAt": points[-1].get("observedAt"), - "latestDelta": { - "durationSeconds": latest_delta.get("durationSeconds"), - "rateWarning": rate_warning(latest_delta), - "topGrowingBytes": [row for row in latest_delta.get("metrics", []) if row.get("unit") == "bytes" and safe_int(row.get("delta")) > 0][:10], - "topShrinkingBytes": [row for row in reversed(latest_delta.get("metrics", [])) if row.get("unit") == "bytes" and safe_int(row.get("delta")) < 0][:10], - "registryCounters": [row for row in latest_delta.get("metrics", []) if str(row.get("key", "")).startswith("registry.") and row.get("unit") == "count"], - }, - "windowDelta": { - "durationSeconds": window_delta.get("durationSeconds"), - "rateWarning": rate_warning(window_delta), - "topGrowingBytes": [row for row in window_delta.get("metrics", []) if row.get("unit") == "bytes" and safe_int(row.get("delta")) > 0][:10], - "topShrinkingBytes": [row for row in reversed(window_delta.get("metrics", [])) if row.get("unit") == "bytes" and safe_int(row.get("delta")) < 0][:10], - "registryCounters": [row for row in window_delta.get("metrics", []) if str(row.get("key", "")).startswith("registry.") and row.get("unit") == "count"], - }, - } - -def compact_metric_rows(rows, limit=3): - compact = [] - for row in (rows or [])[:limit]: - compact.append({ - "key": row.get("key"), - "label": row.get("label"), - "unit": row.get("unit"), - "delta": row.get("delta"), - "deltaHuman": row.get("deltaHuman"), - "perDayHuman": row.get("perDayHuman"), - }) - return compact - -def compact_trend_payload(payload): - if payload.get("state") == "insufficient-history": - return payload - latest = payload.get("latestDelta") or {} - window = payload.get("windowDelta") or {} - return { - "pointCount": payload.get("pointCount"), - "oldestAt": payload.get("oldestAt"), - "latestAt": payload.get("latestAt"), - "latestDelta": { - "durationSeconds": latest.get("durationSeconds"), - "rateWarning": latest.get("rateWarning"), - "topGrowingBytes": compact_metric_rows(latest.get("topGrowingBytes") or [], 1), - "topShrinkingBytes": compact_metric_rows(latest.get("topShrinkingBytes") or [], 1), - "registryCounters": compact_metric_rows(latest.get("registryCounters") or [], 1), - }, - "windowDelta": { - "durationSeconds": window.get("durationSeconds"), - "rateWarning": window.get("rateWarning"), - "topGrowingBytes": compact_metric_rows(window.get("topGrowingBytes") or [], 1), - "topShrinkingBytes": compact_metric_rows(window.get("topShrinkingBytes") or [], 1), - "registryCounters": compact_metric_rows(window.get("registryCounters") or [], 1), - }, - "fullDisclosure": "rerun trend --full for all metric rows", - } - -def compact_growth_point(item): - registry = item.get("registry") or {} - retention = registry.get("retentionPlan") or {} - ci_storage = item.get("ciStorage") or {} - containerd = item.get("containerd") or {} - memory = item.get("memoryPressure") or {} - memory_summary = memory.get("summary") or {} - observe = (memory.get("webObserve") or {}) - return { - "observedAt": item.get("observedAt"), - "rootDisk": item.get("rootDisk"), - "sourceCount": len(item.get("sources") or []), - "registry": { - "sizeBytes": registry.get("sizeBytes"), - "sizeHuman": registry.get("sizeHuman"), - "totalTags": retention.get("totalTags"), - "totalRevisions": retention.get("totalRevisions"), - "deleteTags": retention.get("deleteTags"), - "deleteRevisions": retention.get("deleteRevisions"), - "estimatedReclaimBytes": retention.get("estimatedReclaimBytes"), - "estimatedReclaimHuman": retention.get("estimatedReclaimHuman"), - }, - "ciStorage": { - "pvcCount": ci_storage.get("pvcCount"), - "estimatedBytes": ci_storage.get("estimatedBytes"), - "estimatedHuman": ci_storage.get("estimatedHuman"), - "byOwnerGroup": ci_storage.get("byOwnerGroup"), - }, - "containerd": { - "state": containerd.get("state"), - "cleanupSupported": containerd.get("cleanupSupported"), - }, - "memoryPressure": { - "matchedProcessCount": memory_summary.get("matchedProcessCount"), - "matchedRssBytes": memory_summary.get("matchedRssBytes"), - "matchedRssHuman": memory_summary.get("matchedRssHuman"), - "activeObserverSignals": memory_summary.get("activeObserverSignals"), - "staleObserverSignals": memory_summary.get("staleObserverSignals"), - "observeStateBytes": memory_summary.get("observeStateBytes"), - "observeStateHuman": memory_summary.get("observeStateHuman"), - "webObserveRootCount": observe.get("rootCount"), - }, - } - -def collect_growth_snapshot(observed_at, preflight): - root_disk = df_snapshot() - sources = disk_source_snapshot() - ci_storage = ci_storage_snapshot() - memory_pressure = collect_memory_pressure() - compact_pvc = compact_pvc_attribution(ci_storage) - if bool(OPTIONS.get("full")): - public_pvc = ci_storage - public_memory = memory_pressure - else: - public_pvc = compact_ci_storage_summary(ci_storage) - public_memory = compact_memory_summary(memory_pressure) - registry = registry_growth_snapshot() - containerd = containerd_breakdown_snapshot() - commands = { - "snapshot": "bun scripts/cli.ts gc remote %s snapshot --include-hwlab-registry --history-limit %s" % (PROVIDER_ID, int(OPTIONS.get("historyLimit") or 12)), - "trend": "bun scripts/cli.ts gc remote %s trend --history-limit %s" % (PROVIDER_ID, int(OPTIONS.get("historyLimit") or 12)), - "registryPlan": "bun scripts/cli.ts gc remote %s plan --target-use-percent 70 --include-hwlab-registry --limit 50" % PROVIDER_ID, - "hwlabCiRetention": ((ci_storage.get("handoff") or {}).get("hwlab") or {}).get("dryRun"), - "agentrunRetention": ((ci_storage.get("handoff") or {}).get("agentrun") or {}).get("dryRun"), - "remotePolicy": "bun scripts/cli.ts gc remote %s policy plan" % PROVIDER_ID, - } - if not bool(OPTIONS.get("full")): - commands = { - "trend": "bun scripts/cli.ts gc remote %s trend --history-limit %s" % (PROVIDER_ID, int(OPTIONS.get("historyLimit") or 12)), - "status": "bun scripts/cli.ts gc remote %s status --limit %s" % (PROVIDER_ID, int(OPTIONS.get("limit") or 50)), - "full": "bun scripts/cli.ts gc remote %s snapshot --full --no-save" % PROVIDER_ID, - } - return { - "ok": True, - "action": "gc remote snapshot", - "providerId": PROVIDER_ID, - "dryRun": True, - "mutation": False, - "diagnosticStateMutation": bool(OPTIONS.get("saveSnapshot", True)), - "observedAt": observed_at, - "rootDisk": root_disk, - "clusterPreflight": preflight, - "sources": sources, - "registry": registry, - "pvcAttribution": public_pvc, - "memoryPressure": public_memory, - "containerd": containerd, - "policy": growth_watermark_policy(root_disk or {}), - "commands": commands, - } - +# __UNIDESK_GC_REMOTE_GROWTH_HELPERS__ def allocated_file_size(path): try: stat = os.stat(path) @@ -1445,684 +916,7 @@ def cluster_preflight(): "hwlabDevPodCommand": bounded(pods_cmd), } -def active_hwlab_ci_writes(): - result = command(["sh", "-lc", "KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl get pipelinerun,taskrun -n hwlab-ci --no-headers 2>/dev/null | awk '$2 != \"True\" && $2 != \"False\" {print}' | head -40"], 15) - lines = [line for line in (result.get("stdout") or "").splitlines() if line.strip()] - return {"ok": result["exitCode"] == 0, "activeCount": len(lines), "activePreview": lines, "command": bounded(result)} - -def active_hwlab_ci_jobs(): - result = command(["sh", "-lc", "KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl get jobs -n hwlab-ci --no-headers 2>/dev/null | awk '$2 != \"Complete\" && $2 != \"Failed\" {print}' | head -40"], 15) - lines = [line for line in (result.get("stdout") or "").splitlines() if line.strip()] - return {"ok": result["exitCode"] == 0, "activeCount": len(lines), "activePreview": lines, "command": bounded(result)} - -def wait_no_active_hwlab_ci(timeout=180): - deadline = time.time() + timeout - last = None - while time.time() < deadline: - writes = active_hwlab_ci_writes() - jobs = active_hwlab_ci_jobs() - last = {"writes": writes, "jobs": jobs} - if writes.get("ok") and jobs.get("ok") and int(writes.get("activeCount") or 0) == 0 and int(jobs.get("activeCount") or 0) == 0: - return {"ok": True, "last": last} - time.sleep(5) - return {"ok": False, "last": last} - -def kubectl_json(args, timeout=20): - result = command(["env", "KUBECONFIG=/etc/rancher/k3s/k3s.yaml", "kubectl"] + args + ["-o", "json"], timeout) - if result["exitCode"] != 0: - return None - try: - return json.loads(result["stdout"] or "{}") - except Exception: - return None - -def kctl(args, timeout=30): - return command(["env", "KUBECONFIG=/etc/rancher/k3s/k3s.yaml", "kubectl"] + args, timeout) - -def workload_image_refs(): - result = command(["sh", "-lc", "KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl get deploy,sts,ds,pod -A -o jsonpath='{range .items[*]}{range .spec.containers[*]}{.image}{\"\\n\"}{end}{range .spec.initContainers[*]}{.image}{\"\\n\"}{end}{range .spec.template.spec.containers[*]}{.image}{\"\\n\"}{end}{range .spec.template.spec.initContainers[*]}{.image}{\"\\n\"}{end}{end}' 2>/dev/null | sort -u"], 30) - refs = set() - digests = set() - for image in (result.get("stdout") or "").splitlines(): - image = image.strip() - if not image.startswith("127.0.0.1:5000/"): - continue - ref = image.split("127.0.0.1:5000/", 1)[1] - if "@sha256:" in ref: - repo, digest = ref.split("@", 1) - refs.add((repo, "@" + digest)) - digests.add("sha256:" + digest.split(":", 1)[1]) - elif ":" in ref: - repo, tag = ref.rsplit(":", 1) - refs.add((repo, tag)) - return refs, digests, bounded(result) - -def registry_request(method, path, headers=None, timeout=20): - url = "http://127.0.0.1:5000" + path - req = urllib.request.Request(url, method=method, headers=headers or {}) - with urllib.request.urlopen(req, timeout=timeout) as response: - body = response.read() - return {"status": response.status, "headers": dict(response.headers), "body": body.decode("utf-8", errors="replace")} - -def registry_tag_rows(): - rows = [] - root = REGISTRY_REPOSITORY_ROOT - if not os.path.isdir(root): - return rows - for repo_root, dirs, files in os.walk(root): - if os.path.basename(repo_root) != "tags": - continue - rel = os.path.relpath(repo_root, root) - suffix = "/_manifests/tags" - if not rel.endswith(suffix): - continue - repo = rel[:-len(suffix)] - try: - tags = os.listdir(repo_root) - except OSError: - continue - for tag in sorted(tags): - link = os.path.join(repo_root, tag, "current", "link") - if not os.path.isfile(link): - continue - try: - with open(link, "r", encoding="utf-8") as handle: - digest = handle.read().strip() - stat = os.stat(link) - except OSError: - continue - rows.append({ - "repo": repo, - "tag": tag, - "digest": digest, - "mtime": stat.st_mtime, - "mtimeIso": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(stat.st_mtime)), - "path": os.path.join(repo_root, tag), - }) - return rows - -def registry_revision_rows(): - rows = [] - root = REGISTRY_REPOSITORY_ROOT - if not os.path.isdir(root): - return rows - for repo_root, dirs, files in os.walk(root): - if os.path.basename(repo_root) != "sha256": - continue - rel = os.path.relpath(repo_root, root) - suffix = "/_manifests/revisions/sha256" - if not rel.endswith(suffix): - continue - repo = rel[:-len(suffix)] - try: - revisions = os.listdir(repo_root) - except OSError: - continue - for digest_hex in sorted(revisions): - path = os.path.join(repo_root, digest_hex) - link = os.path.join(path, "link") - if not os.path.isfile(link): - continue - try: - with open(link, "r", encoding="utf-8") as handle: - digest = handle.read().strip() - stat = os.stat(link) - except OSError: - continue - rows.append({ - "repo": repo, - "digest": digest, - "mtime": stat.st_mtime, - "mtimeIso": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(stat.st_mtime)), - "path": path, - }) - return rows - -def registry_retention_repo(repo): - return repo.startswith("hwlab/hwlab-") or repo.startswith("hwlab/cache/hwlab-") - -def registry_digest_hex(digest): - if not isinstance(digest, str) or not digest.startswith("sha256:"): - return None - value = digest.split(":", 1)[1] - if re.match(r"^[0-9a-f]{64}$", value) is None: - return None - return value - -def registry_blob_data_path(digest): - value = registry_digest_hex(digest) - if value is None: - return None - return os.path.join(REGISTRY_ROOT, "docker/registry/v2/blobs/sha256", value[:2], value, "data") - -_manifest_cache = {} -def registry_manifest_json(digest): - if digest in _manifest_cache: - return _manifest_cache[digest] - path = registry_blob_data_path(digest) - if path is None or not os.path.isfile(path): - _manifest_cache[digest] = None - return None - try: - with open(path, "rb") as handle: - data = handle.read(8 * 1024 * 1024) - value = json.loads(data.decode("utf-8")) - except Exception: - value = None - _manifest_cache[digest] = value - return value - -def registry_manifest_refs(digest): - manifest = registry_manifest_json(digest) - if not isinstance(manifest, dict): - return set() - refs = set() - config = manifest.get("config") or {} - config_digest = config.get("digest") - if isinstance(config_digest, str) and registry_digest_hex(config_digest) is not None: - refs.add(config_digest) - for item in manifest.get("layers") or []: - item_digest = (item or {}).get("digest") - if isinstance(item_digest, str) and registry_digest_hex(item_digest) is not None: - refs.add(item_digest) - for item in manifest.get("manifests") or []: - item_digest = (item or {}).get("digest") - if isinstance(item_digest, str) and registry_digest_hex(item_digest) is not None: - refs.add(item_digest) - return refs - -def registry_digest_closure(seed): - seen = set() - stack = list(seed) - while stack: - digest = stack.pop() - if digest in seen or registry_digest_hex(digest) is None: - continue - seen.add(digest) - for child in registry_manifest_refs(digest): - if child not in seen: - stack.append(child) - return seen - -def registry_blob_size(digest): - path = registry_blob_data_path(digest) - if path is None or not os.path.isfile(path): - return 0 - try: - return int(os.lstat(path).st_blocks) * 512 - except OSError: - return 0 - -def estimate_registry_reclaim(delete_manifest_digests, kept_manifest_digests): - deleted = registry_digest_closure(delete_manifest_digests) - kept = registry_digest_closure(kept_manifest_digests) - reclaim = deleted - kept - return sum(registry_blob_size(digest) for digest in reclaim) - -def plan_registry_retention(): - keep_per_repo = int(OPTIONS.get("registryKeepPerRepo") if OPTIONS.get("registryKeepPerRepo") is not None else 5) - min_age_hours = float(OPTIONS.get("registryMinAgeHours") if OPTIONS.get("registryMinAgeHours") is not None else 48) - cutoff = time.time() - min_age_hours * 3600 - refs, digests, refs_command = workload_image_refs() - rows = registry_tag_rows() - revision_rows = registry_revision_rows() - by_repo = {} - for row in rows: - by_repo.setdefault(row["repo"], []).append(row) - keep = set() - keep_reasons = {} - for repo, items in by_repo.items(): - items.sort(key=lambda item: item["mtime"], reverse=True) - for row in items[:keep_per_repo]: - key = (row["repo"], row["tag"]) - keep.add(key) - keep_reasons[key] = "latest-per-repo" - for row in items: - key = (row["repo"], row["tag"]) - if row["tag"] in REGISTRY_PROTECTED_TAGS: - keep.add(key) - keep_reasons[key] = "protected-tag" - if key in refs: - keep.add(key) - keep_reasons[key] = "workload-tag-ref" - if row["digest"] in digests: - keep.add(key) - keep_reasons[key] = "workload-digest-ref" - if row["repo"].startswith("hwlab/cache/"): - keep.add(key) - keep_reasons[key] = "cache-repo" - if row["mtime"] >= cutoff: - keep.add(key) - keep_reasons[key] = "recent-tag" - delete_rows = [] - kept_count = 0 - delete_by_repo = {} - keep_by_repo = {} - kept_digests = set() - for row in rows: - key = (row["repo"], row["tag"]) - should_delete = ( - key not in keep - and row["repo"].startswith("hwlab/hwlab-") - and re.match(r"^[0-9a-f]{7,40}$", row["tag"]) is not None - ) - if should_delete: - delete_rows.append(row) - delete_by_repo[row["repo"]] = delete_by_repo.get(row["repo"], 0) + 1 - else: - kept_count += 1 - kept_digests.add(row["digest"]) - keep_by_repo[row["repo"]] = keep_by_repo.get(row["repo"], 0) + 1 - protected_digests = kept_digests | digests - protected_digests.update(row["digest"] for row in revision_rows if not registry_retention_repo(row["repo"])) - protected_digests = registry_digest_closure(protected_digests) - delete_revision_rows = [] - revision_delete_by_repo = {} - for row in revision_rows: - if not registry_retention_repo(row["repo"]): - continue - if row["digest"] in protected_digests: - continue - delete_revision_rows.append(row) - revision_delete_by_repo[row["repo"]] = revision_delete_by_repo.get(row["repo"], 0) + 1 - kept_revision_digests = set(row["digest"] for row in revision_rows if row not in delete_revision_rows) - delete_revision_digests = set(row["digest"] for row in delete_revision_rows) - deletable_manifests = {} - for row in delete_rows: - if row["digest"] in kept_digests: - continue - deletable_manifests.setdefault(row["repo"], set()).add(row["digest"]) - for row in delete_revision_rows: - deletable_manifests.setdefault(row["repo"], set()).add(row["digest"]) - deletable_manifest_count = sum(len(items) for items in deletable_manifests.values()) - registry_size = du_size(REGISTRY_ROOT, 30) or 0 - estimate = estimate_registry_reclaim(delete_revision_digests, kept_revision_digests) - return { - "tagRows": rows, - "revisionRows": revision_rows, - "deleteRows": delete_rows, - "deleteRevisionRows": delete_revision_rows, - "summary": { - "totalTags": len(rows), - "totalRevisions": len(revision_rows), - "repoCount": len(by_repo), - "keepPerRepo": keep_per_repo, - "minAgeHours": min_age_hours, - "protectedWorkloadRefs": len(refs), - "protectedDigestRefs": len(digests), - "protectedDigestClosure": len(protected_digests), - "keptTags": kept_count, - "deleteTags": len(delete_rows), - "deleteManifests": deletable_manifest_count, - "deleteRevisions": len(delete_revision_rows), - "deleteByRepo": delete_by_repo, - "revisionDeleteByRepo": revision_delete_by_repo, - "keepByRepo": keep_by_repo, - "registrySizeBytes": registry_size, - "estimatedReclaimBytes": estimate, - }, - "deleteManifestsByRepo": {repo: sorted(list(digests)) for repo, digests in deletable_manifests.items()}, - "refsCommand": refs_command, - } - -def registry_deployment_preflight(): - dep = kubectl_json(["-n", "hwlab-ci", "get", "deploy", "hwlab-registry"], 20) - if not dep: - return {"ok": False, "reason": "registry-deployment-missing"} - spec = ((dep.get("spec") or {}).get("template") or {}).get("spec") or {} - containers = spec.get("containers") or [] - volumes = spec.get("volumes") or [] - registry_container = next((item for item in containers if item.get("name") == "registry"), containers[0] if containers else {}) - mounts = registry_container.get("volumeMounts") or [] - has_host_path = any(((vol.get("hostPath") or {}).get("path") == REGISTRY_ROOT and vol.get("name") == "storage") for vol in volumes) - has_mount = any((mount.get("name") == "storage" and mount.get("mountPath") == "/var/lib/registry") for mount in mounts) - image = str(registry_container.get("image") or "") - ok = bool(has_host_path and has_mount and image.startswith("registry:") and spec.get("hostNetwork") is True) - return { - "ok": ok, - "reason": "ok" if ok else "unexpected-registry-deployment-shape", - "image": image, - "hostNetwork": spec.get("hostNetwork"), - "hasExpectedHostPath": has_host_path, - "hasExpectedMount": has_mount, - "replicas": (dep.get("spec") or {}).get("replicas"), - "readyReplicas": (dep.get("status") or {}).get("readyReplicas"), - } - -def cronjob_suspend_states(names): - states = {} - for name in names: - data = kubectl_json(["-n", "hwlab-ci", "get", "cronjob", name], 15) - if data: - states[name] = bool(((data.get("spec") or {}).get("suspend")) is True) - return states - -def patch_cronjob_suspend(name, suspend): - payload = json.dumps({"spec": {"suspend": bool(suspend)}}) - return kctl(["-n", "hwlab-ci", "patch", "cronjob", name, "--type=merge", "-p", payload], 30) - -def wait_registry_pod_count(target, timeout=90): - deadline = time.time() + timeout - last = None - while time.time() < deadline: - result = kctl(["-n", "hwlab-ci", "get", "pods", "-l", "app.kubernetes.io/name=hwlab-registry", "--no-headers"], 20) - last = bounded(result) - lines = [line for line in (result.get("stdout") or "").splitlines() if line.strip()] - active = [] - for line in lines: - parts = line.split() - status = parts[2] if len(parts) >= 3 else "" - if status in set(["Completed", "Error", "Failed", "Succeeded"]): - continue - active.append(line) - if len(active) == target: - return {"ok": True, "lines": active, "allLines": lines, "last": last} - time.sleep(2) - return {"ok": False, "lines": [], "last": last} - -def wait_pod_terminal(name, timeout=900): - deadline = time.time() + timeout - last = None - while time.time() < deadline: - data = kubectl_json(["-n", "hwlab-ci", "get", "pod", name], 20) - if data: - phase = ((data.get("status") or {}).get("phase")) or "" - last = {"phase": phase} - if phase == "Succeeded": - return {"ok": True, "phase": phase} - if phase == "Failed": - return {"ok": False, "phase": phase} - time.sleep(3) - return {"ok": False, "phase": "Timeout", "last": last} - -def execute_registry_retention(): - if PROVIDER_ID.upper() != "G14": - raise RuntimeError("HWLAB registry retention is only supported on G14") - deployment = registry_deployment_preflight() - if not deployment.get("ok"): - raise RuntimeError("registry deployment preflight failed: %s" % deployment.get("reason")) - plan = plan_registry_retention() - delete_rows = plan.get("deleteRows") or [] - delete_revision_rows = plan.get("deleteRevisionRows") or [] - delete_manifests = plan.get("deleteManifestsByRepo") or {} - if not delete_rows and not delete_revision_rows: - return {"reclaimedBytes": 0, "commandOutput": {"message": "no registry tags or revisions matched conservative retention", "registryPlan": plan.get("summary")}} - if not delete_manifests: - return {"reclaimedBytes": 0, "commandOutput": {"message": "matched manifests are still referenced by retained manifests; registry GC would not reclaim blobs", "registryPlan": plan.get("summary")}} - cronjobs = ["hwlab-g14-branch-poller", "hwlab-v02-branch-poller"] - original_crons = cronjob_suspend_states(cronjobs) - before = du_size(REGISTRY_ROOT, 60) or 0 - gc_name = "hwlab-registry-gc-%s" % int(time.time()) - steps = [] - try: - for name in original_crons: - result = patch_cronjob_suspend(name, True) - steps.append({"step": "suspend-cronjob", "name": name, "result": bounded(result)}) - if result["exitCode"] != 0: - raise RuntimeError("failed to suspend cronjob %s" % name) - idle_after_suspend = wait_no_active_hwlab_ci(180) - steps.append({"step": "idle-after-suspend", "result": idle_after_suspend}) - if not idle_after_suspend.get("ok"): - raise RuntimeError("refusing registry maintenance because hwlab-ci did not become idle after suspend") - - deleted_manifests = [] - for repo, digests in delete_manifests.items(): - encoded_repo = "/".join(urllib.parse.quote(part, safe="") for part in repo.split("/")) - for digest in digests: - try: - result = registry_request("DELETE", "/v2/%s/manifests/%s" % (encoded_repo, urllib.parse.quote(digest, safe=":")), {"Accept": "application/vnd.docker.distribution.manifest.v2+json, application/vnd.oci.image.manifest.v1+json"}) - deleted_manifests.append({"repo": repo, "digest": digest, "status": result.get("status")}) - except urllib.error.HTTPError as exc: - if exc.code == 404: - deleted_manifests.append({"repo": repo, "digest": digest, "status": 404}) - else: - raise - steps.append({"step": "registry-api-delete-manifests", "count": len(deleted_manifests), "preview": deleted_manifests[:20]}) - - scale_down = kctl(["-n", "hwlab-ci", "scale", "deploy", "hwlab-registry", "--replicas=0"], 60) - steps.append({"step": "scale-registry-down", "result": bounded(scale_down)}) - if scale_down["exitCode"] != 0: - raise RuntimeError("failed to scale registry down") - waited_down = wait_registry_pod_count(0, 120) - steps.append({"step": "wait-registry-down", "result": waited_down}) - if not waited_down.get("ok"): - raise RuntimeError("registry pod did not scale down") - - deleted = [] - for row in delete_rows: - path = os.path.abspath(str(row.get("path") or "")) - if not path.startswith(REGISTRY_REPOSITORY_ROOT + "/") or "/_manifests/tags/" not in path: - raise RuntimeError("refusing unexpected registry tag path: %s" % path) - if not re.match(r"^[0-9a-f]{7,40}$", str(row.get("tag") or "")): - raise RuntimeError("refusing unexpected registry tag name: %s" % row.get("tag")) - if os.path.isdir(path) and not os.path.islink(path): - shutil.rmtree(path) - deleted.append({"repo": row.get("repo"), "tag": row.get("tag"), "digest": row.get("digest")}) - steps.append({"step": "delete-tag-directories", "count": len(deleted)}) - - deleted_revisions = [] - for row in delete_revision_rows: - path = os.path.abspath(str(row.get("path") or "")) - digest_hex = registry_digest_hex(str(row.get("digest") or "")) - if digest_hex is None: - raise RuntimeError("refusing unexpected registry revision digest: %s" % row.get("digest")) - if not path.startswith(REGISTRY_REPOSITORY_ROOT + "/") or "/_manifests/revisions/sha256/" not in path: - raise RuntimeError("refusing unexpected registry revision path: %s" % path) - if os.path.basename(path) != digest_hex: - raise RuntimeError("refusing registry revision path/digest mismatch: %s" % path) - if os.path.isdir(path) and not os.path.islink(path): - shutil.rmtree(path) - deleted_revisions.append({"repo": row.get("repo"), "digest": row.get("digest")}) - steps.append({"step": "delete-revision-directories", "count": len(deleted_revisions)}) - - overrides = { - "apiVersion": "v1", - "spec": { - "restartPolicy": "Never", - "containers": [{ - "name": "registry-gc", - "image": "registry:2.8.3", - "command": ["registry", "garbage-collect", "/etc/docker/registry/config.yml"], - "volumeMounts": [{"name": "storage", "mountPath": "/var/lib/registry"}], - }], - "volumes": [{"name": "storage", "hostPath": {"path": REGISTRY_ROOT, "type": "DirectoryOrCreate"}}], - }, - } - run_gc = kctl(["-n", "hwlab-ci", "run", gc_name, "--restart=Never", "--image=registry:2.8.3", "--overrides=%s" % json.dumps(overrides)], 60) - steps.append({"step": "start-registry-gc-pod", "result": bounded(run_gc), "pod": gc_name}) - if run_gc["exitCode"] != 0: - raise RuntimeError("failed to start registry GC pod") - waited_gc = wait_pod_terminal(gc_name, 900) - steps.append({"step": "wait-registry-gc", "result": waited_gc}) - logs = kctl(["-n", "hwlab-ci", "logs", gc_name], 120) - steps.append({"step": "registry-gc-logs", "result": bounded(logs)}) - if not waited_gc.get("ok"): - raise RuntimeError("registry GC pod did not complete successfully") - finally: - cleanup_gc = kctl(["-n", "hwlab-ci", "delete", "pod", gc_name, "--ignore-not-found=true"], 60) - steps.append({"step": "delete-registry-gc-pod", "result": bounded(cleanup_gc)}) - scale_up = kctl(["-n", "hwlab-ci", "scale", "deploy", "hwlab-registry", "--replicas=%s" % int(deployment.get("replicas") or 1)], 60) - steps.append({"step": "scale-registry-up", "result": bounded(scale_up)}) - rollout = kctl(["-n", "hwlab-ci", "rollout", "status", "deploy/hwlab-registry", "--timeout=180s"], 200) - steps.append({"step": "wait-registry-rollout", "result": bounded(rollout)}) - for name, was_suspended in original_crons.items(): - restore = patch_cronjob_suspend(name, was_suspended) - steps.append({"step": "restore-cronjob", "name": name, "suspend": was_suspended, "result": bounded(restore)}) - after = du_size(REGISTRY_ROOT, 60) or 0 - return { - "reclaimedBytes": max(0, before - after), - "commandOutput": { - "registryPlan": plan.get("summary"), - "deletedTagCount": len(delete_rows), - "deletedRevisionCount": len(delete_revision_rows), - "deletedManifestCount": sum(len(items) for items in delete_manifests.values()), - "diskBeforeBytes": before, - "diskAfterBytes": after, - "steps": steps[-12:], - }, - } - -def execute_registry_garbage_collect_only(): - if PROVIDER_ID.upper() != "G14": - raise RuntimeError("HWLAB registry garbage-collect is only supported on G14") - deployment = registry_deployment_preflight() - if not deployment.get("ok"): - raise RuntimeError("registry deployment preflight failed: %s" % deployment.get("reason")) - cronjobs = ["hwlab-g14-branch-poller", "hwlab-v02-branch-poller"] - original_crons = cronjob_suspend_states(cronjobs) - before = du_size(REGISTRY_ROOT, 60) or 0 - gc_name = "hwlab-registry-gc-%s" % int(time.time()) - steps = [] - try: - for name in original_crons: - result = patch_cronjob_suspend(name, True) - steps.append({"step": "suspend-cronjob", "name": name, "result": bounded(result)}) - if result["exitCode"] != 0: - raise RuntimeError("failed to suspend cronjob %s" % name) - idle_after_suspend = wait_no_active_hwlab_ci(180) - steps.append({"step": "idle-after-suspend", "result": idle_after_suspend}) - if not idle_after_suspend.get("ok"): - raise RuntimeError("refusing registry maintenance because hwlab-ci did not become idle after suspend") - - scale_down = kctl(["-n", "hwlab-ci", "scale", "deploy", "hwlab-registry", "--replicas=0"], 60) - steps.append({"step": "scale-registry-down", "result": bounded(scale_down)}) - if scale_down["exitCode"] != 0: - raise RuntimeError("failed to scale registry down") - waited_down = wait_registry_pod_count(0, 120) - steps.append({"step": "wait-registry-down", "result": waited_down}) - if not waited_down.get("ok"): - raise RuntimeError("registry pod did not scale down") - - overrides = { - "apiVersion": "v1", - "spec": { - "restartPolicy": "Never", - "containers": [{ - "name": "registry-gc", - "image": "registry:2.8.3", - "command": ["registry", "garbage-collect", "/etc/docker/registry/config.yml"], - "volumeMounts": [{"name": "storage", "mountPath": "/var/lib/registry"}], - }], - "volumes": [{"name": "storage", "hostPath": {"path": REGISTRY_ROOT, "type": "DirectoryOrCreate"}}], - }, - } - run_gc = kctl(["-n", "hwlab-ci", "run", gc_name, "--restart=Never", "--image=registry:2.8.3", "--overrides=%s" % json.dumps(overrides)], 60) - steps.append({"step": "start-registry-gc-pod", "result": bounded(run_gc), "pod": gc_name}) - if run_gc["exitCode"] != 0: - raise RuntimeError("failed to start registry GC pod") - waited_gc = wait_pod_terminal(gc_name, 900) - steps.append({"step": "wait-registry-gc", "result": waited_gc}) - logs = kctl(["-n", "hwlab-ci", "logs", gc_name], 120) - steps.append({"step": "registry-gc-logs", "result": bounded(logs)}) - if not waited_gc.get("ok"): - raise RuntimeError("registry GC pod did not complete successfully") - finally: - cleanup_gc = kctl(["-n", "hwlab-ci", "delete", "pod", gc_name, "--ignore-not-found=true"], 60) - steps.append({"step": "delete-registry-gc-pod", "result": bounded(cleanup_gc)}) - scale_up = kctl(["-n", "hwlab-ci", "scale", "deploy", "hwlab-registry", "--replicas=%s" % int(deployment.get("replicas") or 1)], 60) - steps.append({"step": "scale-registry-up", "result": bounded(scale_up)}) - rollout = kctl(["-n", "hwlab-ci", "rollout", "status", "deploy/hwlab-registry", "--timeout=180s"], 200) - steps.append({"step": "wait-registry-rollout", "result": bounded(rollout)}) - for name, was_suspended in original_crons.items(): - restore = patch_cronjob_suspend(name, was_suspended) - steps.append({"step": "restore-cronjob", "name": name, "suspend": was_suspended, "result": bounded(restore)}) - after = du_size(REGISTRY_ROOT, 60) or 0 - return { - "reclaimedBytes": max(0, before - after), - "commandOutput": { - "message": "official registry garbage-collect only; no additional tag deletion", - "diskBeforeBytes": before, - "diskAfterBytes": after, - "steps": steps[-12:], - }, - } - -def start_registry_retention_job(mode): - job_id = "g14-registry-%s-%s" % (int(time.time()), os.getpid()) - paths = job_paths(job_id) - started_at = now_iso() - initial = { - "ok": True, - "action": "gc remote status", - "providerId": PROVIDER_ID, - "jobId": job_id, - "status": "running", - "kind": "hwlab-registry-retention-gc" if mode == "retention" else "hwlab-registry-garbage-collect", - "mode": mode, - "startedAt": started_at, - "statePath": paths["state"], - "logPath": paths["log"], - "options": OPTIONS, - } - write_json_atomic(paths["state"], initial) - pid = os.fork() - if pid != 0: - return { - "status": "started", - "reclaimedBytes": None, - "commandOutput": { - "jobId": job_id, - "pid": pid, - "statePath": paths["state"], - "logPath": paths["log"], - "statusCommand": "bun scripts/cli.ts gc remote %s status --job-id %s" % (PROVIDER_ID, job_id), - "message": "registry retention GC is running as a detached remote job", - }, - } - - try: - os.setsid() - except Exception: - pass - try: - devnull = os.open(os.devnull, os.O_RDONLY) - os.dup2(devnull, 0) - os.close(devnull) - except Exception: - pass - try: - log_handle = open(paths["log"], "a", encoding="utf-8", buffering=1) - os.dup2(log_handle.fileno(), 1) - os.dup2(log_handle.fileno(), 2) - except Exception: - log_handle = None - try: - print("[%s] starting HWLAB registry %s job %s" % (now_iso(), mode, job_id), flush=True) - result = execute_registry_retention() if mode == "retention" else execute_registry_garbage_collect_only() - payload = dict(initial) - payload.update({ - "status": "succeeded", - "finishedAt": now_iso(), - "result": result, - "diskAfter": df_snapshot(), - "clusterAfter": cluster_preflight(), - }) - write_json_atomic(paths["state"], payload) - print("[%s] completed HWLAB registry %s job %s" % (now_iso(), mode, job_id), flush=True) - os._exit(0) - except Exception as exc: - payload = dict(initial) - payload.update({ - "ok": False, - "status": "failed", - "finishedAt": now_iso(), - "error": str(exc), - "diskAfter": df_snapshot(), - "clusterAfter": cluster_preflight(), - }) - try: - write_json_atomic(paths["state"], payload) - except Exception: - pass - print("[%s] failed HWLAB registry %s job %s: %s" % (now_iso(), mode, job_id, exc), flush=True) - os._exit(1) - finally: - try: - if log_handle: - log_handle.close() - except Exception: - pass - +# __UNIDESK_GC_REMOTE_REGISTRY_HELPERS__ def collect_protected(): protected_paths = [ ("hwlab-k3s-runtime", "/var/lib/rancher/k3s", "Native k3s runtime, containerd state, local-path storage and control-plane data are protected."), @@ -2233,6 +1027,54 @@ def collect_candidates(observed_at): "action": {"op": "rm-recursive", "allowlist": "remote-tool-cache"}, }) + if OPTIONS.get("webObserveArtifacts", False): + observe = collect_web_observe_summary() + for record in observe.get("staleSignals") or []: + path = record.get("path") or "" + if not path or not is_direct_observe_run_path(path): + continue + try: + if path_has_open_fd(path): + continue + size = du_size(path, 15) or path_size(path) + except Exception: + continue + if size <= 0: + continue + candidates.append({ + "id": "web-observe-run:%s" % record.get("id"), + "kind": "web-observe-run-delete", + "risk": "medium", + "description": "Delete stale web-observe artifact run under YAML-configured observeStateRoots", + "path": path, + "sizeBytes": size, + "estimatedReclaimBytes": size, + "stale": { + "id": record.get("id"), + "ageHours": record.get("ageHours"), + "timestampBasis": record.get("timestampBasis"), + "pid": record.get("pid"), + "pidAlive": record.get("pidAlive"), + "status": record.get("status"), + }, + "action": {"op": "rm-recursive", "allowlist": "web-observe-stale-run"}, + }) + + if OPTIONS.get("k3sImageCache", False): + candidate = k3s_image_cache_candidate() + if candidate: + candidates.append(candidate) + + if OPTIONS.get("hostContainerdCache", False): + candidate = host_containerd_cache_candidate() + if candidate: + candidates.append(candidate) + + if OPTIONS.get("localPathOrphans", False): + candidate = local_path_orphan_candidate() + if candidate: + candidates.append(candidate) + if OPTIONS.get("coreDumps", True): cutoff = time.time() - float(OPTIONS.get("coreDumpMinAgeHours") or 1) * 3600 for root in sorted(CORE_DUMP_DIR_ALLOWLIST): @@ -2520,6 +1362,20 @@ def execute(candidate): elif os.path.exists(path): os.unlink(path) return {"reclaimedBytes": before} + if kind == "web-observe-run-delete": + path = candidate.get("path") or "" + record = assert_web_observe_candidate(path) + before = du_size(path, 15) or path_size(path) + shutil.rmtree(path, ignore_errors=True) + return {"reclaimedBytes": before, "stale": record} + if kind == "k3s-cri-image-prune": + return execute_k3s_image_cache_prune() + if kind == "host-containerd-cache-prune": + return execute_host_containerd_cache_prune() + if kind == "host-containerd-orphan-state-delete": + return execute_host_containerd_orphan_cleanup() + if kind == "k3s-local-path-orphans-delete": + return execute_local_path_orphan_cleanup() if kind == "tmp-path-delete": path = candidate.get("path") or "" assert_tmp_candidate(path) @@ -2550,12 +1406,34 @@ def visible_items(items): return items[:int(OPTIONS.get("limit") or 50)] def returned_results(results): + def compact_result(item): + if bool(OPTIONS.get("full")): + return item + return { + key: item.get(key) + for key in [ + "id", + "kind", + "risk", + "path", + "status", + "estimatedReclaimBytes", + "reclaimedBytes", + "orphanCount", + "selectedOrphanCount", + "overlayCandidateCount", + "contentCandidateCount", + "deletedOrphanCount", + "error", + ] + if item.get(key) is not None + } if bool(OPTIONS.get("full")): return results failed = [item for item in results if item.get("status") == "failed"] started = [item for item in results if item.get("status") == "started"] succeeded = [item for item in results if item.get("status") == "succeeded"] - return (failed + started + succeeded)[:int(OPTIONS.get("resultLimit") or 50)] + return [compact_result(item) for item in (failed + started + succeeded)[:int(OPTIONS.get("resultLimit") or 50)]] def plan_payload(observed_at, preflight, protected, candidates, visible): disk = df_snapshot() @@ -2932,7 +1810,7 @@ def main(): "clusterAfter": cluster_preflight(), "summary": run_summary, "results": returned, - "protected": protected, + "protected": protected if bool(OPTIONS.get("full")) else protected[:3], } emit_json(payload, persist_large=True) return 0 diff --git a/scripts/src/gc-remote-web-observe.py b/scripts/src/gc-remote-web-observe.py new file mode 100644 index 00000000..a2cd1404 --- /dev/null +++ b/scripts/src/gc-remote-web-observe.py @@ -0,0 +1,57 @@ +def configured_observe_roots(): + roots = config_list(MEMORY_CONFIG, "observeStateRoots", config_list(MEMORY_CONFIG, "webObserveRoots", [])) + return [os.path.abspath(item) for item in roots if isinstance(item, str) and item.startswith("/")] + +def is_direct_observe_run_path(path): + resolved = os.path.abspath(path) + for root in configured_observe_roots(): + if os.path.dirname(resolved) == root and resolved.startswith(root.rstrip("/") + "/"): + return True + return False + +def path_has_open_fd(path): + resolved = os.path.realpath(path) + prefix = resolved.rstrip("/") + "/" + proc_root = "/proc" + try: + pids = [name for name in os.listdir(proc_root) if name.isdigit()] + except OSError: + return True + for pid in pids: + base = os.path.join(proc_root, pid) + for name in ["cwd", "root"]: + try: + target = os.path.realpath(os.readlink(os.path.join(base, name))) + except OSError: + continue + if target == resolved or target.startswith(prefix): + return True + fd_dir = os.path.join(base, "fd") + try: + fds = os.listdir(fd_dir) + except OSError: + continue + for fd in fds: + try: + target = os.path.realpath(os.readlink(os.path.join(fd_dir, fd))) + except OSError: + continue + if target == resolved or target.startswith(prefix): + return True + return False + +def assert_web_observe_candidate(path): + resolved = os.path.abspath(path) + if not is_direct_observe_run_path(resolved): + raise RuntimeError("refusing to remove web-observe path outside configured direct run roots: %s" % path) + if os.path.islink(resolved) or not os.path.isdir(resolved): + raise RuntimeError("refusing to remove non-directory or symlink web-observe path: %s" % path) + stale_hours = config_float(MEMORY_CONFIG, "staleRunMaxAgeHours", 6.0, minimum=0.0) + record = observe_run_record(resolved, stale_hours) + if record.get("pidAlive"): + raise RuntimeError("refusing to remove active web-observe run with live pid: %s" % path) + if not record.get("staleSignal"): + raise RuntimeError("refusing to remove web-observe run without stale signal: %s" % path) + if path_has_open_fd(resolved): + raise RuntimeError("refusing to remove web-observe run with open fd/cwd reference: %s" % path) + return record diff --git a/scripts/src/gc-remote.ts b/scripts/src/gc-remote.ts index e0b55957..a39ca35a 100644 --- a/scripts/src/gc-remote.ts +++ b/scripts/src/gc-remote.ts @@ -18,6 +18,10 @@ interface RemoteGcOptions { tmp: boolean; tmpMinAgeHours: number; toolCaches: boolean; + webObserveArtifacts: boolean; + k3sImageCache: boolean; + hostContainerdCache: boolean; + localPathOrphans: boolean; aptCache: boolean; coreDumps: boolean; coreDumpMinAgeHours: number; @@ -45,6 +49,10 @@ const DEFAULT_REMOTE_OPTIONS: RemoteGcOptions = { tmp: true, tmpMinAgeHours: 24, toolCaches: false, + webObserveArtifacts: false, + k3sImageCache: false, + hostContainerdCache: false, + localPathOrphans: false, aptCache: true, coreDumps: true, coreDumpMinAgeHours: 1, @@ -63,6 +71,16 @@ const GC_CONFIG_RELATIVE_PATH = "config/unidesk-cli.yaml"; const GC_REMOTE_CONFIG_REF = `${GC_CONFIG_RELATIVE_PATH}#gc.remote.targets`; const GC_REMOTE_RUNNER_RELATIVE_PATH = "scripts/src/gc-remote-runner.py"; const GC_REMOTE_RUNNER_CONFIG_PLACEHOLDER = "__UNIDESK_GC_REMOTE_CONFIG_BASE64__"; +const GC_REMOTE_WEB_OBSERVE_RELATIVE_PATH = "scripts/src/gc-remote-web-observe.py"; +const GC_REMOTE_WEB_OBSERVE_PLACEHOLDER = "# __UNIDESK_GC_REMOTE_WEB_OBSERVE_HELPERS__"; +const GC_REMOTE_CONTAINERD_RELATIVE_PATH = "scripts/src/gc-remote-containerd.py"; +const GC_REMOTE_CONTAINERD_PLACEHOLDER = "# __UNIDESK_GC_REMOTE_CONTAINERD_HELPERS__"; +const GC_REMOTE_PVC_RELATIVE_PATH = "scripts/src/gc-remote-pvc.py"; +const GC_REMOTE_PVC_PLACEHOLDER = "# __UNIDESK_GC_REMOTE_PVC_HELPERS__"; +const GC_REMOTE_GROWTH_RELATIVE_PATH = "scripts/src/gc-remote-growth.py"; +const GC_REMOTE_GROWTH_PLACEHOLDER = "# __UNIDESK_GC_REMOTE_GROWTH_HELPERS__"; +const GC_REMOTE_REGISTRY_RELATIVE_PATH = "scripts/src/gc-remote-registry.py"; +const GC_REMOTE_REGISTRY_PLACEHOLDER = "# __UNIDESK_GC_REMOTE_REGISTRY_HELPERS__"; export async function runRemoteGcCommand(config: UniDeskConfig, providerId: string | undefined, action: string | undefined, args: string[]): Promise { if (providerId === undefined || providerId.length === 0) { @@ -186,6 +204,22 @@ function parseRemoteGcOptions(args: string[]): RemoteGcOptions { options.toolCaches = true; } else if (arg === "--no-tool-caches") { options.toolCaches = false; + } else if (arg === "--include-web-observe-artifacts") { + options.webObserveArtifacts = true; + } else if (arg === "--no-web-observe-artifacts") { + options.webObserveArtifacts = false; + } else if (arg === "--include-k3s-image-cache") { + options.k3sImageCache = true; + } else if (arg === "--no-k3s-image-cache") { + options.k3sImageCache = false; + } else if (arg === "--include-host-containerd-cache") { + options.hostContainerdCache = true; + } else if (arg === "--no-host-containerd-cache") { + options.hostContainerdCache = false; + } else if (arg === "--include-local-path-orphans") { + options.localPathOrphans = true; + } else if (arg === "--no-local-path-orphans") { + options.localPathOrphans = false; } else if (arg === "--no-apt-cache") { options.aptCache = false; } else if (arg === "--no-core-dumps") { @@ -295,5 +329,31 @@ function remoteGcPython(configBase64: string): string { if (!template.includes(GC_REMOTE_RUNNER_CONFIG_PLACEHOLDER)) { throw new Error(`${GC_REMOTE_RUNNER_RELATIVE_PATH} missing ${GC_REMOTE_RUNNER_CONFIG_PLACEHOLDER}`); } - return template.replace(GC_REMOTE_RUNNER_CONFIG_PLACEHOLDER, configBase64); + if (!template.includes(GC_REMOTE_WEB_OBSERVE_PLACEHOLDER)) { + throw new Error(`${GC_REMOTE_RUNNER_RELATIVE_PATH} missing ${GC_REMOTE_WEB_OBSERVE_PLACEHOLDER}`); + } + if (!template.includes(GC_REMOTE_CONTAINERD_PLACEHOLDER)) { + throw new Error(`${GC_REMOTE_RUNNER_RELATIVE_PATH} missing ${GC_REMOTE_CONTAINERD_PLACEHOLDER}`); + } + if (!template.includes(GC_REMOTE_PVC_PLACEHOLDER)) { + throw new Error(`${GC_REMOTE_RUNNER_RELATIVE_PATH} missing ${GC_REMOTE_PVC_PLACEHOLDER}`); + } + if (!template.includes(GC_REMOTE_GROWTH_PLACEHOLDER)) { + throw new Error(`${GC_REMOTE_RUNNER_RELATIVE_PATH} missing ${GC_REMOTE_GROWTH_PLACEHOLDER}`); + } + if (!template.includes(GC_REMOTE_REGISTRY_PLACEHOLDER)) { + throw new Error(`${GC_REMOTE_RUNNER_RELATIVE_PATH} missing ${GC_REMOTE_REGISTRY_PLACEHOLDER}`); + } + const webObserveHelpers = readFileSync(rootPath(GC_REMOTE_WEB_OBSERVE_RELATIVE_PATH), "utf8"); + const containerdHelpers = readFileSync(rootPath(GC_REMOTE_CONTAINERD_RELATIVE_PATH), "utf8"); + const pvcHelpers = readFileSync(rootPath(GC_REMOTE_PVC_RELATIVE_PATH), "utf8"); + const growthHelpers = readFileSync(rootPath(GC_REMOTE_GROWTH_RELATIVE_PATH), "utf8"); + const registryHelpers = readFileSync(rootPath(GC_REMOTE_REGISTRY_RELATIVE_PATH), "utf8"); + return template + .replace(GC_REMOTE_WEB_OBSERVE_PLACEHOLDER, webObserveHelpers.trimEnd()) + .replace(GC_REMOTE_CONTAINERD_PLACEHOLDER, containerdHelpers.trimEnd()) + .replace(GC_REMOTE_PVC_PLACEHOLDER, pvcHelpers.trimEnd()) + .replace(GC_REMOTE_GROWTH_PLACEHOLDER, growthHelpers.trimEnd()) + .replace(GC_REMOTE_REGISTRY_PLACEHOLDER, registryHelpers.trimEnd()) + .replace(GC_REMOTE_RUNNER_CONFIG_PLACEHOLDER, configBase64); }