diff --git a/config/unidesk-cli.yaml b/config/unidesk-cli.yaml index 206c0869..ca02ca90 100644 --- a/config/unidesk-cli.yaml +++ b/config/unidesk-cli.yaml @@ -75,6 +75,15 @@ gc: codex-queue-stats-verify: .state/codex-queue-stats-verify codex-queue-perf: .state/codex-queue-perf tmp: .state/tmp + legacyDockerImages: + enabled: true + minAgeHours: 12 + keepPerRepository: 2 + repositories: + - 127.0.0.1:5000/hwlab/web-probe-sentinel-${nodeLower} + legacyDockerRegistryVolumes: + enabled: true + requireK8sRegistryReady: true codexSessions: enabled: false keepHours: 72 diff --git a/scripts/src/hwlab-node-help.ts b/scripts/src/hwlab-node-help.ts index 6e0e623a..afd389cc 100644 --- a/scripts/src/hwlab-node-help.ts +++ b/scripts/src/hwlab-node-help.ts @@ -12,6 +12,10 @@ export function hwlabNodeHelp(): Record { examples: [ "bun scripts/cli.ts hwlab nodes control-plane infra plan --node D601 --lane v03", "bun scripts/cli.ts hwlab nodes control-plane status --node D601 --lane v03", + "bun scripts/cli.ts hwlab nodes control-plane cleanup-runs --node JD01 --lane v03 --min-age-minutes 30 --limit 200 --dry-run", + "bun scripts/cli.ts hwlab nodes control-plane cleanup-released-pvs --node JD01 --lane v03 --limit 200 --dry-run", + "bun scripts/cli.ts hwlab nodes control-plane cleanup-legacy-docker-images --node JD01 --lane v03 --dry-run", + "bun scripts/cli.ts hwlab nodes control-plane cleanup-legacy-docker-registry-volume --node JD01 --lane v03 --dry-run", "bun scripts/cli.ts hwlab nodes git-mirror status --node G14 --lane v03", "bun scripts/cli.ts hwlab nodes hwpod-preinstall plan --node D601 --lane v03 --dry-run", "bun scripts/cli.ts hwlab nodes fake-model-provider plan --node D518 --lane v03 --provider fake-echo", @@ -21,7 +25,7 @@ export function hwlabNodeHelp(): Record { "bun scripts/cli.ts web-probe --help", ], actions: { - "control-plane": "YAML-first node-local CI/CD, git-mirror, public exposure, runtime-image, Argo and PipelineRun operations.", + "control-plane": "YAML-first node-local CI/CD, git-mirror, public exposure, runtime-image, Argo, PipelineRun and CI workspace retention operations.", "git-mirror": "Inspect or operate the selected node/lane source mirror.", "hwpod-preinstall": "Render YAML-first HWPOD preinstall configRefs, runtime mount targets, PM MDTODO source, and gateway profile status.", "fake-model-provider": "Materialize and operate YAML-declared fake Responses model providers for HWLAB/AgentRun sentinel checks.", @@ -32,6 +36,9 @@ export function hwlabNodeHelp(): Record { notes: [ "`web-probe` is no longer under `hwlab nodes`; use `bun scripts/cli.ts web-probe ...`.", "`--node` and `--lane` remain data arguments resolved from YAML for node/lane operations.", + "control-plane cleanup-runs deletes terminal PipelineRuns; cleanup-released-pvs deletes only orphaned hwlab-ci PipelineRun PVCs with no active pod mounts and Released local-path/Delete PVs.", + "cleanup-legacy-docker-images is a transitional legacy-cache GC for Docker images matching YAML allowlisted repositories; it protects container-referenced images, does not run prune, and does not touch Docker volumes.", + "cleanup-legacy-docker-registry-volume removes only an exited legacy Docker registry container and its unique /var/lib/registry volume after the YAML-declared k8s node-local-registry is ready.", "`trigger-current --confirm --wait` is the one-command CICD path for current node/lane runtime publish.", ], }; @@ -63,6 +70,8 @@ export function hwlabNodeWebProbeHelp(): Record { "bun scripts/cli.ts web-probe observe collect webobs-xxxx --view timeline --command-id cmd-xxxx", "bun scripts/cli.ts web-probe observe collect webobs-xxxx --view project-mdtodo-summary", "bun scripts/cli.ts web-probe observe analyze webobs-xxxx", + "bun scripts/cli.ts web-probe observe gc --node JD01 --lane v03 --dry-run", + "bun scripts/cli.ts web-probe observe gc --node JD01 --lane v03 --keep-hours 24 --confirm", "bun scripts/cli.ts web-probe sentinel plan --node D601 --lane v03 --dry-run", "bun scripts/cli.ts web-probe sentinel plan --node D601 --lane v03 --sentinel workbench-auth-session-switch-2users", "bun scripts/cli.ts web-probe sentinel publish-current --node JD01 --lane v03 --sentinel jd01-web-probe-sentinel --confirm --wait", @@ -77,7 +86,7 @@ export function hwlabNodeWebProbeHelp(): Record { "opencode-smoke": "Run the repo-owned OpenCode iframe/direct-host composer smoke and require DOM assistant text plus EventSource update/finish/idle evidence.", script: "Run caller-provided Playwright JS after CLI-managed /auth/login; scripts must not handle secrets themselves.", screenshot: "Capture a no-auth or public page through the selected node/lane remote browser and download PNG artifacts to the caller /tmp by default.", - observe: "Start, inspect, control, stop, collect, and analyze a long-running observer that writes JSONL artifacts.", + observe: "Start, inspect, control, stop, collect, analyze, and garbage-collect raw artifacts for long-running observers.", sentinel: "Render and operate the YAML-first web-probe sentinel wrapper, one-click publish, image, GitOps, dashboard verification, maintenance and report views.", }, notes: [ @@ -85,6 +94,7 @@ export function hwlabNodeWebProbeHelp(): Record { "`web-probe script` is an ad-hoc exploration escape hatch; repeated/high-frequency workflows must become `web-probe observe command` types or repo-owned web-probe commands.", "`web-probe opencode-smoke` is the repo-owned OpenCode smoke; prefer it over repeating one-off OpenCode Playwright snippets.", "observe is passive by default; user actions must be explicit observe command entries in control.jsonl.", + "observe gc keeps manifest, heartbeat, control/error logs and analysis reports, and only removes dead-run raw samples/browser/network/screenshot artifacts after YAML-configured retention.", "After observe start, prefer observe status|command|stop|collect|analyze instead of repeating --node/--lane/--state-dir.", "collect views render bounded summaries from existing artifacts and do not create a second source of truth.", "analyze is offline-only: it reads artifact JSONL and writes analysis/report.md plus analysis/report.json.", diff --git a/scripts/src/hwlab-node-web-observe-runner-source.ts b/scripts/src/hwlab-node-web-observe-runner-source.ts index cf8e3555..1d1fd76a 100644 --- a/scripts/src/hwlab-node-web-observe-runner-source.ts +++ b/scripts/src/hwlab-node-web-observe-runner-source.ts @@ -24,6 +24,7 @@ const screenshotIntervalMs = positiveInteger(process.env.UNIDESK_WEB_OBSERVE_SCR const screenshotCaptureTimeoutMs = boundedInteger(process.env.UNIDESK_WEB_OBSERVE_SCREENSHOT_CAPTURE_TIMEOUT_MS, 15000, 1000, 120000); const maxSamples = positiveInteger(process.env.UNIDESK_WEB_OBSERVE_MAX_SAMPLES, 0); const observerRefreshIntervalMs = positiveInteger(process.env.UNIDESK_WEB_OBSERVE_OBSERVER_REFRESH_INTERVAL_MS, 180000); +const maxRunMs = positiveInteger(process.env.UNIDESK_WEB_OBSERVE_MAX_RUN_MS, 0); const viewport = parseViewport(process.env.UNIDESK_WEB_OBSERVE_VIEWPORT || "1440x900"); const browserProxyMode = parseBrowserProxyMode(process.env.UNIDESK_WEB_OBSERVE_BROWSER_PROXY_MODE || "auto"); const authLoginMaxAttempts = boundedInteger(process.env.UNIDESK_WEB_OBSERVE_AUTH_LOGIN_MAX_ATTEMPTS, 6, 1, 20); @@ -128,6 +129,10 @@ try { await appendJsonl(files.control, controlRecord({ id: "max-samples", type: "stop", source: "sampler" }, "completed", { reason: "max-samples", maxSamples })); break; } + if (maxRunMs > 0 && Date.now() - startedAtMs >= maxRunMs) { + await appendJsonl(files.control, controlRecord({ id: "max-run-ms", type: "stop", source: "sampler" }, "completed", { reason: "max-run-ms", maxRunMs, elapsedMs: Date.now() - startedAtMs })); + break; + } await sleep(sampleIntervalMs); } if (browserFreezeBlocker) { diff --git a/scripts/src/hwlab-node-web-sentinel-p5-observe.ts b/scripts/src/hwlab-node-web-sentinel-p5-observe.ts index 4886bbd4..3afd6da2 100644 --- a/scripts/src/hwlab-node-web-sentinel-p5-observe.ts +++ b/scripts/src/hwlab-node-web-sentinel-p5-observe.ts @@ -114,6 +114,7 @@ export function runSentinelQuickVerify(state: SentinelCicdState, reason: string, "--target-path", stringAt(scenario, "observeTargetPath"), "--sample-interval-ms", String(sampleIntervalMs), "--screenshot-interval-ms", String(numberAt(scenario, "screenshotIntervalMs")), + "--max-run-seconds", String(hardBudgetSeconds), "--command-timeout-seconds", "55", ]; const viewport = stringAtNullable(scenario, "viewport"); @@ -298,7 +299,12 @@ export function runSentinelQuickVerify(state: SentinelCicdState, reason: string, const controlFindings = quickVerifyControlFindings(null, promptIndex, turnSummary, traceFrame); const artifactSummaryRecord = record(artifactSummary); const artifactFindings = Array.isArray(artifactSummaryRecord.findings) ? artifactSummaryRecord.findings.map(record) : []; - const findings = mergeFindingRecords(artifactFindings, controlFindings); + const cleanupStep = stopQuickVerifyObserver(state, observerId, "observe-stop-after-terminal"); + const cleanupFindings = quickVerifyCleanupFindings(cleanupStep); + const findings = mergeFindingRecords( + mergeFindingRecords(artifactFindings, controlFindings), + cleanupFindings, + ); const blockingFindings = findings.filter(isQuickVerifyBlockingFinding); const analysisWarnings = analysis.ok ? [] : ["quick verify analyze command returned non-zero but a readable analysis artifact was produced; targetValidation is using artifact severity plus control blockers."]; const ok = record(artifactSummary).ok === true && controlFindings.length === 0 && blockingFindings.length === 0; @@ -320,7 +326,7 @@ export function runSentinelQuickVerify(state: SentinelCicdState, reason: string, failure: controlFindings.length > 0 ? "quick-verify-no-business-turn" : blockingFindings.length > 0 ? "quick-verify-blocking-findings" : null, promptSource: prompts.summary, accountEnv: accountEnv.summary, - steps, + steps: [...steps, cleanupStep], analysis: artifactSummary, views: { summary: { renderedText: renderQuickVerifySummary({ runId, scenarioId, observerId, artifactSummary, steps, publicOrigin: stringAt(state.publicExposure, "publicBaseUrl") }) }, @@ -331,7 +337,7 @@ export function runSentinelQuickVerify(state: SentinelCicdState, reason: string, findings, screenshot: record(artifactSummary).screenshot, publicOrigin: stringAt(state.publicExposure, "publicBaseUrl"), - warnings: mergeWarnings(analysisWarnings, nonBlockingCanaryWarnings, elapsedWarnings()), + warnings: mergeWarnings(analysisWarnings, nonBlockingCanaryWarnings, cleanupFindings.length > 0 ? ["quick verify observer stop failed; runner lifecycle cleanup is a blocking finding."] : [], elapsedWarnings()), valuesRedacted: true, }); } @@ -466,14 +472,7 @@ function finalizeQuickVerifyFailure(state: SentinelCicdState, input: { ], 60); cleanupSteps.push({ phase: "observe-cancel-after-failure", ok: cancel.ok, result: cancel.result }); } - const stop = runChildCli([ - "web-probe", "observe", "stop", input.observerId, - "--node", state.spec.nodeId, - "--lane", state.spec.lane, - "--force", - "--command-timeout-seconds", "55", - ], 30); - cleanupSteps.push({ phase: "observe-stop-after-failure", ok: stop.ok, result: stop.result }); + cleanupSteps.push(stopQuickVerifyObserver(state, input.observerId, "observe-stop-after-failure")); const analysis = runChildCli([ "web-probe", "observe", "analyze", input.observerId, "--node", state.spec.nodeId, @@ -534,6 +533,30 @@ function finalizeQuickVerifyFailure(state: SentinelCicdState, input: { }; } +function stopQuickVerifyObserver(state: SentinelCicdState, observerId: string, phase: string): Record { + const stop = runChildCli([ + "web-probe", "observe", "stop", observerId, + "--node", state.spec.nodeId, + "--lane", state.spec.lane, + "--force", + "--wait-ms", "55000", + "--command-timeout-seconds", "55", + ], 60); + return { phase, ok: stop.ok, result: stop.result }; +} + +function quickVerifyCleanupFindings(cleanupStep: Record): Record[] { + if (cleanupStep.ok === true) return []; + return [{ + id: "quick-verify-observer-stop-failed", + severity: "red", + count: 1, + summary: "quick verify completed target analysis but failed to stop its observer runner; this can leak Chrome process trees on cadence runs.", + cleanupPhase: cleanupStep.phase, + valuesRedacted: true, + }]; +} + function callSentinelService(state: SentinelCicdState, method: "GET" | "POST", pathWithQuery: string, body: Record | null, timeoutSeconds: number): Record { const namespace = stringAt(state.runtime, "namespace"); const serviceName = stringAt(state.runtime, "serviceName"); diff --git a/scripts/src/hwlab-node-web-sentinel-service.ts b/scripts/src/hwlab-node-web-sentinel-service.ts index db2afa24..f64d45b1 100644 --- a/scripts/src/hwlab-node-web-sentinel-service.ts +++ b/scripts/src/hwlab-node-web-sentinel-service.ts @@ -526,6 +526,7 @@ function buildObserveCommandPlan(config: WebProbeSentinelServiceConfig, scenario "--target-path", targetPath, "--sample-interval-ms", String(numberAt(scenario, "sampleIntervalMs")), "--screenshot-interval-ms", String(numberAt(scenario, "screenshotIntervalMs")), + "--max-run-seconds", String(numberAt(scenario, "maxRunSeconds")), "--command-timeout-seconds", "55", ]; const viewport = stringOrNull(scenario.viewport); @@ -562,7 +563,17 @@ function buildObserveCommandPlan(config: WebProbeSentinelServiceConfig, scenario argv: ["bun", "scripts/cli.ts", "web-probe", "observe", "analyze", ""], stdinSource: "none", }; - return [start, ...commands, analyze]; + const stop: CommandPlanStep = { + phase: "observe-stop", + argv: ["bun", "scripts/cli.ts", "web-probe", "observe", "stop", "", "--node", config.node, "--lane", config.lane, "--force", "--wait-ms", "55000", "--command-timeout-seconds", "55"], + stdinSource: "none", + }; + const gc: CommandPlanStep = { + phase: "observe-gc", + argv: ["bun", "scripts/cli.ts", "web-probe", "observe", "gc", "--node", config.node, "--lane", config.lane, "--confirm", "--command-timeout-seconds", "55"], + stdinSource: "none", + }; + return [start, ...commands, analyze, stop, gc]; } function inlinePromptText(item: Record): string | null { diff --git a/scripts/src/hwlab-node/cleanup.ts b/scripts/src/hwlab-node/cleanup.ts index bec35ebd..58c40a1d 100644 --- a/scripts/src/hwlab-node/cleanup.ts +++ b/scripts/src/hwlab-node/cleanup.ts @@ -33,7 +33,7 @@ import { deleteNodeRuntimeCleanupRuns, nodeRuntimeApply, nodeRuntimeCiObjectCoun import { HWLAB_CI_NAMESPACE } from "./entry"; import { nodeRuntimeTriggerCurrentOutput } from "./git-mirror"; import { parseNodeScopedDelegatedOptions } from "./plan"; -import { compactRuntimeCommand, compactRuntimeCommandStats, nodeRuntimeUnsupportedAction, transPath } from "./runtime-common"; +import { compactRuntimeCommand, compactRuntimeCommandStats, nodeRuntimeUnsupportedAction, runNodeHostScript, transPath } from "./runtime-common"; import { optionValue, positiveIntegerOption, shellQuote, statusText } from "./utils"; export function parseJsonObject(text: string): Record { @@ -161,6 +161,9 @@ export function nodeRuntimeControlPlaneRun(scoped: ReturnType): NodeRuntimeReleasedPvCleanupOptions { + return { + limit: positiveIntegerOption(scoped.originalArgs, "--limit", 200, 500), + dryRun: scoped.dryRun || !scoped.confirm, + }; +} + +function parseNodeRuntimeLegacyDockerImageCleanupOptions(scoped: ReturnType, policy: NodeRuntimeLegacyDockerImageCleanupPolicy): NodeRuntimeLegacyDockerImageCleanupOptions { + return { + minAgeHours: positiveIntegerOption(scoped.originalArgs, "--min-age-hours", policy.minAgeHours, 8760), + keepPerRepository: positiveIntegerOption(scoped.originalArgs, "--keep-per-repository", policy.keepPerRepository, 100), + limit: positiveIntegerOption(scoped.originalArgs, "--limit", 50, 500), + dryRun: scoped.dryRun || !scoped.confirm, + }; +} + +export function nodeRuntimeCleanupReleasedPvs(scoped: ReturnType): Record { + const options = parseNodeRuntimeReleasedPvCleanupOptions(scoped); + const beforeCounts = nodeRuntimeCiObjectCounts(scoped.spec); + const orphanedPvcs = listNodeRuntimeOrphanedCiPvcs(scoped.spec, options.limit); + const releasedPvsBefore = listNodeRuntimeReleasedCiPvs(scoped.spec, options.limit); + const command = `hwlab nodes control-plane cleanup-released-pvs --node ${scoped.node} --lane ${scoped.lane}`; + if (options.dryRun) { + return { + ok: true, + command, + mode: "dry-run", + node: scoped.node, + lane: scoped.lane, + namespace: HWLAB_CI_NAMESPACE, + limit: options.limit, + orphanedPvcs, + orphanedPvcCount: orphanedPvcs.length, + releasedPvs: releasedPvsBefore, + releasedPvCount: releasedPvsBefore.length, + mutation: false, + policy: { + orphanedPvc: "ownerRef.kind=PipelineRun, owner PipelineRun absent, pvc name matches pvc-*, and no non-terminal pod mounts the claim", + releasedPv: "phase=Released, storageClass=local-path, reclaimPolicy=Delete, claimRef.namespace=hwlab-ci", + }, + next: { confirm: `${command} --limit ${options.limit} --confirm` }, + }; + } + const pvcDeletion = deleteNodeRuntimeCiPvcs(scoped.spec, orphanedPvcs.map((item) => item.name), scoped.timeoutSeconds); + const releasedPvsAfterPvc = listNodeRuntimeReleasedCiPvs(scoped.spec, options.limit); + const pvNames = releasedPvsAfterPvc.map((item) => item.name); + const pvDeletion = deleteNodeRuntimePersistentVolumes(scoped.spec, pvNames, scoped.timeoutSeconds); + const afterCounts = nodeRuntimeCiObjectCounts(scoped.spec); + const ok = isCommandSuccess(pvcDeletion) && isCommandSuccess(pvDeletion); + return { + ok, + command, + mode: "confirmed-cleanup", + node: scoped.node, + lane: scoped.lane, + namespace: HWLAB_CI_NAMESPACE, + limit: options.limit, + deletedPvcs: orphanedPvcs.map((item) => item.name), + deletedPvcCount: orphanedPvcs.length, + deletedPersistentVolumes: pvNames, + deletedPersistentVolumeCount: pvNames.length, + orphanedPvcsBefore: orphanedPvcs.slice(0, 50), + releasedPvsBefore, + releasedPvsAfterPvcDelete: releasedPvsAfterPvc.slice(0, 50), + ciObjectCountsBefore: beforeCounts, + ciObjectCountsAfter: afterCounts, + pvcDeletion: compactRuntimeCommand(pvcDeletion), + pvDeletion: compactRuntimeCommand(pvDeletion), + mutation: ok, + degradedReason: ok ? undefined : "node-runtime-ci-pvc-pv-cleanup-failed", + next: { status: `bun scripts/cli.ts hwlab nodes control-plane status --node ${scoped.node} --lane ${scoped.lane}` }, + }; +} + +export function nodeRuntimeCleanupLegacyDockerImages(scoped: ReturnType): Record { + const policy = nodeRuntimeLegacyDockerImageCleanupPolicy(scoped.node); + const options = parseNodeRuntimeLegacyDockerImageCleanupOptions(scoped, policy); + const command = `hwlab nodes control-plane cleanup-legacy-docker-images --node ${scoped.node} --lane ${scoped.lane}`; + if (!policy.enabled) { + return { + ok: false, + command, + node: scoped.node, + lane: scoped.lane, + mode: "blocked", + mutation: false, + config: policy.source, + degradedReason: "legacy-docker-image-cleanup-disabled", + message: `${policy.source} is disabled`, + valuesRedacted: true, + }; + } + if (policy.repositories.length === 0) { + return { + ok: false, + command, + node: scoped.node, + lane: scoped.lane, + mode: "blocked", + mutation: false, + config: policy.source, + degradedReason: "legacy-docker-image-cleanup-repositories-empty", + message: `${policy.source}.repositories must include at least one repository`, + valuesRedacted: true, + }; + } + const remoteOptions = { + mode: options.dryRun ? "plan" : "run", + minAgeHours: options.minAgeHours, + keepPerRepository: options.keepPerRepository, + limit: options.limit, + repositories: policy.repositories, + node: scoped.node, + lane: scoped.lane, + valuesRedacted: true, + }; + const script = [ + "set -eu", + `node - ${shellQuote(JSON.stringify(remoteOptions))} <<'UNIDESK_LEGACY_DOCKER_IMAGE_GC'`, + nodeRuntimeLegacyDockerImageCleanupNodeScript(), + "UNIDESK_LEGACY_DOCKER_IMAGE_GC", + ].join("\n"); + const result = runNodeHostScript(scoped.spec, script, scoped.timeoutSeconds); + const payload = parseJsonObject(result.stdout); + const ok = isCommandSuccess(result) && payload.ok !== false; + return { + ok, + command, + node: scoped.node, + lane: scoped.lane, + mode: options.dryRun ? "dry-run" : "confirmed-cleanup", + mutation: !options.dryRun && ok, + config: policy.source, + policy: { + repositories: policy.repositories, + minAgeHours: options.minAgeHours, + keepPerRepository: options.keepPerRepository, + limit: options.limit, + dockerPruneUsed: false, + dockerVolumesTouched: false, + protectedContainerImages: true, + legacyOnly: true, + deploymentDependency: false, + valuesRedacted: true, + }, + cleanup: options.dryRun || ok ? compactNodeRuntimeLegacyDockerImageCleanupPayload(payload) : payload, + result: ok ? compactRuntimeCommandStats(result) : compactRuntimeCommand(result), + degradedReason: ok ? undefined : "legacy-docker-image-cleanup-failed", + next: options.dryRun + ? { confirm: `${command} --min-age-hours ${options.minAgeHours} --keep-per-repository ${options.keepPerRepository} --limit ${options.limit} --confirm` } + : { status: `bun scripts/cli.ts hwlab nodes control-plane status --node ${scoped.node} --lane ${scoped.lane}` }, + valuesRedacted: true, + }; +} + +export function nodeRuntimeCleanupLegacyDockerRegistryVolume(scoped: ReturnType): Record { + const policy = nodeRuntimeLegacyDockerRegistryVolumeCleanupPolicy(); + const target = nodeRuntimeLegacyDockerRegistryVolumeTarget(scoped.node); + const dryRun = scoped.dryRun || !scoped.confirm; + const command = `hwlab nodes control-plane cleanup-legacy-docker-registry-volume --node ${scoped.node} --lane ${scoped.lane}`; + if (!policy.enabled) { + return { + ok: false, + command, + node: scoped.node, + lane: scoped.lane, + mode: "blocked", + mutation: false, + config: policy.source, + degradedReason: "legacy-docker-registry-volume-cleanup-disabled", + message: `${policy.source} is disabled`, + valuesRedacted: true, + }; + } + if (target.k8sRegistry.mode !== "k8s-workload") { + return { + ok: false, + command, + node: scoped.node, + lane: scoped.lane, + mode: "blocked", + mutation: false, + config: target.source, + degradedReason: "node-local-registry-is-not-k8s-workload", + message: "legacy Docker registry volume cleanup is allowed only after node-local registry is declared as a k8s workload", + target, + valuesRedacted: true, + }; + } + const remoteOptions = { + mode: dryRun ? "plan" : "run", + target, + requireK8sRegistryReady: policy.requireK8sRegistryReady, + node: scoped.node, + lane: scoped.lane, + valuesRedacted: true, + }; + const script = [ + "set -eu", + `node - ${shellQuote(JSON.stringify(remoteOptions))} <<'UNIDESK_LEGACY_DOCKER_REGISTRY_VOLUME_GC'`, + nodeRuntimeLegacyDockerRegistryVolumeCleanupNodeScript(), + "UNIDESK_LEGACY_DOCKER_REGISTRY_VOLUME_GC", + ].join("\n"); + const result = runNodeHostScript(scoped.spec, script, scoped.timeoutSeconds); + const payload = parseJsonObject(result.stdout); + const ok = isCommandSuccess(result) && payload.ok !== false; + return { + ok, + command, + node: scoped.node, + lane: scoped.lane, + mode: dryRun ? "dry-run" : "confirmed-cleanup", + mutation: !dryRun && ok && payload.mutation === true, + config: { + policy: policy.source, + target: target.source, + }, + target, + cleanup: compactNodeRuntimeLegacyDockerRegistryVolumeCleanupPayload(payload), + result: ok ? compactRuntimeCommandStats(result) : compactRuntimeCommand(result), + degradedReason: ok ? undefined : "legacy-docker-registry-volume-cleanup-failed", + next: dryRun + ? { confirm: `${command} --confirm --wait` } + : { status: `bun scripts/cli.ts hwlab nodes control-plane status --node ${scoped.node} --lane ${scoped.lane}` }, + valuesRedacted: true, + }; +} + +function nodeRuntimeLegacyDockerImageCleanupPolicy(node: string): NodeRuntimeLegacyDockerImageCleanupPolicy { + const source = "config/unidesk-cli.yaml#gc.legacyDockerImages"; + const parsed = objectRecord(Bun.YAML.parse(readFileSync(rootPath("config/unidesk-cli.yaml"), "utf8"))); + const raw = objectRecord(objectRecord(parsed.gc).legacyDockerImages); + const nodeLower = node.toLowerCase(); + const repositories = Array.isArray(raw.repositories) + ? [...new Set(raw.repositories + .filter((item): item is string => typeof item === "string") + .map((item) => item + .replace(/\$\{nodeLower\}/gu, nodeLower) + .replace(/\$\{node\}/gu, node) + .replace(/\$\{NODE\}/gu, node.toUpperCase())) + .map((item) => item.trim()) + .filter(Boolean))] + : []; + return { + enabled: raw.enabled === true, + minAgeHours: nonNegativeInteger(raw.minAgeHours, `${source}.minAgeHours`), + keepPerRepository: nonNegativeInteger(raw.keepPerRepository, `${source}.keepPerRepository`), + repositories, + source, + }; +} + +function nodeRuntimeLegacyDockerRegistryVolumeCleanupPolicy(): NodeRuntimeLegacyDockerRegistryVolumeCleanupPolicy { + const source = "config/unidesk-cli.yaml#gc.legacyDockerRegistryVolumes"; + const parsed = objectRecord(Bun.YAML.parse(readFileSync(rootPath("config/unidesk-cli.yaml"), "utf8"))); + const raw = objectRecord(objectRecord(parsed.gc).legacyDockerRegistryVolumes); + return { + enabled: raw.enabled === true, + requireK8sRegistryReady: raw.requireK8sRegistryReady !== false, + source, + }; +} + +function nodeRuntimeLegacyDockerRegistryVolumeTarget(node: string): NodeRuntimeLegacyDockerRegistryVolumeTarget { + const source = `${HWLAB_NODE_CONTROL_PLANE_CONFIG_PATH}#nodes.${node}`; + const parsed = objectRecord(Bun.YAML.parse(readFileSync(rootPath(HWLAB_NODE_CONTROL_PLANE_CONFIG_PATH), "utf8"))); + const nodeConfig = objectRecord(objectRecord(parsed.nodes)[node]); + const localRegistry = objectRecord(objectRecord(objectRecord(nodeConfig.k3s).install).localRegistry); + const registry = objectRecord(nodeConfig.registry); + return { + dockerContainerName: requiredConfigString(localRegistry.containerName, `${source}.k3s.install.localRegistry.containerName`), + dockerImage: requiredConfigString(localRegistry.canonicalImage ?? localRegistry.image, `${source}.k3s.install.localRegistry.canonicalImage`), + mountDestination: "/var/lib/registry", + k8sRegistry: { + mode: requiredConfigString(registry.mode, `${source}.registry.mode`), + endpoint: requiredConfigString(registry.endpoint, `${source}.registry.endpoint`), + namespace: requiredConfigString(registry.namespace, `${source}.registry.namespace`), + deploymentName: requiredConfigString(registry.deploymentName, `${source}.registry.deploymentName`), + pvcName: requiredConfigString(registry.pvcName, `${source}.registry.pvcName`), + }, + source, + }; +} + +function objectRecord(value: unknown): Record { + return typeof value === "object" && value !== null && !Array.isArray(value) ? value as Record : {}; +} + +function nonNegativeInteger(value: unknown, label: string): number { + if (!Number.isInteger(value) || value < 0) throw new Error(`${label} must be a non-negative integer`); + return value; +} + +function requiredConfigString(value: unknown, label: string): string { + if (typeof value !== "string" || value.trim().length === 0) throw new Error(`${label} must be a non-empty string`); + return value.trim(); +} + +function compactNodeRuntimeLegacyDockerImageCleanupPayload(payload: Record): Record { + const candidates = Array.isArray(payload.candidates) ? payload.candidates.map(objectRecord) : []; + const selected = Array.isArray(payload.selected) ? payload.selected.map(objectRecord) : []; + const protectedImages = Array.isArray(payload.protected) ? payload.protected.map(objectRecord) : []; + const results = Array.isArray(payload.results) ? payload.results.map(objectRecord) : []; + return { + ok: payload.ok ?? null, + mode: payload.mode ?? null, + mutation: payload.mutation ?? null, + diskBefore: payload.diskBefore ?? null, + diskAfter: payload.diskAfter ?? null, + actualDiskReclaimBytes: payload.actualDiskReclaimBytes ?? null, + repositoryCount: payload.repositoryCount ?? null, + imageCount: payload.imageCount ?? null, + containerCount: payload.containerCount ?? null, + candidateCount: payload.candidateCount ?? null, + selectedCount: payload.selectedCount ?? null, + deferredCount: payload.deferredCount ?? null, + protectedCount: payload.protectedCount ?? null, + estimatedReclaimBytes: payload.estimatedReclaimBytes ?? null, + estimatedReclaimHuman: payload.estimatedReclaimHuman ?? null, + candidates: candidates.slice(0, 5).map(compactLegacyDockerImageRow), + selected: selected.slice(0, 10).map(compactLegacyDockerImageRow), + protected: protectedImages.slice(0, 5).map((item) => ({ + ...compactLegacyDockerImageRow(item), + reasons: item.reasons ?? null, + })), + results: results.slice(0, 10).map((item) => ({ + shortId: item.shortId ?? null, + tags: compactLegacyDockerTags(item.refs), + status: item.status ?? null, + exitCode: item.exitCode ?? null, + stderrTail: item.status === "failed" ? item.stderrTail ?? null : undefined, + })), + policy: payload.policy ?? null, + disclosure: { + candidateRowsShown: Math.min(5, candidates.length), + selectedRowsShown: Math.min(10, selected.length), + protectedRowsShown: Math.min(5, protectedImages.length), + resultRowsShown: Math.min(10, results.length), + }, + valuesRedacted: true, + }; +} + +function compactLegacyDockerImageRow(item: Record): Record { + return { + shortId: item.shortId ?? null, + tags: compactLegacyDockerTags(item.repoTags), + createdAt: item.createdAt ?? null, + ageHours: item.ageHours ?? null, + sizeHuman: item.sizeHuman ?? null, + }; +} + +function compactLegacyDockerTags(value: unknown): string[] { + if (!Array.isArray(value)) return []; + return value + .filter((item): item is string => typeof item === "string") + .slice(0, 3) + .map((item) => { + const index = item.lastIndexOf(":"); + return index > 0 ? item.slice(index + 1) : item; + }); +} + +function compactNodeRuntimeLegacyDockerRegistryVolumeCleanupPayload(payload: Record): Record { + const candidate = objectRecord(payload.candidate); + return { + ok: payload.ok ?? null, + mode: payload.mode ?? null, + mutation: payload.mutation ?? null, + diskBefore: payload.diskBefore ?? null, + diskAfter: payload.diskAfter ?? null, + actualDiskReclaimBytes: payload.actualDiskReclaimBytes ?? null, + candidate: Object.keys(candidate).length === 0 ? null : { + containerName: candidate.containerName ?? null, + containerStatus: candidate.containerStatus ?? null, + image: candidate.image ?? null, + volumeName: candidate.volumeName ?? null, + mountDestination: candidate.mountDestination ?? null, + sizeBytes: candidate.sizeBytes ?? null, + sizeHuman: candidate.sizeHuman ?? null, + }, + k8sRegistry: payload.k8sRegistry ?? null, + blockers: Array.isArray(payload.blockers) ? payload.blockers.slice(0, 20) : [], + actions: payload.actions ?? null, + policy: payload.policy ?? null, + valuesRedacted: true, + }; +} + +function nodeRuntimeLegacyDockerImageCleanupNodeScript(): string { + return String.raw` +const childProcess = require("child_process"); + +const options = JSON.parse(process.argv[2] || "{}"); +const mode = options.mode === "run" ? "run" : "plan"; +const repositories = Array.isArray(options.repositories) ? options.repositories.filter((item) => typeof item === "string" && item.length > 0) : []; +const repositorySet = new Set(repositories); +const minAgeHours = Number(options.minAgeHours); +const keepPerRepository = Number(options.keepPerRepository); +const limit = Number(options.limit); +const nowMs = Date.now(); + +function run(args) { + const result = childProcess.spawnSync(args[0], args.slice(1), { encoding: "utf8", maxBuffer: 64 * 1024 * 1024 }); + return { + command: args, + exitCode: typeof result.status === "number" ? result.status : null, + signal: result.signal || null, + stdout: result.stdout || "", + stderr: result.stderr || "", + ok: result.status === 0, + }; +} + +function parseJsonArray(text) { + try { + const parsed = JSON.parse(text); + return Array.isArray(parsed) ? parsed : []; + } catch { + return []; + } +} + +function diskSnapshot() { + const result = run(["df", "-B1", "/"]); + const line = result.stdout.trim().split(/\r?\n/)[1] || ""; + const cols = line.trim().split(/\s+/); + const usePercent = Number(String(cols[4] || "0").replace(/%$/, "")); + return { + filesystem: cols[0] || null, + sizeBytes: Number(cols[1] || 0), + usedBytes: Number(cols[2] || 0), + availableBytes: Number(cols[3] || 0), + usePercent: Number.isFinite(usePercent) ? usePercent : null, + mount: cols[5] || "/", + }; +} + +function repoFromTag(tag) { + const value = String(tag || ""); + const atless = value.split("@")[0]; + const colon = atless.lastIndexOf(":"); + const slash = atless.lastIndexOf("/"); + return colon > slash ? atless.slice(0, colon) : atless; +} + +function ageHours(createdAt) { + const ms = Date.parse(String(createdAt || "")); + if (!Number.isFinite(ms)) return null; + return Math.max(0, Math.round(((nowMs - ms) / 3600000) * 10) / 10); +} + +function human(bytes) { + const value = Number(bytes || 0); + if (!Number.isFinite(value) || value <= 0) return "0B"; + const units = ["B", "KiB", "MiB", "GiB", "TiB"]; + let scaled = value; + let index = 0; + while (scaled >= 1024 && index < units.length - 1) { + scaled /= 1024; + index += 1; + } + return String(Math.round(scaled * 10) / 10) + units[index]; +} + +function shortId(id) { + return String(id || "").replace(/^sha256:/, "").slice(0, 12); +} + +function dockerImagePresent(id) { + return run(["docker", "image", "inspect", id]).ok; +} + +const errors = []; +const dockerProbe = run(["docker", "version", "--format", "{{json .Server.Version}}"]); +if (!dockerProbe.ok) { + console.log(JSON.stringify({ ok: false, mode, mutation: false, error: "docker-daemon-unavailable", stderrTail: dockerProbe.stderr.slice(-2000), valuesRedacted: true })); + process.exit(0); +} + +const diskBefore = diskSnapshot(); +const imageList = run(["docker", "image", "ls", "-q", "--no-trunc"]); +if (!imageList.ok) errors.push({ command: imageList.command, exitCode: imageList.exitCode, stderrTail: imageList.stderr.slice(-1000) }); +const imageIds = [...new Set(imageList.stdout.split(/\r?\n/).map((line) => line.trim()).filter(Boolean))]; +const imageInspect = imageIds.length === 0 ? { ok: true, stdout: "[]", stderr: "", exitCode: 0, command: [] } : run(["docker", "image", "inspect", ...imageIds]); +if (!imageInspect.ok) errors.push({ command: ["docker", "image", "inspect", ""], exitCode: imageInspect.exitCode, stderrTail: imageInspect.stderr.slice(-1000) }); +const images = parseJsonArray(imageInspect.stdout).map((image) => { + const repoTags = Array.isArray(image.RepoTags) ? image.RepoTags.filter((tag) => typeof tag === "string" && tag !== ":") : []; + const matchedTags = repoTags.filter((tag) => repositorySet.has(repoFromTag(tag))); + const matchedRepositories = [...new Set(matchedTags.map(repoFromTag))]; + const createdAt = typeof image.Created === "string" ? image.Created : null; + return { + id: String(image.Id || ""), + shortId: shortId(image.Id), + repoTags, + matchedTags, + matchedRepositories, + createdAt, + createdMs: createdAt === null ? 0 : Date.parse(createdAt) || 0, + ageHours: ageHours(createdAt), + sizeBytes: Number(image.Size || 0), + sizeHuman: human(Number(image.Size || 0)), + }; +}).filter((image) => image.id.length > 0); + +const containerList = run(["docker", "ps", "-a", "--no-trunc", "--format", "{{.ID}}"]); +if (!containerList.ok) errors.push({ command: containerList.command, exitCode: containerList.exitCode, stderrTail: containerList.stderr.slice(-1000) }); +const containerIds = [...new Set(containerList.stdout.split(/\r?\n/).map((line) => line.trim()).filter(Boolean))]; +const containerInspect = containerIds.length === 0 ? { ok: true, stdout: "[]", stderr: "", exitCode: 0, command: [] } : run(["docker", "container", "inspect", ...containerIds]); +if (!containerInspect.ok) errors.push({ command: ["docker", "container", "inspect", ""], exitCode: containerInspect.exitCode, stderrTail: containerInspect.stderr.slice(-1000) }); +const containers = parseJsonArray(containerInspect.stdout).map((container) => ({ + id: String(container.Id || ""), + name: String(container.Name || "").replace(/^\//, ""), + imageId: String(container.Image || ""), + imageRef: String((container.Config && container.Config.Image) || ""), + state: String((container.State && container.State.Status) || ""), +})); +const protectedByContainer = new Set(containers.map((container) => container.imageId).filter(Boolean)); + +const latestProtected = new Set(); +for (const repository of repositories) { + images + .filter((image) => image.matchedRepositories.includes(repository)) + .sort((a, b) => (b.createdMs - a.createdMs) || a.id.localeCompare(b.id)) + .slice(0, keepPerRepository) + .forEach((image) => latestProtected.add(image.id)); +} + +const candidates = []; +const protectedImages = []; +for (const image of images.filter((item) => item.matchedRepositories.length > 0)) { + const reasons = []; + if (protectedByContainer.has(image.id)) reasons.push("container-referenced"); + if (latestProtected.has(image.id)) reasons.push("latest-retention"); + if (image.ageHours === null) reasons.push("created-at-unknown"); + if (image.ageHours !== null && image.ageHours < minAgeHours) reasons.push("under-min-age"); + if (reasons.length > 0) { + protectedImages.push({ ...image, reasons }); + } else { + candidates.push({ ...image, reasons: ["repository-allowlisted", "unreferenced-by-container", "older-than-min-age", "outside-latest-retention"] }); + } +} + +candidates.sort((a, b) => (a.createdMs - b.createdMs) || a.id.localeCompare(b.id)); +const selected = candidates.slice(0, limit); +const results = []; +if (mode === "run") { + for (const image of selected) { + const refs = image.matchedTags.length > 0 ? image.matchedTags : [image.id]; + const remove = run(["docker", "image", "rm", ...refs]); + results.push({ + id: image.id, + shortId: image.shortId, + refs, + status: remove.ok && !dockerImagePresent(image.id) ? "deleted" : remove.ok ? "untagged-or-shared" : "failed", + exitCode: remove.exitCode, + stdoutTail: remove.stdout.slice(-1000), + stderrTail: remove.stderr.slice(-1000), + }); + } +} + +const diskAfter = mode === "run" ? diskSnapshot() : null; +const actualDiskReclaimBytes = diskAfter === null ? null : Math.max(0, Number(diskBefore.usedBytes || 0) - Number(diskAfter.usedBytes || 0)); +const ok = errors.length === 0 && results.every((item) => item.status !== "failed"); +console.log(JSON.stringify({ + ok, + mode, + mutation: mode === "run", + node: options.node || null, + lane: options.lane || null, + repositories, + repositoryCount: repositories.length, + diskBefore, + diskAfter, + actualDiskReclaimBytes, + imageCount: images.length, + containerCount: containers.length, + activeContainers: containers.filter((container) => container.state === "running").slice(0, 20), + candidateCount: candidates.length, + selectedCount: selected.length, + deferredCount: Math.max(0, candidates.length - selected.length), + protectedCount: protectedImages.length, + estimatedReclaimBytes: selected.reduce((sum, image) => sum + Number(image.sizeBytes || 0), 0), + estimatedReclaimHuman: human(selected.reduce((sum, image) => sum + Number(image.sizeBytes || 0), 0)), + candidates, + selected, + protected: protectedImages, + results, + errors, + policy: { + minAgeHours, + keepPerRepository, + limit, + dockerPruneUsed: false, + dockerVolumesTouched: false, + matchedRepositoriesOnly: true, + protectedContainerImages: true, + valuesRedacted: true, + }, + valuesRedacted: true, +})); +`; +} + +function nodeRuntimeLegacyDockerRegistryVolumeCleanupNodeScript(): string { + return String.raw` +const childProcess = require("child_process"); + +const options = JSON.parse(process.argv[2] || "{}"); +const mode = options.mode === "run" ? "run" : "plan"; +const target = options.target || {}; +const requireK8sRegistryReady = options.requireK8sRegistryReady !== false; + +function run(args, extraEnv) { + const result = childProcess.spawnSync(args[0], args.slice(1), { + encoding: "utf8", + maxBuffer: 64 * 1024 * 1024, + env: { ...process.env, ...(extraEnv || {}) }, + }); + return { + command: args, + exitCode: typeof result.status === "number" ? result.status : null, + signal: result.signal || null, + stdout: result.stdout || "", + stderr: result.stderr || "", + ok: result.status === 0, + }; +} + +function parseJson(text) { + try { return JSON.parse(text); } catch { return null; } +} + +function diskSnapshot() { + const result = run(["df", "-B1", "/"]); + const line = result.stdout.trim().split(/\r?\n/)[1] || ""; + const cols = line.trim().split(/\s+/); + const usePercent = Number(String(cols[4] || "0").replace(/%$/, "")); + return { + filesystem: cols[0] || null, + sizeBytes: Number(cols[1] || 0), + usedBytes: Number(cols[2] || 0), + availableBytes: Number(cols[3] || 0), + usePercent: Number.isFinite(usePercent) ? usePercent : null, + mount: cols[5] || "/", + }; +} + +function duBytes(path) { + const bytes = run(["du", "-sb", path]); + if (bytes.ok) { + const value = Number((bytes.stdout.trim().split(/\s+/)[0] || "0")); + if (Number.isFinite(value)) return value; + } + const kib = run(["du", "-sk", path]); + const value = Number((kib.stdout.trim().split(/\s+/)[0] || "0")); + return Number.isFinite(value) ? value * 1024 : null; +} + +function human(bytes) { + const value = Number(bytes || 0); + if (!Number.isFinite(value) || value <= 0) return "0B"; + const units = ["B", "KiB", "MiB", "GiB", "TiB"]; + let scaled = value; + let index = 0; + while (scaled >= 1024 && index < units.length - 1) { + scaled /= 1024; + index += 1; + } + return String(Math.round(scaled * 10) / 10) + units[index]; +} + +function inspectContainer(name) { + const result = run(["docker", "container", "inspect", name]); + if (!result.ok) return { ok: false, result, container: null }; + const parsed = parseJson(result.stdout); + return { ok: Array.isArray(parsed) && parsed.length > 0, result, container: Array.isArray(parsed) ? parsed[0] : null }; +} + +function listContainers() { + const list = run(["docker", "ps", "-a", "--no-trunc", "--format", "{{.ID}}"]); + if (!list.ok) return []; + const ids = [...new Set(list.stdout.split(/\r?\n/).map((line) => line.trim()).filter(Boolean))]; + if (ids.length === 0) return []; + const inspect = run(["docker", "container", "inspect", ...ids]); + const parsed = parseJson(inspect.stdout); + return Array.isArray(parsed) ? parsed : []; +} + +function k8sRegistryStatus(registry) { + const env = { KUBECONFIG: "/etc/rancher/k3s/k3s.yaml" }; + const namespace = String(registry.namespace || ""); + const deploymentName = String(registry.deploymentName || ""); + const pvcName = String(registry.pvcName || ""); + const deployResult = run(["kubectl", "-n", namespace, "get", "deployment", deploymentName, "-o", "json"], env); + const pvcResult = run(["kubectl", "-n", namespace, "get", "pvc", pvcName, "-o", "json"], env); + const endpoint = String(registry.endpoint || ""); + const curlResult = endpoint.length > 0 ? run(["curl", "-fsS", "--max-time", "5", "http://" + endpoint + "/v2/"]) : { ok: false, exitCode: null, stderr: "endpoint missing" }; + const deployment = parseJson(deployResult.stdout) || {}; + const pvc = parseJson(pvcResult.stdout) || {}; + const desired = Math.max(1, Number((deployment.spec && deployment.spec.replicas) || 1)); + const readyReplicas = Number((deployment.status && deployment.status.readyReplicas) || 0); + const updatedReplicas = Number((deployment.status && deployment.status.updatedReplicas) || 0); + const pvcPhase = String((pvc.status && pvc.status.phase) || ""); + return { + ok: deployResult.ok && pvcResult.ok && curlResult.ok && readyReplicas >= desired && updatedReplicas >= desired && pvcPhase === "Bound", + namespace, + deploymentName, + pvcName, + desiredReplicas: desired, + readyReplicas, + updatedReplicas, + pvcPhase, + endpoint, + endpointOk: curlResult.ok, + deployExitCode: deployResult.exitCode, + pvcExitCode: pvcResult.exitCode, + curlExitCode: curlResult.exitCode, + valuesRedacted: true, + }; +} + +const diskBefore = diskSnapshot(); +const inspect = inspectContainer(String(target.dockerContainerName || "registry")); +if (!inspect.ok || inspect.container === null) { + console.log(JSON.stringify({ + ok: true, + mode, + mutation: false, + diskBefore, + diskAfter: null, + actualDiskReclaimBytes: null, + candidate: null, + k8sRegistry: k8sRegistryStatus(target.k8sRegistry || {}), + blockers: [], + actions: { containerRemoved: false, volumeRemoved: false }, + policy: { noCandidateIsOk: true, dockerPruneUsed: false, dockerVolumeRmScoped: false, valuesRedacted: true }, + valuesRedacted: true, + })); + process.exit(0); +} + +const container = inspect.container; +const state = container.State || {}; +const config = container.Config || {}; +const mounts = Array.isArray(container.Mounts) ? container.Mounts : []; +const registryMount = mounts.find((mount) => mount && mount.Type === "volume" && mount.Destination === target.mountDestination); +const allContainers = listContainers(); +const otherUsers = registryMount === undefined ? [] : allContainers + .filter((item) => item && item.Id !== container.Id) + .filter((item) => Array.isArray(item.Mounts) && item.Mounts.some((mount) => mount && mount.Name === registryMount.Name)) + .map((item) => ({ id: String(item.Id || "").slice(0, 12), name: String(item.Name || "").replace(/^\//, ""), state: String((item.State && item.State.Status) || "") })); +const k8sRegistry = k8sRegistryStatus(target.k8sRegistry || {}); +const volumeSource = registryMount && typeof registryMount.Source === "string" ? registryMount.Source : null; +const volumeName = registryMount && typeof registryMount.Name === "string" ? registryMount.Name : null; +const sizeBytes = volumeSource === null ? null : duBytes(volumeSource); +const candidate = registryMount === undefined ? null : { + containerName: String(container.Name || "").replace(/^\//, ""), + containerStatus: String(state.Status || ""), + running: state.Running === true, + image: String(config.Image || ""), + volumeName, + volumeSource, + mountDestination: String(registryMount.Destination || ""), + sizeBytes, + sizeHuman: human(sizeBytes), + otherUsers, + valuesRedacted: true, +}; +const blockers = []; +if (registryMount === undefined) blockers.push("registry-volume-mount-missing"); +if (state.Running === true || String(state.Status || "") === "running") blockers.push("docker-registry-container-running"); +if (String(config.Image || "") !== String(target.dockerImage || "")) blockers.push("docker-registry-image-mismatch"); +if (otherUsers.length > 0) blockers.push("docker-volume-has-other-container-users"); +if (volumeName === null || volumeName.length === 0) blockers.push("docker-volume-name-missing"); +if (volumeSource === null || !volumeSource.startsWith("/var/lib/docker/volumes/") || !volumeSource.endsWith("/_data")) blockers.push("docker-volume-source-not-under-docker-volumes"); +if (requireK8sRegistryReady && k8sRegistry.ok !== true) blockers.push("replacement-k8s-registry-not-ready"); + +let actions = { containerRemoved: false, volumeRemoved: false, containerRmExitCode: null, volumeRmExitCode: null, stderrTail: "" }; +let diskAfter = null; +let mutation = false; +if (mode === "run" && blockers.length === 0 && candidate !== null) { + const removeContainer = run(["docker", "rm", String(target.dockerContainerName || "registry")]); + const removeVolume = removeContainer.ok ? run(["docker", "volume", "rm", String(volumeName)]) : { ok: false, exitCode: null, stderr: "container removal failed" }; + actions = { + containerRemoved: removeContainer.ok, + volumeRemoved: removeVolume.ok, + containerRmExitCode: removeContainer.exitCode, + volumeRmExitCode: removeVolume.exitCode, + stderrTail: (removeContainer.stderr + "\n" + removeVolume.stderr).trim().slice(-1000), + }; + mutation = removeContainer.ok && removeVolume.ok; + if (!removeContainer.ok) blockers.push("docker-container-rm-failed"); + if (removeContainer.ok && !removeVolume.ok) blockers.push("docker-volume-rm-failed"); + diskAfter = diskSnapshot(); +} + +if (diskAfter === null) diskAfter = mode === "run" ? diskSnapshot() : null; +const actualDiskReclaimBytes = diskAfter === null ? null : Math.max(0, Number(diskBefore.usedBytes || 0) - Number(diskAfter.usedBytes || 0)); +console.log(JSON.stringify({ + ok: blockers.length === 0, + mode, + mutation, + diskBefore, + diskAfter, + actualDiskReclaimBytes, + candidate, + k8sRegistry, + blockers, + actions, + policy: { + requireK8sRegistryReady, + dockerPruneUsed: false, + dockerVolumeRmScoped: mode === "run" && mutation, + dockerComposeTouched: false, + databaseTouched: false, + matchedExitedRegistryContainerOnly: true, + valuesRedacted: true, + }, + valuesRedacted: true, +})); +`; +} + +function listNodeRuntimePipelineRunNames(spec: HwlabRuntimeLaneSpec): Set { + const result = runNodeK3sArgs(spec, [ + "kubectl", + "-n", + HWLAB_CI_NAMESPACE, + "get", + "pipelinerun", + "-o", + 'jsonpath={range .items[*]}{.metadata.name}{"\\n"}{end}', + ], 60); + if (!isCommandSuccess(result)) throw new Error(`failed to list ${HWLAB_CI_NAMESPACE} PipelineRuns: ${result.stderr.trim().slice(0, 1000)}`); + return new Set(result.stdout.split(/\r?\n/u).map((line) => line.trim()).filter(Boolean)); +} + +function listNodeRuntimeCiPodClaims(spec: HwlabRuntimeLaneSpec): NodeRuntimeCiPodClaimRow[] { + const result = runNodeK3sArgs(spec, [ + "kubectl", + "-n", + HWLAB_CI_NAMESPACE, + "get", + "pod", + "-o", + 'go-template={{range .items}}{{.metadata.name}}{{"\\t"}}{{.status.phase}}{{"\\t"}}{{range .spec.volumes}}{{if .persistentVolumeClaim}}{{.persistentVolumeClaim.claimName}}{{","}}{{end}}{{end}}{{"\\n"}}{{end}}', + ], 60); + if (!isCommandSuccess(result)) throw new Error(`failed to list ${HWLAB_CI_NAMESPACE} Pod PVC mounts: ${result.stderr.trim().slice(0, 1000)}`); + return result.stdout.split(/\r?\n/u).map((line) => { + const [name = "", phase = "", claimsRaw = ""] = line.trim().split("\t"); + return { name, phase, claims: claimsRaw.split(",").map((claim) => claim.trim()).filter(Boolean) }; + }).filter((item) => item.name.length > 0); +} + +function listNodeRuntimeCiPvcs(spec: HwlabRuntimeLaneSpec): NodeRuntimeCiPvcRow[] { + const result = runNodeK3sArgs(spec, [ + "kubectl", + "-n", + HWLAB_CI_NAMESPACE, + "get", + "pvc", + "-o", + 'go-template={{range .items}}{{.metadata.name}}{{"\\t"}}{{.spec.volumeName}}{{"\\t"}}{{.status.phase}}{{"\\t"}}{{range .metadata.ownerReferences}}{{if eq .kind "PipelineRun"}}{{.kind}}{{"\\t"}}{{.name}}{{end}}{{end}}{{"\\t"}}{{.spec.resources.requests.storage}}{{"\\n"}}{{end}}', + ], 60); + if (!isCommandSuccess(result)) throw new Error(`failed to list ${HWLAB_CI_NAMESPACE} PVCs: ${result.stderr.trim().slice(0, 1000)}`); + return result.stdout.split(/\r?\n/u).map((line) => { + const [name = "", volumeName = "", phase = "", ownerKind = "", ownerName = "", storage = ""] = line.trim().split("\t"); + return { name, volumeName, phase, ownerKind, ownerName, storage }; + }).filter((item) => item.name.length > 0); +} + +export function listNodeRuntimeOrphanedCiPvcs(spec: HwlabRuntimeLaneSpec, limit: number): NodeRuntimeCiPvcRow[] { + const pipelineRuns = listNodeRuntimePipelineRunNames(spec); + const activeClaims = new Set(listNodeRuntimeCiPodClaims(spec) + .filter((pod) => pod.phase !== "Succeeded" && pod.phase !== "Failed") + .flatMap((pod) => pod.claims)); + return listNodeRuntimeCiPvcs(spec) + .filter((item) => /^pvc-[a-z0-9]+$/u.test(item.name)) + .filter((item) => item.ownerKind === "PipelineRun" && item.ownerName.length > 0) + .filter((item) => !pipelineRuns.has(item.ownerName)) + .filter((item) => !activeClaims.has(item.name)) + .sort((a, b) => a.name.localeCompare(b.name)) + .slice(0, limit); +} + +export function listNodeRuntimeReleasedCiPvs(spec: HwlabRuntimeLaneSpec, limit: number): NodeRuntimeReleasedPvRow[] { + const result = runNodeK3sArgs(spec, [ + "kubectl", + "get", + "pv", + "-o", + 'jsonpath={range .items[*]}{.metadata.name}{"\\t"}{.metadata.creationTimestamp}{"\\t"}{.status.phase}{"\\t"}{.spec.storageClassName}{"\\t"}{.spec.persistentVolumeReclaimPolicy}{"\\t"}{.spec.claimRef.namespace}{"\\t"}{.spec.claimRef.name}{"\\t"}{.spec.capacity.storage}{"\\t"}{.spec.local.path}{"\\t"}{.spec.hostPath.path}{"\\n"}{end}', + ], 60); + if (!isCommandSuccess(result)) throw new Error(`failed to list released ${HWLAB_CI_NAMESPACE} PVs: ${result.stderr.trim().slice(0, 1000)}`); + return result.stdout.split(/\r?\n/u).map((line) => { + const [name = "", createdAt = "", phase = "", storageClass = "", reclaimPolicy = "", claimNamespace = "", claimName = "", capacity = "", localPath = "", hostPath = ""] = line.trim().split("\t"); + return { name, createdAt, phase, storageClass, reclaimPolicy, claimNamespace, claimName, capacity, hostPath: safeNodeRuntimeStoragePath(localPath) ?? safeNodeRuntimeStoragePath(hostPath) }; + }) + .filter((item) => item.name.length > 0) + .filter((item) => item.phase === "Released") + .filter((item) => item.storageClass === "local-path" && item.reclaimPolicy === "Delete") + .filter((item) => item.claimNamespace === HWLAB_CI_NAMESPACE && /^pvc-[a-z0-9]+$/u.test(item.claimName)) + .sort((a, b) => String(a.createdAt).localeCompare(String(b.createdAt))) + .slice(0, limit); +} + +function safeNodeRuntimeStoragePath(value: string): string | null { + if (!value.startsWith("/var/lib/rancher/k3s/storage/")) return null; + if (value.includes("\0") || value.split("/").some((part) => part === "..")) return null; + return value; +} + +function deleteNodeRuntimeCiPvcs(spec: HwlabRuntimeLaneSpec, names: string[], timeoutSeconds: number): CommandResult { + if (names.length === 0) return { command: [], cwd: repoRoot, exitCode: 0, stdout: "no orphaned PVC candidates", stderr: "", signal: null, timedOut: false }; + return runNodeK3sArgs(spec, ["kubectl", "-n", HWLAB_CI_NAMESPACE, "delete", "pvc", ...names, "--ignore-not-found=true", "--wait=true", `--timeout=${timeoutSeconds}s`], timeoutSeconds); +} + +function deleteNodeRuntimePersistentVolumes(spec: HwlabRuntimeLaneSpec, names: string[], timeoutSeconds: number): CommandResult { + if (names.length === 0) return { command: [], cwd: repoRoot, exitCode: 0, stdout: "no released PV candidates", stderr: "", signal: null, timedOut: false }; + return runNodeK3sArgs(spec, ["kubectl", "delete", "pv", ...names, "--ignore-not-found=true", "--wait=true", `--timeout=${timeoutSeconds}s`], timeoutSeconds); +} diff --git a/scripts/src/hwlab-node/entry.ts b/scripts/src/hwlab-node/entry.ts index 1074fe66..595999a5 100644 --- a/scripts/src/hwlab-node/entry.ts +++ b/scripts/src/hwlab-node/entry.ts @@ -116,7 +116,7 @@ export interface NodeWebProbeScreenshotOptions { commandTimeoutSeconds: number; } -export type NodeWebProbeObserveAction = "start" | "status" | "command" | "stop" | "collect" | "analyze"; +export type NodeWebProbeObserveAction = "start" | "status" | "command" | "stop" | "collect" | "analyze" | "gc"; export type NodeWebProbeObserveCommandType = | "login" @@ -176,10 +176,15 @@ export interface NodeWebProbeObserveOptions { screenshotIntervalMs: number; observerRefreshIntervalMs: number; maxSamples: number; + maxRunSeconds: number; commandTimeoutSeconds: number; waitMs: number; tailLines: number; maxFiles: number; + gcKeepHours: number; + gcLimit: number; + confirm: boolean; + dryRun: boolean; collectView: NodeWebProbeObserveCollectView; collectFile: string | null; collectFinding: string | null; @@ -612,7 +617,7 @@ export async function runNodeDelegatedDomain(config: Config, domain: DelegatedNo if (scoped.originalArgs.includes("--full")) return withNodeRuntimeControlPlaneStatusFullRendered(result, scoped); return withNodeRuntimeControlPlaneStatusRendered(result, scoped); } - if (scoped.action === "apply" || scoped.action === "trigger-current" || scoped.action === "refresh" || scoped.action === "sync" || scoped.action === "runtime-migration" || scoped.action === "cleanup-runs") { + if (scoped.action === "apply" || scoped.action === "trigger-current" || scoped.action === "refresh" || scoped.action === "sync" || scoped.action === "runtime-migration" || scoped.action === "cleanup-runs" || scoped.action === "cleanup-released-pvs" || scoped.action === "cleanup-legacy-docker-images" || scoped.action === "cleanup-legacy-docker-registry-volume") { if (scoped.confirm && !scoped.dryRun && !scoped.wait) return startNodeDelegatedJob(scoped); return nodeRuntimeControlPlaneRun(scoped); } diff --git a/scripts/src/hwlab-node/runtime-common.ts b/scripts/src/hwlab-node/runtime-common.ts index 0fc2ec67..42db1de9 100644 --- a/scripts/src/hwlab-node/runtime-common.ts +++ b/scripts/src/hwlab-node/runtime-common.ts @@ -209,7 +209,7 @@ export function nodeRuntimeUnsupportedAction(scoped: ReturnType [...]"); + throw new Error("web-probe observe usage: observe start --node NODE --lane vNN [...]; observe status|command|stop|collect|analyze [...]; observe gc --node NODE --lane vNN [--confirm]"); } assertKnownOptions(args.slice(1), new Set([ "--node", @@ -233,6 +234,9 @@ export function parseNodeWebProbeObserveOptions( "--screenshot-interval-ms", "--observer-refresh-interval-ms", "--max-samples", + "--max-run-seconds", + "--keep-hours", + "--limit", "--command-timeout-seconds", "--wait-ms", "--tail-lines", @@ -284,7 +288,7 @@ export function parseNodeWebProbeObserveOptions( "--workspace-root", "--workspace-root-ref", "--root", - ]), new Set(["--force", "--full", "--raw", "--text-stdin", "--require-composer-ready", "--wait-project-management-ready", "--blocking", "--non-blocking"])); + ]), new Set(["--force", "--full", "--raw", "--text-stdin", "--require-composer-ready", "--wait-project-management-ready", "--blocking", "--non-blocking", "--dry-run", "--confirm"])); const commandTypeRaw = optionValue(args, "--type") ?? null; const commandType = commandTypeRaw === null ? null : parseNodeWebProbeObserveCommandType(commandTypeRaw); const stateDir = optionValue(args, "--state-dir") ?? indexed?.stateDir ?? null; @@ -320,9 +324,12 @@ export function parseNodeWebProbeObserveOptions( if (analyzeTailSamples !== null && (!Number.isInteger(analyzeTailSamples) || analyzeTailSamples < 0)) { throw new Error("unsafe web-probe observe --tail-samples: expected a non-negative integer; use 0 for all samples"); } - if (observeActionRaw !== "start" && stateDir === null && jobId === null) { + if (observeActionRaw !== "start" && observeActionRaw !== "gc" && stateDir === null && jobId === null) { throw new Error("web-probe observe status|command|stop|collect|analyze requires --state-dir or --job-id"); } + const confirm = args.includes("--confirm"); + const dryRun = args.includes("--dry-run") || !confirm; + if (confirm && args.includes("--dry-run")) throw new Error("web-probe observe gc accepts only one of --confirm or --dry-run"); const commandTextOption = optionValue(args, "--text") ?? null; const commandTextFromStdin = args.includes("--text-stdin"); if (commandTextFromStdin && observeActionRaw !== "command") { @@ -403,10 +410,15 @@ export function parseNodeWebProbeObserveOptions( screenshotIntervalMs: positiveIntegerOption(args, "--screenshot-interval-ms", 300000, 86_400_000), observerRefreshIntervalMs: positiveIntegerOption(args, "--observer-refresh-interval-ms", 180000, 86_400_000), maxSamples: positiveIntegerOption(args, "--max-samples", 0, 10_000_000), + maxRunSeconds: positiveIntegerOption(args, "--max-run-seconds", 0, 86_400), commandTimeoutSeconds: positiveIntegerOption(args, "--command-timeout-seconds", 55, 3600), waitMs: positiveIntegerOption(args, "--wait-ms", 0, 600000), tailLines: positiveIntegerOption(args, "--tail-lines", 5, 200), maxFiles: positiveIntegerOption(args, "--max-files", 80, 5000), + gcKeepHours: positiveIntegerOption(args, "--keep-hours", webObserveGcDefaultKeepHours(), 8760), + gcLimit: positiveIntegerOption(args, "--limit", 200, 5000), + confirm, + dryRun, collectView, collectFile, collectFinding, @@ -555,6 +567,18 @@ export function assertKnownOptions(args: string[], valueOptions: Set, fl } } +function webObserveGcDefaultKeepHours(): number { + const configPath = "config/unidesk-cli.yaml"; + const parsed = record(Bun.YAML.parse(readFileSync(rootPath(configPath), "utf8")) as unknown); + const gc = record(parsed.gc); + const scratch = record(gc.stateStaleScratch); + const keepHours = scratch.keepHours; + if (!Number.isInteger(keepHours) || keepHours < 0) { + throw new Error(`${configPath}#gc.stateStaleScratch.keepHours must be a non-negative integer`); + } + return keepHours; +} + export function runNodeWebProbe(options: NodeWebProbeOptions): Record | RenderedCliResult { const lane = options.lane; if (!isHwlabRuntimeLane(lane)) throw new Error(`web-probe only supports HWLAB runtime lanes, got ${lane}`); @@ -1305,9 +1329,353 @@ export function runNodeWebProbeObserve( if (options.observeAction === "command") return runNodeWebProbeObserveCommand(options, spec, false); if (options.observeAction === "stop") return runNodeWebProbeObserveCommand({ ...options, commandType: "stop" }, spec, true); if (options.observeAction === "collect") return runNodeWebProbeObserveCollect(options, spec); + if (options.observeAction === "gc") return runNodeWebProbeObserveGc(options, spec); return runNodeWebProbeObserveAnalyze(options, spec); } +export function runNodeWebProbeObserveGc(options: NodeWebProbeObserveOptions, spec: HwlabRuntimeLaneSpec): Record { + const stateRoot = `.state/web-observe/${safeWebObserveSegment(options.node)}/${safeWebObserveSegment(options.lane)}`; + const script = [ + "set -eu", + `node - ${shellQuote(stateRoot)} ${shellQuote(String(options.gcKeepHours))} ${shellQuote(String(options.gcLimit))} ${shellQuote(options.confirm ? "run" : "plan")} ${shellQuote(options.node)} ${shellQuote(options.lane)} <<'UNIDESK_WEB_OBSERVE_GC'`, + nodeWebObserveGcNodeScript(), + "UNIDESK_WEB_OBSERVE_GC", + ].join("\n"); + const result = runTransWorkspaceStdinScript(options.node, spec.workspace, script, options.commandTimeoutSeconds); + const payload = parseJsonObject(result.stdout); + const ok = result.exitCode === 0 && payload.ok !== false; + return { + ok, + status: ok ? (options.confirm ? "cleaned" : "planned") : "blocked", + command: `web-probe observe gc --node ${options.node} --lane ${options.lane}`, + node: options.node, + lane: options.lane, + workspace: spec.workspace, + stateRoot, + mode: options.confirm ? "confirmed-run" : "dry-run", + retention: { + source: "config/unidesk-cli.yaml#gc.stateStaleScratch.keepHours", + keepHours: options.gcKeepHours, + userOverride: options.gcKeepHours !== webObserveGcDefaultKeepHours(), + }, + gc: options.full || options.raw ? payload : compactWebObserveGcPayload(payload), + result: ok + ? { + exitCode: result.exitCode, + timedOut: result.timedOut, + stdoutBytes: result.stdout.length, + stderrBytes: result.stderr.length, + } + : compactCommandResultWithStdoutTail(result), + valuesRedacted: true, + }; +} + +function compactWebObserveGcPayload(payload: Record): Record { + const candidates = Array.isArray(payload.candidates) ? payload.candidates.map(record) : []; + const protectedRuns = Array.isArray(payload.protected) ? payload.protected.map(record) : []; + const deleted = Array.isArray(payload.deleted) ? payload.deleted.map(record) : []; + const failures = Array.isArray(payload.failures) ? payload.failures.map(record) : []; + return { + ok: payload.ok ?? null, + mode: payload.mode ?? null, + mutation: payload.mutation ?? null, + keepHours: payload.keepHours ?? null, + limit: payload.limit ?? null, + diskBefore: payload.diskBefore ?? null, + diskAfter: payload.diskAfter ?? null, + scannedRuns: payload.scannedRuns ?? null, + candidateCount: payload.candidateCount ?? null, + selectedCount: payload.selectedCount ?? null, + deferredCount: payload.deferredCount ?? null, + protectedCount: payload.protectedCount ?? null, + estimatedReclaimBytes: payload.estimatedReclaimBytes ?? null, + estimatedReclaimHuman: payload.estimatedReclaimHuman ?? null, + reclaimedBytes: payload.reclaimedBytes ?? null, + reclaimedHuman: payload.reclaimedHuman ?? null, + candidates: candidates.slice(0, 20).map((item) => ({ + id: item.id ?? null, + runDir: item.runDir ?? null, + status: item.status ?? null, + ageHours: item.ageHours ?? null, + rawHuman: item.rawHuman ?? null, + artifactCount: item.artifactCount ?? null, + })), + deleted: deleted.slice(0, 20).map((item) => ({ + id: item.id ?? null, + runDir: item.runDir ?? null, + rawHuman: item.rawHuman ?? null, + artifactCount: item.artifactCount ?? null, + })), + failures: failures.slice(0, 20), + protected: protectedRuns.slice(0, 20).map((item) => ({ + id: item.id ?? null, + runDir: item.runDir ?? null, + status: item.status ?? null, + ageHours: item.ageHours ?? null, + rawHuman: item.rawHuman ?? null, + reasons: item.reasons ?? null, + })), + disclosure: { + candidateRowsShown: Math.min(20, candidates.length), + deletedRowsShown: Math.min(20, deleted.length), + protectedRowsShown: Math.min(20, protectedRuns.length), + full: "rerun with --full or --raw for all candidates/artifacts", + }, + valuesRedacted: true, + }; +} + +function nodeWebObserveGcNodeScript(): string { + return String.raw` +const fs = require("fs"); +const path = require("path"); +const childProcess = require("child_process"); + +const stateRoot = process.argv[2]; +const keepHours = Number(process.argv[3]); +const limit = Number(process.argv[4]); +const mode = process.argv[5] === "run" ? "run" : "plan"; +const nodeId = process.argv[6]; +const lane = process.argv[7]; +const keepMs = keepHours * 60 * 60 * 1000; +const nowMs = Date.now(); +const rawNames = ["samples.jsonl", "browser-process.jsonl", "network.jsonl", "console.jsonl", "artifacts.jsonl", "screenshots"]; + +function jsonRead(file) { + try { return JSON.parse(fs.readFileSync(file, "utf8")); } catch { return null; } +} + +function parseTime(value) { + if (typeof value !== "string") return 0; + const ms = Date.parse(value); + return Number.isFinite(ms) ? ms : 0; +} + +function dirTimeMs(runDir) { + const base = path.basename(runDir); + const match = base.match(/^(\d{8})T(\d{6})Z_/); + if (!match) return 0; + const raw = match[1].slice(0, 4) + "-" + match[1].slice(4, 6) + "-" + match[1].slice(6, 8) + "T" + match[2].slice(0, 2) + ":" + match[2].slice(2, 4) + ":" + match[2].slice(4, 6) + "Z"; + return Date.parse(raw) || 0; +} + +function newestKnownTimeMs(runDir, manifest, heartbeat) { + return Math.max( + dirTimeMs(runDir), + parseTime(manifest && manifest.completedAt), + parseTime(manifest && manifest.updatedAt), + parseTime(manifest && manifest.startedAt), + parseTime(manifest && manifest.pageProvenance && manifest.pageProvenance.observedAt), + parseTime(heartbeat && heartbeat.completedAt), + parseTime(heartbeat && heartbeat.forceStoppedAt), + parseTime(heartbeat && heartbeat.updatedAt), + parseTime(heartbeat && heartbeat.pageProvenance && heartbeat.pageProvenance.observedAt), + ); +} + +function sizePath(target) { + let total = 0; + const stack = [target]; + while (stack.length > 0) { + const current = stack.pop(); + let st; + try { st = fs.lstatSync(current); } catch { continue; } + if (st.isSymbolicLink()) continue; + total += st.blocks ? st.blocks * 512 : st.size; + if (!st.isDirectory()) continue; + let entries; + try { entries = fs.readdirSync(current); } catch { continue; } + for (const entry of entries) stack.push(path.join(current, entry)); + } + return total; +} + +function human(bytes) { + const units = ["B", "KiB", "MiB", "GiB", "TiB"]; + let value = bytes; + let unit = 0; + while (value >= 1024 && unit < units.length - 1) { value /= 1024; unit += 1; } + return value.toFixed(unit === 0 ? 0 : 1) + units[unit]; +} + +function disk() { + try { + const line = childProcess.execFileSync("df", ["-Pk", "/"], { encoding: "utf8" }).trim().split("\n").slice(-1)[0]; + const parts = line.trim().split(/\s+/); + const sizeBytes = Number(parts[1]) * 1024; + const usedBytes = Number(parts[2]) * 1024; + const availableBytes = Number(parts[3]) * 1024; + const usePercent = Number(String(parts[4]).replace("%", "")); + return { filesystem: parts[0], sizeBytes, usedBytes, availableBytes, usePercent, usedHuman: human(usedBytes), availableHuman: human(availableBytes) }; + } catch (error) { + return { error: error instanceof Error ? error.message : String(error) }; + } +} + +function findRunDirs(root) { + const runs = []; + for (const year of safeReadDir(root)) { + const y = path.join(root, year.name); + if (!year.isDirectory() || !/^\d{4}$/.test(year.name)) continue; + for (const month of safeReadDir(y)) { + const m = path.join(y, month.name); + if (!month.isDirectory() || !/^\d{2}$/.test(month.name)) continue; + for (const day of safeReadDir(m)) { + const d = path.join(m, day.name); + if (!day.isDirectory() || !/^\d{2}$/.test(day.name)) continue; + for (const run of safeReadDir(d)) { + const r = path.join(d, run.name); + if (run.isDirectory() && /_webobs-[A-Za-z0-9_.-]+$/.test(run.name)) runs.push(r); + } + } + } + } + return runs; +} + +function safeReadDir(dir) { + try { return fs.readdirSync(dir, { withFileTypes: true }); } catch { return []; } +} + +function pidObserverAlive(pid, runAbs) { + if (!Number.isInteger(pid) || pid <= 1) return { alive: false, reason: null }; + try { + const cmdline = fs.readFileSync("/proc/" + pid + "/cmdline", "utf8").replace(/\0/g, " "); + if (cmdline.includes("observer-runner") || cmdline.includes(runAbs)) return { alive: true, reason: "pid-alive" }; + return { alive: false, reason: null }; + } catch (error) { + if (fs.existsSync("/proc/" + pid)) return { alive: true, reason: "pid-alive-unreadable" }; + return { alive: false, reason: null }; + } +} + +function openRunDirs(rootAbs) { + const active = new Set(); + const procEntries = safeReadDir("/proc").filter((entry) => entry.isDirectory() && /^\d+$/.test(entry.name)); + for (const proc of procEntries) { + const fdDir = path.join("/proc", proc.name, "fd"); + for (const fd of safeReadDir(fdDir)) { + let target; + try { target = fs.readlinkSync(path.join(fdDir, fd.name)).replace(/ \(deleted\)$/, ""); } catch { continue; } + if (!target.startsWith(rootAbs + path.sep)) continue; + const rel = path.relative(rootAbs, target).split(path.sep); + if (rel.length >= 4) active.add(path.join(rootAbs, rel[0], rel[1], rel[2], rel[3])); + } + } + return active; +} + +function rawArtifacts(runDir) { + const artifacts = []; + for (const name of rawNames) { + const target = path.join(runDir, name); + if (!fs.existsSync(target)) continue; + const bytes = sizePath(target); + if (bytes > 0) artifacts.push({ name, path: target, bytes, human: human(bytes) }); + } + return artifacts; +} + +function removeArtifact(artifact) { + const st = fs.lstatSync(artifact.path); + if (st.isSymbolicLink()) throw new Error("refusing symlink: " + artifact.path); + if (st.isDirectory()) fs.rmSync(artifact.path, { recursive: true, force: false }); + else fs.unlinkSync(artifact.path); +} + +const rootAbs = path.resolve(stateRoot); +const diskBefore = disk(); +const openDirs = openRunDirs(rootAbs); +const candidates = []; +const protectedRuns = []; + +for (const runDir of findRunDirs(rootAbs)) { + const manifest = jsonRead(path.join(runDir, "manifest.json")); + const heartbeat = jsonRead(path.join(runDir, "heartbeat.json")); + const pid = Number((heartbeat && heartbeat.pid) || (manifest && manifest.pid) || 0); + const pidState = pidObserverAlive(pid, runDir); + const hasOpenFd = openDirs.has(runDir); + const newestMs = newestKnownTimeMs(runDir, manifest, heartbeat); + const ageMs = newestMs > 0 ? nowMs - newestMs : 0; + const artifacts = rawArtifacts(runDir); + const rawBytes = artifacts.reduce((sum, item) => sum + item.bytes, 0); + const rel = path.relative(rootAbs, runDir); + const reasons = []; + if (!manifest) reasons.push("manifest-missing"); + if (pidState.alive) reasons.push(pidState.reason || "pid-alive"); + if (hasOpenFd) reasons.push("open-fd"); + if (newestMs <= 0) reasons.push("time-unknown"); + if (ageMs < keepMs) reasons.push("retention-window"); + if (rawBytes <= 0) reasons.push("no-raw-artifacts"); + const summary = { + id: (manifest && manifest.jobId) || path.basename(runDir).match(/(webobs-[A-Za-z0-9_.-]+)$/)?.[1] || null, + runDir: rel, + status: heartbeat && heartbeat.status || null, + pid: Number.isInteger(pid) && pid > 0 ? pid : null, + newestAt: newestMs > 0 ? new Date(newestMs).toISOString() : null, + ageHours: newestMs > 0 ? Number((ageMs / 3600000).toFixed(2)) : null, + rawBytes, + rawHuman: human(rawBytes), + artifacts, + }; + if (reasons.length > 0) protectedRuns.push({ ...summary, reasons }); + else candidates.push(summary); +} + +candidates.sort((a, b) => b.rawBytes - a.rawBytes); +const selected = candidates.slice(0, limit); +const selectedSet = new Set(selected.map((item) => item.runDir)); +const deferred = candidates.filter((item) => !selectedSet.has(item.runDir)); +let reclaimedBytes = 0; +const deleted = []; +const failures = []; + +if (mode === "run") { + for (const item of selected) { + const runDir = path.join(rootAbs, item.runDir); + try { + for (const artifact of item.artifacts) removeArtifact(artifact); + reclaimedBytes += item.rawBytes; + deleted.push({ id: item.id, runDir: item.runDir, rawBytes: item.rawBytes, rawHuman: item.rawHuman, artifactCount: item.artifacts.length }); + } catch (error) { + failures.push({ id: item.id, runDir: item.runDir, error: error instanceof Error ? error.message : String(error) }); + } + } +} + +const estimatedReclaimBytes = selected.reduce((sum, item) => sum + item.rawBytes, 0); +const diskAfter = mode === "run" ? disk() : null; +console.log(JSON.stringify({ + ok: failures.length === 0, + command: "web-probe observe gc", + node: nodeId, + lane, + stateRoot, + mode, + mutation: mode === "run", + keepHours, + limit, + diskBefore, + diskAfter, + scannedRuns: candidates.length + protectedRuns.length, + candidateCount: candidates.length, + selectedCount: selected.length, + deferredCount: deferred.length, + protectedCount: protectedRuns.length, + estimatedReclaimBytes, + estimatedReclaimHuman: human(estimatedReclaimBytes), + reclaimedBytes, + reclaimedHuman: human(reclaimedBytes), + candidates: selected.map((item) => ({ id: item.id, runDir: item.runDir, status: item.status, ageHours: item.ageHours, rawBytes: item.rawBytes, rawHuman: item.rawHuman, artifactCount: item.artifacts.length, artifacts: item.artifacts.map((artifact) => ({ name: artifact.name, bytes: artifact.bytes, human: artifact.human })) })), + deleted, + failures, + protected: protectedRuns.slice(0, Math.min(50, protectedRuns.length)).map((item) => ({ id: item.id, runDir: item.runDir, status: item.status, ageHours: item.ageHours, rawBytes: item.rawBytes, rawHuman: item.rawHuman, reasons: item.reasons })), + valuesRedacted: true, +}, null, 2)); +`; +} + export function runNodeWebProbeObserveStart( options: NodeWebProbeObserveOptions, spec: HwlabRuntimeLaneSpec, @@ -1343,6 +1711,7 @@ export function runNodeWebProbeObserveStart( `UNIDESK_WEB_OBSERVE_SCREENSHOT_INTERVAL_MS=${shellQuote(String(options.screenshotIntervalMs))}`, `UNIDESK_WEB_OBSERVE_OBSERVER_REFRESH_INTERVAL_MS=${shellQuote(String(options.observerRefreshIntervalMs))}`, `UNIDESK_WEB_OBSERVE_MAX_SAMPLES=${shellQuote(String(options.maxSamples))}`, + `UNIDESK_WEB_OBSERVE_MAX_RUN_MS=${shellQuote(String(options.maxRunSeconds > 0 ? options.maxRunSeconds * 1000 : 0))}`, `UNIDESK_WEB_OBSERVE_VIEWPORT=${shellQuote(options.viewport)}`, `UNIDESK_WEB_OBSERVE_BROWSER_PROXY_MODE=${shellQuote(options.browserProxyMode)}`, `UNIDESK_WEB_OBSERVE_ALERT_THRESHOLDS_JSON=${shellQuote(JSON.stringify(alertThresholds))}`,