Merge pull request #1365 from pikasTech/fix/1352-jd01-observe-gc

Fix JD01 web observe GC and legacy cleanup
2026-07-01 11:40:33 +08:00
parent 09edf61fcf 1bc2dd3d69
commit e969e9c49a
9 changed files with 1448 additions and 21 deletions
@@ -75,6 +75,15 @@ gc:
      codex-queue-stats-verify: .state/codex-queue-stats-verify
      codex-queue-perf: .state/codex-queue-perf
      tmp: .state/tmp
+  legacyDockerImages:
+    enabled: true
+    minAgeHours: 12
+    keepPerRepository: 2
+    repositories:
+      - 127.0.0.1:5000/hwlab/web-probe-sentinel-${nodeLower}
+  legacyDockerRegistryVolumes:
+    enabled: true
+    requireK8sRegistryReady: true
  codexSessions:
    enabled: false
    keepHours: 72
@@ -12,6 +12,10 @@ export function hwlabNodeHelp(): Record<string, unknown> {
    examples: [
      "bun scripts/cli.ts hwlab nodes control-plane infra plan --node D601 --lane v03",
      "bun scripts/cli.ts hwlab nodes control-plane status --node D601 --lane v03",
+      "bun scripts/cli.ts hwlab nodes control-plane cleanup-runs --node JD01 --lane v03 --min-age-minutes 30 --limit 200 --dry-run",
+      "bun scripts/cli.ts hwlab nodes control-plane cleanup-released-pvs --node JD01 --lane v03 --limit 200 --dry-run",
+      "bun scripts/cli.ts hwlab nodes control-plane cleanup-legacy-docker-images --node JD01 --lane v03 --dry-run",
+      "bun scripts/cli.ts hwlab nodes control-plane cleanup-legacy-docker-registry-volume --node JD01 --lane v03 --dry-run",
      "bun scripts/cli.ts hwlab nodes git-mirror status --node G14 --lane v03",
      "bun scripts/cli.ts hwlab nodes hwpod-preinstall plan --node D601 --lane v03 --dry-run",
      "bun scripts/cli.ts hwlab nodes fake-model-provider plan --node D518 --lane v03 --provider fake-echo",
@@ -21,7 +25,7 @@ export function hwlabNodeHelp(): Record<string, unknown> {
      "bun scripts/cli.ts web-probe --help",
    ],
    actions: {
-      "control-plane": "YAML-first node-local CI/CD, git-mirror, public exposure, runtime-image, Argo and PipelineRun operations.",
+      "control-plane": "YAML-first node-local CI/CD, git-mirror, public exposure, runtime-image, Argo, PipelineRun and CI workspace retention operations.",
      "git-mirror": "Inspect or operate the selected node/lane source mirror.",
      "hwpod-preinstall": "Render YAML-first HWPOD preinstall configRefs, runtime mount targets, PM MDTODO source, and gateway profile status.",
      "fake-model-provider": "Materialize and operate YAML-declared fake Responses model providers for HWLAB/AgentRun sentinel checks.",
@@ -32,6 +36,9 @@ export function hwlabNodeHelp(): Record<string, unknown> {
    notes: [
      "`web-probe` is no longer under `hwlab nodes`; use `bun scripts/cli.ts web-probe ...`.",
      "`--node` and `--lane` remain data arguments resolved from YAML for node/lane operations.",
+      "control-plane cleanup-runs deletes terminal PipelineRuns; cleanup-released-pvs deletes only orphaned hwlab-ci PipelineRun PVCs with no active pod mounts and Released local-path/Delete PVs.",
+      "cleanup-legacy-docker-images is a transitional legacy-cache GC for Docker images matching YAML allowlisted repositories; it protects container-referenced images, does not run prune, and does not touch Docker volumes.",
+      "cleanup-legacy-docker-registry-volume removes only an exited legacy Docker registry container and its unique /var/lib/registry volume after the YAML-declared k8s node-local-registry is ready.",
      "`trigger-current --confirm --wait` is the one-command CICD path for current node/lane runtime publish.",
    ],
  };
@@ -63,6 +70,8 @@ export function hwlabNodeWebProbeHelp(): Record<string, unknown> {
      "bun scripts/cli.ts web-probe observe collect webobs-xxxx --view timeline --command-id cmd-xxxx",
      "bun scripts/cli.ts web-probe observe collect webobs-xxxx --view project-mdtodo-summary",
      "bun scripts/cli.ts web-probe observe analyze webobs-xxxx",
+      "bun scripts/cli.ts web-probe observe gc --node JD01 --lane v03 --dry-run",
+      "bun scripts/cli.ts web-probe observe gc --node JD01 --lane v03 --keep-hours 24 --confirm",
      "bun scripts/cli.ts web-probe sentinel plan --node D601 --lane v03 --dry-run",
      "bun scripts/cli.ts web-probe sentinel plan --node D601 --lane v03 --sentinel workbench-auth-session-switch-2users",
      "bun scripts/cli.ts web-probe sentinel publish-current --node JD01 --lane v03 --sentinel jd01-web-probe-sentinel --confirm --wait",
@@ -77,7 +86,7 @@ export function hwlabNodeWebProbeHelp(): Record<string, unknown> {
      "opencode-smoke": "Run the repo-owned OpenCode iframe/direct-host composer smoke and require DOM assistant text plus EventSource update/finish/idle evidence.",
      script: "Run caller-provided Playwright JS after CLI-managed /auth/login; scripts must not handle secrets themselves.",
      screenshot: "Capture a no-auth or public page through the selected node/lane remote browser and download PNG artifacts to the caller /tmp by default.",
-      observe: "Start, inspect, control, stop, collect, and analyze a long-running observer that writes JSONL artifacts.",
+      observe: "Start, inspect, control, stop, collect, analyze, and garbage-collect raw artifacts for long-running observers.",
      sentinel: "Render and operate the YAML-first web-probe sentinel wrapper, one-click publish, image, GitOps, dashboard verification, maintenance and report views.",
    },
    notes: [
@@ -85,6 +94,7 @@ export function hwlabNodeWebProbeHelp(): Record<string, unknown> {
      "`web-probe script` is an ad-hoc exploration escape hatch; repeated/high-frequency workflows must become `web-probe observe command` types or repo-owned web-probe commands.",
      "`web-probe opencode-smoke` is the repo-owned OpenCode smoke; prefer it over repeating one-off OpenCode Playwright snippets.",
      "observe is passive by default; user actions must be explicit observe command entries in control.jsonl.",
+      "observe gc keeps manifest, heartbeat, control/error logs and analysis reports, and only removes dead-run raw samples/browser/network/screenshot artifacts after YAML-configured retention.",
      "After observe start, prefer observe status|command|stop|collect|analyze <id> instead of repeating --node/--lane/--state-dir.",
      "collect views render bounded summaries from existing artifacts and do not create a second source of truth.",
      "analyze is offline-only: it reads artifact JSONL and writes analysis/report.md plus analysis/report.json.",
@@ -24,6 +24,7 @@ const screenshotIntervalMs = positiveInteger(process.env.UNIDESK_WEB_OBSERVE_SCR
 const screenshotCaptureTimeoutMs = boundedInteger(process.env.UNIDESK_WEB_OBSERVE_SCREENSHOT_CAPTURE_TIMEOUT_MS, 15000, 1000, 120000);
 const maxSamples = positiveInteger(process.env.UNIDESK_WEB_OBSERVE_MAX_SAMPLES, 0);
 const observerRefreshIntervalMs = positiveInteger(process.env.UNIDESK_WEB_OBSERVE_OBSERVER_REFRESH_INTERVAL_MS, 180000);
+const maxRunMs = positiveInteger(process.env.UNIDESK_WEB_OBSERVE_MAX_RUN_MS, 0);
 const viewport = parseViewport(process.env.UNIDESK_WEB_OBSERVE_VIEWPORT || "1440x900");
 const browserProxyMode = parseBrowserProxyMode(process.env.UNIDESK_WEB_OBSERVE_BROWSER_PROXY_MODE || "auto");
 const authLoginMaxAttempts = boundedInteger(process.env.UNIDESK_WEB_OBSERVE_AUTH_LOGIN_MAX_ATTEMPTS, 6, 1, 20);
@@ -128,6 +129,10 @@ try {
      await appendJsonl(files.control, controlRecord({ id: "max-samples", type: "stop", source: "sampler" }, "completed", { reason: "max-samples", maxSamples }));
      break;
    }
+    if (maxRunMs > 0 && Date.now() - startedAtMs >= maxRunMs) {
+      await appendJsonl(files.control, controlRecord({ id: "max-run-ms", type: "stop", source: "sampler" }, "completed", { reason: "max-run-ms", maxRunMs, elapsedMs: Date.now() - startedAtMs }));
+      break;
+    }
    await sleep(sampleIntervalMs);
  }
  if (browserFreezeBlocker) {
@@ -114,6 +114,7 @@ export function runSentinelQuickVerify(state: SentinelCicdState, reason: string,
    "--target-path", stringAt(scenario, "observeTargetPath"),
    "--sample-interval-ms", String(sampleIntervalMs),
    "--screenshot-interval-ms", String(numberAt(scenario, "screenshotIntervalMs")),
+    "--max-run-seconds", String(hardBudgetSeconds),
    "--command-timeout-seconds", "55",
  ];
  const viewport = stringAtNullable(scenario, "viewport");
@@ -298,7 +299,12 @@ export function runSentinelQuickVerify(state: SentinelCicdState, reason: string,
  const controlFindings = quickVerifyControlFindings(null, promptIndex, turnSummary, traceFrame);
  const artifactSummaryRecord = record(artifactSummary);
  const artifactFindings = Array.isArray(artifactSummaryRecord.findings) ? artifactSummaryRecord.findings.map(record) : [];
-  const findings = mergeFindingRecords(artifactFindings, controlFindings);
+  const cleanupStep = stopQuickVerifyObserver(state, observerId, "observe-stop-after-terminal");
+  const cleanupFindings = quickVerifyCleanupFindings(cleanupStep);
+  const findings = mergeFindingRecords(
+    mergeFindingRecords(artifactFindings, controlFindings),
+    cleanupFindings,
+  );
  const blockingFindings = findings.filter(isQuickVerifyBlockingFinding);
  const analysisWarnings = analysis.ok ? [] : ["quick verify analyze command returned non-zero but a readable analysis artifact was produced; targetValidation is using artifact severity plus control blockers."];
  const ok = record(artifactSummary).ok === true && controlFindings.length === 0 && blockingFindings.length === 0;
@@ -320,7 +326,7 @@ export function runSentinelQuickVerify(state: SentinelCicdState, reason: string,
    failure: controlFindings.length > 0 ? "quick-verify-no-business-turn" : blockingFindings.length > 0 ? "quick-verify-blocking-findings" : null,
    promptSource: prompts.summary,
    accountEnv: accountEnv.summary,
-    steps,
+    steps: [...steps, cleanupStep],
    analysis: artifactSummary,
    views: {
      summary: { renderedText: renderQuickVerifySummary({ runId, scenarioId, observerId, artifactSummary, steps, publicOrigin: stringAt(state.publicExposure, "publicBaseUrl") }) },
@@ -331,7 +337,7 @@ export function runSentinelQuickVerify(state: SentinelCicdState, reason: string,
    findings,
    screenshot: record(artifactSummary).screenshot,
    publicOrigin: stringAt(state.publicExposure, "publicBaseUrl"),
-    warnings: mergeWarnings(analysisWarnings, nonBlockingCanaryWarnings, elapsedWarnings()),
+    warnings: mergeWarnings(analysisWarnings, nonBlockingCanaryWarnings, cleanupFindings.length > 0 ? ["quick verify observer stop failed; runner lifecycle cleanup is a blocking finding."] : [], elapsedWarnings()),
    valuesRedacted: true,
  });
 }
@@ -466,14 +472,7 @@ function finalizeQuickVerifyFailure(state: SentinelCicdState, input: {
    ], 60);
    cleanupSteps.push({ phase: "observe-cancel-after-failure", ok: cancel.ok, result: cancel.result });
  }
-  const stop = runChildCli([
-    "web-probe", "observe", "stop", input.observerId,
-    "--node", state.spec.nodeId,
-    "--lane", state.spec.lane,
-    "--force",
-    "--command-timeout-seconds", "55",
-  ], 30);
-  cleanupSteps.push({ phase: "observe-stop-after-failure", ok: stop.ok, result: stop.result });
+  cleanupSteps.push(stopQuickVerifyObserver(state, input.observerId, "observe-stop-after-failure"));
  const analysis = runChildCli([
    "web-probe", "observe", "analyze", input.observerId,
    "--node", state.spec.nodeId,
@@ -534,6 +533,30 @@ function finalizeQuickVerifyFailure(state: SentinelCicdState, input: {
  };
 }

+function stopQuickVerifyObserver(state: SentinelCicdState, observerId: string, phase: string): Record<string, unknown> {
+  const stop = runChildCli([
+    "web-probe", "observe", "stop", observerId,
+    "--node", state.spec.nodeId,
+    "--lane", state.spec.lane,
+    "--force",
+    "--wait-ms", "55000",
+    "--command-timeout-seconds", "55",
+  ], 60);
+  return { phase, ok: stop.ok, result: stop.result };
+}
+
+function quickVerifyCleanupFindings(cleanupStep: Record<string, unknown>): Record<string, unknown>[] {
+  if (cleanupStep.ok === true) return [];
+  return [{
+    id: "quick-verify-observer-stop-failed",
+    severity: "red",
+    count: 1,
+    summary: "quick verify completed target analysis but failed to stop its observer runner; this can leak Chrome process trees on cadence runs.",
+    cleanupPhase: cleanupStep.phase,
+    valuesRedacted: true,
+  }];
+}
+
 function callSentinelService(state: SentinelCicdState, method: "GET" | "POST", pathWithQuery: string, body: Record<string, unknown> | null, timeoutSeconds: number): Record<string, unknown> {
  const namespace = stringAt(state.runtime, "namespace");
  const serviceName = stringAt(state.runtime, "serviceName");
@@ -526,6 +526,7 @@ function buildObserveCommandPlan(config: WebProbeSentinelServiceConfig, scenario
    "--target-path", targetPath,
    "--sample-interval-ms", String(numberAt(scenario, "sampleIntervalMs")),
    "--screenshot-interval-ms", String(numberAt(scenario, "screenshotIntervalMs")),
+    "--max-run-seconds", String(numberAt(scenario, "maxRunSeconds")),
    "--command-timeout-seconds", "55",
  ];
  const viewport = stringOrNull(scenario.viewport);
@@ -562,7 +563,17 @@ function buildObserveCommandPlan(config: WebProbeSentinelServiceConfig, scenario
    argv: ["bun", "scripts/cli.ts", "web-probe", "observe", "analyze", "<observerId>"],
    stdinSource: "none",
  };
-  return [start, ...commands, analyze];
+  const stop: CommandPlanStep = {
+    phase: "observe-stop",
+    argv: ["bun", "scripts/cli.ts", "web-probe", "observe", "stop", "<observerId>", "--node", config.node, "--lane", config.lane, "--force", "--wait-ms", "55000", "--command-timeout-seconds", "55"],
+    stdinSource: "none",
+  };
+  const gc: CommandPlanStep = {
+    phase: "observe-gc",
+    argv: ["bun", "scripts/cli.ts", "web-probe", "observe", "gc", "--node", config.node, "--lane", config.lane, "--confirm", "--command-timeout-seconds", "55"],
+    stdinSource: "none",
+  };
+  return [start, ...commands, analyze, stop, gc];
 }

 function inlinePromptText(item: Record<string, unknown>): string | null {
@@ -116,7 +116,7 @@ export interface NodeWebProbeScreenshotOptions {
  commandTimeoutSeconds: number;
 }

-export type NodeWebProbeObserveAction = "start" | "status" | "command" | "stop" | "collect" | "analyze";
+export type NodeWebProbeObserveAction = "start" | "status" | "command" | "stop" | "collect" | "analyze" | "gc";

 export type NodeWebProbeObserveCommandType =
  | "login"
@@ -176,10 +176,15 @@ export interface NodeWebProbeObserveOptions {
  screenshotIntervalMs: number;
  observerRefreshIntervalMs: number;
  maxSamples: number;
+  maxRunSeconds: number;
  commandTimeoutSeconds: number;
  waitMs: number;
  tailLines: number;
  maxFiles: number;
+  gcKeepHours: number;
+  gcLimit: number;
+  confirm: boolean;
+  dryRun: boolean;
  collectView: NodeWebProbeObserveCollectView;
  collectFile: string | null;
  collectFinding: string | null;
@@ -612,7 +617,7 @@ export async function runNodeDelegatedDomain(config: Config, domain: DelegatedNo
      if (scoped.originalArgs.includes("--full")) return withNodeRuntimeControlPlaneStatusFullRendered(result, scoped);
      return withNodeRuntimeControlPlaneStatusRendered(result, scoped);
    }
-    if (scoped.action === "apply" || scoped.action === "trigger-current" || scoped.action === "refresh" || scoped.action === "sync" || scoped.action === "runtime-migration" || scoped.action === "cleanup-runs") {
+    if (scoped.action === "apply" || scoped.action === "trigger-current" || scoped.action === "refresh" || scoped.action === "sync" || scoped.action === "runtime-migration" || scoped.action === "cleanup-runs" || scoped.action === "cleanup-released-pvs" || scoped.action === "cleanup-legacy-docker-images" || scoped.action === "cleanup-legacy-docker-registry-volume") {
      if (scoped.confirm && !scoped.dryRun && !scoped.wait) return startNodeDelegatedJob(scoped);
      return nodeRuntimeControlPlaneRun(scoped);
    }
@@ -209,7 +209,7 @@ export function nodeRuntimeUnsupportedAction(scoped: ReturnType<typeof parseNode
    lane: scoped.lane,
    mutation: false,
    degradedReason: "unsupported-node-scoped-runtime-action",
-    message: "node-scoped runtime currently supports plan/status/apply/refresh/sync/trigger-current/cleanup-runs/runtime-image/runtime-migration/source-workspace",
+    message: "node-scoped runtime currently supports plan/status/apply/refresh/sync/trigger-current/cleanup-runs/cleanup-released-pvs/cleanup-legacy-docker-images/cleanup-legacy-docker-registry-volume/runtime-image/runtime-migration/source-workspace",
    expected: nodeRuntimeExpected(scoped.spec),
  };
 }
@@ -219,8 +219,9 @@ export function parseNodeWebProbeObserveOptions(
    && observeActionRaw !== "stop"
    && observeActionRaw !== "collect"
    && observeActionRaw !== "analyze"
+    && observeActionRaw !== "gc"
  ) {
-    throw new Error("web-probe observe usage: observe start --node NODE --lane vNN [...]; observe status|command|stop|collect|analyze <id> [...]");
+    throw new Error("web-probe observe usage: observe start --node NODE --lane vNN [...]; observe status|command|stop|collect|analyze <id> [...]; observe gc --node NODE --lane vNN [--confirm]");
  }
  assertKnownOptions(args.slice(1), new Set([
    "--node",
@@ -233,6 +234,9 @@ export function parseNodeWebProbeObserveOptions(
    "--screenshot-interval-ms",
    "--observer-refresh-interval-ms",
    "--max-samples",
+    "--max-run-seconds",
+    "--keep-hours",
+    "--limit",
    "--command-timeout-seconds",
    "--wait-ms",
    "--tail-lines",
@@ -284,7 +288,7 @@ export function parseNodeWebProbeObserveOptions(
    "--workspace-root",
    "--workspace-root-ref",
    "--root",
-  ]), new Set(["--force", "--full", "--raw", "--text-stdin", "--require-composer-ready", "--wait-project-management-ready", "--blocking", "--non-blocking"]));
+  ]), new Set(["--force", "--full", "--raw", "--text-stdin", "--require-composer-ready", "--wait-project-management-ready", "--blocking", "--non-blocking", "--dry-run", "--confirm"]));
  const commandTypeRaw = optionValue(args, "--type") ?? null;
  const commandType = commandTypeRaw === null ? null : parseNodeWebProbeObserveCommandType(commandTypeRaw);
  const stateDir = optionValue(args, "--state-dir") ?? indexed?.stateDir ?? null;
@@ -320,9 +324,12 @@ export function parseNodeWebProbeObserveOptions(
  if (analyzeTailSamples !== null && (!Number.isInteger(analyzeTailSamples) || analyzeTailSamples < 0)) {
    throw new Error("unsafe web-probe observe --tail-samples: expected a non-negative integer; use 0 for all samples");
  }
-  if (observeActionRaw !== "start" && stateDir === null && jobId === null) {
+  if (observeActionRaw !== "start" && observeActionRaw !== "gc" && stateDir === null && jobId === null) {
    throw new Error("web-probe observe status|command|stop|collect|analyze requires --state-dir or --job-id");
  }
+  const confirm = args.includes("--confirm");
+  const dryRun = args.includes("--dry-run") || !confirm;
+  if (confirm && args.includes("--dry-run")) throw new Error("web-probe observe gc accepts only one of --confirm or --dry-run");
  const commandTextOption = optionValue(args, "--text") ?? null;
  const commandTextFromStdin = args.includes("--text-stdin");
  if (commandTextFromStdin && observeActionRaw !== "command") {
@@ -403,10 +410,15 @@ export function parseNodeWebProbeObserveOptions(
    screenshotIntervalMs: positiveIntegerOption(args, "--screenshot-interval-ms", 300000, 86_400_000),
    observerRefreshIntervalMs: positiveIntegerOption(args, "--observer-refresh-interval-ms", 180000, 86_400_000),
    maxSamples: positiveIntegerOption(args, "--max-samples", 0, 10_000_000),
+    maxRunSeconds: positiveIntegerOption(args, "--max-run-seconds", 0, 86_400),
    commandTimeoutSeconds: positiveIntegerOption(args, "--command-timeout-seconds", 55, 3600),
    waitMs: positiveIntegerOption(args, "--wait-ms", 0, 600000),
    tailLines: positiveIntegerOption(args, "--tail-lines", 5, 200),
    maxFiles: positiveIntegerOption(args, "--max-files", 80, 5000),
+    gcKeepHours: positiveIntegerOption(args, "--keep-hours", webObserveGcDefaultKeepHours(), 8760),
+    gcLimit: positiveIntegerOption(args, "--limit", 200, 5000),
+    confirm,
+    dryRun,
    collectView,
    collectFile,
    collectFinding,
@@ -555,6 +567,18 @@ export function assertKnownOptions(args: string[], valueOptions: Set<string>, fl
  }
 }

+function webObserveGcDefaultKeepHours(): number {
+  const configPath = "config/unidesk-cli.yaml";
+  const parsed = record(Bun.YAML.parse(readFileSync(rootPath(configPath), "utf8")) as unknown);
+  const gc = record(parsed.gc);
+  const scratch = record(gc.stateStaleScratch);
+  const keepHours = scratch.keepHours;
+  if (!Number.isInteger(keepHours) || keepHours < 0) {
+    throw new Error(`${configPath}#gc.stateStaleScratch.keepHours must be a non-negative integer`);
+  }
+  return keepHours;
+}
+
 export function runNodeWebProbe(options: NodeWebProbeOptions): Record<string, unknown> | RenderedCliResult {
  const lane = options.lane;
  if (!isHwlabRuntimeLane(lane)) throw new Error(`web-probe only supports HWLAB runtime lanes, got ${lane}`);
@@ -1305,9 +1329,353 @@ export function runNodeWebProbeObserve(
  if (options.observeAction === "command") return runNodeWebProbeObserveCommand(options, spec, false);
  if (options.observeAction === "stop") return runNodeWebProbeObserveCommand({ ...options, commandType: "stop" }, spec, true);
  if (options.observeAction === "collect") return runNodeWebProbeObserveCollect(options, spec);
+  if (options.observeAction === "gc") return runNodeWebProbeObserveGc(options, spec);
  return runNodeWebProbeObserveAnalyze(options, spec);
 }

+export function runNodeWebProbeObserveGc(options: NodeWebProbeObserveOptions, spec: HwlabRuntimeLaneSpec): Record<string, unknown> {
+  const stateRoot = `.state/web-observe/${safeWebObserveSegment(options.node)}/${safeWebObserveSegment(options.lane)}`;
+  const script = [
+    "set -eu",
+    `node - ${shellQuote(stateRoot)} ${shellQuote(String(options.gcKeepHours))} ${shellQuote(String(options.gcLimit))} ${shellQuote(options.confirm ? "run" : "plan")} ${shellQuote(options.node)} ${shellQuote(options.lane)} <<'UNIDESK_WEB_OBSERVE_GC'`,
+    nodeWebObserveGcNodeScript(),
+    "UNIDESK_WEB_OBSERVE_GC",
+  ].join("\n");
+  const result = runTransWorkspaceStdinScript(options.node, spec.workspace, script, options.commandTimeoutSeconds);
+  const payload = parseJsonObject(result.stdout);
+  const ok = result.exitCode === 0 && payload.ok !== false;
+  return {
+    ok,
+    status: ok ? (options.confirm ? "cleaned" : "planned") : "blocked",
+    command: `web-probe observe gc --node ${options.node} --lane ${options.lane}`,
+    node: options.node,
+    lane: options.lane,
+    workspace: spec.workspace,
+    stateRoot,
+    mode: options.confirm ? "confirmed-run" : "dry-run",
+    retention: {
+      source: "config/unidesk-cli.yaml#gc.stateStaleScratch.keepHours",
+      keepHours: options.gcKeepHours,
+      userOverride: options.gcKeepHours !== webObserveGcDefaultKeepHours(),
+    },
+    gc: options.full || options.raw ? payload : compactWebObserveGcPayload(payload),
+    result: ok
+      ? {
+        exitCode: result.exitCode,
+        timedOut: result.timedOut,
+        stdoutBytes: result.stdout.length,
+        stderrBytes: result.stderr.length,
+      }
+      : compactCommandResultWithStdoutTail(result),
+    valuesRedacted: true,
+  };
+}
+
+function compactWebObserveGcPayload(payload: Record<string, unknown>): Record<string, unknown> {
+  const candidates = Array.isArray(payload.candidates) ? payload.candidates.map(record) : [];
+  const protectedRuns = Array.isArray(payload.protected) ? payload.protected.map(record) : [];
+  const deleted = Array.isArray(payload.deleted) ? payload.deleted.map(record) : [];
+  const failures = Array.isArray(payload.failures) ? payload.failures.map(record) : [];
+  return {
+    ok: payload.ok ?? null,
+    mode: payload.mode ?? null,
+    mutation: payload.mutation ?? null,
+    keepHours: payload.keepHours ?? null,
+    limit: payload.limit ?? null,
+    diskBefore: payload.diskBefore ?? null,
+    diskAfter: payload.diskAfter ?? null,
+    scannedRuns: payload.scannedRuns ?? null,
+    candidateCount: payload.candidateCount ?? null,
+    selectedCount: payload.selectedCount ?? null,
+    deferredCount: payload.deferredCount ?? null,
+    protectedCount: payload.protectedCount ?? null,
+    estimatedReclaimBytes: payload.estimatedReclaimBytes ?? null,
+    estimatedReclaimHuman: payload.estimatedReclaimHuman ?? null,
+    reclaimedBytes: payload.reclaimedBytes ?? null,
+    reclaimedHuman: payload.reclaimedHuman ?? null,
+    candidates: candidates.slice(0, 20).map((item) => ({
+      id: item.id ?? null,
+      runDir: item.runDir ?? null,
+      status: item.status ?? null,
+      ageHours: item.ageHours ?? null,
+      rawHuman: item.rawHuman ?? null,
+      artifactCount: item.artifactCount ?? null,
+    })),
+    deleted: deleted.slice(0, 20).map((item) => ({
+      id: item.id ?? null,
+      runDir: item.runDir ?? null,
+      rawHuman: item.rawHuman ?? null,
+      artifactCount: item.artifactCount ?? null,
+    })),
+    failures: failures.slice(0, 20),
+    protected: protectedRuns.slice(0, 20).map((item) => ({
+      id: item.id ?? null,
+      runDir: item.runDir ?? null,
+      status: item.status ?? null,
+      ageHours: item.ageHours ?? null,
+      rawHuman: item.rawHuman ?? null,
+      reasons: item.reasons ?? null,
+    })),
+    disclosure: {
+      candidateRowsShown: Math.min(20, candidates.length),
+      deletedRowsShown: Math.min(20, deleted.length),
+      protectedRowsShown: Math.min(20, protectedRuns.length),
+      full: "rerun with --full or --raw for all candidates/artifacts",
+    },
+    valuesRedacted: true,
+  };
+}
+
+function nodeWebObserveGcNodeScript(): string {
+  return String.raw`
+const fs = require("fs");
+const path = require("path");
+const childProcess = require("child_process");
+
+const stateRoot = process.argv[2];
+const keepHours = Number(process.argv[3]);
+const limit = Number(process.argv[4]);
+const mode = process.argv[5] === "run" ? "run" : "plan";
+const nodeId = process.argv[6];
+const lane = process.argv[7];
+const keepMs = keepHours * 60 * 60 * 1000;
+const nowMs = Date.now();
+const rawNames = ["samples.jsonl", "browser-process.jsonl", "network.jsonl", "console.jsonl", "artifacts.jsonl", "screenshots"];
+
+function jsonRead(file) {
+  try { return JSON.parse(fs.readFileSync(file, "utf8")); } catch { return null; }
+}
+
+function parseTime(value) {
+  if (typeof value !== "string") return 0;
+  const ms = Date.parse(value);
+  return Number.isFinite(ms) ? ms : 0;
+}
+
+function dirTimeMs(runDir) {
+  const base = path.basename(runDir);
+  const match = base.match(/^(\d{8})T(\d{6})Z_/);
+  if (!match) return 0;
+  const raw = match[1].slice(0, 4) + "-" + match[1].slice(4, 6) + "-" + match[1].slice(6, 8) + "T" + match[2].slice(0, 2) + ":" + match[2].slice(2, 4) + ":" + match[2].slice(4, 6) + "Z";
+  return Date.parse(raw) || 0;
+}
+
+function newestKnownTimeMs(runDir, manifest, heartbeat) {
+  return Math.max(
+    dirTimeMs(runDir),
+    parseTime(manifest && manifest.completedAt),
+    parseTime(manifest && manifest.updatedAt),
+    parseTime(manifest && manifest.startedAt),
+    parseTime(manifest && manifest.pageProvenance && manifest.pageProvenance.observedAt),
+    parseTime(heartbeat && heartbeat.completedAt),
+    parseTime(heartbeat && heartbeat.forceStoppedAt),
+    parseTime(heartbeat && heartbeat.updatedAt),
+    parseTime(heartbeat && heartbeat.pageProvenance && heartbeat.pageProvenance.observedAt),
+  );
+}
+
+function sizePath(target) {
+  let total = 0;
+  const stack = [target];
+  while (stack.length > 0) {
+    const current = stack.pop();
+    let st;
+    try { st = fs.lstatSync(current); } catch { continue; }
+    if (st.isSymbolicLink()) continue;
+    total += st.blocks ? st.blocks * 512 : st.size;
+    if (!st.isDirectory()) continue;
+    let entries;
+    try { entries = fs.readdirSync(current); } catch { continue; }
+    for (const entry of entries) stack.push(path.join(current, entry));
+  }
+  return total;
+}
+
+function human(bytes) {
+  const units = ["B", "KiB", "MiB", "GiB", "TiB"];
+  let value = bytes;
+  let unit = 0;
+  while (value >= 1024 && unit < units.length - 1) { value /= 1024; unit += 1; }
+  return value.toFixed(unit === 0 ? 0 : 1) + units[unit];
+}
+
+function disk() {
+  try {
+    const line = childProcess.execFileSync("df", ["-Pk", "/"], { encoding: "utf8" }).trim().split("\n").slice(-1)[0];
+    const parts = line.trim().split(/\s+/);
+    const sizeBytes = Number(parts[1]) * 1024;
+    const usedBytes = Number(parts[2]) * 1024;
+    const availableBytes = Number(parts[3]) * 1024;
+    const usePercent = Number(String(parts[4]).replace("%", ""));
+    return { filesystem: parts[0], sizeBytes, usedBytes, availableBytes, usePercent, usedHuman: human(usedBytes), availableHuman: human(availableBytes) };
+  } catch (error) {
+    return { error: error instanceof Error ? error.message : String(error) };
+  }
+}
+
+function findRunDirs(root) {
+  const runs = [];
+  for (const year of safeReadDir(root)) {
+    const y = path.join(root, year.name);
+    if (!year.isDirectory() || !/^\d{4}$/.test(year.name)) continue;
+    for (const month of safeReadDir(y)) {
+      const m = path.join(y, month.name);
+      if (!month.isDirectory() || !/^\d{2}$/.test(month.name)) continue;
+      for (const day of safeReadDir(m)) {
+        const d = path.join(m, day.name);
+        if (!day.isDirectory() || !/^\d{2}$/.test(day.name)) continue;
+        for (const run of safeReadDir(d)) {
+          const r = path.join(d, run.name);
+          if (run.isDirectory() && /_webobs-[A-Za-z0-9_.-]+$/.test(run.name)) runs.push(r);
+        }
+      }
+    }
+  }
+  return runs;
+}
+
+function safeReadDir(dir) {
+  try { return fs.readdirSync(dir, { withFileTypes: true }); } catch { return []; }
+}
+
+function pidObserverAlive(pid, runAbs) {
+  if (!Number.isInteger(pid) || pid <= 1) return { alive: false, reason: null };
+  try {
+    const cmdline = fs.readFileSync("/proc/" + pid + "/cmdline", "utf8").replace(/\0/g, " ");
+    if (cmdline.includes("observer-runner") || cmdline.includes(runAbs)) return { alive: true, reason: "pid-alive" };
+    return { alive: false, reason: null };
+  } catch (error) {
+    if (fs.existsSync("/proc/" + pid)) return { alive: true, reason: "pid-alive-unreadable" };
+    return { alive: false, reason: null };
+  }
+}
+
+function openRunDirs(rootAbs) {
+  const active = new Set();
+  const procEntries = safeReadDir("/proc").filter((entry) => entry.isDirectory() && /^\d+$/.test(entry.name));
+  for (const proc of procEntries) {
+    const fdDir = path.join("/proc", proc.name, "fd");
+    for (const fd of safeReadDir(fdDir)) {
+      let target;
+      try { target = fs.readlinkSync(path.join(fdDir, fd.name)).replace(/ \(deleted\)$/, ""); } catch { continue; }
+      if (!target.startsWith(rootAbs + path.sep)) continue;
+      const rel = path.relative(rootAbs, target).split(path.sep);
+      if (rel.length >= 4) active.add(path.join(rootAbs, rel[0], rel[1], rel[2], rel[3]));
+    }
+  }
+  return active;
+}
+
+function rawArtifacts(runDir) {
+  const artifacts = [];
+  for (const name of rawNames) {
+    const target = path.join(runDir, name);
+    if (!fs.existsSync(target)) continue;
+    const bytes = sizePath(target);
+    if (bytes > 0) artifacts.push({ name, path: target, bytes, human: human(bytes) });
+  }
+  return artifacts;
+}
+
+function removeArtifact(artifact) {
+  const st = fs.lstatSync(artifact.path);
+  if (st.isSymbolicLink()) throw new Error("refusing symlink: " + artifact.path);
+  if (st.isDirectory()) fs.rmSync(artifact.path, { recursive: true, force: false });
+  else fs.unlinkSync(artifact.path);
+}
+
+const rootAbs = path.resolve(stateRoot);
+const diskBefore = disk();
+const openDirs = openRunDirs(rootAbs);
+const candidates = [];
+const protectedRuns = [];
+
+for (const runDir of findRunDirs(rootAbs)) {
+  const manifest = jsonRead(path.join(runDir, "manifest.json"));
+  const heartbeat = jsonRead(path.join(runDir, "heartbeat.json"));
+  const pid = Number((heartbeat && heartbeat.pid) || (manifest && manifest.pid) || 0);
+  const pidState = pidObserverAlive(pid, runDir);
+  const hasOpenFd = openDirs.has(runDir);
+  const newestMs = newestKnownTimeMs(runDir, manifest, heartbeat);
+  const ageMs = newestMs > 0 ? nowMs - newestMs : 0;
+  const artifacts = rawArtifacts(runDir);
+  const rawBytes = artifacts.reduce((sum, item) => sum + item.bytes, 0);
+  const rel = path.relative(rootAbs, runDir);
+  const reasons = [];
+  if (!manifest) reasons.push("manifest-missing");
+  if (pidState.alive) reasons.push(pidState.reason || "pid-alive");
+  if (hasOpenFd) reasons.push("open-fd");
+  if (newestMs <= 0) reasons.push("time-unknown");
+  if (ageMs < keepMs) reasons.push("retention-window");
+  if (rawBytes <= 0) reasons.push("no-raw-artifacts");
+  const summary = {
+    id: (manifest && manifest.jobId) || path.basename(runDir).match(/(webobs-[A-Za-z0-9_.-]+)$/)?.[1] || null,
+    runDir: rel,
+    status: heartbeat && heartbeat.status || null,
+    pid: Number.isInteger(pid) && pid > 0 ? pid : null,
+    newestAt: newestMs > 0 ? new Date(newestMs).toISOString() : null,
+    ageHours: newestMs > 0 ? Number((ageMs / 3600000).toFixed(2)) : null,
+    rawBytes,
+    rawHuman: human(rawBytes),
+    artifacts,
+  };
+  if (reasons.length > 0) protectedRuns.push({ ...summary, reasons });
+  else candidates.push(summary);
+}
+
+candidates.sort((a, b) => b.rawBytes - a.rawBytes);
+const selected = candidates.slice(0, limit);
+const selectedSet = new Set(selected.map((item) => item.runDir));
+const deferred = candidates.filter((item) => !selectedSet.has(item.runDir));
+let reclaimedBytes = 0;
+const deleted = [];
+const failures = [];
+
+if (mode === "run") {
+  for (const item of selected) {
+    const runDir = path.join(rootAbs, item.runDir);
+    try {
+      for (const artifact of item.artifacts) removeArtifact(artifact);
+      reclaimedBytes += item.rawBytes;
+      deleted.push({ id: item.id, runDir: item.runDir, rawBytes: item.rawBytes, rawHuman: item.rawHuman, artifactCount: item.artifacts.length });
+    } catch (error) {
+      failures.push({ id: item.id, runDir: item.runDir, error: error instanceof Error ? error.message : String(error) });
+    }
+  }
+}
+
+const estimatedReclaimBytes = selected.reduce((sum, item) => sum + item.rawBytes, 0);
+const diskAfter = mode === "run" ? disk() : null;
+console.log(JSON.stringify({
+  ok: failures.length === 0,
+  command: "web-probe observe gc",
+  node: nodeId,
+  lane,
+  stateRoot,
+  mode,
+  mutation: mode === "run",
+  keepHours,
+  limit,
+  diskBefore,
+  diskAfter,
+  scannedRuns: candidates.length + protectedRuns.length,
+  candidateCount: candidates.length,
+  selectedCount: selected.length,
+  deferredCount: deferred.length,
+  protectedCount: protectedRuns.length,
+  estimatedReclaimBytes,
+  estimatedReclaimHuman: human(estimatedReclaimBytes),
+  reclaimedBytes,
+  reclaimedHuman: human(reclaimedBytes),
+  candidates: selected.map((item) => ({ id: item.id, runDir: item.runDir, status: item.status, ageHours: item.ageHours, rawBytes: item.rawBytes, rawHuman: item.rawHuman, artifactCount: item.artifacts.length, artifacts: item.artifacts.map((artifact) => ({ name: artifact.name, bytes: artifact.bytes, human: artifact.human })) })),
+  deleted,
+  failures,
+  protected: protectedRuns.slice(0, Math.min(50, protectedRuns.length)).map((item) => ({ id: item.id, runDir: item.runDir, status: item.status, ageHours: item.ageHours, rawBytes: item.rawBytes, rawHuman: item.rawHuman, reasons: item.reasons })),
+  valuesRedacted: true,
+}, null, 2));
+`;
+}
+
 export function runNodeWebProbeObserveStart(
  options: NodeWebProbeObserveOptions,
  spec: HwlabRuntimeLaneSpec,
@@ -1343,6 +1711,7 @@ export function runNodeWebProbeObserveStart(
    `UNIDESK_WEB_OBSERVE_SCREENSHOT_INTERVAL_MS=${shellQuote(String(options.screenshotIntervalMs))}`,
    `UNIDESK_WEB_OBSERVE_OBSERVER_REFRESH_INTERVAL_MS=${shellQuote(String(options.observerRefreshIntervalMs))}`,
    `UNIDESK_WEB_OBSERVE_MAX_SAMPLES=${shellQuote(String(options.maxSamples))}`,
+    `UNIDESK_WEB_OBSERVE_MAX_RUN_MS=${shellQuote(String(options.maxRunSeconds > 0 ? options.maxRunSeconds * 1000 : 0))}`,
    `UNIDESK_WEB_OBSERVE_VIEWPORT=${shellQuote(options.viewport)}`,
    `UNIDESK_WEB_OBSERVE_BROWSER_PROXY_MODE=${shellQuote(options.browserProxyMode)}`,
    `UNIDESK_WEB_OBSERVE_ALERT_THRESHOLDS_JSON=${shellQuote(JSON.stringify(alertThresholds))}`,