Merge pull request #1365 from pikasTech/fix/1352-jd01-observe-gc

Fix JD01 web observe GC and legacy cleanup
This commit is contained in:
Lyon
2026-07-01 11:40:33 +08:00
committed by GitHub
9 changed files with 1448 additions and 21 deletions
+9
View File
@@ -75,6 +75,15 @@ gc:
codex-queue-stats-verify: .state/codex-queue-stats-verify
codex-queue-perf: .state/codex-queue-perf
tmp: .state/tmp
legacyDockerImages:
enabled: true
minAgeHours: 12
keepPerRepository: 2
repositories:
- 127.0.0.1:5000/hwlab/web-probe-sentinel-${nodeLower}
legacyDockerRegistryVolumes:
enabled: true
requireK8sRegistryReady: true
codexSessions:
enabled: false
keepHours: 72
+12 -2
View File
@@ -12,6 +12,10 @@ export function hwlabNodeHelp(): Record<string, unknown> {
examples: [
"bun scripts/cli.ts hwlab nodes control-plane infra plan --node D601 --lane v03",
"bun scripts/cli.ts hwlab nodes control-plane status --node D601 --lane v03",
"bun scripts/cli.ts hwlab nodes control-plane cleanup-runs --node JD01 --lane v03 --min-age-minutes 30 --limit 200 --dry-run",
"bun scripts/cli.ts hwlab nodes control-plane cleanup-released-pvs --node JD01 --lane v03 --limit 200 --dry-run",
"bun scripts/cli.ts hwlab nodes control-plane cleanup-legacy-docker-images --node JD01 --lane v03 --dry-run",
"bun scripts/cli.ts hwlab nodes control-plane cleanup-legacy-docker-registry-volume --node JD01 --lane v03 --dry-run",
"bun scripts/cli.ts hwlab nodes git-mirror status --node G14 --lane v03",
"bun scripts/cli.ts hwlab nodes hwpod-preinstall plan --node D601 --lane v03 --dry-run",
"bun scripts/cli.ts hwlab nodes fake-model-provider plan --node D518 --lane v03 --provider fake-echo",
@@ -21,7 +25,7 @@ export function hwlabNodeHelp(): Record<string, unknown> {
"bun scripts/cli.ts web-probe --help",
],
actions: {
"control-plane": "YAML-first node-local CI/CD, git-mirror, public exposure, runtime-image, Argo and PipelineRun operations.",
"control-plane": "YAML-first node-local CI/CD, git-mirror, public exposure, runtime-image, Argo, PipelineRun and CI workspace retention operations.",
"git-mirror": "Inspect or operate the selected node/lane source mirror.",
"hwpod-preinstall": "Render YAML-first HWPOD preinstall configRefs, runtime mount targets, PM MDTODO source, and gateway profile status.",
"fake-model-provider": "Materialize and operate YAML-declared fake Responses model providers for HWLAB/AgentRun sentinel checks.",
@@ -32,6 +36,9 @@ export function hwlabNodeHelp(): Record<string, unknown> {
notes: [
"`web-probe` is no longer under `hwlab nodes`; use `bun scripts/cli.ts web-probe ...`.",
"`--node` and `--lane` remain data arguments resolved from YAML for node/lane operations.",
"control-plane cleanup-runs deletes terminal PipelineRuns; cleanup-released-pvs deletes only orphaned hwlab-ci PipelineRun PVCs with no active pod mounts and Released local-path/Delete PVs.",
"cleanup-legacy-docker-images is a transitional legacy-cache GC for Docker images matching YAML allowlisted repositories; it protects container-referenced images, does not run prune, and does not touch Docker volumes.",
"cleanup-legacy-docker-registry-volume removes only an exited legacy Docker registry container and its unique /var/lib/registry volume after the YAML-declared k8s node-local-registry is ready.",
"`trigger-current --confirm --wait` is the one-command CICD path for current node/lane runtime publish.",
],
};
@@ -63,6 +70,8 @@ export function hwlabNodeWebProbeHelp(): Record<string, unknown> {
"bun scripts/cli.ts web-probe observe collect webobs-xxxx --view timeline --command-id cmd-xxxx",
"bun scripts/cli.ts web-probe observe collect webobs-xxxx --view project-mdtodo-summary",
"bun scripts/cli.ts web-probe observe analyze webobs-xxxx",
"bun scripts/cli.ts web-probe observe gc --node JD01 --lane v03 --dry-run",
"bun scripts/cli.ts web-probe observe gc --node JD01 --lane v03 --keep-hours 24 --confirm",
"bun scripts/cli.ts web-probe sentinel plan --node D601 --lane v03 --dry-run",
"bun scripts/cli.ts web-probe sentinel plan --node D601 --lane v03 --sentinel workbench-auth-session-switch-2users",
"bun scripts/cli.ts web-probe sentinel publish-current --node JD01 --lane v03 --sentinel jd01-web-probe-sentinel --confirm --wait",
@@ -77,7 +86,7 @@ export function hwlabNodeWebProbeHelp(): Record<string, unknown> {
"opencode-smoke": "Run the repo-owned OpenCode iframe/direct-host composer smoke and require DOM assistant text plus EventSource update/finish/idle evidence.",
script: "Run caller-provided Playwright JS after CLI-managed /auth/login; scripts must not handle secrets themselves.",
screenshot: "Capture a no-auth or public page through the selected node/lane remote browser and download PNG artifacts to the caller /tmp by default.",
observe: "Start, inspect, control, stop, collect, and analyze a long-running observer that writes JSONL artifacts.",
observe: "Start, inspect, control, stop, collect, analyze, and garbage-collect raw artifacts for long-running observers.",
sentinel: "Render and operate the YAML-first web-probe sentinel wrapper, one-click publish, image, GitOps, dashboard verification, maintenance and report views.",
},
notes: [
@@ -85,6 +94,7 @@ export function hwlabNodeWebProbeHelp(): Record<string, unknown> {
"`web-probe script` is an ad-hoc exploration escape hatch; repeated/high-frequency workflows must become `web-probe observe command` types or repo-owned web-probe commands.",
"`web-probe opencode-smoke` is the repo-owned OpenCode smoke; prefer it over repeating one-off OpenCode Playwright snippets.",
"observe is passive by default; user actions must be explicit observe command entries in control.jsonl.",
"observe gc keeps manifest, heartbeat, control/error logs and analysis reports, and only removes dead-run raw samples/browser/network/screenshot artifacts after YAML-configured retention.",
"After observe start, prefer observe status|command|stop|collect|analyze <id> instead of repeating --node/--lane/--state-dir.",
"collect views render bounded summaries from existing artifacts and do not create a second source of truth.",
"analyze is offline-only: it reads artifact JSONL and writes analysis/report.md plus analysis/report.json.",
@@ -24,6 +24,7 @@ const screenshotIntervalMs = positiveInteger(process.env.UNIDESK_WEB_OBSERVE_SCR
const screenshotCaptureTimeoutMs = boundedInteger(process.env.UNIDESK_WEB_OBSERVE_SCREENSHOT_CAPTURE_TIMEOUT_MS, 15000, 1000, 120000);
const maxSamples = positiveInteger(process.env.UNIDESK_WEB_OBSERVE_MAX_SAMPLES, 0);
const observerRefreshIntervalMs = positiveInteger(process.env.UNIDESK_WEB_OBSERVE_OBSERVER_REFRESH_INTERVAL_MS, 180000);
const maxRunMs = positiveInteger(process.env.UNIDESK_WEB_OBSERVE_MAX_RUN_MS, 0);
const viewport = parseViewport(process.env.UNIDESK_WEB_OBSERVE_VIEWPORT || "1440x900");
const browserProxyMode = parseBrowserProxyMode(process.env.UNIDESK_WEB_OBSERVE_BROWSER_PROXY_MODE || "auto");
const authLoginMaxAttempts = boundedInteger(process.env.UNIDESK_WEB_OBSERVE_AUTH_LOGIN_MAX_ATTEMPTS, 6, 1, 20);
@@ -128,6 +129,10 @@ try {
await appendJsonl(files.control, controlRecord({ id: "max-samples", type: "stop", source: "sampler" }, "completed", { reason: "max-samples", maxSamples }));
break;
}
if (maxRunMs > 0 && Date.now() - startedAtMs >= maxRunMs) {
await appendJsonl(files.control, controlRecord({ id: "max-run-ms", type: "stop", source: "sampler" }, "completed", { reason: "max-run-ms", maxRunMs, elapsedMs: Date.now() - startedAtMs }));
break;
}
await sleep(sampleIntervalMs);
}
if (browserFreezeBlocker) {
@@ -114,6 +114,7 @@ export function runSentinelQuickVerify(state: SentinelCicdState, reason: string,
"--target-path", stringAt(scenario, "observeTargetPath"),
"--sample-interval-ms", String(sampleIntervalMs),
"--screenshot-interval-ms", String(numberAt(scenario, "screenshotIntervalMs")),
"--max-run-seconds", String(hardBudgetSeconds),
"--command-timeout-seconds", "55",
];
const viewport = stringAtNullable(scenario, "viewport");
@@ -298,7 +299,12 @@ export function runSentinelQuickVerify(state: SentinelCicdState, reason: string,
const controlFindings = quickVerifyControlFindings(null, promptIndex, turnSummary, traceFrame);
const artifactSummaryRecord = record(artifactSummary);
const artifactFindings = Array.isArray(artifactSummaryRecord.findings) ? artifactSummaryRecord.findings.map(record) : [];
const findings = mergeFindingRecords(artifactFindings, controlFindings);
const cleanupStep = stopQuickVerifyObserver(state, observerId, "observe-stop-after-terminal");
const cleanupFindings = quickVerifyCleanupFindings(cleanupStep);
const findings = mergeFindingRecords(
mergeFindingRecords(artifactFindings, controlFindings),
cleanupFindings,
);
const blockingFindings = findings.filter(isQuickVerifyBlockingFinding);
const analysisWarnings = analysis.ok ? [] : ["quick verify analyze command returned non-zero but a readable analysis artifact was produced; targetValidation is using artifact severity plus control blockers."];
const ok = record(artifactSummary).ok === true && controlFindings.length === 0 && blockingFindings.length === 0;
@@ -320,7 +326,7 @@ export function runSentinelQuickVerify(state: SentinelCicdState, reason: string,
failure: controlFindings.length > 0 ? "quick-verify-no-business-turn" : blockingFindings.length > 0 ? "quick-verify-blocking-findings" : null,
promptSource: prompts.summary,
accountEnv: accountEnv.summary,
steps,
steps: [...steps, cleanupStep],
analysis: artifactSummary,
views: {
summary: { renderedText: renderQuickVerifySummary({ runId, scenarioId, observerId, artifactSummary, steps, publicOrigin: stringAt(state.publicExposure, "publicBaseUrl") }) },
@@ -331,7 +337,7 @@ export function runSentinelQuickVerify(state: SentinelCicdState, reason: string,
findings,
screenshot: record(artifactSummary).screenshot,
publicOrigin: stringAt(state.publicExposure, "publicBaseUrl"),
warnings: mergeWarnings(analysisWarnings, nonBlockingCanaryWarnings, elapsedWarnings()),
warnings: mergeWarnings(analysisWarnings, nonBlockingCanaryWarnings, cleanupFindings.length > 0 ? ["quick verify observer stop failed; runner lifecycle cleanup is a blocking finding."] : [], elapsedWarnings()),
valuesRedacted: true,
});
}
@@ -466,14 +472,7 @@ function finalizeQuickVerifyFailure(state: SentinelCicdState, input: {
], 60);
cleanupSteps.push({ phase: "observe-cancel-after-failure", ok: cancel.ok, result: cancel.result });
}
const stop = runChildCli([
"web-probe", "observe", "stop", input.observerId,
"--node", state.spec.nodeId,
"--lane", state.spec.lane,
"--force",
"--command-timeout-seconds", "55",
], 30);
cleanupSteps.push({ phase: "observe-stop-after-failure", ok: stop.ok, result: stop.result });
cleanupSteps.push(stopQuickVerifyObserver(state, input.observerId, "observe-stop-after-failure"));
const analysis = runChildCli([
"web-probe", "observe", "analyze", input.observerId,
"--node", state.spec.nodeId,
@@ -534,6 +533,30 @@ function finalizeQuickVerifyFailure(state: SentinelCicdState, input: {
};
}
function stopQuickVerifyObserver(state: SentinelCicdState, observerId: string, phase: string): Record<string, unknown> {
const stop = runChildCli([
"web-probe", "observe", "stop", observerId,
"--node", state.spec.nodeId,
"--lane", state.spec.lane,
"--force",
"--wait-ms", "55000",
"--command-timeout-seconds", "55",
], 60);
return { phase, ok: stop.ok, result: stop.result };
}
function quickVerifyCleanupFindings(cleanupStep: Record<string, unknown>): Record<string, unknown>[] {
if (cleanupStep.ok === true) return [];
return [{
id: "quick-verify-observer-stop-failed",
severity: "red",
count: 1,
summary: "quick verify completed target analysis but failed to stop its observer runner; this can leak Chrome process trees on cadence runs.",
cleanupPhase: cleanupStep.phase,
valuesRedacted: true,
}];
}
function callSentinelService(state: SentinelCicdState, method: "GET" | "POST", pathWithQuery: string, body: Record<string, unknown> | null, timeoutSeconds: number): Record<string, unknown> {
const namespace = stringAt(state.runtime, "namespace");
const serviceName = stringAt(state.runtime, "serviceName");
+12 -1
View File
@@ -526,6 +526,7 @@ function buildObserveCommandPlan(config: WebProbeSentinelServiceConfig, scenario
"--target-path", targetPath,
"--sample-interval-ms", String(numberAt(scenario, "sampleIntervalMs")),
"--screenshot-interval-ms", String(numberAt(scenario, "screenshotIntervalMs")),
"--max-run-seconds", String(numberAt(scenario, "maxRunSeconds")),
"--command-timeout-seconds", "55",
];
const viewport = stringOrNull(scenario.viewport);
@@ -562,7 +563,17 @@ function buildObserveCommandPlan(config: WebProbeSentinelServiceConfig, scenario
argv: ["bun", "scripts/cli.ts", "web-probe", "observe", "analyze", "<observerId>"],
stdinSource: "none",
};
return [start, ...commands, analyze];
const stop: CommandPlanStep = {
phase: "observe-stop",
argv: ["bun", "scripts/cli.ts", "web-probe", "observe", "stop", "<observerId>", "--node", config.node, "--lane", config.lane, "--force", "--wait-ms", "55000", "--command-timeout-seconds", "55"],
stdinSource: "none",
};
const gc: CommandPlanStep = {
phase: "observe-gc",
argv: ["bun", "scripts/cli.ts", "web-probe", "observe", "gc", "--node", config.node, "--lane", config.lane, "--confirm", "--command-timeout-seconds", "55"],
stdinSource: "none",
};
return [start, ...commands, analyze, stop, gc];
}
function inlinePromptText(item: Record<string, unknown>): string | null {
File diff suppressed because it is too large Load Diff
+7 -2
View File
@@ -116,7 +116,7 @@ export interface NodeWebProbeScreenshotOptions {
commandTimeoutSeconds: number;
}
export type NodeWebProbeObserveAction = "start" | "status" | "command" | "stop" | "collect" | "analyze";
export type NodeWebProbeObserveAction = "start" | "status" | "command" | "stop" | "collect" | "analyze" | "gc";
export type NodeWebProbeObserveCommandType =
| "login"
@@ -176,10 +176,15 @@ export interface NodeWebProbeObserveOptions {
screenshotIntervalMs: number;
observerRefreshIntervalMs: number;
maxSamples: number;
maxRunSeconds: number;
commandTimeoutSeconds: number;
waitMs: number;
tailLines: number;
maxFiles: number;
gcKeepHours: number;
gcLimit: number;
confirm: boolean;
dryRun: boolean;
collectView: NodeWebProbeObserveCollectView;
collectFile: string | null;
collectFinding: string | null;
@@ -612,7 +617,7 @@ export async function runNodeDelegatedDomain(config: Config, domain: DelegatedNo
if (scoped.originalArgs.includes("--full")) return withNodeRuntimeControlPlaneStatusFullRendered(result, scoped);
return withNodeRuntimeControlPlaneStatusRendered(result, scoped);
}
if (scoped.action === "apply" || scoped.action === "trigger-current" || scoped.action === "refresh" || scoped.action === "sync" || scoped.action === "runtime-migration" || scoped.action === "cleanup-runs") {
if (scoped.action === "apply" || scoped.action === "trigger-current" || scoped.action === "refresh" || scoped.action === "sync" || scoped.action === "runtime-migration" || scoped.action === "cleanup-runs" || scoped.action === "cleanup-released-pvs" || scoped.action === "cleanup-legacy-docker-images" || scoped.action === "cleanup-legacy-docker-registry-volume") {
if (scoped.confirm && !scoped.dryRun && !scoped.wait) return startNodeDelegatedJob(scoped);
return nodeRuntimeControlPlaneRun(scoped);
}
+1 -1
View File
@@ -209,7 +209,7 @@ export function nodeRuntimeUnsupportedAction(scoped: ReturnType<typeof parseNode
lane: scoped.lane,
mutation: false,
degradedReason: "unsupported-node-scoped-runtime-action",
message: "node-scoped runtime currently supports plan/status/apply/refresh/sync/trigger-current/cleanup-runs/runtime-image/runtime-migration/source-workspace",
message: "node-scoped runtime currently supports plan/status/apply/refresh/sync/trigger-current/cleanup-runs/cleanup-released-pvs/cleanup-legacy-docker-images/cleanup-legacy-docker-registry-volume/runtime-image/runtime-migration/source-workspace",
expected: nodeRuntimeExpected(scoped.spec),
};
}
+372 -3
View File
@@ -219,8 +219,9 @@ export function parseNodeWebProbeObserveOptions(
&& observeActionRaw !== "stop"
&& observeActionRaw !== "collect"
&& observeActionRaw !== "analyze"
&& observeActionRaw !== "gc"
) {
throw new Error("web-probe observe usage: observe start --node NODE --lane vNN [...]; observe status|command|stop|collect|analyze <id> [...]");
throw new Error("web-probe observe usage: observe start --node NODE --lane vNN [...]; observe status|command|stop|collect|analyze <id> [...]; observe gc --node NODE --lane vNN [--confirm]");
}
assertKnownOptions(args.slice(1), new Set([
"--node",
@@ -233,6 +234,9 @@ export function parseNodeWebProbeObserveOptions(
"--screenshot-interval-ms",
"--observer-refresh-interval-ms",
"--max-samples",
"--max-run-seconds",
"--keep-hours",
"--limit",
"--command-timeout-seconds",
"--wait-ms",
"--tail-lines",
@@ -284,7 +288,7 @@ export function parseNodeWebProbeObserveOptions(
"--workspace-root",
"--workspace-root-ref",
"--root",
]), new Set(["--force", "--full", "--raw", "--text-stdin", "--require-composer-ready", "--wait-project-management-ready", "--blocking", "--non-blocking"]));
]), new Set(["--force", "--full", "--raw", "--text-stdin", "--require-composer-ready", "--wait-project-management-ready", "--blocking", "--non-blocking", "--dry-run", "--confirm"]));
const commandTypeRaw = optionValue(args, "--type") ?? null;
const commandType = commandTypeRaw === null ? null : parseNodeWebProbeObserveCommandType(commandTypeRaw);
const stateDir = optionValue(args, "--state-dir") ?? indexed?.stateDir ?? null;
@@ -320,9 +324,12 @@ export function parseNodeWebProbeObserveOptions(
if (analyzeTailSamples !== null && (!Number.isInteger(analyzeTailSamples) || analyzeTailSamples < 0)) {
throw new Error("unsafe web-probe observe --tail-samples: expected a non-negative integer; use 0 for all samples");
}
if (observeActionRaw !== "start" && stateDir === null && jobId === null) {
if (observeActionRaw !== "start" && observeActionRaw !== "gc" && stateDir === null && jobId === null) {
throw new Error("web-probe observe status|command|stop|collect|analyze requires --state-dir or --job-id");
}
const confirm = args.includes("--confirm");
const dryRun = args.includes("--dry-run") || !confirm;
if (confirm && args.includes("--dry-run")) throw new Error("web-probe observe gc accepts only one of --confirm or --dry-run");
const commandTextOption = optionValue(args, "--text") ?? null;
const commandTextFromStdin = args.includes("--text-stdin");
if (commandTextFromStdin && observeActionRaw !== "command") {
@@ -403,10 +410,15 @@ export function parseNodeWebProbeObserveOptions(
screenshotIntervalMs: positiveIntegerOption(args, "--screenshot-interval-ms", 300000, 86_400_000),
observerRefreshIntervalMs: positiveIntegerOption(args, "--observer-refresh-interval-ms", 180000, 86_400_000),
maxSamples: positiveIntegerOption(args, "--max-samples", 0, 10_000_000),
maxRunSeconds: positiveIntegerOption(args, "--max-run-seconds", 0, 86_400),
commandTimeoutSeconds: positiveIntegerOption(args, "--command-timeout-seconds", 55, 3600),
waitMs: positiveIntegerOption(args, "--wait-ms", 0, 600000),
tailLines: positiveIntegerOption(args, "--tail-lines", 5, 200),
maxFiles: positiveIntegerOption(args, "--max-files", 80, 5000),
gcKeepHours: positiveIntegerOption(args, "--keep-hours", webObserveGcDefaultKeepHours(), 8760),
gcLimit: positiveIntegerOption(args, "--limit", 200, 5000),
confirm,
dryRun,
collectView,
collectFile,
collectFinding,
@@ -555,6 +567,18 @@ export function assertKnownOptions(args: string[], valueOptions: Set<string>, fl
}
}
function webObserveGcDefaultKeepHours(): number {
const configPath = "config/unidesk-cli.yaml";
const parsed = record(Bun.YAML.parse(readFileSync(rootPath(configPath), "utf8")) as unknown);
const gc = record(parsed.gc);
const scratch = record(gc.stateStaleScratch);
const keepHours = scratch.keepHours;
if (!Number.isInteger(keepHours) || keepHours < 0) {
throw new Error(`${configPath}#gc.stateStaleScratch.keepHours must be a non-negative integer`);
}
return keepHours;
}
export function runNodeWebProbe(options: NodeWebProbeOptions): Record<string, unknown> | RenderedCliResult {
const lane = options.lane;
if (!isHwlabRuntimeLane(lane)) throw new Error(`web-probe only supports HWLAB runtime lanes, got ${lane}`);
@@ -1305,9 +1329,353 @@ export function runNodeWebProbeObserve(
if (options.observeAction === "command") return runNodeWebProbeObserveCommand(options, spec, false);
if (options.observeAction === "stop") return runNodeWebProbeObserveCommand({ ...options, commandType: "stop" }, spec, true);
if (options.observeAction === "collect") return runNodeWebProbeObserveCollect(options, spec);
if (options.observeAction === "gc") return runNodeWebProbeObserveGc(options, spec);
return runNodeWebProbeObserveAnalyze(options, spec);
}
export function runNodeWebProbeObserveGc(options: NodeWebProbeObserveOptions, spec: HwlabRuntimeLaneSpec): Record<string, unknown> {
const stateRoot = `.state/web-observe/${safeWebObserveSegment(options.node)}/${safeWebObserveSegment(options.lane)}`;
const script = [
"set -eu",
`node - ${shellQuote(stateRoot)} ${shellQuote(String(options.gcKeepHours))} ${shellQuote(String(options.gcLimit))} ${shellQuote(options.confirm ? "run" : "plan")} ${shellQuote(options.node)} ${shellQuote(options.lane)} <<'UNIDESK_WEB_OBSERVE_GC'`,
nodeWebObserveGcNodeScript(),
"UNIDESK_WEB_OBSERVE_GC",
].join("\n");
const result = runTransWorkspaceStdinScript(options.node, spec.workspace, script, options.commandTimeoutSeconds);
const payload = parseJsonObject(result.stdout);
const ok = result.exitCode === 0 && payload.ok !== false;
return {
ok,
status: ok ? (options.confirm ? "cleaned" : "planned") : "blocked",
command: `web-probe observe gc --node ${options.node} --lane ${options.lane}`,
node: options.node,
lane: options.lane,
workspace: spec.workspace,
stateRoot,
mode: options.confirm ? "confirmed-run" : "dry-run",
retention: {
source: "config/unidesk-cli.yaml#gc.stateStaleScratch.keepHours",
keepHours: options.gcKeepHours,
userOverride: options.gcKeepHours !== webObserveGcDefaultKeepHours(),
},
gc: options.full || options.raw ? payload : compactWebObserveGcPayload(payload),
result: ok
? {
exitCode: result.exitCode,
timedOut: result.timedOut,
stdoutBytes: result.stdout.length,
stderrBytes: result.stderr.length,
}
: compactCommandResultWithStdoutTail(result),
valuesRedacted: true,
};
}
function compactWebObserveGcPayload(payload: Record<string, unknown>): Record<string, unknown> {
const candidates = Array.isArray(payload.candidates) ? payload.candidates.map(record) : [];
const protectedRuns = Array.isArray(payload.protected) ? payload.protected.map(record) : [];
const deleted = Array.isArray(payload.deleted) ? payload.deleted.map(record) : [];
const failures = Array.isArray(payload.failures) ? payload.failures.map(record) : [];
return {
ok: payload.ok ?? null,
mode: payload.mode ?? null,
mutation: payload.mutation ?? null,
keepHours: payload.keepHours ?? null,
limit: payload.limit ?? null,
diskBefore: payload.diskBefore ?? null,
diskAfter: payload.diskAfter ?? null,
scannedRuns: payload.scannedRuns ?? null,
candidateCount: payload.candidateCount ?? null,
selectedCount: payload.selectedCount ?? null,
deferredCount: payload.deferredCount ?? null,
protectedCount: payload.protectedCount ?? null,
estimatedReclaimBytes: payload.estimatedReclaimBytes ?? null,
estimatedReclaimHuman: payload.estimatedReclaimHuman ?? null,
reclaimedBytes: payload.reclaimedBytes ?? null,
reclaimedHuman: payload.reclaimedHuman ?? null,
candidates: candidates.slice(0, 20).map((item) => ({
id: item.id ?? null,
runDir: item.runDir ?? null,
status: item.status ?? null,
ageHours: item.ageHours ?? null,
rawHuman: item.rawHuman ?? null,
artifactCount: item.artifactCount ?? null,
})),
deleted: deleted.slice(0, 20).map((item) => ({
id: item.id ?? null,
runDir: item.runDir ?? null,
rawHuman: item.rawHuman ?? null,
artifactCount: item.artifactCount ?? null,
})),
failures: failures.slice(0, 20),
protected: protectedRuns.slice(0, 20).map((item) => ({
id: item.id ?? null,
runDir: item.runDir ?? null,
status: item.status ?? null,
ageHours: item.ageHours ?? null,
rawHuman: item.rawHuman ?? null,
reasons: item.reasons ?? null,
})),
disclosure: {
candidateRowsShown: Math.min(20, candidates.length),
deletedRowsShown: Math.min(20, deleted.length),
protectedRowsShown: Math.min(20, protectedRuns.length),
full: "rerun with --full or --raw for all candidates/artifacts",
},
valuesRedacted: true,
};
}
function nodeWebObserveGcNodeScript(): string {
return String.raw`
const fs = require("fs");
const path = require("path");
const childProcess = require("child_process");
const stateRoot = process.argv[2];
const keepHours = Number(process.argv[3]);
const limit = Number(process.argv[4]);
const mode = process.argv[5] === "run" ? "run" : "plan";
const nodeId = process.argv[6];
const lane = process.argv[7];
const keepMs = keepHours * 60 * 60 * 1000;
const nowMs = Date.now();
const rawNames = ["samples.jsonl", "browser-process.jsonl", "network.jsonl", "console.jsonl", "artifacts.jsonl", "screenshots"];
function jsonRead(file) {
try { return JSON.parse(fs.readFileSync(file, "utf8")); } catch { return null; }
}
function parseTime(value) {
if (typeof value !== "string") return 0;
const ms = Date.parse(value);
return Number.isFinite(ms) ? ms : 0;
}
function dirTimeMs(runDir) {
const base = path.basename(runDir);
const match = base.match(/^(\d{8})T(\d{6})Z_/);
if (!match) return 0;
const raw = match[1].slice(0, 4) + "-" + match[1].slice(4, 6) + "-" + match[1].slice(6, 8) + "T" + match[2].slice(0, 2) + ":" + match[2].slice(2, 4) + ":" + match[2].slice(4, 6) + "Z";
return Date.parse(raw) || 0;
}
function newestKnownTimeMs(runDir, manifest, heartbeat) {
return Math.max(
dirTimeMs(runDir),
parseTime(manifest && manifest.completedAt),
parseTime(manifest && manifest.updatedAt),
parseTime(manifest && manifest.startedAt),
parseTime(manifest && manifest.pageProvenance && manifest.pageProvenance.observedAt),
parseTime(heartbeat && heartbeat.completedAt),
parseTime(heartbeat && heartbeat.forceStoppedAt),
parseTime(heartbeat && heartbeat.updatedAt),
parseTime(heartbeat && heartbeat.pageProvenance && heartbeat.pageProvenance.observedAt),
);
}
function sizePath(target) {
let total = 0;
const stack = [target];
while (stack.length > 0) {
const current = stack.pop();
let st;
try { st = fs.lstatSync(current); } catch { continue; }
if (st.isSymbolicLink()) continue;
total += st.blocks ? st.blocks * 512 : st.size;
if (!st.isDirectory()) continue;
let entries;
try { entries = fs.readdirSync(current); } catch { continue; }
for (const entry of entries) stack.push(path.join(current, entry));
}
return total;
}
function human(bytes) {
const units = ["B", "KiB", "MiB", "GiB", "TiB"];
let value = bytes;
let unit = 0;
while (value >= 1024 && unit < units.length - 1) { value /= 1024; unit += 1; }
return value.toFixed(unit === 0 ? 0 : 1) + units[unit];
}
function disk() {
try {
const line = childProcess.execFileSync("df", ["-Pk", "/"], { encoding: "utf8" }).trim().split("\n").slice(-1)[0];
const parts = line.trim().split(/\s+/);
const sizeBytes = Number(parts[1]) * 1024;
const usedBytes = Number(parts[2]) * 1024;
const availableBytes = Number(parts[3]) * 1024;
const usePercent = Number(String(parts[4]).replace("%", ""));
return { filesystem: parts[0], sizeBytes, usedBytes, availableBytes, usePercent, usedHuman: human(usedBytes), availableHuman: human(availableBytes) };
} catch (error) {
return { error: error instanceof Error ? error.message : String(error) };
}
}
function findRunDirs(root) {
const runs = [];
for (const year of safeReadDir(root)) {
const y = path.join(root, year.name);
if (!year.isDirectory() || !/^\d{4}$/.test(year.name)) continue;
for (const month of safeReadDir(y)) {
const m = path.join(y, month.name);
if (!month.isDirectory() || !/^\d{2}$/.test(month.name)) continue;
for (const day of safeReadDir(m)) {
const d = path.join(m, day.name);
if (!day.isDirectory() || !/^\d{2}$/.test(day.name)) continue;
for (const run of safeReadDir(d)) {
const r = path.join(d, run.name);
if (run.isDirectory() && /_webobs-[A-Za-z0-9_.-]+$/.test(run.name)) runs.push(r);
}
}
}
}
return runs;
}
function safeReadDir(dir) {
try { return fs.readdirSync(dir, { withFileTypes: true }); } catch { return []; }
}
function pidObserverAlive(pid, runAbs) {
if (!Number.isInteger(pid) || pid <= 1) return { alive: false, reason: null };
try {
const cmdline = fs.readFileSync("/proc/" + pid + "/cmdline", "utf8").replace(/\0/g, " ");
if (cmdline.includes("observer-runner") || cmdline.includes(runAbs)) return { alive: true, reason: "pid-alive" };
return { alive: false, reason: null };
} catch (error) {
if (fs.existsSync("/proc/" + pid)) return { alive: true, reason: "pid-alive-unreadable" };
return { alive: false, reason: null };
}
}
function openRunDirs(rootAbs) {
const active = new Set();
const procEntries = safeReadDir("/proc").filter((entry) => entry.isDirectory() && /^\d+$/.test(entry.name));
for (const proc of procEntries) {
const fdDir = path.join("/proc", proc.name, "fd");
for (const fd of safeReadDir(fdDir)) {
let target;
try { target = fs.readlinkSync(path.join(fdDir, fd.name)).replace(/ \(deleted\)$/, ""); } catch { continue; }
if (!target.startsWith(rootAbs + path.sep)) continue;
const rel = path.relative(rootAbs, target).split(path.sep);
if (rel.length >= 4) active.add(path.join(rootAbs, rel[0], rel[1], rel[2], rel[3]));
}
}
return active;
}
function rawArtifacts(runDir) {
const artifacts = [];
for (const name of rawNames) {
const target = path.join(runDir, name);
if (!fs.existsSync(target)) continue;
const bytes = sizePath(target);
if (bytes > 0) artifacts.push({ name, path: target, bytes, human: human(bytes) });
}
return artifacts;
}
function removeArtifact(artifact) {
const st = fs.lstatSync(artifact.path);
if (st.isSymbolicLink()) throw new Error("refusing symlink: " + artifact.path);
if (st.isDirectory()) fs.rmSync(artifact.path, { recursive: true, force: false });
else fs.unlinkSync(artifact.path);
}
const rootAbs = path.resolve(stateRoot);
const diskBefore = disk();
const openDirs = openRunDirs(rootAbs);
const candidates = [];
const protectedRuns = [];
for (const runDir of findRunDirs(rootAbs)) {
const manifest = jsonRead(path.join(runDir, "manifest.json"));
const heartbeat = jsonRead(path.join(runDir, "heartbeat.json"));
const pid = Number((heartbeat && heartbeat.pid) || (manifest && manifest.pid) || 0);
const pidState = pidObserverAlive(pid, runDir);
const hasOpenFd = openDirs.has(runDir);
const newestMs = newestKnownTimeMs(runDir, manifest, heartbeat);
const ageMs = newestMs > 0 ? nowMs - newestMs : 0;
const artifacts = rawArtifacts(runDir);
const rawBytes = artifacts.reduce((sum, item) => sum + item.bytes, 0);
const rel = path.relative(rootAbs, runDir);
const reasons = [];
if (!manifest) reasons.push("manifest-missing");
if (pidState.alive) reasons.push(pidState.reason || "pid-alive");
if (hasOpenFd) reasons.push("open-fd");
if (newestMs <= 0) reasons.push("time-unknown");
if (ageMs < keepMs) reasons.push("retention-window");
if (rawBytes <= 0) reasons.push("no-raw-artifacts");
const summary = {
id: (manifest && manifest.jobId) || path.basename(runDir).match(/(webobs-[A-Za-z0-9_.-]+)$/)?.[1] || null,
runDir: rel,
status: heartbeat && heartbeat.status || null,
pid: Number.isInteger(pid) && pid > 0 ? pid : null,
newestAt: newestMs > 0 ? new Date(newestMs).toISOString() : null,
ageHours: newestMs > 0 ? Number((ageMs / 3600000).toFixed(2)) : null,
rawBytes,
rawHuman: human(rawBytes),
artifacts,
};
if (reasons.length > 0) protectedRuns.push({ ...summary, reasons });
else candidates.push(summary);
}
candidates.sort((a, b) => b.rawBytes - a.rawBytes);
const selected = candidates.slice(0, limit);
const selectedSet = new Set(selected.map((item) => item.runDir));
const deferred = candidates.filter((item) => !selectedSet.has(item.runDir));
let reclaimedBytes = 0;
const deleted = [];
const failures = [];
if (mode === "run") {
for (const item of selected) {
const runDir = path.join(rootAbs, item.runDir);
try {
for (const artifact of item.artifacts) removeArtifact(artifact);
reclaimedBytes += item.rawBytes;
deleted.push({ id: item.id, runDir: item.runDir, rawBytes: item.rawBytes, rawHuman: item.rawHuman, artifactCount: item.artifacts.length });
} catch (error) {
failures.push({ id: item.id, runDir: item.runDir, error: error instanceof Error ? error.message : String(error) });
}
}
}
const estimatedReclaimBytes = selected.reduce((sum, item) => sum + item.rawBytes, 0);
const diskAfter = mode === "run" ? disk() : null;
console.log(JSON.stringify({
ok: failures.length === 0,
command: "web-probe observe gc",
node: nodeId,
lane,
stateRoot,
mode,
mutation: mode === "run",
keepHours,
limit,
diskBefore,
diskAfter,
scannedRuns: candidates.length + protectedRuns.length,
candidateCount: candidates.length,
selectedCount: selected.length,
deferredCount: deferred.length,
protectedCount: protectedRuns.length,
estimatedReclaimBytes,
estimatedReclaimHuman: human(estimatedReclaimBytes),
reclaimedBytes,
reclaimedHuman: human(reclaimedBytes),
candidates: selected.map((item) => ({ id: item.id, runDir: item.runDir, status: item.status, ageHours: item.ageHours, rawBytes: item.rawBytes, rawHuman: item.rawHuman, artifactCount: item.artifacts.length, artifacts: item.artifacts.map((artifact) => ({ name: artifact.name, bytes: artifact.bytes, human: artifact.human })) })),
deleted,
failures,
protected: protectedRuns.slice(0, Math.min(50, protectedRuns.length)).map((item) => ({ id: item.id, runDir: item.runDir, status: item.status, ageHours: item.ageHours, rawBytes: item.rawBytes, rawHuman: item.rawHuman, reasons: item.reasons })),
valuesRedacted: true,
}, null, 2));
`;
}
export function runNodeWebProbeObserveStart(
options: NodeWebProbeObserveOptions,
spec: HwlabRuntimeLaneSpec,
@@ -1343,6 +1711,7 @@ export function runNodeWebProbeObserveStart(
`UNIDESK_WEB_OBSERVE_SCREENSHOT_INTERVAL_MS=${shellQuote(String(options.screenshotIntervalMs))}`,
`UNIDESK_WEB_OBSERVE_OBSERVER_REFRESH_INTERVAL_MS=${shellQuote(String(options.observerRefreshIntervalMs))}`,
`UNIDESK_WEB_OBSERVE_MAX_SAMPLES=${shellQuote(String(options.maxSamples))}`,
`UNIDESK_WEB_OBSERVE_MAX_RUN_MS=${shellQuote(String(options.maxRunSeconds > 0 ? options.maxRunSeconds * 1000 : 0))}`,
`UNIDESK_WEB_OBSERVE_VIEWPORT=${shellQuote(options.viewport)}`,
`UNIDESK_WEB_OBSERVE_BROWSER_PROXY_MODE=${shellQuote(options.browserProxyMode)}`,
`UNIDESK_WEB_OBSERVE_ALERT_THRESHOLDS_JSON=${shellQuote(JSON.stringify(alertThresholds))}`,