Fix JD01 web observe GC and legacy cleanup
This commit is contained in:
@@ -75,6 +75,15 @@ gc:
|
||||
codex-queue-stats-verify: .state/codex-queue-stats-verify
|
||||
codex-queue-perf: .state/codex-queue-perf
|
||||
tmp: .state/tmp
|
||||
legacyDockerImages:
|
||||
enabled: true
|
||||
minAgeHours: 12
|
||||
keepPerRepository: 2
|
||||
repositories:
|
||||
- 127.0.0.1:5000/hwlab/web-probe-sentinel-${nodeLower}
|
||||
legacyDockerRegistryVolumes:
|
||||
enabled: true
|
||||
requireK8sRegistryReady: true
|
||||
codexSessions:
|
||||
enabled: false
|
||||
keepHours: 72
|
||||
|
||||
@@ -12,6 +12,10 @@ export function hwlabNodeHelp(): Record<string, unknown> {
|
||||
examples: [
|
||||
"bun scripts/cli.ts hwlab nodes control-plane infra plan --node D601 --lane v03",
|
||||
"bun scripts/cli.ts hwlab nodes control-plane status --node D601 --lane v03",
|
||||
"bun scripts/cli.ts hwlab nodes control-plane cleanup-runs --node JD01 --lane v03 --min-age-minutes 30 --limit 200 --dry-run",
|
||||
"bun scripts/cli.ts hwlab nodes control-plane cleanup-released-pvs --node JD01 --lane v03 --limit 200 --dry-run",
|
||||
"bun scripts/cli.ts hwlab nodes control-plane cleanup-legacy-docker-images --node JD01 --lane v03 --dry-run",
|
||||
"bun scripts/cli.ts hwlab nodes control-plane cleanup-legacy-docker-registry-volume --node JD01 --lane v03 --dry-run",
|
||||
"bun scripts/cli.ts hwlab nodes git-mirror status --node G14 --lane v03",
|
||||
"bun scripts/cli.ts hwlab nodes hwpod-preinstall plan --node D601 --lane v03 --dry-run",
|
||||
"bun scripts/cli.ts hwlab nodes fake-model-provider plan --node D518 --lane v03 --provider fake-echo",
|
||||
@@ -21,7 +25,7 @@ export function hwlabNodeHelp(): Record<string, unknown> {
|
||||
"bun scripts/cli.ts web-probe --help",
|
||||
],
|
||||
actions: {
|
||||
"control-plane": "YAML-first node-local CI/CD, git-mirror, public exposure, runtime-image, Argo and PipelineRun operations.",
|
||||
"control-plane": "YAML-first node-local CI/CD, git-mirror, public exposure, runtime-image, Argo, PipelineRun and CI workspace retention operations.",
|
||||
"git-mirror": "Inspect or operate the selected node/lane source mirror.",
|
||||
"hwpod-preinstall": "Render YAML-first HWPOD preinstall configRefs, runtime mount targets, PM MDTODO source, and gateway profile status.",
|
||||
"fake-model-provider": "Materialize and operate YAML-declared fake Responses model providers for HWLAB/AgentRun sentinel checks.",
|
||||
@@ -32,6 +36,9 @@ export function hwlabNodeHelp(): Record<string, unknown> {
|
||||
notes: [
|
||||
"`web-probe` is no longer under `hwlab nodes`; use `bun scripts/cli.ts web-probe ...`.",
|
||||
"`--node` and `--lane` remain data arguments resolved from YAML for node/lane operations.",
|
||||
"control-plane cleanup-runs deletes terminal PipelineRuns; cleanup-released-pvs deletes only orphaned hwlab-ci PipelineRun PVCs with no active pod mounts and Released local-path/Delete PVs.",
|
||||
"cleanup-legacy-docker-images is a transitional legacy-cache GC for Docker images matching YAML allowlisted repositories; it protects container-referenced images, does not run prune, and does not touch Docker volumes.",
|
||||
"cleanup-legacy-docker-registry-volume removes only an exited legacy Docker registry container and its unique /var/lib/registry volume after the YAML-declared k8s node-local-registry is ready.",
|
||||
"`trigger-current --confirm --wait` is the one-command CICD path for current node/lane runtime publish.",
|
||||
],
|
||||
};
|
||||
@@ -63,6 +70,8 @@ export function hwlabNodeWebProbeHelp(): Record<string, unknown> {
|
||||
"bun scripts/cli.ts web-probe observe collect webobs-xxxx --view timeline --command-id cmd-xxxx",
|
||||
"bun scripts/cli.ts web-probe observe collect webobs-xxxx --view project-mdtodo-summary",
|
||||
"bun scripts/cli.ts web-probe observe analyze webobs-xxxx",
|
||||
"bun scripts/cli.ts web-probe observe gc --node JD01 --lane v03 --dry-run",
|
||||
"bun scripts/cli.ts web-probe observe gc --node JD01 --lane v03 --keep-hours 24 --confirm",
|
||||
"bun scripts/cli.ts web-probe sentinel plan --node D601 --lane v03 --dry-run",
|
||||
"bun scripts/cli.ts web-probe sentinel plan --node D601 --lane v03 --sentinel workbench-auth-session-switch-2users",
|
||||
"bun scripts/cli.ts web-probe sentinel publish-current --node JD01 --lane v03 --sentinel jd01-web-probe-sentinel --confirm --wait",
|
||||
@@ -77,7 +86,7 @@ export function hwlabNodeWebProbeHelp(): Record<string, unknown> {
|
||||
"opencode-smoke": "Run the repo-owned OpenCode iframe/direct-host composer smoke and require DOM assistant text plus EventSource update/finish/idle evidence.",
|
||||
script: "Run caller-provided Playwright JS after CLI-managed /auth/login; scripts must not handle secrets themselves.",
|
||||
screenshot: "Capture a no-auth or public page through the selected node/lane remote browser and download PNG artifacts to the caller /tmp by default.",
|
||||
observe: "Start, inspect, control, stop, collect, and analyze a long-running observer that writes JSONL artifacts.",
|
||||
observe: "Start, inspect, control, stop, collect, analyze, and garbage-collect raw artifacts for long-running observers.",
|
||||
sentinel: "Render and operate the YAML-first web-probe sentinel wrapper, one-click publish, image, GitOps, dashboard verification, maintenance and report views.",
|
||||
},
|
||||
notes: [
|
||||
@@ -85,6 +94,7 @@ export function hwlabNodeWebProbeHelp(): Record<string, unknown> {
|
||||
"`web-probe script` is an ad-hoc exploration escape hatch; repeated/high-frequency workflows must become `web-probe observe command` types or repo-owned web-probe commands.",
|
||||
"`web-probe opencode-smoke` is the repo-owned OpenCode smoke; prefer it over repeating one-off OpenCode Playwright snippets.",
|
||||
"observe is passive by default; user actions must be explicit observe command entries in control.jsonl.",
|
||||
"observe gc keeps manifest, heartbeat, control/error logs and analysis reports, and only removes dead-run raw samples/browser/network/screenshot artifacts after YAML-configured retention.",
|
||||
"After observe start, prefer observe status|command|stop|collect|analyze <id> instead of repeating --node/--lane/--state-dir.",
|
||||
"collect views render bounded summaries from existing artifacts and do not create a second source of truth.",
|
||||
"analyze is offline-only: it reads artifact JSONL and writes analysis/report.md plus analysis/report.json.",
|
||||
|
||||
@@ -24,6 +24,7 @@ const screenshotIntervalMs = positiveInteger(process.env.UNIDESK_WEB_OBSERVE_SCR
|
||||
const screenshotCaptureTimeoutMs = boundedInteger(process.env.UNIDESK_WEB_OBSERVE_SCREENSHOT_CAPTURE_TIMEOUT_MS, 15000, 1000, 120000);
|
||||
const maxSamples = positiveInteger(process.env.UNIDESK_WEB_OBSERVE_MAX_SAMPLES, 0);
|
||||
const observerRefreshIntervalMs = positiveInteger(process.env.UNIDESK_WEB_OBSERVE_OBSERVER_REFRESH_INTERVAL_MS, 180000);
|
||||
const maxRunMs = positiveInteger(process.env.UNIDESK_WEB_OBSERVE_MAX_RUN_MS, 0);
|
||||
const viewport = parseViewport(process.env.UNIDESK_WEB_OBSERVE_VIEWPORT || "1440x900");
|
||||
const browserProxyMode = parseBrowserProxyMode(process.env.UNIDESK_WEB_OBSERVE_BROWSER_PROXY_MODE || "auto");
|
||||
const authLoginMaxAttempts = boundedInteger(process.env.UNIDESK_WEB_OBSERVE_AUTH_LOGIN_MAX_ATTEMPTS, 6, 1, 20);
|
||||
@@ -128,6 +129,10 @@ try {
|
||||
await appendJsonl(files.control, controlRecord({ id: "max-samples", type: "stop", source: "sampler" }, "completed", { reason: "max-samples", maxSamples }));
|
||||
break;
|
||||
}
|
||||
if (maxRunMs > 0 && Date.now() - startedAtMs >= maxRunMs) {
|
||||
await appendJsonl(files.control, controlRecord({ id: "max-run-ms", type: "stop", source: "sampler" }, "completed", { reason: "max-run-ms", maxRunMs, elapsedMs: Date.now() - startedAtMs }));
|
||||
break;
|
||||
}
|
||||
await sleep(sampleIntervalMs);
|
||||
}
|
||||
if (browserFreezeBlocker) {
|
||||
|
||||
@@ -114,6 +114,7 @@ export function runSentinelQuickVerify(state: SentinelCicdState, reason: string,
|
||||
"--target-path", stringAt(scenario, "observeTargetPath"),
|
||||
"--sample-interval-ms", String(sampleIntervalMs),
|
||||
"--screenshot-interval-ms", String(numberAt(scenario, "screenshotIntervalMs")),
|
||||
"--max-run-seconds", String(hardBudgetSeconds),
|
||||
"--command-timeout-seconds", "55",
|
||||
];
|
||||
const viewport = stringAtNullable(scenario, "viewport");
|
||||
@@ -298,7 +299,12 @@ export function runSentinelQuickVerify(state: SentinelCicdState, reason: string,
|
||||
const controlFindings = quickVerifyControlFindings(null, promptIndex, turnSummary, traceFrame);
|
||||
const artifactSummaryRecord = record(artifactSummary);
|
||||
const artifactFindings = Array.isArray(artifactSummaryRecord.findings) ? artifactSummaryRecord.findings.map(record) : [];
|
||||
const findings = mergeFindingRecords(artifactFindings, controlFindings);
|
||||
const cleanupStep = stopQuickVerifyObserver(state, observerId, "observe-stop-after-terminal");
|
||||
const cleanupFindings = quickVerifyCleanupFindings(cleanupStep);
|
||||
const findings = mergeFindingRecords(
|
||||
mergeFindingRecords(artifactFindings, controlFindings),
|
||||
cleanupFindings,
|
||||
);
|
||||
const blockingFindings = findings.filter(isQuickVerifyBlockingFinding);
|
||||
const analysisWarnings = analysis.ok ? [] : ["quick verify analyze command returned non-zero but a readable analysis artifact was produced; targetValidation is using artifact severity plus control blockers."];
|
||||
const ok = record(artifactSummary).ok === true && controlFindings.length === 0 && blockingFindings.length === 0;
|
||||
@@ -320,7 +326,7 @@ export function runSentinelQuickVerify(state: SentinelCicdState, reason: string,
|
||||
failure: controlFindings.length > 0 ? "quick-verify-no-business-turn" : blockingFindings.length > 0 ? "quick-verify-blocking-findings" : null,
|
||||
promptSource: prompts.summary,
|
||||
accountEnv: accountEnv.summary,
|
||||
steps,
|
||||
steps: [...steps, cleanupStep],
|
||||
analysis: artifactSummary,
|
||||
views: {
|
||||
summary: { renderedText: renderQuickVerifySummary({ runId, scenarioId, observerId, artifactSummary, steps, publicOrigin: stringAt(state.publicExposure, "publicBaseUrl") }) },
|
||||
@@ -331,7 +337,7 @@ export function runSentinelQuickVerify(state: SentinelCicdState, reason: string,
|
||||
findings,
|
||||
screenshot: record(artifactSummary).screenshot,
|
||||
publicOrigin: stringAt(state.publicExposure, "publicBaseUrl"),
|
||||
warnings: mergeWarnings(analysisWarnings, nonBlockingCanaryWarnings, elapsedWarnings()),
|
||||
warnings: mergeWarnings(analysisWarnings, nonBlockingCanaryWarnings, cleanupFindings.length > 0 ? ["quick verify observer stop failed; runner lifecycle cleanup is a blocking finding."] : [], elapsedWarnings()),
|
||||
valuesRedacted: true,
|
||||
});
|
||||
}
|
||||
@@ -466,14 +472,7 @@ function finalizeQuickVerifyFailure(state: SentinelCicdState, input: {
|
||||
], 60);
|
||||
cleanupSteps.push({ phase: "observe-cancel-after-failure", ok: cancel.ok, result: cancel.result });
|
||||
}
|
||||
const stop = runChildCli([
|
||||
"web-probe", "observe", "stop", input.observerId,
|
||||
"--node", state.spec.nodeId,
|
||||
"--lane", state.spec.lane,
|
||||
"--force",
|
||||
"--command-timeout-seconds", "55",
|
||||
], 30);
|
||||
cleanupSteps.push({ phase: "observe-stop-after-failure", ok: stop.ok, result: stop.result });
|
||||
cleanupSteps.push(stopQuickVerifyObserver(state, input.observerId, "observe-stop-after-failure"));
|
||||
const analysis = runChildCli([
|
||||
"web-probe", "observe", "analyze", input.observerId,
|
||||
"--node", state.spec.nodeId,
|
||||
@@ -534,6 +533,30 @@ function finalizeQuickVerifyFailure(state: SentinelCicdState, input: {
|
||||
};
|
||||
}
|
||||
|
||||
function stopQuickVerifyObserver(state: SentinelCicdState, observerId: string, phase: string): Record<string, unknown> {
|
||||
const stop = runChildCli([
|
||||
"web-probe", "observe", "stop", observerId,
|
||||
"--node", state.spec.nodeId,
|
||||
"--lane", state.spec.lane,
|
||||
"--force",
|
||||
"--wait-ms", "55000",
|
||||
"--command-timeout-seconds", "55",
|
||||
], 60);
|
||||
return { phase, ok: stop.ok, result: stop.result };
|
||||
}
|
||||
|
||||
function quickVerifyCleanupFindings(cleanupStep: Record<string, unknown>): Record<string, unknown>[] {
|
||||
if (cleanupStep.ok === true) return [];
|
||||
return [{
|
||||
id: "quick-verify-observer-stop-failed",
|
||||
severity: "red",
|
||||
count: 1,
|
||||
summary: "quick verify completed target analysis but failed to stop its observer runner; this can leak Chrome process trees on cadence runs.",
|
||||
cleanupPhase: cleanupStep.phase,
|
||||
valuesRedacted: true,
|
||||
}];
|
||||
}
|
||||
|
||||
function callSentinelService(state: SentinelCicdState, method: "GET" | "POST", pathWithQuery: string, body: Record<string, unknown> | null, timeoutSeconds: number): Record<string, unknown> {
|
||||
const namespace = stringAt(state.runtime, "namespace");
|
||||
const serviceName = stringAt(state.runtime, "serviceName");
|
||||
|
||||
@@ -526,6 +526,7 @@ function buildObserveCommandPlan(config: WebProbeSentinelServiceConfig, scenario
|
||||
"--target-path", targetPath,
|
||||
"--sample-interval-ms", String(numberAt(scenario, "sampleIntervalMs")),
|
||||
"--screenshot-interval-ms", String(numberAt(scenario, "screenshotIntervalMs")),
|
||||
"--max-run-seconds", String(numberAt(scenario, "maxRunSeconds")),
|
||||
"--command-timeout-seconds", "55",
|
||||
];
|
||||
const viewport = stringOrNull(scenario.viewport);
|
||||
@@ -562,7 +563,17 @@ function buildObserveCommandPlan(config: WebProbeSentinelServiceConfig, scenario
|
||||
argv: ["bun", "scripts/cli.ts", "web-probe", "observe", "analyze", "<observerId>"],
|
||||
stdinSource: "none",
|
||||
};
|
||||
return [start, ...commands, analyze];
|
||||
const stop: CommandPlanStep = {
|
||||
phase: "observe-stop",
|
||||
argv: ["bun", "scripts/cli.ts", "web-probe", "observe", "stop", "<observerId>", "--node", config.node, "--lane", config.lane, "--force", "--wait-ms", "55000", "--command-timeout-seconds", "55"],
|
||||
stdinSource: "none",
|
||||
};
|
||||
const gc: CommandPlanStep = {
|
||||
phase: "observe-gc",
|
||||
argv: ["bun", "scripts/cli.ts", "web-probe", "observe", "gc", "--node", config.node, "--lane", config.lane, "--confirm", "--command-timeout-seconds", "55"],
|
||||
stdinSource: "none",
|
||||
};
|
||||
return [start, ...commands, analyze, stop, gc];
|
||||
}
|
||||
|
||||
function inlinePromptText(item: Record<string, unknown>): string | null {
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -116,7 +116,7 @@ export interface NodeWebProbeScreenshotOptions {
|
||||
commandTimeoutSeconds: number;
|
||||
}
|
||||
|
||||
export type NodeWebProbeObserveAction = "start" | "status" | "command" | "stop" | "collect" | "analyze";
|
||||
export type NodeWebProbeObserveAction = "start" | "status" | "command" | "stop" | "collect" | "analyze" | "gc";
|
||||
|
||||
export type NodeWebProbeObserveCommandType =
|
||||
| "login"
|
||||
@@ -176,10 +176,15 @@ export interface NodeWebProbeObserveOptions {
|
||||
screenshotIntervalMs: number;
|
||||
observerRefreshIntervalMs: number;
|
||||
maxSamples: number;
|
||||
maxRunSeconds: number;
|
||||
commandTimeoutSeconds: number;
|
||||
waitMs: number;
|
||||
tailLines: number;
|
||||
maxFiles: number;
|
||||
gcKeepHours: number;
|
||||
gcLimit: number;
|
||||
confirm: boolean;
|
||||
dryRun: boolean;
|
||||
collectView: NodeWebProbeObserveCollectView;
|
||||
collectFile: string | null;
|
||||
collectFinding: string | null;
|
||||
@@ -612,7 +617,7 @@ export async function runNodeDelegatedDomain(config: Config, domain: DelegatedNo
|
||||
if (scoped.originalArgs.includes("--full")) return withNodeRuntimeControlPlaneStatusFullRendered(result, scoped);
|
||||
return withNodeRuntimeControlPlaneStatusRendered(result, scoped);
|
||||
}
|
||||
if (scoped.action === "apply" || scoped.action === "trigger-current" || scoped.action === "refresh" || scoped.action === "sync" || scoped.action === "runtime-migration" || scoped.action === "cleanup-runs") {
|
||||
if (scoped.action === "apply" || scoped.action === "trigger-current" || scoped.action === "refresh" || scoped.action === "sync" || scoped.action === "runtime-migration" || scoped.action === "cleanup-runs" || scoped.action === "cleanup-released-pvs" || scoped.action === "cleanup-legacy-docker-images" || scoped.action === "cleanup-legacy-docker-registry-volume") {
|
||||
if (scoped.confirm && !scoped.dryRun && !scoped.wait) return startNodeDelegatedJob(scoped);
|
||||
return nodeRuntimeControlPlaneRun(scoped);
|
||||
}
|
||||
|
||||
@@ -209,7 +209,7 @@ export function nodeRuntimeUnsupportedAction(scoped: ReturnType<typeof parseNode
|
||||
lane: scoped.lane,
|
||||
mutation: false,
|
||||
degradedReason: "unsupported-node-scoped-runtime-action",
|
||||
message: "node-scoped runtime currently supports plan/status/apply/refresh/sync/trigger-current/cleanup-runs/runtime-image/runtime-migration/source-workspace",
|
||||
message: "node-scoped runtime currently supports plan/status/apply/refresh/sync/trigger-current/cleanup-runs/cleanup-released-pvs/cleanup-legacy-docker-images/cleanup-legacy-docker-registry-volume/runtime-image/runtime-migration/source-workspace",
|
||||
expected: nodeRuntimeExpected(scoped.spec),
|
||||
};
|
||||
}
|
||||
|
||||
@@ -219,8 +219,9 @@ export function parseNodeWebProbeObserveOptions(
|
||||
&& observeActionRaw !== "stop"
|
||||
&& observeActionRaw !== "collect"
|
||||
&& observeActionRaw !== "analyze"
|
||||
&& observeActionRaw !== "gc"
|
||||
) {
|
||||
throw new Error("web-probe observe usage: observe start --node NODE --lane vNN [...]; observe status|command|stop|collect|analyze <id> [...]");
|
||||
throw new Error("web-probe observe usage: observe start --node NODE --lane vNN [...]; observe status|command|stop|collect|analyze <id> [...]; observe gc --node NODE --lane vNN [--confirm]");
|
||||
}
|
||||
assertKnownOptions(args.slice(1), new Set([
|
||||
"--node",
|
||||
@@ -233,6 +234,9 @@ export function parseNodeWebProbeObserveOptions(
|
||||
"--screenshot-interval-ms",
|
||||
"--observer-refresh-interval-ms",
|
||||
"--max-samples",
|
||||
"--max-run-seconds",
|
||||
"--keep-hours",
|
||||
"--limit",
|
||||
"--command-timeout-seconds",
|
||||
"--wait-ms",
|
||||
"--tail-lines",
|
||||
@@ -284,7 +288,7 @@ export function parseNodeWebProbeObserveOptions(
|
||||
"--workspace-root",
|
||||
"--workspace-root-ref",
|
||||
"--root",
|
||||
]), new Set(["--force", "--full", "--raw", "--text-stdin", "--require-composer-ready", "--wait-project-management-ready", "--blocking", "--non-blocking"]));
|
||||
]), new Set(["--force", "--full", "--raw", "--text-stdin", "--require-composer-ready", "--wait-project-management-ready", "--blocking", "--non-blocking", "--dry-run", "--confirm"]));
|
||||
const commandTypeRaw = optionValue(args, "--type") ?? null;
|
||||
const commandType = commandTypeRaw === null ? null : parseNodeWebProbeObserveCommandType(commandTypeRaw);
|
||||
const stateDir = optionValue(args, "--state-dir") ?? indexed?.stateDir ?? null;
|
||||
@@ -320,9 +324,12 @@ export function parseNodeWebProbeObserveOptions(
|
||||
if (analyzeTailSamples !== null && (!Number.isInteger(analyzeTailSamples) || analyzeTailSamples < 0)) {
|
||||
throw new Error("unsafe web-probe observe --tail-samples: expected a non-negative integer; use 0 for all samples");
|
||||
}
|
||||
if (observeActionRaw !== "start" && stateDir === null && jobId === null) {
|
||||
if (observeActionRaw !== "start" && observeActionRaw !== "gc" && stateDir === null && jobId === null) {
|
||||
throw new Error("web-probe observe status|command|stop|collect|analyze requires --state-dir or --job-id");
|
||||
}
|
||||
const confirm = args.includes("--confirm");
|
||||
const dryRun = args.includes("--dry-run") || !confirm;
|
||||
if (confirm && args.includes("--dry-run")) throw new Error("web-probe observe gc accepts only one of --confirm or --dry-run");
|
||||
const commandTextOption = optionValue(args, "--text") ?? null;
|
||||
const commandTextFromStdin = args.includes("--text-stdin");
|
||||
if (commandTextFromStdin && observeActionRaw !== "command") {
|
||||
@@ -403,10 +410,15 @@ export function parseNodeWebProbeObserveOptions(
|
||||
screenshotIntervalMs: positiveIntegerOption(args, "--screenshot-interval-ms", 300000, 86_400_000),
|
||||
observerRefreshIntervalMs: positiveIntegerOption(args, "--observer-refresh-interval-ms", 180000, 86_400_000),
|
||||
maxSamples: positiveIntegerOption(args, "--max-samples", 0, 10_000_000),
|
||||
maxRunSeconds: positiveIntegerOption(args, "--max-run-seconds", 0, 86_400),
|
||||
commandTimeoutSeconds: positiveIntegerOption(args, "--command-timeout-seconds", 55, 3600),
|
||||
waitMs: positiveIntegerOption(args, "--wait-ms", 0, 600000),
|
||||
tailLines: positiveIntegerOption(args, "--tail-lines", 5, 200),
|
||||
maxFiles: positiveIntegerOption(args, "--max-files", 80, 5000),
|
||||
gcKeepHours: positiveIntegerOption(args, "--keep-hours", webObserveGcDefaultKeepHours(), 8760),
|
||||
gcLimit: positiveIntegerOption(args, "--limit", 200, 5000),
|
||||
confirm,
|
||||
dryRun,
|
||||
collectView,
|
||||
collectFile,
|
||||
collectFinding,
|
||||
@@ -555,6 +567,18 @@ export function assertKnownOptions(args: string[], valueOptions: Set<string>, fl
|
||||
}
|
||||
}
|
||||
|
||||
function webObserveGcDefaultKeepHours(): number {
|
||||
const configPath = "config/unidesk-cli.yaml";
|
||||
const parsed = record(Bun.YAML.parse(readFileSync(rootPath(configPath), "utf8")) as unknown);
|
||||
const gc = record(parsed.gc);
|
||||
const scratch = record(gc.stateStaleScratch);
|
||||
const keepHours = scratch.keepHours;
|
||||
if (!Number.isInteger(keepHours) || keepHours < 0) {
|
||||
throw new Error(`${configPath}#gc.stateStaleScratch.keepHours must be a non-negative integer`);
|
||||
}
|
||||
return keepHours;
|
||||
}
|
||||
|
||||
export function runNodeWebProbe(options: NodeWebProbeOptions): Record<string, unknown> | RenderedCliResult {
|
||||
const lane = options.lane;
|
||||
if (!isHwlabRuntimeLane(lane)) throw new Error(`web-probe only supports HWLAB runtime lanes, got ${lane}`);
|
||||
@@ -1305,9 +1329,353 @@ export function runNodeWebProbeObserve(
|
||||
if (options.observeAction === "command") return runNodeWebProbeObserveCommand(options, spec, false);
|
||||
if (options.observeAction === "stop") return runNodeWebProbeObserveCommand({ ...options, commandType: "stop" }, spec, true);
|
||||
if (options.observeAction === "collect") return runNodeWebProbeObserveCollect(options, spec);
|
||||
if (options.observeAction === "gc") return runNodeWebProbeObserveGc(options, spec);
|
||||
return runNodeWebProbeObserveAnalyze(options, spec);
|
||||
}
|
||||
|
||||
export function runNodeWebProbeObserveGc(options: NodeWebProbeObserveOptions, spec: HwlabRuntimeLaneSpec): Record<string, unknown> {
|
||||
const stateRoot = `.state/web-observe/${safeWebObserveSegment(options.node)}/${safeWebObserveSegment(options.lane)}`;
|
||||
const script = [
|
||||
"set -eu",
|
||||
`node - ${shellQuote(stateRoot)} ${shellQuote(String(options.gcKeepHours))} ${shellQuote(String(options.gcLimit))} ${shellQuote(options.confirm ? "run" : "plan")} ${shellQuote(options.node)} ${shellQuote(options.lane)} <<'UNIDESK_WEB_OBSERVE_GC'`,
|
||||
nodeWebObserveGcNodeScript(),
|
||||
"UNIDESK_WEB_OBSERVE_GC",
|
||||
].join("\n");
|
||||
const result = runTransWorkspaceStdinScript(options.node, spec.workspace, script, options.commandTimeoutSeconds);
|
||||
const payload = parseJsonObject(result.stdout);
|
||||
const ok = result.exitCode === 0 && payload.ok !== false;
|
||||
return {
|
||||
ok,
|
||||
status: ok ? (options.confirm ? "cleaned" : "planned") : "blocked",
|
||||
command: `web-probe observe gc --node ${options.node} --lane ${options.lane}`,
|
||||
node: options.node,
|
||||
lane: options.lane,
|
||||
workspace: spec.workspace,
|
||||
stateRoot,
|
||||
mode: options.confirm ? "confirmed-run" : "dry-run",
|
||||
retention: {
|
||||
source: "config/unidesk-cli.yaml#gc.stateStaleScratch.keepHours",
|
||||
keepHours: options.gcKeepHours,
|
||||
userOverride: options.gcKeepHours !== webObserveGcDefaultKeepHours(),
|
||||
},
|
||||
gc: options.full || options.raw ? payload : compactWebObserveGcPayload(payload),
|
||||
result: ok
|
||||
? {
|
||||
exitCode: result.exitCode,
|
||||
timedOut: result.timedOut,
|
||||
stdoutBytes: result.stdout.length,
|
||||
stderrBytes: result.stderr.length,
|
||||
}
|
||||
: compactCommandResultWithStdoutTail(result),
|
||||
valuesRedacted: true,
|
||||
};
|
||||
}
|
||||
|
||||
function compactWebObserveGcPayload(payload: Record<string, unknown>): Record<string, unknown> {
|
||||
const candidates = Array.isArray(payload.candidates) ? payload.candidates.map(record) : [];
|
||||
const protectedRuns = Array.isArray(payload.protected) ? payload.protected.map(record) : [];
|
||||
const deleted = Array.isArray(payload.deleted) ? payload.deleted.map(record) : [];
|
||||
const failures = Array.isArray(payload.failures) ? payload.failures.map(record) : [];
|
||||
return {
|
||||
ok: payload.ok ?? null,
|
||||
mode: payload.mode ?? null,
|
||||
mutation: payload.mutation ?? null,
|
||||
keepHours: payload.keepHours ?? null,
|
||||
limit: payload.limit ?? null,
|
||||
diskBefore: payload.diskBefore ?? null,
|
||||
diskAfter: payload.diskAfter ?? null,
|
||||
scannedRuns: payload.scannedRuns ?? null,
|
||||
candidateCount: payload.candidateCount ?? null,
|
||||
selectedCount: payload.selectedCount ?? null,
|
||||
deferredCount: payload.deferredCount ?? null,
|
||||
protectedCount: payload.protectedCount ?? null,
|
||||
estimatedReclaimBytes: payload.estimatedReclaimBytes ?? null,
|
||||
estimatedReclaimHuman: payload.estimatedReclaimHuman ?? null,
|
||||
reclaimedBytes: payload.reclaimedBytes ?? null,
|
||||
reclaimedHuman: payload.reclaimedHuman ?? null,
|
||||
candidates: candidates.slice(0, 20).map((item) => ({
|
||||
id: item.id ?? null,
|
||||
runDir: item.runDir ?? null,
|
||||
status: item.status ?? null,
|
||||
ageHours: item.ageHours ?? null,
|
||||
rawHuman: item.rawHuman ?? null,
|
||||
artifactCount: item.artifactCount ?? null,
|
||||
})),
|
||||
deleted: deleted.slice(0, 20).map((item) => ({
|
||||
id: item.id ?? null,
|
||||
runDir: item.runDir ?? null,
|
||||
rawHuman: item.rawHuman ?? null,
|
||||
artifactCount: item.artifactCount ?? null,
|
||||
})),
|
||||
failures: failures.slice(0, 20),
|
||||
protected: protectedRuns.slice(0, 20).map((item) => ({
|
||||
id: item.id ?? null,
|
||||
runDir: item.runDir ?? null,
|
||||
status: item.status ?? null,
|
||||
ageHours: item.ageHours ?? null,
|
||||
rawHuman: item.rawHuman ?? null,
|
||||
reasons: item.reasons ?? null,
|
||||
})),
|
||||
disclosure: {
|
||||
candidateRowsShown: Math.min(20, candidates.length),
|
||||
deletedRowsShown: Math.min(20, deleted.length),
|
||||
protectedRowsShown: Math.min(20, protectedRuns.length),
|
||||
full: "rerun with --full or --raw for all candidates/artifacts",
|
||||
},
|
||||
valuesRedacted: true,
|
||||
};
|
||||
}
|
||||
|
||||
function nodeWebObserveGcNodeScript(): string {
|
||||
return String.raw`
|
||||
const fs = require("fs");
|
||||
const path = require("path");
|
||||
const childProcess = require("child_process");
|
||||
|
||||
const stateRoot = process.argv[2];
|
||||
const keepHours = Number(process.argv[3]);
|
||||
const limit = Number(process.argv[4]);
|
||||
const mode = process.argv[5] === "run" ? "run" : "plan";
|
||||
const nodeId = process.argv[6];
|
||||
const lane = process.argv[7];
|
||||
const keepMs = keepHours * 60 * 60 * 1000;
|
||||
const nowMs = Date.now();
|
||||
const rawNames = ["samples.jsonl", "browser-process.jsonl", "network.jsonl", "console.jsonl", "artifacts.jsonl", "screenshots"];
|
||||
|
||||
function jsonRead(file) {
|
||||
try { return JSON.parse(fs.readFileSync(file, "utf8")); } catch { return null; }
|
||||
}
|
||||
|
||||
function parseTime(value) {
|
||||
if (typeof value !== "string") return 0;
|
||||
const ms = Date.parse(value);
|
||||
return Number.isFinite(ms) ? ms : 0;
|
||||
}
|
||||
|
||||
function dirTimeMs(runDir) {
|
||||
const base = path.basename(runDir);
|
||||
const match = base.match(/^(\d{8})T(\d{6})Z_/);
|
||||
if (!match) return 0;
|
||||
const raw = match[1].slice(0, 4) + "-" + match[1].slice(4, 6) + "-" + match[1].slice(6, 8) + "T" + match[2].slice(0, 2) + ":" + match[2].slice(2, 4) + ":" + match[2].slice(4, 6) + "Z";
|
||||
return Date.parse(raw) || 0;
|
||||
}
|
||||
|
||||
function newestKnownTimeMs(runDir, manifest, heartbeat) {
|
||||
return Math.max(
|
||||
dirTimeMs(runDir),
|
||||
parseTime(manifest && manifest.completedAt),
|
||||
parseTime(manifest && manifest.updatedAt),
|
||||
parseTime(manifest && manifest.startedAt),
|
||||
parseTime(manifest && manifest.pageProvenance && manifest.pageProvenance.observedAt),
|
||||
parseTime(heartbeat && heartbeat.completedAt),
|
||||
parseTime(heartbeat && heartbeat.forceStoppedAt),
|
||||
parseTime(heartbeat && heartbeat.updatedAt),
|
||||
parseTime(heartbeat && heartbeat.pageProvenance && heartbeat.pageProvenance.observedAt),
|
||||
);
|
||||
}
|
||||
|
||||
function sizePath(target) {
|
||||
let total = 0;
|
||||
const stack = [target];
|
||||
while (stack.length > 0) {
|
||||
const current = stack.pop();
|
||||
let st;
|
||||
try { st = fs.lstatSync(current); } catch { continue; }
|
||||
if (st.isSymbolicLink()) continue;
|
||||
total += st.blocks ? st.blocks * 512 : st.size;
|
||||
if (!st.isDirectory()) continue;
|
||||
let entries;
|
||||
try { entries = fs.readdirSync(current); } catch { continue; }
|
||||
for (const entry of entries) stack.push(path.join(current, entry));
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
function human(bytes) {
|
||||
const units = ["B", "KiB", "MiB", "GiB", "TiB"];
|
||||
let value = bytes;
|
||||
let unit = 0;
|
||||
while (value >= 1024 && unit < units.length - 1) { value /= 1024; unit += 1; }
|
||||
return value.toFixed(unit === 0 ? 0 : 1) + units[unit];
|
||||
}
|
||||
|
||||
function disk() {
|
||||
try {
|
||||
const line = childProcess.execFileSync("df", ["-Pk", "/"], { encoding: "utf8" }).trim().split("\n").slice(-1)[0];
|
||||
const parts = line.trim().split(/\s+/);
|
||||
const sizeBytes = Number(parts[1]) * 1024;
|
||||
const usedBytes = Number(parts[2]) * 1024;
|
||||
const availableBytes = Number(parts[3]) * 1024;
|
||||
const usePercent = Number(String(parts[4]).replace("%", ""));
|
||||
return { filesystem: parts[0], sizeBytes, usedBytes, availableBytes, usePercent, usedHuman: human(usedBytes), availableHuman: human(availableBytes) };
|
||||
} catch (error) {
|
||||
return { error: error instanceof Error ? error.message : String(error) };
|
||||
}
|
||||
}
|
||||
|
||||
function findRunDirs(root) {
|
||||
const runs = [];
|
||||
for (const year of safeReadDir(root)) {
|
||||
const y = path.join(root, year.name);
|
||||
if (!year.isDirectory() || !/^\d{4}$/.test(year.name)) continue;
|
||||
for (const month of safeReadDir(y)) {
|
||||
const m = path.join(y, month.name);
|
||||
if (!month.isDirectory() || !/^\d{2}$/.test(month.name)) continue;
|
||||
for (const day of safeReadDir(m)) {
|
||||
const d = path.join(m, day.name);
|
||||
if (!day.isDirectory() || !/^\d{2}$/.test(day.name)) continue;
|
||||
for (const run of safeReadDir(d)) {
|
||||
const r = path.join(d, run.name);
|
||||
if (run.isDirectory() && /_webobs-[A-Za-z0-9_.-]+$/.test(run.name)) runs.push(r);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return runs;
|
||||
}
|
||||
|
||||
function safeReadDir(dir) {
|
||||
try { return fs.readdirSync(dir, { withFileTypes: true }); } catch { return []; }
|
||||
}
|
||||
|
||||
function pidObserverAlive(pid, runAbs) {
|
||||
if (!Number.isInteger(pid) || pid <= 1) return { alive: false, reason: null };
|
||||
try {
|
||||
const cmdline = fs.readFileSync("/proc/" + pid + "/cmdline", "utf8").replace(/\0/g, " ");
|
||||
if (cmdline.includes("observer-runner") || cmdline.includes(runAbs)) return { alive: true, reason: "pid-alive" };
|
||||
return { alive: false, reason: null };
|
||||
} catch (error) {
|
||||
if (fs.existsSync("/proc/" + pid)) return { alive: true, reason: "pid-alive-unreadable" };
|
||||
return { alive: false, reason: null };
|
||||
}
|
||||
}
|
||||
|
||||
function openRunDirs(rootAbs) {
|
||||
const active = new Set();
|
||||
const procEntries = safeReadDir("/proc").filter((entry) => entry.isDirectory() && /^\d+$/.test(entry.name));
|
||||
for (const proc of procEntries) {
|
||||
const fdDir = path.join("/proc", proc.name, "fd");
|
||||
for (const fd of safeReadDir(fdDir)) {
|
||||
let target;
|
||||
try { target = fs.readlinkSync(path.join(fdDir, fd.name)).replace(/ \(deleted\)$/, ""); } catch { continue; }
|
||||
if (!target.startsWith(rootAbs + path.sep)) continue;
|
||||
const rel = path.relative(rootAbs, target).split(path.sep);
|
||||
if (rel.length >= 4) active.add(path.join(rootAbs, rel[0], rel[1], rel[2], rel[3]));
|
||||
}
|
||||
}
|
||||
return active;
|
||||
}
|
||||
|
||||
function rawArtifacts(runDir) {
|
||||
const artifacts = [];
|
||||
for (const name of rawNames) {
|
||||
const target = path.join(runDir, name);
|
||||
if (!fs.existsSync(target)) continue;
|
||||
const bytes = sizePath(target);
|
||||
if (bytes > 0) artifacts.push({ name, path: target, bytes, human: human(bytes) });
|
||||
}
|
||||
return artifacts;
|
||||
}
|
||||
|
||||
function removeArtifact(artifact) {
|
||||
const st = fs.lstatSync(artifact.path);
|
||||
if (st.isSymbolicLink()) throw new Error("refusing symlink: " + artifact.path);
|
||||
if (st.isDirectory()) fs.rmSync(artifact.path, { recursive: true, force: false });
|
||||
else fs.unlinkSync(artifact.path);
|
||||
}
|
||||
|
||||
const rootAbs = path.resolve(stateRoot);
|
||||
const diskBefore = disk();
|
||||
const openDirs = openRunDirs(rootAbs);
|
||||
const candidates = [];
|
||||
const protectedRuns = [];
|
||||
|
||||
for (const runDir of findRunDirs(rootAbs)) {
|
||||
const manifest = jsonRead(path.join(runDir, "manifest.json"));
|
||||
const heartbeat = jsonRead(path.join(runDir, "heartbeat.json"));
|
||||
const pid = Number((heartbeat && heartbeat.pid) || (manifest && manifest.pid) || 0);
|
||||
const pidState = pidObserverAlive(pid, runDir);
|
||||
const hasOpenFd = openDirs.has(runDir);
|
||||
const newestMs = newestKnownTimeMs(runDir, manifest, heartbeat);
|
||||
const ageMs = newestMs > 0 ? nowMs - newestMs : 0;
|
||||
const artifacts = rawArtifacts(runDir);
|
||||
const rawBytes = artifacts.reduce((sum, item) => sum + item.bytes, 0);
|
||||
const rel = path.relative(rootAbs, runDir);
|
||||
const reasons = [];
|
||||
if (!manifest) reasons.push("manifest-missing");
|
||||
if (pidState.alive) reasons.push(pidState.reason || "pid-alive");
|
||||
if (hasOpenFd) reasons.push("open-fd");
|
||||
if (newestMs <= 0) reasons.push("time-unknown");
|
||||
if (ageMs < keepMs) reasons.push("retention-window");
|
||||
if (rawBytes <= 0) reasons.push("no-raw-artifacts");
|
||||
const summary = {
|
||||
id: (manifest && manifest.jobId) || path.basename(runDir).match(/(webobs-[A-Za-z0-9_.-]+)$/)?.[1] || null,
|
||||
runDir: rel,
|
||||
status: heartbeat && heartbeat.status || null,
|
||||
pid: Number.isInteger(pid) && pid > 0 ? pid : null,
|
||||
newestAt: newestMs > 0 ? new Date(newestMs).toISOString() : null,
|
||||
ageHours: newestMs > 0 ? Number((ageMs / 3600000).toFixed(2)) : null,
|
||||
rawBytes,
|
||||
rawHuman: human(rawBytes),
|
||||
artifacts,
|
||||
};
|
||||
if (reasons.length > 0) protectedRuns.push({ ...summary, reasons });
|
||||
else candidates.push(summary);
|
||||
}
|
||||
|
||||
candidates.sort((a, b) => b.rawBytes - a.rawBytes);
|
||||
const selected = candidates.slice(0, limit);
|
||||
const selectedSet = new Set(selected.map((item) => item.runDir));
|
||||
const deferred = candidates.filter((item) => !selectedSet.has(item.runDir));
|
||||
let reclaimedBytes = 0;
|
||||
const deleted = [];
|
||||
const failures = [];
|
||||
|
||||
if (mode === "run") {
|
||||
for (const item of selected) {
|
||||
const runDir = path.join(rootAbs, item.runDir);
|
||||
try {
|
||||
for (const artifact of item.artifacts) removeArtifact(artifact);
|
||||
reclaimedBytes += item.rawBytes;
|
||||
deleted.push({ id: item.id, runDir: item.runDir, rawBytes: item.rawBytes, rawHuman: item.rawHuman, artifactCount: item.artifacts.length });
|
||||
} catch (error) {
|
||||
failures.push({ id: item.id, runDir: item.runDir, error: error instanceof Error ? error.message : String(error) });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const estimatedReclaimBytes = selected.reduce((sum, item) => sum + item.rawBytes, 0);
|
||||
const diskAfter = mode === "run" ? disk() : null;
|
||||
console.log(JSON.stringify({
|
||||
ok: failures.length === 0,
|
||||
command: "web-probe observe gc",
|
||||
node: nodeId,
|
||||
lane,
|
||||
stateRoot,
|
||||
mode,
|
||||
mutation: mode === "run",
|
||||
keepHours,
|
||||
limit,
|
||||
diskBefore,
|
||||
diskAfter,
|
||||
scannedRuns: candidates.length + protectedRuns.length,
|
||||
candidateCount: candidates.length,
|
||||
selectedCount: selected.length,
|
||||
deferredCount: deferred.length,
|
||||
protectedCount: protectedRuns.length,
|
||||
estimatedReclaimBytes,
|
||||
estimatedReclaimHuman: human(estimatedReclaimBytes),
|
||||
reclaimedBytes,
|
||||
reclaimedHuman: human(reclaimedBytes),
|
||||
candidates: selected.map((item) => ({ id: item.id, runDir: item.runDir, status: item.status, ageHours: item.ageHours, rawBytes: item.rawBytes, rawHuman: item.rawHuman, artifactCount: item.artifacts.length, artifacts: item.artifacts.map((artifact) => ({ name: artifact.name, bytes: artifact.bytes, human: artifact.human })) })),
|
||||
deleted,
|
||||
failures,
|
||||
protected: protectedRuns.slice(0, Math.min(50, protectedRuns.length)).map((item) => ({ id: item.id, runDir: item.runDir, status: item.status, ageHours: item.ageHours, rawBytes: item.rawBytes, rawHuman: item.rawHuman, reasons: item.reasons })),
|
||||
valuesRedacted: true,
|
||||
}, null, 2));
|
||||
`;
|
||||
}
|
||||
|
||||
export function runNodeWebProbeObserveStart(
|
||||
options: NodeWebProbeObserveOptions,
|
||||
spec: HwlabRuntimeLaneSpec,
|
||||
@@ -1343,6 +1711,7 @@ export function runNodeWebProbeObserveStart(
|
||||
`UNIDESK_WEB_OBSERVE_SCREENSHOT_INTERVAL_MS=${shellQuote(String(options.screenshotIntervalMs))}`,
|
||||
`UNIDESK_WEB_OBSERVE_OBSERVER_REFRESH_INTERVAL_MS=${shellQuote(String(options.observerRefreshIntervalMs))}`,
|
||||
`UNIDESK_WEB_OBSERVE_MAX_SAMPLES=${shellQuote(String(options.maxSamples))}`,
|
||||
`UNIDESK_WEB_OBSERVE_MAX_RUN_MS=${shellQuote(String(options.maxRunSeconds > 0 ? options.maxRunSeconds * 1000 : 0))}`,
|
||||
`UNIDESK_WEB_OBSERVE_VIEWPORT=${shellQuote(options.viewport)}`,
|
||||
`UNIDESK_WEB_OBSERVE_BROWSER_PROXY_MODE=${shellQuote(options.browserProxyMode)}`,
|
||||
`UNIDESK_WEB_OBSERVE_ALERT_THRESHOLDS_JSON=${shellQuote(JSON.stringify(alertThresholds))}`,
|
||||
|
||||
Reference in New Issue
Block a user