From 99ed2e950237f6d109ed8166d8c766c8dbfe69dc Mon Sep 17 00:00:00 2001 From: Codex Date: Thu, 2 Jul 2026 21:14:05 +0000 Subject: [PATCH] fix(web-probe): isolate performance capture cdp timeouts --- ...-node-web-observe-runner-control-source.ts | 4 +- ...e-web-observe-runner-performance-source.ts | 62 ++++++++++++++++++- ...-node-web-observe-runner-runtime-source.ts | 46 +++++++++++--- .../hwlab-node-web-observe-runner-source.ts | 1 + 4 files changed, 101 insertions(+), 12 deletions(-) diff --git a/scripts/src/hwlab-node-web-observe-runner-control-source.ts b/scripts/src/hwlab-node-web-observe-runner-control-source.ts index 74bacd4e..0b37d351 100644 --- a/scripts/src/hwlab-node-web-observe-runner-control-source.ts +++ b/scripts/src/hwlab-node-web-observe-runner-control-source.ts @@ -31,6 +31,7 @@ export function nodeWebObserveRunnerControlSource(): string { } finally { stopCommandSampler(); activeCommandId = null; + activeCommandType = null; await writeHeartbeat({ status: terminalStatus }); } } @@ -79,7 +80,8 @@ function startCommandActiveSampler(command) { async function processCommand(command) { commandSeq += 1; activeCommandId = command.id; - await writeHeartbeat({ status: "running", activeCommandId }); + activeCommandType = command.type; + await writeHeartbeat({ status: "running", activeCommandId, activeCommandType }); await appendJsonl(files.control, controlRecord(command, "started", commandInputSummary(command))); switch (command.type) { case "login": return authenticate(context); diff --git a/scripts/src/hwlab-node-web-observe-runner-performance-source.ts b/scripts/src/hwlab-node-web-observe-runner-performance-source.ts index d56ef14a..be0cf785 100644 --- a/scripts/src/hwlab-node-web-observe-runner-performance-source.ts +++ b/scripts/src/hwlab-node-web-observe-runner-performance-source.ts @@ -246,6 +246,7 @@ async function capturePerformanceProfile(command) { const startedAt = new Date(startedAtMs).toISOString(); let pageClockStart = null; let stopped = null; + let captureError = null; try { pageClockStart = await withHardTimeout(targetPage.evaluate(() => ({ timeOrigin: Math.round(performance.timeOrigin || 0), now: Math.round(performance.now()), url: location.href, path: location.pathname, title: document.title, valuesRedacted: true })), timeoutMs, "performanceCapture page clock exceeded " + timeoutMs + "ms") .catch((error) => ({ error: errorSummary(error), valuesRedacted: true })); @@ -265,6 +266,8 @@ async function capturePerformanceProfile(command) { })); await sleep(durationMs); stopped = await withHardTimeout(session.send("Profiler.stop"), Math.max(timeoutMs, 5000), "Profiler.stop exceeded " + Math.max(timeoutMs, 5000) + "ms"); + } catch (error) { + captureError = error; } finally { if (session) { await withHardTimeout(session.send("Profiler.disable"), 1000, "Profiler.disable exceeded 1000ms").catch(() => {}); @@ -274,10 +277,67 @@ async function capturePerformanceProfile(command) { const completedAtMs = Date.now(); const afterDrain = await drainPagePerformanceEvents(targetPage, { reason: "performanceCapture-after", groupSeq: sampleSeq, pageRole: targetPageRole, targetPageId, pageEpoch: targetPageEpoch }) .catch((error) => ({ ok: false, error: errorSummary(error), count: 0, valuesRedacted: true })); + const summaryFile = path.join(captureDir, "summary.json"); + if (captureError) { + const failureKind = isTimeoutErrorMessage(captureError?.message) ? "performance-capture-cdp-timeout" : "performance-capture-failed"; + const summaryPayload = { + ok: false, + captureId, + type: "performance-cpu-profile", + commandId: command.id, + label: truncate(command.label || "", 200), + startedAt, + completedAt: new Date(completedAtMs).toISOString(), + durationMs: completedAtMs - startedAtMs, + requestedDurationMs: durationMs, + pageRole: targetPageRole, + pageId: targetPageId, + pageEpoch: targetPageEpoch, + pageClockStart, + currentUrl: pageUrl(targetPage), + beforeDrain, + afterDrain, + failureKind, + error: errorSummary(captureError), + valuesRedacted: true, + }; + await writeFile(summaryFile, JSON.stringify(summaryPayload, null, 2) + "\n", { mode: 0o600 }); + const summaryMeta = await fileMeta(summaryFile); + artifactSeq += 1; + const artifact = { + seq: artifactSeq, + sampleSeq, + ts: new Date().toISOString(), + kind: "performance-cpu-profile-failed", + captureId, + commandId: command.id, + summaryPath: summaryFile, + summaryByteCount: summaryMeta.byteCount, + summarySha256: summaryMeta.sha256, + pageRole: targetPageRole, + pageId: targetPageId, + durationMs: summaryPayload.durationMs, + failureKind, + valuesRedacted: true, + }; + await appendJsonl(files.artifacts, artifact); + await appendJsonl(files.performanceEvents, eventRecord("performance-capture-failed", { + captureId, + pageRole: targetPageRole, + pageId: targetPageId, + pageEpoch: targetPageEpoch, + artifact, + failureKind, + error: summaryPayload.error, + valuesRedacted: true, + })); + const wrapped = captureError instanceof Error ? captureError : new Error(String(captureError)); + wrapped.details = summaryPayload; + throw wrapped; + } const profile = stopped?.profile || null; const summary = summarizeCpuProfile(profile); const profileFile = path.join(captureDir, "profile.cpuprofile"); - const summaryFile = path.join(captureDir, "summary.json"); const summaryPayload = { ok: true, captureId, diff --git a/scripts/src/hwlab-node-web-observe-runner-runtime-source.ts b/scripts/src/hwlab-node-web-observe-runner-runtime-source.ts index f9eb716d..ec9eb4f9 100644 --- a/scripts/src/hwlab-node-web-observe-runner-runtime-source.ts +++ b/scripts/src/hwlab-node-web-observe-runner-runtime-source.ts @@ -160,6 +160,7 @@ async function collectBrowserProcessSample(reason) { async function enforceBrowserFreezePolicy(sample) { if (browserFreezePolicy.enabled !== true || browserFreezeBlocker) return; + const suppressRuntimeProbeFreeze = activeCommandType === "performanceCapture"; const processSummary = sample && typeof sample.process === "object" ? sample.process : {}; const growth = sample && typeof sample.growth === "object" ? sample.growth : {}; const totalRssMb = Number(processSummary.totalRssMb); @@ -228,7 +229,7 @@ async function enforceBrowserFreezePolicy(sample) { const responsiveness = pageMetric?.responsiveness && typeof pageMetric.responsiveness === "object" ? pageMetric.responsiveness : {}; const responsivenessLatencyMs = Number(responsiveness.latencyMs); if (responsiveness.timeout === true || (Number.isFinite(responsivenessLatencyMs) && responsivenessLatencyMs >= browserFreezePolicy.responsiveness.latencyBlockerMs)) { - const signal = recordBrowserFreezeSignal("playwright-responsiveness", sample, pageMetric, { + const detail = { rootCause: "frontend_browser_page_unresponsive_to_playwright", observed: { responsivenessLatencyMs: Number.isFinite(responsivenessLatencyMs) ? responsivenessLatencyMs : null, @@ -241,10 +242,15 @@ async function enforceBrowserFreezePolicy(sample) { windowMs: browserFreezePolicy.blockerWindowMs, valuesRedacted: true, }, - }); - if (signal.burst.length >= browserFreezePolicy.responsiveness.eventBlockerCount) { - await triggerBrowserFreezeBlocker(signal); - return; + }; + if (suppressRuntimeProbeFreeze) { + await appendBrowserFreezeSuppressedSignal("playwright-responsiveness", sample, pageMetric, detail, "performance-capture-active"); + } else { + const signal = recordBrowserFreezeSignal("playwright-responsiveness", sample, pageMetric, detail); + if (signal.burst.length >= browserFreezePolicy.responsiveness.eventBlockerCount) { + await triggerBrowserFreezeBlocker(signal); + return; + } } } const cdp = pageMetric?.cdp && typeof pageMetric.cdp === "object" ? pageMetric.cdp : {}; @@ -253,7 +259,7 @@ async function enforceBrowserFreezePolicy(sample) { const sessionTimeoutCount = calls.length === 0 ? Number(cdp.timeoutCount || 0) : 0; const metricTimeoutCount = metricTimeoutCalls.length + (Number.isFinite(sessionTimeoutCount) ? sessionTimeoutCount : 0); if (metricTimeoutCount > 0) { - const signal = recordBrowserFreezeSignal("cdp-metrics-timeout", sample, pageMetric, { + const detail = { rootCause: "frontend_browser_cdp_metrics_unresponsive", observed: { cdpMetricsTimeoutCount: metricTimeoutCount, @@ -265,15 +271,35 @@ async function enforceBrowserFreezePolicy(sample) { windowMs: browserFreezePolicy.blockerWindowMs, valuesRedacted: true, }, - }); - if (signal.burst.length >= browserFreezePolicy.cdp.metricsTimeoutBlockerCount) { - await triggerBrowserFreezeBlocker(signal); - return; + }; + if (suppressRuntimeProbeFreeze) { + await appendBrowserFreezeSuppressedSignal("cdp-metrics-timeout", sample, pageMetric, detail, "performance-capture-active"); + } else { + const signal = recordBrowserFreezeSignal("cdp-metrics-timeout", sample, pageMetric, detail); + if (signal.burst.length >= browserFreezePolicy.cdp.metricsTimeoutBlockerCount) { + await triggerBrowserFreezeBlocker(signal); + return; + } } } } } +async function appendBrowserFreezeSuppressedSignal(kind, sample, pageMetric, detail, reason) { + await appendJsonl(files.browserProcess, eventRecord("browser-freeze-signal-suppressed", { + kind, + reason, + activeCommandId, + activeCommandType, + rootCause: detail?.rootCause ?? null, + observed: detail?.observed ?? null, + threshold: detail?.threshold ?? null, + sample: browserProcessSampleRef(sample), + page: browserPageMetricRef(pageMetric), + valuesRedacted: true, + })).catch(() => {}); +} + function recordBrowserFreezeSignal(kind, sample, pageMetric, detail) { const tsMs = Date.parse(String(sample?.ts || "")); const signal = { diff --git a/scripts/src/hwlab-node-web-observe-runner-source.ts b/scripts/src/hwlab-node-web-observe-runner-source.ts index 6492d7a3..8d9aaac7 100644 --- a/scripts/src/hwlab-node-web-observe-runner-source.ts +++ b/scripts/src/hwlab-node-web-observe-runner-source.ts @@ -79,6 +79,7 @@ let sampleSeq = 0; let commandSeq = 0; let artifactSeq = 0; let activeCommandId = null; +let activeCommandType = null; let stopping = false; let terminalStatus = "starting"; let lastScreenshotAtMs = 0;