fix(web-probe): isolate performance capture cdp timeouts
This commit is contained in:
@@ -31,6 +31,7 @@ export function nodeWebObserveRunnerControlSource(): string {
|
||||
} finally {
|
||||
stopCommandSampler();
|
||||
activeCommandId = null;
|
||||
activeCommandType = null;
|
||||
await writeHeartbeat({ status: terminalStatus });
|
||||
}
|
||||
}
|
||||
@@ -79,7 +80,8 @@ function startCommandActiveSampler(command) {
|
||||
async function processCommand(command) {
|
||||
commandSeq += 1;
|
||||
activeCommandId = command.id;
|
||||
await writeHeartbeat({ status: "running", activeCommandId });
|
||||
activeCommandType = command.type;
|
||||
await writeHeartbeat({ status: "running", activeCommandId, activeCommandType });
|
||||
await appendJsonl(files.control, controlRecord(command, "started", commandInputSummary(command)));
|
||||
switch (command.type) {
|
||||
case "login": return authenticate(context);
|
||||
|
||||
@@ -246,6 +246,7 @@ async function capturePerformanceProfile(command) {
|
||||
const startedAt = new Date(startedAtMs).toISOString();
|
||||
let pageClockStart = null;
|
||||
let stopped = null;
|
||||
let captureError = null;
|
||||
try {
|
||||
pageClockStart = await withHardTimeout(targetPage.evaluate(() => ({ timeOrigin: Math.round(performance.timeOrigin || 0), now: Math.round(performance.now()), url: location.href, path: location.pathname, title: document.title, valuesRedacted: true })), timeoutMs, "performanceCapture page clock exceeded " + timeoutMs + "ms")
|
||||
.catch((error) => ({ error: errorSummary(error), valuesRedacted: true }));
|
||||
@@ -265,6 +266,8 @@ async function capturePerformanceProfile(command) {
|
||||
}));
|
||||
await sleep(durationMs);
|
||||
stopped = await withHardTimeout(session.send("Profiler.stop"), Math.max(timeoutMs, 5000), "Profiler.stop exceeded " + Math.max(timeoutMs, 5000) + "ms");
|
||||
} catch (error) {
|
||||
captureError = error;
|
||||
} finally {
|
||||
if (session) {
|
||||
await withHardTimeout(session.send("Profiler.disable"), 1000, "Profiler.disable exceeded 1000ms").catch(() => {});
|
||||
@@ -274,10 +277,67 @@ async function capturePerformanceProfile(command) {
|
||||
const completedAtMs = Date.now();
|
||||
const afterDrain = await drainPagePerformanceEvents(targetPage, { reason: "performanceCapture-after", groupSeq: sampleSeq, pageRole: targetPageRole, targetPageId, pageEpoch: targetPageEpoch })
|
||||
.catch((error) => ({ ok: false, error: errorSummary(error), count: 0, valuesRedacted: true }));
|
||||
const summaryFile = path.join(captureDir, "summary.json");
|
||||
if (captureError) {
|
||||
const failureKind = isTimeoutErrorMessage(captureError?.message) ? "performance-capture-cdp-timeout" : "performance-capture-failed";
|
||||
const summaryPayload = {
|
||||
ok: false,
|
||||
captureId,
|
||||
type: "performance-cpu-profile",
|
||||
commandId: command.id,
|
||||
label: truncate(command.label || "", 200),
|
||||
startedAt,
|
||||
completedAt: new Date(completedAtMs).toISOString(),
|
||||
durationMs: completedAtMs - startedAtMs,
|
||||
requestedDurationMs: durationMs,
|
||||
pageRole: targetPageRole,
|
||||
pageId: targetPageId,
|
||||
pageEpoch: targetPageEpoch,
|
||||
pageClockStart,
|
||||
currentUrl: pageUrl(targetPage),
|
||||
beforeDrain,
|
||||
afterDrain,
|
||||
failureKind,
|
||||
error: errorSummary(captureError),
|
||||
valuesRedacted: true,
|
||||
};
|
||||
await writeFile(summaryFile, JSON.stringify(summaryPayload, null, 2) + "\n", { mode: 0o600 });
|
||||
const summaryMeta = await fileMeta(summaryFile);
|
||||
artifactSeq += 1;
|
||||
const artifact = {
|
||||
seq: artifactSeq,
|
||||
sampleSeq,
|
||||
ts: new Date().toISOString(),
|
||||
kind: "performance-cpu-profile-failed",
|
||||
captureId,
|
||||
commandId: command.id,
|
||||
summaryPath: summaryFile,
|
||||
summaryByteCount: summaryMeta.byteCount,
|
||||
summarySha256: summaryMeta.sha256,
|
||||
pageRole: targetPageRole,
|
||||
pageId: targetPageId,
|
||||
durationMs: summaryPayload.durationMs,
|
||||
failureKind,
|
||||
valuesRedacted: true,
|
||||
};
|
||||
await appendJsonl(files.artifacts, artifact);
|
||||
await appendJsonl(files.performanceEvents, eventRecord("performance-capture-failed", {
|
||||
captureId,
|
||||
pageRole: targetPageRole,
|
||||
pageId: targetPageId,
|
||||
pageEpoch: targetPageEpoch,
|
||||
artifact,
|
||||
failureKind,
|
||||
error: summaryPayload.error,
|
||||
valuesRedacted: true,
|
||||
}));
|
||||
const wrapped = captureError instanceof Error ? captureError : new Error(String(captureError));
|
||||
wrapped.details = summaryPayload;
|
||||
throw wrapped;
|
||||
}
|
||||
const profile = stopped?.profile || null;
|
||||
const summary = summarizeCpuProfile(profile);
|
||||
const profileFile = path.join(captureDir, "profile.cpuprofile");
|
||||
const summaryFile = path.join(captureDir, "summary.json");
|
||||
const summaryPayload = {
|
||||
ok: true,
|
||||
captureId,
|
||||
|
||||
@@ -160,6 +160,7 @@ async function collectBrowserProcessSample(reason) {
|
||||
|
||||
async function enforceBrowserFreezePolicy(sample) {
|
||||
if (browserFreezePolicy.enabled !== true || browserFreezeBlocker) return;
|
||||
const suppressRuntimeProbeFreeze = activeCommandType === "performanceCapture";
|
||||
const processSummary = sample && typeof sample.process === "object" ? sample.process : {};
|
||||
const growth = sample && typeof sample.growth === "object" ? sample.growth : {};
|
||||
const totalRssMb = Number(processSummary.totalRssMb);
|
||||
@@ -228,7 +229,7 @@ async function enforceBrowserFreezePolicy(sample) {
|
||||
const responsiveness = pageMetric?.responsiveness && typeof pageMetric.responsiveness === "object" ? pageMetric.responsiveness : {};
|
||||
const responsivenessLatencyMs = Number(responsiveness.latencyMs);
|
||||
if (responsiveness.timeout === true || (Number.isFinite(responsivenessLatencyMs) && responsivenessLatencyMs >= browserFreezePolicy.responsiveness.latencyBlockerMs)) {
|
||||
const signal = recordBrowserFreezeSignal("playwright-responsiveness", sample, pageMetric, {
|
||||
const detail = {
|
||||
rootCause: "frontend_browser_page_unresponsive_to_playwright",
|
||||
observed: {
|
||||
responsivenessLatencyMs: Number.isFinite(responsivenessLatencyMs) ? responsivenessLatencyMs : null,
|
||||
@@ -241,10 +242,15 @@ async function enforceBrowserFreezePolicy(sample) {
|
||||
windowMs: browserFreezePolicy.blockerWindowMs,
|
||||
valuesRedacted: true,
|
||||
},
|
||||
});
|
||||
if (signal.burst.length >= browserFreezePolicy.responsiveness.eventBlockerCount) {
|
||||
await triggerBrowserFreezeBlocker(signal);
|
||||
return;
|
||||
};
|
||||
if (suppressRuntimeProbeFreeze) {
|
||||
await appendBrowserFreezeSuppressedSignal("playwright-responsiveness", sample, pageMetric, detail, "performance-capture-active");
|
||||
} else {
|
||||
const signal = recordBrowserFreezeSignal("playwright-responsiveness", sample, pageMetric, detail);
|
||||
if (signal.burst.length >= browserFreezePolicy.responsiveness.eventBlockerCount) {
|
||||
await triggerBrowserFreezeBlocker(signal);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
const cdp = pageMetric?.cdp && typeof pageMetric.cdp === "object" ? pageMetric.cdp : {};
|
||||
@@ -253,7 +259,7 @@ async function enforceBrowserFreezePolicy(sample) {
|
||||
const sessionTimeoutCount = calls.length === 0 ? Number(cdp.timeoutCount || 0) : 0;
|
||||
const metricTimeoutCount = metricTimeoutCalls.length + (Number.isFinite(sessionTimeoutCount) ? sessionTimeoutCount : 0);
|
||||
if (metricTimeoutCount > 0) {
|
||||
const signal = recordBrowserFreezeSignal("cdp-metrics-timeout", sample, pageMetric, {
|
||||
const detail = {
|
||||
rootCause: "frontend_browser_cdp_metrics_unresponsive",
|
||||
observed: {
|
||||
cdpMetricsTimeoutCount: metricTimeoutCount,
|
||||
@@ -265,15 +271,35 @@ async function enforceBrowserFreezePolicy(sample) {
|
||||
windowMs: browserFreezePolicy.blockerWindowMs,
|
||||
valuesRedacted: true,
|
||||
},
|
||||
});
|
||||
if (signal.burst.length >= browserFreezePolicy.cdp.metricsTimeoutBlockerCount) {
|
||||
await triggerBrowserFreezeBlocker(signal);
|
||||
return;
|
||||
};
|
||||
if (suppressRuntimeProbeFreeze) {
|
||||
await appendBrowserFreezeSuppressedSignal("cdp-metrics-timeout", sample, pageMetric, detail, "performance-capture-active");
|
||||
} else {
|
||||
const signal = recordBrowserFreezeSignal("cdp-metrics-timeout", sample, pageMetric, detail);
|
||||
if (signal.burst.length >= browserFreezePolicy.cdp.metricsTimeoutBlockerCount) {
|
||||
await triggerBrowserFreezeBlocker(signal);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function appendBrowserFreezeSuppressedSignal(kind, sample, pageMetric, detail, reason) {
|
||||
await appendJsonl(files.browserProcess, eventRecord("browser-freeze-signal-suppressed", {
|
||||
kind,
|
||||
reason,
|
||||
activeCommandId,
|
||||
activeCommandType,
|
||||
rootCause: detail?.rootCause ?? null,
|
||||
observed: detail?.observed ?? null,
|
||||
threshold: detail?.threshold ?? null,
|
||||
sample: browserProcessSampleRef(sample),
|
||||
page: browserPageMetricRef(pageMetric),
|
||||
valuesRedacted: true,
|
||||
})).catch(() => {});
|
||||
}
|
||||
|
||||
function recordBrowserFreezeSignal(kind, sample, pageMetric, detail) {
|
||||
const tsMs = Date.parse(String(sample?.ts || ""));
|
||||
const signal = {
|
||||
|
||||
@@ -79,6 +79,7 @@ let sampleSeq = 0;
|
||||
let commandSeq = 0;
|
||||
let artifactSeq = 0;
|
||||
let activeCommandId = null;
|
||||
let activeCommandType = null;
|
||||
let stopping = false;
|
||||
let terminalStatus = "starting";
|
||||
let lastScreenshotAtMs = 0;
|
||||
|
||||
Reference in New Issue
Block a user