fix: mark web sentinel frontend freezes red
This commit is contained in:
@@ -180,6 +180,10 @@ export interface HwlabRuntimeWebProbeAlertThresholdsSpec {
|
||||
readonly visibleLoadingSlowMs: number;
|
||||
readonly turnTimingSampleSlackSeconds: number;
|
||||
readonly turnElapsedSevereTimeoutSeconds: number;
|
||||
readonly domEvaluateTimeoutRedCount: number;
|
||||
readonly domEvaluateTimeoutRedWindowMs: number;
|
||||
readonly screenshotTimeoutRedCount: number;
|
||||
readonly pageErrorRedCount: number;
|
||||
readonly uncommandedStateChangeCommandWindowMs: number;
|
||||
readonly scrollJumpCommandWindowMs: number;
|
||||
readonly scrollJumpFromY: number;
|
||||
@@ -1176,6 +1180,10 @@ function webProbeAlertThresholdsConfig(value: unknown, path: string): HwlabRunti
|
||||
visibleLoadingSlowMs: positiveNumberField(raw, "visibleLoadingSlowMs", path),
|
||||
turnTimingSampleSlackSeconds: positiveNumberField(raw, "turnTimingSampleSlackSeconds", path),
|
||||
turnElapsedSevereTimeoutSeconds: positiveNumberField(raw, "turnElapsedSevereTimeoutSeconds", path),
|
||||
domEvaluateTimeoutRedCount: positiveNumberField(raw, "domEvaluateTimeoutRedCount", path),
|
||||
domEvaluateTimeoutRedWindowMs: positiveNumberField(raw, "domEvaluateTimeoutRedWindowMs", path),
|
||||
screenshotTimeoutRedCount: positiveNumberField(raw, "screenshotTimeoutRedCount", path),
|
||||
pageErrorRedCount: positiveNumberField(raw, "pageErrorRedCount", path),
|
||||
uncommandedStateChangeCommandWindowMs: positiveNumberField(raw, "uncommandedStateChangeCommandWindowMs", path),
|
||||
scrollJumpCommandWindowMs: positiveNumberField(raw, "scrollJumpCommandWindowMs", path),
|
||||
scrollJumpFromY: positiveNumberField(raw, "scrollJumpFromY", path),
|
||||
|
||||
@@ -557,6 +557,10 @@ function parseAlertThresholds(value) {
|
||||
visibleLoadingSlowMs: requiredPositiveThreshold(raw, "visibleLoadingSlowMs"),
|
||||
turnTimingSampleSlackSeconds: requiredPositiveThreshold(raw, "turnTimingSampleSlackSeconds"),
|
||||
turnElapsedSevereTimeoutSeconds: requiredPositiveThreshold(raw, "turnElapsedSevereTimeoutSeconds"),
|
||||
domEvaluateTimeoutRedCount: requiredPositiveThreshold(raw, "domEvaluateTimeoutRedCount"),
|
||||
domEvaluateTimeoutRedWindowMs: requiredPositiveThreshold(raw, "domEvaluateTimeoutRedWindowMs"),
|
||||
screenshotTimeoutRedCount: requiredPositiveThreshold(raw, "screenshotTimeoutRedCount"),
|
||||
pageErrorRedCount: requiredPositiveThreshold(raw, "pageErrorRedCount"),
|
||||
uncommandedStateChangeCommandWindowMs: requiredPositiveThreshold(raw, "uncommandedStateChangeCommandWindowMs"),
|
||||
scrollJumpCommandWindowMs: requiredPositiveThreshold(raw, "scrollJumpCommandWindowMs"),
|
||||
scrollJumpFromY: requiredPositiveThreshold(raw, "scrollJumpFromY"),
|
||||
@@ -2725,6 +2729,7 @@ function buildFindings(samples, control, network, errors, sampleMetrics, promptN
|
||||
const findings = [];
|
||||
const effectiveApiDomLag = apiDomLag || buildApiDomLagReport(samples, network);
|
||||
if (commandFailures.length > 0) findings.push({ id: "observer-command-failed", severity: "red", summary: "observer control commands failed; analyze must surface command failure instead of hiding it in command artifacts", count: commandFailures.length, commands: commandFailures.slice(0, 20) });
|
||||
findings.push(...buildFrontendFreezeFindings(errors, control));
|
||||
findings.push(...buildControlledNavigationRootCauseFindings(control, manifest));
|
||||
findings.push(...buildSessionInvariantFindings(control, manifest));
|
||||
const commandTimes = control
|
||||
@@ -3023,6 +3028,198 @@ function buildFindings(samples, control, network, errors, sampleMetrics, promptN
|
||||
return findings;
|
||||
}
|
||||
|
||||
function buildFrontendFreezeFindings(errors, control) {
|
||||
const findings = [];
|
||||
const promptTimes = (control || [])
|
||||
.filter((item) => item.type === "sendPrompt" && item.phase === "completed")
|
||||
.map((item) => Date.parse(item.ts))
|
||||
.filter(Number.isFinite)
|
||||
.sort((a, b) => a - b);
|
||||
const stopWindows = stopCommandWindows(control);
|
||||
const events = (errors || [])
|
||||
.map((item) => frontendFreezeErrorEvent(item, promptTimes))
|
||||
.filter((item) => item && !errorInsideStopWindow(item, stopWindows));
|
||||
const domEvents = events.filter((item) => item.kind === "dom-evaluate-timeout");
|
||||
const controlDomBurst = firstBurst(
|
||||
domEvents.filter((item) => item.pageRole === "control" || item.pageRole === null),
|
||||
alertThresholds.domEvaluateTimeoutRedCount,
|
||||
alertThresholds.domEvaluateTimeoutRedWindowMs,
|
||||
);
|
||||
if (controlDomBurst) findings.push(frontendFreezeBurstFinding({
|
||||
id: "frontend-control-dom-evaluate-timeout-red",
|
||||
summary: "control page DOM evaluation timed out repeatedly; treat the browser page as frozen and keep the sentinel red instead of refreshing or falling back",
|
||||
burst: controlDomBurst,
|
||||
thresholdCount: alertThresholds.domEvaluateTimeoutRedCount,
|
||||
windowMs: alertThresholds.domEvaluateTimeoutRedWindowMs,
|
||||
pageRole: "control",
|
||||
}));
|
||||
const observerDomBurst = firstBurst(
|
||||
domEvents.filter((item) => item.pageRole === "observer"),
|
||||
alertThresholds.domEvaluateTimeoutRedCount,
|
||||
alertThresholds.domEvaluateTimeoutRedWindowMs,
|
||||
);
|
||||
if (observerDomBurst) findings.push(frontendFreezeBurstFinding({
|
||||
id: "frontend-observer-dom-evaluate-timeout-red",
|
||||
summary: "observer page DOM evaluation timed out repeatedly; the observer page is frozen and later periodic refresh evidence must not clear this run",
|
||||
burst: observerDomBurst,
|
||||
thresholdCount: alertThresholds.domEvaluateTimeoutRedCount,
|
||||
windowMs: alertThresholds.domEvaluateTimeoutRedWindowMs,
|
||||
pageRole: "observer",
|
||||
}));
|
||||
const screenshotBurst = firstBurst(
|
||||
events.filter((item) => item.kind === "screenshot-timeout"),
|
||||
alertThresholds.screenshotTimeoutRedCount,
|
||||
alertThresholds.domEvaluateTimeoutRedWindowMs,
|
||||
);
|
||||
if (screenshotBurst) findings.push(frontendFreezeBurstFinding({
|
||||
id: "frontend-screenshot-timeout-red",
|
||||
summary: "browser screenshot capture timed out repeatedly; this is freeze evidence and the sentinel must stay red until investigated",
|
||||
burst: screenshotBurst,
|
||||
thresholdCount: alertThresholds.screenshotTimeoutRedCount,
|
||||
windowMs: alertThresholds.domEvaluateTimeoutRedWindowMs,
|
||||
pageRole: null,
|
||||
}));
|
||||
const pageErrors = events.filter((item) => item.kind === "page-error");
|
||||
const pageErrorBurst = firstBurst(pageErrors, alertThresholds.pageErrorRedCount, alertThresholds.domEvaluateTimeoutRedWindowMs);
|
||||
if (pageErrorBurst) findings.push(frontendFreezeBurstFinding({
|
||||
id: "frontend-page-error-red",
|
||||
summary: "browser pageerror entries exceeded the YAML threshold; page runtime exceptions are blocking when repeated in the observation window",
|
||||
burst: pageErrorBurst,
|
||||
thresholdCount: alertThresholds.pageErrorRedCount,
|
||||
windowMs: alertThresholds.domEvaluateTimeoutRedWindowMs,
|
||||
pageRole: null,
|
||||
}));
|
||||
return findings;
|
||||
}
|
||||
|
||||
function frontendFreezeErrorEvent(item, promptTimes) {
|
||||
const details = objectValue(item?.error?.details);
|
||||
const message = String(item?.error?.message ?? item?.message ?? item?.error ?? "");
|
||||
const type = String(item?.type || "");
|
||||
const tsMs = Date.parse(String(item?.ts || ""));
|
||||
if (!Number.isFinite(tsMs)) return null;
|
||||
const kind = classifyFrontendFreezeError(type, message);
|
||||
if (!kind) return null;
|
||||
return {
|
||||
ts: item.ts ?? null,
|
||||
tsMs,
|
||||
promptIndex: promptIndexForTs(promptTimes, item.ts),
|
||||
kind,
|
||||
type: item.type ?? null,
|
||||
pageRole: stringOrNull(item?.pageRole) ?? stringOrNull(details.pageRole) ?? pageRoleFromErrorType(type),
|
||||
pageId: stringOrNull(item?.pageId) ?? stringOrNull(details.pageId),
|
||||
routeSessionId: stringOrNull(item?.routeSessionId) ?? stringOrNull(details.routeSessionId),
|
||||
activeSessionId: stringOrNull(item?.activeSessionId) ?? stringOrNull(details.activeSessionId),
|
||||
commandId: stringOrNull(item?.commandId) ?? stringOrNull(details.commandId),
|
||||
sampleSeq: numberOrNull(item?.sampleSeq ?? details.sampleSeq),
|
||||
timeoutMs: timeoutMsFromMessage(message),
|
||||
messageHash: message ? sha256(message) : null,
|
||||
preview: limitText(message, 240),
|
||||
valuesRedacted: true,
|
||||
};
|
||||
}
|
||||
|
||||
function pageRoleFromErrorType(type) {
|
||||
const value = String(type || "");
|
||||
if (/^control-/iu.test(value)) return "control";
|
||||
if (/^observer-/iu.test(value)) return "observer";
|
||||
return null;
|
||||
}
|
||||
|
||||
function classifyFrontendFreezeError(type, message) {
|
||||
const value = String(message || "");
|
||||
if (/sampleOnePage\s+DOM\s+evaluate\s+exceeded/iu.test(value) && /(?:control|observer)-sample-error/iu.test(type)) return "dom-evaluate-timeout";
|
||||
if (/screenshot|captureScreenshot|page\.screenshot/iu.test(type + " " + value) && /timeout|timed\s*out|exceeded/iu.test(value)) return "screenshot-timeout";
|
||||
if (/pageerror|uncaught|unhandledrejection/iu.test(type) || /^(?:Error|TypeError|ReferenceError|RangeError|SyntaxError):/u.test(value)) return "page-error";
|
||||
return null;
|
||||
}
|
||||
|
||||
function firstBurst(events, thresholdCount, windowMs) {
|
||||
const count = Math.max(1, Math.floor(Number(thresholdCount || 0)));
|
||||
const budgetMs = Math.max(1, Number(windowMs || 0));
|
||||
const sorted = (events || []).filter((item) => Number.isFinite(item?.tsMs)).sort((a, b) => a.tsMs - b.tsMs);
|
||||
if (sorted.length < count) return null;
|
||||
for (let start = 0; start <= sorted.length - count; start += 1) {
|
||||
const end = start + count - 1;
|
||||
if (sorted[end].tsMs - sorted[start].tsMs <= budgetMs) return sorted.slice(start, end + 1);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function frontendFreezeBurstFinding({ id, summary, burst, thresholdCount, windowMs, pageRole }) {
|
||||
const first = burst[0];
|
||||
const last = burst[burst.length - 1];
|
||||
const pageIds = uniqueStrings(burst.map((item) => item.pageId));
|
||||
const routeSessionIds = uniqueStrings(burst.map((item) => item.routeSessionId));
|
||||
const activeSessionIds = uniqueStrings(burst.map((item) => item.activeSessionId));
|
||||
return {
|
||||
id,
|
||||
severity: "red",
|
||||
summary,
|
||||
count: burst.length,
|
||||
thresholdCount,
|
||||
windowMs,
|
||||
firstAt: first?.ts ?? null,
|
||||
lastAt: last?.ts ?? null,
|
||||
pageRole,
|
||||
pageIds,
|
||||
routeSessionIds,
|
||||
activeSessionIds,
|
||||
timeoutMsMax: maxPresentNumber(burst.map((item) => item.timeoutMs)),
|
||||
rootCause: "frontend_page_freeze_or_runtime_exception",
|
||||
rootCauseStatus: "confirmed-from-browser-observer-errors",
|
||||
rootCauseConfidence: "high",
|
||||
fallbackAllowed: false,
|
||||
observerRefreshMayNotClear: true,
|
||||
nextAction: "Keep this run red; do not auto-refresh, fallback, or mark healthy until OTel/browser evidence explains why the page stopped responding.",
|
||||
events: burst.map((item) => ({
|
||||
ts: item.ts,
|
||||
promptIndex: item.promptIndex,
|
||||
type: item.type,
|
||||
pageRole: item.pageRole,
|
||||
pageId: item.pageId,
|
||||
routeSessionId: item.routeSessionId,
|
||||
activeSessionId: item.activeSessionId,
|
||||
commandId: item.commandId,
|
||||
sampleSeq: item.sampleSeq,
|
||||
timeoutMs: item.timeoutMs,
|
||||
messageHash: item.messageHash,
|
||||
preview: item.preview,
|
||||
valuesRedacted: true,
|
||||
})),
|
||||
valuesRedacted: true,
|
||||
};
|
||||
}
|
||||
|
||||
function stopCommandWindows(control) {
|
||||
return (control || [])
|
||||
.filter((item) => /^(?:stop|forceStop|cancel|close)$/iu.test(String(item?.type || item?.command || "")))
|
||||
.map((item) => {
|
||||
const tsMs = Date.parse(String(item?.ts || ""));
|
||||
return Number.isFinite(tsMs) ? { fromMs: tsMs - 1000, toMs: tsMs + 10000 } : null;
|
||||
})
|
||||
.filter(Boolean);
|
||||
}
|
||||
|
||||
function errorInsideStopWindow(event, windows) {
|
||||
return (windows || []).some((window) => event.tsMs >= window.fromMs && event.tsMs <= window.toMs);
|
||||
}
|
||||
|
||||
function timeoutMsFromMessage(value) {
|
||||
const match = String(value || "").match(/\b(?:exceeded|timeout|timed\s*out\s*after)\s+(\d{2,})\s*ms\b/iu)
|
||||
|| String(value || "").match(/\b(\d{2,})\s*ms\b/iu);
|
||||
return match ? Number(match[1]) : null;
|
||||
}
|
||||
|
||||
function uniqueStrings(values) {
|
||||
return Array.from(new Set((values || []).filter((item) => typeof item === "string" && item.length > 0))).slice(0, 12);
|
||||
}
|
||||
|
||||
function maxPresentNumber(values) {
|
||||
const numbers = (values || []).filter((item) => item !== null && item !== undefined && Number.isFinite(Number(item))).map((item) => Number(item));
|
||||
return numbers.length > 0 ? Math.max(...numbers) : null;
|
||||
}
|
||||
|
||||
function buildRecentAnalysisWindow({ samples, control, network, consoleEvents, errors, manifest }) {
|
||||
const latestSampleMs = latestTimestampMs(samples);
|
||||
const windowMs = 5 * 60 * 1000;
|
||||
@@ -3745,6 +3942,13 @@ function buildRuntimeAlerts(samples, control, network, consoleEvents, errors) {
|
||||
ts: item.ts ?? null,
|
||||
promptIndex: promptIndexForTs(promptTimes, item.ts),
|
||||
type: item.type ?? null,
|
||||
pageRole: item.pageRole ?? item.error?.details?.pageRole ?? null,
|
||||
pageId: item.pageId ?? item.error?.details?.pageId ?? null,
|
||||
routeSessionId: item.routeSessionId ?? item.error?.details?.routeSessionId ?? null,
|
||||
activeSessionId: item.activeSessionId ?? item.error?.details?.activeSessionId ?? null,
|
||||
commandId: item.commandId ?? item.error?.details?.commandId ?? null,
|
||||
sampleSeq: item.sampleSeq ?? item.error?.details?.sampleSeq ?? null,
|
||||
timeoutMs: timeoutMsFromMessage(item.error?.message || item.message || item.error || ""),
|
||||
errorName: item.error?.name ?? item.name ?? null,
|
||||
messageHash: item.error?.message ? sha256(item.error.message) : item.message ? sha256(item.message) : null,
|
||||
preview: limitText(item.error?.message || item.message || item.error || "", 220)
|
||||
|
||||
@@ -4709,6 +4709,10 @@ function parseAlertThresholds(value) {
|
||||
visibleLoadingSlowMs: requiredPositiveThreshold(raw, "visibleLoadingSlowMs"),
|
||||
turnTimingSampleSlackSeconds: requiredPositiveThreshold(raw, "turnTimingSampleSlackSeconds"),
|
||||
turnElapsedSevereTimeoutSeconds: requiredPositiveThreshold(raw, "turnElapsedSevereTimeoutSeconds"),
|
||||
domEvaluateTimeoutRedCount: requiredPositiveThreshold(raw, "domEvaluateTimeoutRedCount"),
|
||||
domEvaluateTimeoutRedWindowMs: requiredPositiveThreshold(raw, "domEvaluateTimeoutRedWindowMs"),
|
||||
screenshotTimeoutRedCount: requiredPositiveThreshold(raw, "screenshotTimeoutRedCount"),
|
||||
pageErrorRedCount: requiredPositiveThreshold(raw, "pageErrorRedCount"),
|
||||
uncommandedStateChangeCommandWindowMs: requiredPositiveThreshold(raw, "uncommandedStateChangeCommandWindowMs"),
|
||||
scrollJumpCommandWindowMs: requiredPositiveThreshold(raw, "scrollJumpCommandWindowMs"),
|
||||
scrollJumpFromY: requiredPositiveThreshold(raw, "scrollJumpFromY"),
|
||||
|
||||
Reference in New Issue
Block a user