Merge pull request #1276 from pikasTech/fix/1274-sentinel-freeze-red

fix: mark web sentinel frontend freezes red
This commit is contained in:
Lyon
2026-06-30 09:24:02 +08:00
committed by GitHub
4 changed files with 228 additions and 0 deletions
+12
View File
@@ -206,6 +206,10 @@ lanes:
visibleLoadingSlowMs: 10000
turnTimingSampleSlackSeconds: 3
turnElapsedSevereTimeoutSeconds: 120
domEvaluateTimeoutRedCount: 2
domEvaluateTimeoutRedWindowMs: 30000
screenshotTimeoutRedCount: 2
pageErrorRedCount: 2
uncommandedStateChangeCommandWindowMs: 10000
scrollJumpCommandWindowMs: 8000
scrollJumpFromY: 250
@@ -595,6 +599,10 @@ lanes:
visibleLoadingSlowMs: 10000
turnTimingSampleSlackSeconds: 3
turnElapsedSevereTimeoutSeconds: 120
domEvaluateTimeoutRedCount: 2
domEvaluateTimeoutRedWindowMs: 30000
screenshotTimeoutRedCount: 2
pageErrorRedCount: 2
uncommandedStateChangeCommandWindowMs: 10000
scrollJumpCommandWindowMs: 8000
scrollJumpFromY: 250
@@ -914,6 +922,10 @@ lanes:
visibleLoadingSlowMs: 10000
turnTimingSampleSlackSeconds: 3
turnElapsedSevereTimeoutSeconds: 120
domEvaluateTimeoutRedCount: 2
domEvaluateTimeoutRedWindowMs: 30000
screenshotTimeoutRedCount: 2
pageErrorRedCount: 2
uncommandedStateChangeCommandWindowMs: 10000
scrollJumpCommandWindowMs: 8000
scrollJumpFromY: 250
+8
View File
@@ -180,6 +180,10 @@ export interface HwlabRuntimeWebProbeAlertThresholdsSpec {
readonly visibleLoadingSlowMs: number;
readonly turnTimingSampleSlackSeconds: number;
readonly turnElapsedSevereTimeoutSeconds: number;
readonly domEvaluateTimeoutRedCount: number;
readonly domEvaluateTimeoutRedWindowMs: number;
readonly screenshotTimeoutRedCount: number;
readonly pageErrorRedCount: number;
readonly uncommandedStateChangeCommandWindowMs: number;
readonly scrollJumpCommandWindowMs: number;
readonly scrollJumpFromY: number;
@@ -1176,6 +1180,10 @@ function webProbeAlertThresholdsConfig(value: unknown, path: string): HwlabRunti
visibleLoadingSlowMs: positiveNumberField(raw, "visibleLoadingSlowMs", path),
turnTimingSampleSlackSeconds: positiveNumberField(raw, "turnTimingSampleSlackSeconds", path),
turnElapsedSevereTimeoutSeconds: positiveNumberField(raw, "turnElapsedSevereTimeoutSeconds", path),
domEvaluateTimeoutRedCount: positiveNumberField(raw, "domEvaluateTimeoutRedCount", path),
domEvaluateTimeoutRedWindowMs: positiveNumberField(raw, "domEvaluateTimeoutRedWindowMs", path),
screenshotTimeoutRedCount: positiveNumberField(raw, "screenshotTimeoutRedCount", path),
pageErrorRedCount: positiveNumberField(raw, "pageErrorRedCount", path),
uncommandedStateChangeCommandWindowMs: positiveNumberField(raw, "uncommandedStateChangeCommandWindowMs", path),
scrollJumpCommandWindowMs: positiveNumberField(raw, "scrollJumpCommandWindowMs", path),
scrollJumpFromY: positiveNumberField(raw, "scrollJumpFromY", path),
@@ -557,6 +557,10 @@ function parseAlertThresholds(value) {
visibleLoadingSlowMs: requiredPositiveThreshold(raw, "visibleLoadingSlowMs"),
turnTimingSampleSlackSeconds: requiredPositiveThreshold(raw, "turnTimingSampleSlackSeconds"),
turnElapsedSevereTimeoutSeconds: requiredPositiveThreshold(raw, "turnElapsedSevereTimeoutSeconds"),
domEvaluateTimeoutRedCount: requiredPositiveThreshold(raw, "domEvaluateTimeoutRedCount"),
domEvaluateTimeoutRedWindowMs: requiredPositiveThreshold(raw, "domEvaluateTimeoutRedWindowMs"),
screenshotTimeoutRedCount: requiredPositiveThreshold(raw, "screenshotTimeoutRedCount"),
pageErrorRedCount: requiredPositiveThreshold(raw, "pageErrorRedCount"),
uncommandedStateChangeCommandWindowMs: requiredPositiveThreshold(raw, "uncommandedStateChangeCommandWindowMs"),
scrollJumpCommandWindowMs: requiredPositiveThreshold(raw, "scrollJumpCommandWindowMs"),
scrollJumpFromY: requiredPositiveThreshold(raw, "scrollJumpFromY"),
@@ -2725,6 +2729,7 @@ function buildFindings(samples, control, network, errors, sampleMetrics, promptN
const findings = [];
const effectiveApiDomLag = apiDomLag || buildApiDomLagReport(samples, network);
if (commandFailures.length > 0) findings.push({ id: "observer-command-failed", severity: "red", summary: "observer control commands failed; analyze must surface command failure instead of hiding it in command artifacts", count: commandFailures.length, commands: commandFailures.slice(0, 20) });
findings.push(...buildFrontendFreezeFindings(errors, control));
findings.push(...buildControlledNavigationRootCauseFindings(control, manifest));
findings.push(...buildSessionInvariantFindings(control, manifest));
const commandTimes = control
@@ -3023,6 +3028,198 @@ function buildFindings(samples, control, network, errors, sampleMetrics, promptN
return findings;
}
function buildFrontendFreezeFindings(errors, control) {
const findings = [];
const promptTimes = (control || [])
.filter((item) => item.type === "sendPrompt" && item.phase === "completed")
.map((item) => Date.parse(item.ts))
.filter(Number.isFinite)
.sort((a, b) => a - b);
const stopWindows = stopCommandWindows(control);
const events = (errors || [])
.map((item) => frontendFreezeErrorEvent(item, promptTimes))
.filter((item) => item && !errorInsideStopWindow(item, stopWindows));
const domEvents = events.filter((item) => item.kind === "dom-evaluate-timeout");
const controlDomBurst = firstBurst(
domEvents.filter((item) => item.pageRole === "control" || item.pageRole === null),
alertThresholds.domEvaluateTimeoutRedCount,
alertThresholds.domEvaluateTimeoutRedWindowMs,
);
if (controlDomBurst) findings.push(frontendFreezeBurstFinding({
id: "frontend-control-dom-evaluate-timeout-red",
summary: "control page DOM evaluation timed out repeatedly; treat the browser page as frozen and keep the sentinel red instead of refreshing or falling back",
burst: controlDomBurst,
thresholdCount: alertThresholds.domEvaluateTimeoutRedCount,
windowMs: alertThresholds.domEvaluateTimeoutRedWindowMs,
pageRole: "control",
}));
const observerDomBurst = firstBurst(
domEvents.filter((item) => item.pageRole === "observer"),
alertThresholds.domEvaluateTimeoutRedCount,
alertThresholds.domEvaluateTimeoutRedWindowMs,
);
if (observerDomBurst) findings.push(frontendFreezeBurstFinding({
id: "frontend-observer-dom-evaluate-timeout-red",
summary: "observer page DOM evaluation timed out repeatedly; the observer page is frozen and later periodic refresh evidence must not clear this run",
burst: observerDomBurst,
thresholdCount: alertThresholds.domEvaluateTimeoutRedCount,
windowMs: alertThresholds.domEvaluateTimeoutRedWindowMs,
pageRole: "observer",
}));
const screenshotBurst = firstBurst(
events.filter((item) => item.kind === "screenshot-timeout"),
alertThresholds.screenshotTimeoutRedCount,
alertThresholds.domEvaluateTimeoutRedWindowMs,
);
if (screenshotBurst) findings.push(frontendFreezeBurstFinding({
id: "frontend-screenshot-timeout-red",
summary: "browser screenshot capture timed out repeatedly; this is freeze evidence and the sentinel must stay red until investigated",
burst: screenshotBurst,
thresholdCount: alertThresholds.screenshotTimeoutRedCount,
windowMs: alertThresholds.domEvaluateTimeoutRedWindowMs,
pageRole: null,
}));
const pageErrors = events.filter((item) => item.kind === "page-error");
const pageErrorBurst = firstBurst(pageErrors, alertThresholds.pageErrorRedCount, alertThresholds.domEvaluateTimeoutRedWindowMs);
if (pageErrorBurst) findings.push(frontendFreezeBurstFinding({
id: "frontend-page-error-red",
summary: "browser pageerror entries exceeded the YAML threshold; page runtime exceptions are blocking when repeated in the observation window",
burst: pageErrorBurst,
thresholdCount: alertThresholds.pageErrorRedCount,
windowMs: alertThresholds.domEvaluateTimeoutRedWindowMs,
pageRole: null,
}));
return findings;
}
function frontendFreezeErrorEvent(item, promptTimes) {
const details = objectValue(item?.error?.details);
const message = String(item?.error?.message ?? item?.message ?? item?.error ?? "");
const type = String(item?.type || "");
const tsMs = Date.parse(String(item?.ts || ""));
if (!Number.isFinite(tsMs)) return null;
const kind = classifyFrontendFreezeError(type, message);
if (!kind) return null;
return {
ts: item.ts ?? null,
tsMs,
promptIndex: promptIndexForTs(promptTimes, item.ts),
kind,
type: item.type ?? null,
pageRole: stringOrNull(item?.pageRole) ?? stringOrNull(details.pageRole) ?? pageRoleFromErrorType(type),
pageId: stringOrNull(item?.pageId) ?? stringOrNull(details.pageId),
routeSessionId: stringOrNull(item?.routeSessionId) ?? stringOrNull(details.routeSessionId),
activeSessionId: stringOrNull(item?.activeSessionId) ?? stringOrNull(details.activeSessionId),
commandId: stringOrNull(item?.commandId) ?? stringOrNull(details.commandId),
sampleSeq: numberOrNull(item?.sampleSeq ?? details.sampleSeq),
timeoutMs: timeoutMsFromMessage(message),
messageHash: message ? sha256(message) : null,
preview: limitText(message, 240),
valuesRedacted: true,
};
}
function pageRoleFromErrorType(type) {
const value = String(type || "");
if (/^control-/iu.test(value)) return "control";
if (/^observer-/iu.test(value)) return "observer";
return null;
}
function classifyFrontendFreezeError(type, message) {
const value = String(message || "");
if (/sampleOnePage\s+DOM\s+evaluate\s+exceeded/iu.test(value) && /(?:control|observer)-sample-error/iu.test(type)) return "dom-evaluate-timeout";
if (/screenshot|captureScreenshot|page\.screenshot/iu.test(type + " " + value) && /timeout|timed\s*out|exceeded/iu.test(value)) return "screenshot-timeout";
if (/pageerror|uncaught|unhandledrejection/iu.test(type) || /^(?:Error|TypeError|ReferenceError|RangeError|SyntaxError):/u.test(value)) return "page-error";
return null;
}
function firstBurst(events, thresholdCount, windowMs) {
const count = Math.max(1, Math.floor(Number(thresholdCount || 0)));
const budgetMs = Math.max(1, Number(windowMs || 0));
const sorted = (events || []).filter((item) => Number.isFinite(item?.tsMs)).sort((a, b) => a.tsMs - b.tsMs);
if (sorted.length < count) return null;
for (let start = 0; start <= sorted.length - count; start += 1) {
const end = start + count - 1;
if (sorted[end].tsMs - sorted[start].tsMs <= budgetMs) return sorted.slice(start, end + 1);
}
return null;
}
function frontendFreezeBurstFinding({ id, summary, burst, thresholdCount, windowMs, pageRole }) {
const first = burst[0];
const last = burst[burst.length - 1];
const pageIds = uniqueStrings(burst.map((item) => item.pageId));
const routeSessionIds = uniqueStrings(burst.map((item) => item.routeSessionId));
const activeSessionIds = uniqueStrings(burst.map((item) => item.activeSessionId));
return {
id,
severity: "red",
summary,
count: burst.length,
thresholdCount,
windowMs,
firstAt: first?.ts ?? null,
lastAt: last?.ts ?? null,
pageRole,
pageIds,
routeSessionIds,
activeSessionIds,
timeoutMsMax: maxPresentNumber(burst.map((item) => item.timeoutMs)),
rootCause: "frontend_page_freeze_or_runtime_exception",
rootCauseStatus: "confirmed-from-browser-observer-errors",
rootCauseConfidence: "high",
fallbackAllowed: false,
observerRefreshMayNotClear: true,
nextAction: "Keep this run red; do not auto-refresh, fallback, or mark healthy until OTel/browser evidence explains why the page stopped responding.",
events: burst.map((item) => ({
ts: item.ts,
promptIndex: item.promptIndex,
type: item.type,
pageRole: item.pageRole,
pageId: item.pageId,
routeSessionId: item.routeSessionId,
activeSessionId: item.activeSessionId,
commandId: item.commandId,
sampleSeq: item.sampleSeq,
timeoutMs: item.timeoutMs,
messageHash: item.messageHash,
preview: item.preview,
valuesRedacted: true,
})),
valuesRedacted: true,
};
}
function stopCommandWindows(control) {
return (control || [])
.filter((item) => /^(?:stop|forceStop|cancel|close)$/iu.test(String(item?.type || item?.command || "")))
.map((item) => {
const tsMs = Date.parse(String(item?.ts || ""));
return Number.isFinite(tsMs) ? { fromMs: tsMs - 1000, toMs: tsMs + 10000 } : null;
})
.filter(Boolean);
}
function errorInsideStopWindow(event, windows) {
return (windows || []).some((window) => event.tsMs >= window.fromMs && event.tsMs <= window.toMs);
}
function timeoutMsFromMessage(value) {
const match = String(value || "").match(/\b(?:exceeded|timeout|timed\s*out\s*after)\s+(\d{2,})\s*ms\b/iu)
|| String(value || "").match(/\b(\d{2,})\s*ms\b/iu);
return match ? Number(match[1]) : null;
}
function uniqueStrings(values) {
return Array.from(new Set((values || []).filter((item) => typeof item === "string" && item.length > 0))).slice(0, 12);
}
function maxPresentNumber(values) {
const numbers = (values || []).filter((item) => item !== null && item !== undefined && Number.isFinite(Number(item))).map((item) => Number(item));
return numbers.length > 0 ? Math.max(...numbers) : null;
}
function buildRecentAnalysisWindow({ samples, control, network, consoleEvents, errors, manifest }) {
const latestSampleMs = latestTimestampMs(samples);
const windowMs = 5 * 60 * 1000;
@@ -3745,6 +3942,13 @@ function buildRuntimeAlerts(samples, control, network, consoleEvents, errors) {
ts: item.ts ?? null,
promptIndex: promptIndexForTs(promptTimes, item.ts),
type: item.type ?? null,
pageRole: item.pageRole ?? item.error?.details?.pageRole ?? null,
pageId: item.pageId ?? item.error?.details?.pageId ?? null,
routeSessionId: item.routeSessionId ?? item.error?.details?.routeSessionId ?? null,
activeSessionId: item.activeSessionId ?? item.error?.details?.activeSessionId ?? null,
commandId: item.commandId ?? item.error?.details?.commandId ?? null,
sampleSeq: item.sampleSeq ?? item.error?.details?.sampleSeq ?? null,
timeoutMs: timeoutMsFromMessage(item.error?.message || item.message || item.error || ""),
errorName: item.error?.name ?? item.name ?? null,
messageHash: item.error?.message ? sha256(item.error.message) : item.message ? sha256(item.message) : null,
preview: limitText(item.error?.message || item.message || item.error || "", 220)
@@ -4709,6 +4709,10 @@ function parseAlertThresholds(value) {
visibleLoadingSlowMs: requiredPositiveThreshold(raw, "visibleLoadingSlowMs"),
turnTimingSampleSlackSeconds: requiredPositiveThreshold(raw, "turnTimingSampleSlackSeconds"),
turnElapsedSevereTimeoutSeconds: requiredPositiveThreshold(raw, "turnElapsedSevereTimeoutSeconds"),
domEvaluateTimeoutRedCount: requiredPositiveThreshold(raw, "domEvaluateTimeoutRedCount"),
domEvaluateTimeoutRedWindowMs: requiredPositiveThreshold(raw, "domEvaluateTimeoutRedWindowMs"),
screenshotTimeoutRedCount: requiredPositiveThreshold(raw, "screenshotTimeoutRedCount"),
pageErrorRedCount: requiredPositiveThreshold(raw, "pageErrorRedCount"),
uncommandedStateChangeCommandWindowMs: requiredPositiveThreshold(raw, "uncommandedStateChangeCommandWindowMs"),
scrollJumpCommandWindowMs: requiredPositiveThreshold(raw, "scrollJumpCommandWindowMs"),
scrollJumpFromY: requiredPositiveThreshold(raw, "scrollJumpFromY"),