From 1e66cb8de89d21ef62b830e3f2382dffe513ee6f Mon Sep 17 00:00:00 2001 From: Codex Date: Tue, 30 Jun 2026 01:22:46 +0000 Subject: [PATCH] fix: mark web sentinel frontend freezes red --- config/hwlab-node-lanes.yaml | 12 ++ scripts/src/hwlab-node-lanes.ts | 8 + .../hwlab-node-web-observe-analyzer-source.ts | 204 ++++++++++++++++++ .../hwlab-node-web-observe-runner-source.ts | 4 + 4 files changed, 228 insertions(+) diff --git a/config/hwlab-node-lanes.yaml b/config/hwlab-node-lanes.yaml index 9f33c6fe..b579ab70 100644 --- a/config/hwlab-node-lanes.yaml +++ b/config/hwlab-node-lanes.yaml @@ -206,6 +206,10 @@ lanes: visibleLoadingSlowMs: 10000 turnTimingSampleSlackSeconds: 3 turnElapsedSevereTimeoutSeconds: 120 + domEvaluateTimeoutRedCount: 2 + domEvaluateTimeoutRedWindowMs: 30000 + screenshotTimeoutRedCount: 2 + pageErrorRedCount: 2 uncommandedStateChangeCommandWindowMs: 10000 scrollJumpCommandWindowMs: 8000 scrollJumpFromY: 250 @@ -595,6 +599,10 @@ lanes: visibleLoadingSlowMs: 10000 turnTimingSampleSlackSeconds: 3 turnElapsedSevereTimeoutSeconds: 120 + domEvaluateTimeoutRedCount: 2 + domEvaluateTimeoutRedWindowMs: 30000 + screenshotTimeoutRedCount: 2 + pageErrorRedCount: 2 uncommandedStateChangeCommandWindowMs: 10000 scrollJumpCommandWindowMs: 8000 scrollJumpFromY: 250 @@ -914,6 +922,10 @@ lanes: visibleLoadingSlowMs: 10000 turnTimingSampleSlackSeconds: 3 turnElapsedSevereTimeoutSeconds: 120 + domEvaluateTimeoutRedCount: 2 + domEvaluateTimeoutRedWindowMs: 30000 + screenshotTimeoutRedCount: 2 + pageErrorRedCount: 2 uncommandedStateChangeCommandWindowMs: 10000 scrollJumpCommandWindowMs: 8000 scrollJumpFromY: 250 diff --git a/scripts/src/hwlab-node-lanes.ts b/scripts/src/hwlab-node-lanes.ts index e6b84766..d32e4a64 100644 --- a/scripts/src/hwlab-node-lanes.ts +++ b/scripts/src/hwlab-node-lanes.ts @@ -180,6 +180,10 @@ export interface HwlabRuntimeWebProbeAlertThresholdsSpec { readonly visibleLoadingSlowMs: number; readonly turnTimingSampleSlackSeconds: number; readonly turnElapsedSevereTimeoutSeconds: number; + readonly domEvaluateTimeoutRedCount: number; + readonly domEvaluateTimeoutRedWindowMs: number; + readonly screenshotTimeoutRedCount: number; + readonly pageErrorRedCount: number; readonly uncommandedStateChangeCommandWindowMs: number; readonly scrollJumpCommandWindowMs: number; readonly scrollJumpFromY: number; @@ -1176,6 +1180,10 @@ function webProbeAlertThresholdsConfig(value: unknown, path: string): HwlabRunti visibleLoadingSlowMs: positiveNumberField(raw, "visibleLoadingSlowMs", path), turnTimingSampleSlackSeconds: positiveNumberField(raw, "turnTimingSampleSlackSeconds", path), turnElapsedSevereTimeoutSeconds: positiveNumberField(raw, "turnElapsedSevereTimeoutSeconds", path), + domEvaluateTimeoutRedCount: positiveNumberField(raw, "domEvaluateTimeoutRedCount", path), + domEvaluateTimeoutRedWindowMs: positiveNumberField(raw, "domEvaluateTimeoutRedWindowMs", path), + screenshotTimeoutRedCount: positiveNumberField(raw, "screenshotTimeoutRedCount", path), + pageErrorRedCount: positiveNumberField(raw, "pageErrorRedCount", path), uncommandedStateChangeCommandWindowMs: positiveNumberField(raw, "uncommandedStateChangeCommandWindowMs", path), scrollJumpCommandWindowMs: positiveNumberField(raw, "scrollJumpCommandWindowMs", path), scrollJumpFromY: positiveNumberField(raw, "scrollJumpFromY", path), diff --git a/scripts/src/hwlab-node-web-observe-analyzer-source.ts b/scripts/src/hwlab-node-web-observe-analyzer-source.ts index 0cd79651..fd5c5496 100644 --- a/scripts/src/hwlab-node-web-observe-analyzer-source.ts +++ b/scripts/src/hwlab-node-web-observe-analyzer-source.ts @@ -557,6 +557,10 @@ function parseAlertThresholds(value) { visibleLoadingSlowMs: requiredPositiveThreshold(raw, "visibleLoadingSlowMs"), turnTimingSampleSlackSeconds: requiredPositiveThreshold(raw, "turnTimingSampleSlackSeconds"), turnElapsedSevereTimeoutSeconds: requiredPositiveThreshold(raw, "turnElapsedSevereTimeoutSeconds"), + domEvaluateTimeoutRedCount: requiredPositiveThreshold(raw, "domEvaluateTimeoutRedCount"), + domEvaluateTimeoutRedWindowMs: requiredPositiveThreshold(raw, "domEvaluateTimeoutRedWindowMs"), + screenshotTimeoutRedCount: requiredPositiveThreshold(raw, "screenshotTimeoutRedCount"), + pageErrorRedCount: requiredPositiveThreshold(raw, "pageErrorRedCount"), uncommandedStateChangeCommandWindowMs: requiredPositiveThreshold(raw, "uncommandedStateChangeCommandWindowMs"), scrollJumpCommandWindowMs: requiredPositiveThreshold(raw, "scrollJumpCommandWindowMs"), scrollJumpFromY: requiredPositiveThreshold(raw, "scrollJumpFromY"), @@ -2725,6 +2729,7 @@ function buildFindings(samples, control, network, errors, sampleMetrics, promptN const findings = []; const effectiveApiDomLag = apiDomLag || buildApiDomLagReport(samples, network); if (commandFailures.length > 0) findings.push({ id: "observer-command-failed", severity: "red", summary: "observer control commands failed; analyze must surface command failure instead of hiding it in command artifacts", count: commandFailures.length, commands: commandFailures.slice(0, 20) }); + findings.push(...buildFrontendFreezeFindings(errors, control)); findings.push(...buildControlledNavigationRootCauseFindings(control, manifest)); findings.push(...buildSessionInvariantFindings(control, manifest)); const commandTimes = control @@ -3023,6 +3028,198 @@ function buildFindings(samples, control, network, errors, sampleMetrics, promptN return findings; } +function buildFrontendFreezeFindings(errors, control) { + const findings = []; + const promptTimes = (control || []) + .filter((item) => item.type === "sendPrompt" && item.phase === "completed") + .map((item) => Date.parse(item.ts)) + .filter(Number.isFinite) + .sort((a, b) => a - b); + const stopWindows = stopCommandWindows(control); + const events = (errors || []) + .map((item) => frontendFreezeErrorEvent(item, promptTimes)) + .filter((item) => item && !errorInsideStopWindow(item, stopWindows)); + const domEvents = events.filter((item) => item.kind === "dom-evaluate-timeout"); + const controlDomBurst = firstBurst( + domEvents.filter((item) => item.pageRole === "control" || item.pageRole === null), + alertThresholds.domEvaluateTimeoutRedCount, + alertThresholds.domEvaluateTimeoutRedWindowMs, + ); + if (controlDomBurst) findings.push(frontendFreezeBurstFinding({ + id: "frontend-control-dom-evaluate-timeout-red", + summary: "control page DOM evaluation timed out repeatedly; treat the browser page as frozen and keep the sentinel red instead of refreshing or falling back", + burst: controlDomBurst, + thresholdCount: alertThresholds.domEvaluateTimeoutRedCount, + windowMs: alertThresholds.domEvaluateTimeoutRedWindowMs, + pageRole: "control", + })); + const observerDomBurst = firstBurst( + domEvents.filter((item) => item.pageRole === "observer"), + alertThresholds.domEvaluateTimeoutRedCount, + alertThresholds.domEvaluateTimeoutRedWindowMs, + ); + if (observerDomBurst) findings.push(frontendFreezeBurstFinding({ + id: "frontend-observer-dom-evaluate-timeout-red", + summary: "observer page DOM evaluation timed out repeatedly; the observer page is frozen and later periodic refresh evidence must not clear this run", + burst: observerDomBurst, + thresholdCount: alertThresholds.domEvaluateTimeoutRedCount, + windowMs: alertThresholds.domEvaluateTimeoutRedWindowMs, + pageRole: "observer", + })); + const screenshotBurst = firstBurst( + events.filter((item) => item.kind === "screenshot-timeout"), + alertThresholds.screenshotTimeoutRedCount, + alertThresholds.domEvaluateTimeoutRedWindowMs, + ); + if (screenshotBurst) findings.push(frontendFreezeBurstFinding({ + id: "frontend-screenshot-timeout-red", + summary: "browser screenshot capture timed out repeatedly; this is freeze evidence and the sentinel must stay red until investigated", + burst: screenshotBurst, + thresholdCount: alertThresholds.screenshotTimeoutRedCount, + windowMs: alertThresholds.domEvaluateTimeoutRedWindowMs, + pageRole: null, + })); + const pageErrors = events.filter((item) => item.kind === "page-error"); + const pageErrorBurst = firstBurst(pageErrors, alertThresholds.pageErrorRedCount, alertThresholds.domEvaluateTimeoutRedWindowMs); + if (pageErrorBurst) findings.push(frontendFreezeBurstFinding({ + id: "frontend-page-error-red", + summary: "browser pageerror entries exceeded the YAML threshold; page runtime exceptions are blocking when repeated in the observation window", + burst: pageErrorBurst, + thresholdCount: alertThresholds.pageErrorRedCount, + windowMs: alertThresholds.domEvaluateTimeoutRedWindowMs, + pageRole: null, + })); + return findings; +} + +function frontendFreezeErrorEvent(item, promptTimes) { + const details = objectValue(item?.error?.details); + const message = String(item?.error?.message ?? item?.message ?? item?.error ?? ""); + const type = String(item?.type || ""); + const tsMs = Date.parse(String(item?.ts || "")); + if (!Number.isFinite(tsMs)) return null; + const kind = classifyFrontendFreezeError(type, message); + if (!kind) return null; + return { + ts: item.ts ?? null, + tsMs, + promptIndex: promptIndexForTs(promptTimes, item.ts), + kind, + type: item.type ?? null, + pageRole: stringOrNull(item?.pageRole) ?? stringOrNull(details.pageRole) ?? pageRoleFromErrorType(type), + pageId: stringOrNull(item?.pageId) ?? stringOrNull(details.pageId), + routeSessionId: stringOrNull(item?.routeSessionId) ?? stringOrNull(details.routeSessionId), + activeSessionId: stringOrNull(item?.activeSessionId) ?? stringOrNull(details.activeSessionId), + commandId: stringOrNull(item?.commandId) ?? stringOrNull(details.commandId), + sampleSeq: numberOrNull(item?.sampleSeq ?? details.sampleSeq), + timeoutMs: timeoutMsFromMessage(message), + messageHash: message ? sha256(message) : null, + preview: limitText(message, 240), + valuesRedacted: true, + }; +} + +function pageRoleFromErrorType(type) { + const value = String(type || ""); + if (/^control-/iu.test(value)) return "control"; + if (/^observer-/iu.test(value)) return "observer"; + return null; +} + +function classifyFrontendFreezeError(type, message) { + const value = String(message || ""); + if (/sampleOnePage\s+DOM\s+evaluate\s+exceeded/iu.test(value) && /(?:control|observer)-sample-error/iu.test(type)) return "dom-evaluate-timeout"; + if (/screenshot|captureScreenshot|page\.screenshot/iu.test(type + " " + value) && /timeout|timed\s*out|exceeded/iu.test(value)) return "screenshot-timeout"; + if (/pageerror|uncaught|unhandledrejection/iu.test(type) || /^(?:Error|TypeError|ReferenceError|RangeError|SyntaxError):/u.test(value)) return "page-error"; + return null; +} + +function firstBurst(events, thresholdCount, windowMs) { + const count = Math.max(1, Math.floor(Number(thresholdCount || 0))); + const budgetMs = Math.max(1, Number(windowMs || 0)); + const sorted = (events || []).filter((item) => Number.isFinite(item?.tsMs)).sort((a, b) => a.tsMs - b.tsMs); + if (sorted.length < count) return null; + for (let start = 0; start <= sorted.length - count; start += 1) { + const end = start + count - 1; + if (sorted[end].tsMs - sorted[start].tsMs <= budgetMs) return sorted.slice(start, end + 1); + } + return null; +} + +function frontendFreezeBurstFinding({ id, summary, burst, thresholdCount, windowMs, pageRole }) { + const first = burst[0]; + const last = burst[burst.length - 1]; + const pageIds = uniqueStrings(burst.map((item) => item.pageId)); + const routeSessionIds = uniqueStrings(burst.map((item) => item.routeSessionId)); + const activeSessionIds = uniqueStrings(burst.map((item) => item.activeSessionId)); + return { + id, + severity: "red", + summary, + count: burst.length, + thresholdCount, + windowMs, + firstAt: first?.ts ?? null, + lastAt: last?.ts ?? null, + pageRole, + pageIds, + routeSessionIds, + activeSessionIds, + timeoutMsMax: maxPresentNumber(burst.map((item) => item.timeoutMs)), + rootCause: "frontend_page_freeze_or_runtime_exception", + rootCauseStatus: "confirmed-from-browser-observer-errors", + rootCauseConfidence: "high", + fallbackAllowed: false, + observerRefreshMayNotClear: true, + nextAction: "Keep this run red; do not auto-refresh, fallback, or mark healthy until OTel/browser evidence explains why the page stopped responding.", + events: burst.map((item) => ({ + ts: item.ts, + promptIndex: item.promptIndex, + type: item.type, + pageRole: item.pageRole, + pageId: item.pageId, + routeSessionId: item.routeSessionId, + activeSessionId: item.activeSessionId, + commandId: item.commandId, + sampleSeq: item.sampleSeq, + timeoutMs: item.timeoutMs, + messageHash: item.messageHash, + preview: item.preview, + valuesRedacted: true, + })), + valuesRedacted: true, + }; +} + +function stopCommandWindows(control) { + return (control || []) + .filter((item) => /^(?:stop|forceStop|cancel|close)$/iu.test(String(item?.type || item?.command || ""))) + .map((item) => { + const tsMs = Date.parse(String(item?.ts || "")); + return Number.isFinite(tsMs) ? { fromMs: tsMs - 1000, toMs: tsMs + 10000 } : null; + }) + .filter(Boolean); +} + +function errorInsideStopWindow(event, windows) { + return (windows || []).some((window) => event.tsMs >= window.fromMs && event.tsMs <= window.toMs); +} + +function timeoutMsFromMessage(value) { + const match = String(value || "").match(/\b(?:exceeded|timeout|timed\s*out\s*after)\s+(\d{2,})\s*ms\b/iu) + || String(value || "").match(/\b(\d{2,})\s*ms\b/iu); + return match ? Number(match[1]) : null; +} + +function uniqueStrings(values) { + return Array.from(new Set((values || []).filter((item) => typeof item === "string" && item.length > 0))).slice(0, 12); +} + +function maxPresentNumber(values) { + const numbers = (values || []).filter((item) => item !== null && item !== undefined && Number.isFinite(Number(item))).map((item) => Number(item)); + return numbers.length > 0 ? Math.max(...numbers) : null; +} + function buildRecentAnalysisWindow({ samples, control, network, consoleEvents, errors, manifest }) { const latestSampleMs = latestTimestampMs(samples); const windowMs = 5 * 60 * 1000; @@ -3745,6 +3942,13 @@ function buildRuntimeAlerts(samples, control, network, consoleEvents, errors) { ts: item.ts ?? null, promptIndex: promptIndexForTs(promptTimes, item.ts), type: item.type ?? null, + pageRole: item.pageRole ?? item.error?.details?.pageRole ?? null, + pageId: item.pageId ?? item.error?.details?.pageId ?? null, + routeSessionId: item.routeSessionId ?? item.error?.details?.routeSessionId ?? null, + activeSessionId: item.activeSessionId ?? item.error?.details?.activeSessionId ?? null, + commandId: item.commandId ?? item.error?.details?.commandId ?? null, + sampleSeq: item.sampleSeq ?? item.error?.details?.sampleSeq ?? null, + timeoutMs: timeoutMsFromMessage(item.error?.message || item.message || item.error || ""), errorName: item.error?.name ?? item.name ?? null, messageHash: item.error?.message ? sha256(item.error.message) : item.message ? sha256(item.message) : null, preview: limitText(item.error?.message || item.message || item.error || "", 220) diff --git a/scripts/src/hwlab-node-web-observe-runner-source.ts b/scripts/src/hwlab-node-web-observe-runner-source.ts index 43d9eeb4..fa595f21 100644 --- a/scripts/src/hwlab-node-web-observe-runner-source.ts +++ b/scripts/src/hwlab-node-web-observe-runner-source.ts @@ -4709,6 +4709,10 @@ function parseAlertThresholds(value) { visibleLoadingSlowMs: requiredPositiveThreshold(raw, "visibleLoadingSlowMs"), turnTimingSampleSlackSeconds: requiredPositiveThreshold(raw, "turnTimingSampleSlackSeconds"), turnElapsedSevereTimeoutSeconds: requiredPositiveThreshold(raw, "turnElapsedSevereTimeoutSeconds"), + domEvaluateTimeoutRedCount: requiredPositiveThreshold(raw, "domEvaluateTimeoutRedCount"), + domEvaluateTimeoutRedWindowMs: requiredPositiveThreshold(raw, "domEvaluateTimeoutRedWindowMs"), + screenshotTimeoutRedCount: requiredPositiveThreshold(raw, "screenshotTimeoutRedCount"), + pageErrorRedCount: requiredPositiveThreshold(raw, "pageErrorRedCount"), uncommandedStateChangeCommandWindowMs: requiredPositiveThreshold(raw, "uncommandedStateChangeCommandWindowMs"), scrollJumpCommandWindowMs: requiredPositiveThreshold(raw, "scrollJumpCommandWindowMs"), scrollJumpFromY: requiredPositiveThreshold(raw, "scrollJumpFromY"),