From 9e3747d9ed13c02b6d13caebacac71a92d2d2170 Mon Sep 17 00:00:00 2001 From: Codex Date: Tue, 30 Jun 2026 06:44:10 +0000 Subject: [PATCH] fix: surface OTel and sentinel visibility gaps --- .../hwlab-node-web-observe-analyzer-source.ts | 51 ++++++++++++++++++- scripts/src/hwlab-node-web-observe-collect.ts | 30 +++++++++-- .../platform-infra-observability/actions.ts | 5 ++ .../apply-status-scripts.ts | 1 + .../diagnose-code-agent-script.ts | 22 +++++++- .../platform-infra-observability/render.ts | 14 +++++ 6 files changed, 116 insertions(+), 7 deletions(-) diff --git a/scripts/src/hwlab-node-web-observe-analyzer-source.ts b/scripts/src/hwlab-node-web-observe-analyzer-source.ts index 54bede8f..efd07443 100644 --- a/scripts/src/hwlab-node-web-observe-analyzer-source.ts +++ b/scripts/src/hwlab-node-web-observe-analyzer-source.ts @@ -2743,7 +2743,7 @@ function buildFindings(samples, control, network, errors, sampleMetrics, promptN const effectiveApiDomLag = apiDomLag || buildApiDomLagReport(samples, network); if (commandFailures.length > 0) findings.push({ id: "observer-command-failed", severity: "red", summary: "observer control commands failed; analyze must surface command failure instead of hiding it in command artifacts", count: commandFailures.length, commands: commandFailures.slice(0, 20) }); findings.push(...buildFrontendFreezeFindings(errors, control)); - findings.push(...buildBrowserProcessFindings(browserProcess)); + findings.push(...buildBrowserProcessFindings(browserProcess, runtimeAlerts)); findings.push(...buildControlledNavigationRootCauseFindings(control, manifest)); findings.push(...buildSessionInvariantFindings(control, manifest)); const commandTimes = control @@ -3220,10 +3220,11 @@ function buildBrowserProcessReport(rows) { }; } -function buildBrowserProcessFindings(report) { +function buildBrowserProcessFindings(report, runtimeAlerts = null) { const summary = report?.summary || {}; if (!summary || Number(summary.sampleCount ?? 0) <= 0) return []; const findings = []; + const rootCauseSignals = browserRootCauseSignals(report, runtimeAlerts); const maxTotalRssMb = Number(summary.maxTotalRssMb ?? 0); const maxProcessRssMb = Number(summary.maxProcessRssMb ?? 0); if (maxTotalRssMb >= alertThresholds.browserTotalRssRedMb || maxProcessRssMb >= alertThresholds.browserProcessRssRedMb) { @@ -3240,6 +3241,7 @@ function buildBrowserProcessFindings(report) { rootCause: "frontend_browser_process_memory_pressure", rootCauseStatus: "confirmed-from-runner-process-rss", rootCauseConfidence: "high", + rootCauseSignals, fallbackAllowed: false, valuesRedacted: true, }); @@ -3260,6 +3262,7 @@ function buildBrowserProcessFindings(report) { rootCause: "frontend_browser_process_memory_leak_or_unbounded_render_growth", rootCauseStatus: "confirmed-from-runner-process-rss-growth", rootCauseConfidence: "high", + rootCauseSignals, fallbackAllowed: false, valuesRedacted: true, }); @@ -3285,6 +3288,7 @@ function buildBrowserProcessFindings(report) { rootCause: "frontend_browser_page_unresponsive_to_playwright", rootCauseStatus: "confirmed-from-cdp-runtime-evaluate", rootCauseConfidence: "high", + rootCauseSignals, fallbackAllowed: false, valuesRedacted: true, }); @@ -3303,6 +3307,7 @@ function buildBrowserProcessFindings(report) { rootCause: "frontend_browser_cdp_metrics_unresponsive", rootCauseStatus: "confirmed-from-cdp-metrics-timeouts", rootCauseConfidence: "high", + rootCauseSignals, fallbackAllowed: false, valuesRedacted: true, }); @@ -3310,6 +3315,32 @@ function buildBrowserProcessFindings(report) { return findings; } +function browserRootCauseSignals(report, runtimeAlerts) { + const browserSummary = report?.summary || {}; + const runtimeSummary = runtimeAlerts?.summary || {}; + const sampleCount = Number(browserSummary.sampleCount ?? 0); + const sessionListReadCount = Number(runtimeSummary.workbenchSessionListReadCount ?? 0); + const traceEventsReadCount = Number(runtimeSummary.workbenchTraceEventsReadCount ?? 0); + const webPerformanceBeaconFailureCount = Number(runtimeSummary.webPerformanceBeaconFailureCount ?? 0); + const eventSourceFailureCount = Number(runtimeSummary.workbenchEventSourceFailureCount ?? 0); + const suspectedFrontendRefreshStorm = sessionListReadCount >= Math.max(20, sampleCount * 2); + return { + suspectedFrontendRefreshStorm, + sessionListReadCount, + traceEventsReadCount, + webPerformanceBeaconFailureCount, + eventSourceFailureCount, + requestFailedCount: runtimeSummary.significantRequestFailedCount ?? runtimeSummary.requestFailedCount ?? 0, + httpErrorCount: runtimeSummary.httpErrorCount ?? 0, + topRequestFailedPaths: (runtimeAlerts?.networkSignificantRequestFailedByPath ?? runtimeAlerts?.networkRequestFailedByPath ?? []).slice(0, 5), + topHttpErrorPaths: (runtimeAlerts?.networkHttpErrorsByPath ?? []).slice(0, 5), + note: suspectedFrontendRefreshStorm + ? "suspected frontend refresh storm: session list reads exceed the sample-derived budget during a browser red finding" + : "root-cause signals are included so memory/responsiveness/CDP red findings can be correlated without manual grep", + valuesRedacted: true, + }; +} + function computeBrowserProcessGrowth(samples, windowMs) { const budgetMs = Math.max(1000, Number(windowMs || 0)); const sorted = (samples || []).filter((item) => Number.isFinite(item.tsMs)).sort((a, b) => a.tsMs - b.tsMs); @@ -4154,6 +4185,18 @@ function buildRuntimeAlerts(samples, control, network, consoleEvents, errors) { const requestFailed = naturalNetwork .filter((item) => item?.type === "requestfailed") .map((item) => networkAlertEvent(item, promptTimes)); + const workbenchSessionListReadCount = naturalNetwork + .filter((item) => urlPath(item?.url) === "/v1/workbench/sessions") + .length; + const workbenchTraceEventsReadCount = naturalNetwork + .filter((item) => /^\/v1\/workbench\/traces\/[^/]+\/events$/u.test(urlPath(item?.url))) + .length; + const webPerformanceBeaconFailureCount = naturalNetwork + .filter((item) => urlPath(item?.url) === "/v1/web-performance" && (item?.type === "requestfailed" || Number(item?.status) >= 400)) + .length; + const workbenchEventSourceFailureCount = naturalNetwork + .filter((item) => urlPath(item?.url) === "/v1/workbench/events" && (item?.type === "requestfailed" || Number(item?.status) >= 400)) + .length; const significantRequestFailed = requestFailed.filter( (item) => !isBenignLongLivedStreamClosureAlert(item) && !isObserverRefreshClosureAlert(item, observerRefreshTimes), ); @@ -4284,6 +4327,10 @@ function buildRuntimeAlerts(samples, control, network, consoleEvents, errors) { httpErrorCount: httpErrors.length, requestFailedCount: requestFailed.length, significantRequestFailedCount: significantRequestFailed.length, + workbenchSessionListReadCount, + workbenchTraceEventsReadCount, + webPerformanceBeaconFailureCount, + workbenchEventSourceFailureCount, benignLongLivedStreamClosureCount: requestFailed.length - significantRequestFailed.length, domDiagnosticSampleCount: domDiagnostics.length, domDiagnosticGroupCount: groupDomDiagnostics(domDiagnostics).length, diff --git a/scripts/src/hwlab-node-web-observe-collect.ts b/scripts/src/hwlab-node-web-observe-collect.ts index 0211a64c..7d75a64a 100644 --- a/scripts/src/hwlab-node-web-observe-collect.ts +++ b/scripts/src/hwlab-node-web-observe-collect.ts @@ -166,6 +166,26 @@ function samplesForTrace(items,traceId,maxSeq=null){ function latestSampleForTrace(items,traceId,maxSeq=null){ return samplesForTrace(items,traceId,maxSeq).slice(-1)[0]||null; } +function sampleHasTerminalForTrace(sample,traceId){ + if(!sample) return false; + const status=statusFor([sample],traceId); + if(terminalFinalStatus(status)) return true; + const final=finalResponseFor([sample],traceId); + if(final&&!final.empty) return true; + return traceEntries([sample],traceId).some((entry)=>terminalText(entry.text)||terminalFinalStatus(entry.item?.status)); +} +function bestTraceFrameSample(traceId,maxSeq=null){ + if(!traceId) return null; + const matched=samples.filter((sample)=>traceIdsFromSamples([sample]).includes(traceId)); + if(matched.length===0) return null; + const terminal=matched.filter((sample)=>sampleHasTerminalForTrace(sample,traceId)).slice(-1)[0]; + if(terminal) return terminal; + if(maxSeq!==null){ + const before=matched.filter((sample)=>{const seq=numOrNull(sample.seq); return seq===null||seq<=maxSeq;}).slice(-1)[0]; + if(before) return before; + } + return matched.slice(-1)[0]||null; +} function traceEntries(items,traceId){ const entries=[]; for(const sample of items){ @@ -345,9 +365,9 @@ function selectSample(rows){ if(requestedSampleSeq!==null){const exact=samples.find((sample)=>Number(sample.seq)===requestedSampleSeq); if(exact)return exact;} if(requestedTimestamp){const target=tsMs(requestedTimestamp); if(target!==null){const before=samples.filter((sample)=>{const ms=tsMs(sample.ts); return ms!==null&&ms<=target}).slice(-1)[0]; if(before)return before;}} const requestedRow=requestedTurn!==null?rows[requestedTurn-1]:null; - if(requestedRow?.traceId){const byTrace=latestSampleForTrace(samples,requestedRow.traceId,numOrNull(requestedRow.lastSeq)); if(byTrace)return byTrace;} + if(requestedRow?.traceId){const byTrace=bestTraceFrameSample(requestedRow.traceId,numOrNull(requestedRow.lastSeq)); if(byTrace)return byTrace;} if(requestedRow&&requestedRow.lastSeq!==null&&requestedRow.lastSeq!==undefined){const byTurn=samples.find((sample)=>Number(sample.seq)===Number(requestedRow.lastSeq)); if(byTurn)return byTurn;} - if(requestedTraceId){const byTrace=samples.filter((sample)=>traceIdsFromSamples([sample]).includes(requestedTraceId)).slice(-1)[0]; if(byTrace)return byTrace;} + if(requestedTraceId){const byTrace=bestTraceFrameSample(requestedTraceId); if(byTrace)return byTrace;} return samples[samples.length-1]||null; } function renderTraceFrame(sample,rows){ @@ -365,9 +385,11 @@ function renderTraceFrame(sample,rows){ const terminalProjectionVisible=status==='completed'&&finalResponse&&!finalResponse.empty&&turns.some((turn)=>String(turn?.status||'').toLowerCase()==='completed'||/completed|轮次完成|final response/iu.test(textOf(turn))); if(rowLines.length===0&&terminalProjectionVisible) rowLines.push('(trace rows collapsed; terminal turn and Final Response are visible)'); const missingRows=rowLines.length===0; + const laterTerminalSample=traceId?samples.filter((candidate)=>Number(candidate.seq??0)>Number(sample.seq??0)&&traceIdsFromSamples([candidate]).includes(traceId)&&sampleHasTerminalForTrace(candidate,traceId)).slice(-1)[0]||null:null; + const sampleNotice=laterTerminalSample?['TRACE SAMPLE NOTICE','current sample is non-terminal but a later terminal sample exists: seq='+(laterTerminalSample.seq??'-')+' ts='+(laterTerminalSample.ts||'-')+'; rerun with --sample-seq '+(laterTerminalSample.seq??'')+' or omit explicit sample selection.']:[]; const bodyRows=missingRows?['(无 trace rows;这是 blocker,不能当业务通过证据)',...traceFrameDiagnosticLines(sample,traceId,traceRows)]:rowLines; - const rendered=['Code Agent 耗时 '+(elapsed>=0?fmtDuration(elapsed):'-')+' 最近 '+(recent>=0?String(recent)+' 秒前':'-')+' ('+status+')','=======================================================','sample seq='+(sample.seq??'-')+' ts='+(sample.ts||'-')+' traceId='+(traceId||'-')+' routeSession='+(sample.routeSessionId||'-')+' activeSession='+(sample.activeSessionId||'-'),...bodyRows,'==========================','Final Response',finalResponse.preview||'(空内容)'].join('\\n'); - return {ok:!missingRows,renderedText:rendered,blocker:missingRows?'trace-rows-missing':null,sampleSeq:sample.seq??null,traceId,finalResponse,traceDiagnostic:missingRows?{pageRole:sample.pageRole||null,pageId:sample.pageId||null,traceRows:Array.isArray(sample.traceRows)?sample.traceRows.length:0,turns:Array.isArray(sample.turns)?sample.turns.length:0,messages:Array.isArray(sample.messages)?sample.messages.length:0,sampleTraceIds:traceIdsFromSamples([sample]).slice(0,12)}:null,valuesRedacted:true}; + const rendered=['Code Agent 耗时 '+(elapsed>=0?fmtDuration(elapsed):'-')+' 最近 '+(recent>=0?String(recent)+' 秒前':'-')+' ('+status+')','=======================================================','sample seq='+(sample.seq??'-')+' ts='+(sample.ts||'-')+' traceId='+(traceId||'-')+' routeSession='+(sample.routeSessionId||'-')+' activeSession='+(sample.activeSessionId||'-'),...sampleNotice,...bodyRows,'==========================','Final Response',finalResponse.preview||'(空内容)'].join('\\n'); + return {ok:!missingRows,renderedText:rendered,blocker:missingRows?'trace-rows-missing':null,sampleSeq:sample.seq??null,traceId,finalResponse,laterTerminalSample:laterTerminalSample?{seq:laterTerminalSample.seq??null,ts:laterTerminalSample.ts??null}:null,traceDiagnostic:missingRows?{pageRole:sample.pageRole||null,pageId:sample.pageId||null,traceRows:Array.isArray(sample.traceRows)?sample.traceRows.length:0,turns:Array.isArray(sample.turns)?sample.turns.length:0,messages:Array.isArray(sample.messages)?sample.messages.length:0,sampleTraceIds:traceIdsFromSamples([sample]).slice(0,12)}:null,valuesRedacted:true}; } function firstPresent(values){ for(const value of values){ diff --git a/scripts/src/platform-infra-observability/actions.ts b/scripts/src/platform-infra-observability/actions.ts index a580f763..1a78fbb4 100644 --- a/scripts/src/platform-infra-observability/actions.ts +++ b/scripts/src/platform-infra-observability/actions.ts @@ -120,6 +120,8 @@ export async function status(config: UniDeskConfig, options: CommonOptions): Pro return renderStatusTable({ ok: result.exitCode === 0 && summary?.ready === true, target, + backendEndpoint: `${observability.traceBackend.serviceName}.${target.namespace}.svc.cluster.local:${observability.traceBackend.httpPort}`, + collectorEndpoint: `${observability.collector.serviceName}.${target.namespace}.svc.cluster.local:${observability.collector.otlp.grpcPort}`, summary, remote: parsed === null ? compactCapture(result, { full: false }) : null, }); @@ -143,6 +145,8 @@ export async function status(config: UniDeskConfig, options: CommonOptions): Pro function renderStatusTable(input: { ok: boolean; target: ObservabilityTarget; + backendEndpoint: string; + collectorEndpoint: string; summary: Record | null; remote: Record | null; }): RenderedCliResult { @@ -167,6 +171,7 @@ function renderStatusTable(input: { `platform-infra observability status (${input.ok ? "ok" : "not-ok"})`, "", `target=${input.target.id} namespace=${input.target.namespace} ready=${textValue(input.summary?.ready)} route=${input.target.route}`, + `tempo=${input.backendEndpoint} collector=${input.collectorEndpoint} deployments=${deployments.length} pods=${pods.length} probes=${probes.length}`, "", "Deployments:", formatTable(["NAME", "READY", "AVAILABLE"], deployments.length > 0 ? deployments : [["-", "-", "-"]]), diff --git a/scripts/src/platform-infra-observability/apply-status-scripts.ts b/scripts/src/platform-infra-observability/apply-status-scripts.ts index 3f612cd9..70f54e49 100644 --- a/scripts/src/platform-infra-observability/apply-status-scripts.ts +++ b/scripts/src/platform-infra-observability/apply-status-scripts.ts @@ -87,6 +87,7 @@ export function compactDiagnoseCodeAgentResult(value: unknown): Record asPlainRecord(item) ?? {}); const http = asPlainRecord(input.result.http); const services = joinValues(input.result.services, 54); @@ -633,6 +635,14 @@ export function renderDiagnoseCodeAgentTable(input: { shortenEnd(textValue(candidate.summary ?? candidate.label), 80), ]); const httpRows = httpTableRows(http); + const expectedServices = asArray(observabilityGap?.expectedServices).map((item) => textValue(item)).filter((item) => item !== "-"); + const seenServices = new Set(asArray(observabilityGap?.seenServices).map((item) => textValue(item)).filter((item) => item !== "-")); + const missingServices = new Set(asArray(observabilityGap?.missingServices).map((item) => textValue(item)).filter((item) => item !== "-")); + const serviceRows = expectedServices.map((service) => [ + service, + textValue(servicePath?.[service] ?? (missingServices.has(service) ? "missing" : seenServices.has(service) ? "reached" : "-")), + missingServices.has(service) ? "missing" : seenServices.has(service) ? "seen" : "-", + ]); const queryClauses = asArray(input.query.queryClauses).map((item) => textValue(item)).filter((item) => item !== "-"); const requestedRunId = textValue(input.query.runId); const requestedCommandId = textValue(input.query.commandId); @@ -658,6 +668,10 @@ export function renderDiagnoseCodeAgentTable(input: { "Root causes:", formatTable(["CODE", "CONF", "SUMMARY"], rootRows.length > 0 ? rootRows : [["-", "-", "-"]]), "", + "Service trace coverage:", + formatTable(["SERVICE", "PATH", "SPAN"], serviceRows.length > 0 ? serviceRows : [["-", "-", "-"]]), + ` observabilityGap=${textValue(observabilityGap?.status)} missing=${joinValues(observabilityGap?.missingServices, 60)} seen=${joinValues(observabilityGap?.seenServices, 80)}`, + "", "HTTP:", formatTable(["METHOD", "ROUTE", "STATUS", "COUNT"], httpRows.length > 0 ? httpRows : [["-", "-", "-", "-"]]), "",