Merge pull request #1291 from pikasTech/fix/1287-otel-sentinel-visibility

fix: surface OTel and sentinel visibility gaps
This commit is contained in:
Lyon
2026-06-30 14:45:35 +08:00
committed by GitHub
6 changed files with 116 additions and 7 deletions
@@ -2743,7 +2743,7 @@ function buildFindings(samples, control, network, errors, sampleMetrics, promptN
const effectiveApiDomLag = apiDomLag || buildApiDomLagReport(samples, network);
if (commandFailures.length > 0) findings.push({ id: "observer-command-failed", severity: "red", summary: "observer control commands failed; analyze must surface command failure instead of hiding it in command artifacts", count: commandFailures.length, commands: commandFailures.slice(0, 20) });
findings.push(...buildFrontendFreezeFindings(errors, control));
findings.push(...buildBrowserProcessFindings(browserProcess));
findings.push(...buildBrowserProcessFindings(browserProcess, runtimeAlerts));
findings.push(...buildControlledNavigationRootCauseFindings(control, manifest));
findings.push(...buildSessionInvariantFindings(control, manifest));
const commandTimes = control
@@ -3220,10 +3220,11 @@ function buildBrowserProcessReport(rows) {
};
}
function buildBrowserProcessFindings(report) {
function buildBrowserProcessFindings(report, runtimeAlerts = null) {
const summary = report?.summary || {};
if (!summary || Number(summary.sampleCount ?? 0) <= 0) return [];
const findings = [];
const rootCauseSignals = browserRootCauseSignals(report, runtimeAlerts);
const maxTotalRssMb = Number(summary.maxTotalRssMb ?? 0);
const maxProcessRssMb = Number(summary.maxProcessRssMb ?? 0);
if (maxTotalRssMb >= alertThresholds.browserTotalRssRedMb || maxProcessRssMb >= alertThresholds.browserProcessRssRedMb) {
@@ -3240,6 +3241,7 @@ function buildBrowserProcessFindings(report) {
rootCause: "frontend_browser_process_memory_pressure",
rootCauseStatus: "confirmed-from-runner-process-rss",
rootCauseConfidence: "high",
rootCauseSignals,
fallbackAllowed: false,
valuesRedacted: true,
});
@@ -3260,6 +3262,7 @@ function buildBrowserProcessFindings(report) {
rootCause: "frontend_browser_process_memory_leak_or_unbounded_render_growth",
rootCauseStatus: "confirmed-from-runner-process-rss-growth",
rootCauseConfidence: "high",
rootCauseSignals,
fallbackAllowed: false,
valuesRedacted: true,
});
@@ -3285,6 +3288,7 @@ function buildBrowserProcessFindings(report) {
rootCause: "frontend_browser_page_unresponsive_to_playwright",
rootCauseStatus: "confirmed-from-cdp-runtime-evaluate",
rootCauseConfidence: "high",
rootCauseSignals,
fallbackAllowed: false,
valuesRedacted: true,
});
@@ -3303,6 +3307,7 @@ function buildBrowserProcessFindings(report) {
rootCause: "frontend_browser_cdp_metrics_unresponsive",
rootCauseStatus: "confirmed-from-cdp-metrics-timeouts",
rootCauseConfidence: "high",
rootCauseSignals,
fallbackAllowed: false,
valuesRedacted: true,
});
@@ -3310,6 +3315,32 @@ function buildBrowserProcessFindings(report) {
return findings;
}
function browserRootCauseSignals(report, runtimeAlerts) {
const browserSummary = report?.summary || {};
const runtimeSummary = runtimeAlerts?.summary || {};
const sampleCount = Number(browserSummary.sampleCount ?? 0);
const sessionListReadCount = Number(runtimeSummary.workbenchSessionListReadCount ?? 0);
const traceEventsReadCount = Number(runtimeSummary.workbenchTraceEventsReadCount ?? 0);
const webPerformanceBeaconFailureCount = Number(runtimeSummary.webPerformanceBeaconFailureCount ?? 0);
const eventSourceFailureCount = Number(runtimeSummary.workbenchEventSourceFailureCount ?? 0);
const suspectedFrontendRefreshStorm = sessionListReadCount >= Math.max(20, sampleCount * 2);
return {
suspectedFrontendRefreshStorm,
sessionListReadCount,
traceEventsReadCount,
webPerformanceBeaconFailureCount,
eventSourceFailureCount,
requestFailedCount: runtimeSummary.significantRequestFailedCount ?? runtimeSummary.requestFailedCount ?? 0,
httpErrorCount: runtimeSummary.httpErrorCount ?? 0,
topRequestFailedPaths: (runtimeAlerts?.networkSignificantRequestFailedByPath ?? runtimeAlerts?.networkRequestFailedByPath ?? []).slice(0, 5),
topHttpErrorPaths: (runtimeAlerts?.networkHttpErrorsByPath ?? []).slice(0, 5),
note: suspectedFrontendRefreshStorm
? "suspected frontend refresh storm: session list reads exceed the sample-derived budget during a browser red finding"
: "root-cause signals are included so memory/responsiveness/CDP red findings can be correlated without manual grep",
valuesRedacted: true,
};
}
function computeBrowserProcessGrowth(samples, windowMs) {
const budgetMs = Math.max(1000, Number(windowMs || 0));
const sorted = (samples || []).filter((item) => Number.isFinite(item.tsMs)).sort((a, b) => a.tsMs - b.tsMs);
@@ -4154,6 +4185,18 @@ function buildRuntimeAlerts(samples, control, network, consoleEvents, errors) {
const requestFailed = naturalNetwork
.filter((item) => item?.type === "requestfailed")
.map((item) => networkAlertEvent(item, promptTimes));
const workbenchSessionListReadCount = naturalNetwork
.filter((item) => urlPath(item?.url) === "/v1/workbench/sessions")
.length;
const workbenchTraceEventsReadCount = naturalNetwork
.filter((item) => /^\/v1\/workbench\/traces\/[^/]+\/events$/u.test(urlPath(item?.url)))
.length;
const webPerformanceBeaconFailureCount = naturalNetwork
.filter((item) => urlPath(item?.url) === "/v1/web-performance" && (item?.type === "requestfailed" || Number(item?.status) >= 400))
.length;
const workbenchEventSourceFailureCount = naturalNetwork
.filter((item) => urlPath(item?.url) === "/v1/workbench/events" && (item?.type === "requestfailed" || Number(item?.status) >= 400))
.length;
const significantRequestFailed = requestFailed.filter(
(item) => !isBenignLongLivedStreamClosureAlert(item) && !isObserverRefreshClosureAlert(item, observerRefreshTimes),
);
@@ -4284,6 +4327,10 @@ function buildRuntimeAlerts(samples, control, network, consoleEvents, errors) {
httpErrorCount: httpErrors.length,
requestFailedCount: requestFailed.length,
significantRequestFailedCount: significantRequestFailed.length,
workbenchSessionListReadCount,
workbenchTraceEventsReadCount,
webPerformanceBeaconFailureCount,
workbenchEventSourceFailureCount,
benignLongLivedStreamClosureCount: requestFailed.length - significantRequestFailed.length,
domDiagnosticSampleCount: domDiagnostics.length,
domDiagnosticGroupCount: groupDomDiagnostics(domDiagnostics).length,
+26 -4
View File
@@ -166,6 +166,26 @@ function samplesForTrace(items,traceId,maxSeq=null){
function latestSampleForTrace(items,traceId,maxSeq=null){
return samplesForTrace(items,traceId,maxSeq).slice(-1)[0]||null;
}
function sampleHasTerminalForTrace(sample,traceId){
if(!sample) return false;
const status=statusFor([sample],traceId);
if(terminalFinalStatus(status)) return true;
const final=finalResponseFor([sample],traceId);
if(final&&!final.empty) return true;
return traceEntries([sample],traceId).some((entry)=>terminalText(entry.text)||terminalFinalStatus(entry.item?.status));
}
function bestTraceFrameSample(traceId,maxSeq=null){
if(!traceId) return null;
const matched=samples.filter((sample)=>traceIdsFromSamples([sample]).includes(traceId));
if(matched.length===0) return null;
const terminal=matched.filter((sample)=>sampleHasTerminalForTrace(sample,traceId)).slice(-1)[0];
if(terminal) return terminal;
if(maxSeq!==null){
const before=matched.filter((sample)=>{const seq=numOrNull(sample.seq); return seq===null||seq<=maxSeq;}).slice(-1)[0];
if(before) return before;
}
return matched.slice(-1)[0]||null;
}
function traceEntries(items,traceId){
const entries=[];
for(const sample of items){
@@ -345,9 +365,9 @@ function selectSample(rows){
if(requestedSampleSeq!==null){const exact=samples.find((sample)=>Number(sample.seq)===requestedSampleSeq); if(exact)return exact;}
if(requestedTimestamp){const target=tsMs(requestedTimestamp); if(target!==null){const before=samples.filter((sample)=>{const ms=tsMs(sample.ts); return ms!==null&&ms<=target}).slice(-1)[0]; if(before)return before;}}
const requestedRow=requestedTurn!==null?rows[requestedTurn-1]:null;
if(requestedRow?.traceId){const byTrace=latestSampleForTrace(samples,requestedRow.traceId,numOrNull(requestedRow.lastSeq)); if(byTrace)return byTrace;}
if(requestedRow?.traceId){const byTrace=bestTraceFrameSample(requestedRow.traceId,numOrNull(requestedRow.lastSeq)); if(byTrace)return byTrace;}
if(requestedRow&&requestedRow.lastSeq!==null&&requestedRow.lastSeq!==undefined){const byTurn=samples.find((sample)=>Number(sample.seq)===Number(requestedRow.lastSeq)); if(byTurn)return byTurn;}
if(requestedTraceId){const byTrace=samples.filter((sample)=>traceIdsFromSamples([sample]).includes(requestedTraceId)).slice(-1)[0]; if(byTrace)return byTrace;}
if(requestedTraceId){const byTrace=bestTraceFrameSample(requestedTraceId); if(byTrace)return byTrace;}
return samples[samples.length-1]||null;
}
function renderTraceFrame(sample,rows){
@@ -365,9 +385,11 @@ function renderTraceFrame(sample,rows){
const terminalProjectionVisible=status==='completed'&&finalResponse&&!finalResponse.empty&&turns.some((turn)=>String(turn?.status||'').toLowerCase()==='completed'||/completed|轮次完成|final response/iu.test(textOf(turn)));
if(rowLines.length===0&&terminalProjectionVisible) rowLines.push('(trace rows collapsed; terminal turn and Final Response are visible)');
const missingRows=rowLines.length===0;
const laterTerminalSample=traceId?samples.filter((candidate)=>Number(candidate.seq??0)>Number(sample.seq??0)&&traceIdsFromSamples([candidate]).includes(traceId)&&sampleHasTerminalForTrace(candidate,traceId)).slice(-1)[0]||null:null;
const sampleNotice=laterTerminalSample?['TRACE SAMPLE NOTICE','current sample is non-terminal but a later terminal sample exists: seq='+(laterTerminalSample.seq??'-')+' ts='+(laterTerminalSample.ts||'-')+'; rerun with --sample-seq '+(laterTerminalSample.seq??'<seq>')+' or omit explicit sample selection.']:[];
const bodyRows=missingRows?['(无 trace rows;这是 blocker,不能当业务通过证据)',...traceFrameDiagnosticLines(sample,traceId,traceRows)]:rowLines;
const rendered=['Code Agent 耗时 '+(elapsed>=0?fmtDuration(elapsed):'-')+' 最近 '+(recent>=0?String(recent)+' 秒前':'-')+' '+status+'','=======================================================','sample seq='+(sample.seq??'-')+' ts='+(sample.ts||'-')+' traceId='+(traceId||'-')+' routeSession='+(sample.routeSessionId||'-')+' activeSession='+(sample.activeSessionId||'-'),...bodyRows,'==========================','Final Response',finalResponse.preview||'(空内容)'].join('\\n');
return {ok:!missingRows,renderedText:rendered,blocker:missingRows?'trace-rows-missing':null,sampleSeq:sample.seq??null,traceId,finalResponse,traceDiagnostic:missingRows?{pageRole:sample.pageRole||null,pageId:sample.pageId||null,traceRows:Array.isArray(sample.traceRows)?sample.traceRows.length:0,turns:Array.isArray(sample.turns)?sample.turns.length:0,messages:Array.isArray(sample.messages)?sample.messages.length:0,sampleTraceIds:traceIdsFromSamples([sample]).slice(0,12)}:null,valuesRedacted:true};
const rendered=['Code Agent 耗时 '+(elapsed>=0?fmtDuration(elapsed):'-')+' 最近 '+(recent>=0?String(recent)+' 秒前':'-')+' '+status+'','=======================================================','sample seq='+(sample.seq??'-')+' ts='+(sample.ts||'-')+' traceId='+(traceId||'-')+' routeSession='+(sample.routeSessionId||'-')+' activeSession='+(sample.activeSessionId||'-'),...sampleNotice,...bodyRows,'==========================','Final Response',finalResponse.preview||'(空内容)'].join('\\n');
return {ok:!missingRows,renderedText:rendered,blocker:missingRows?'trace-rows-missing':null,sampleSeq:sample.seq??null,traceId,finalResponse,laterTerminalSample:laterTerminalSample?{seq:laterTerminalSample.seq??null,ts:laterTerminalSample.ts??null}:null,traceDiagnostic:missingRows?{pageRole:sample.pageRole||null,pageId:sample.pageId||null,traceRows:Array.isArray(sample.traceRows)?sample.traceRows.length:0,turns:Array.isArray(sample.turns)?sample.turns.length:0,messages:Array.isArray(sample.messages)?sample.messages.length:0,sampleTraceIds:traceIdsFromSamples([sample]).slice(0,12)}:null,valuesRedacted:true};
}
function firstPresent(values){
for(const value of values){
@@ -120,6 +120,8 @@ export async function status(config: UniDeskConfig, options: CommonOptions): Pro
return renderStatusTable({
ok: result.exitCode === 0 && summary?.ready === true,
target,
backendEndpoint: `${observability.traceBackend.serviceName}.${target.namespace}.svc.cluster.local:${observability.traceBackend.httpPort}`,
collectorEndpoint: `${observability.collector.serviceName}.${target.namespace}.svc.cluster.local:${observability.collector.otlp.grpcPort}`,
summary,
remote: parsed === null ? compactCapture(result, { full: false }) : null,
});
@@ -143,6 +145,8 @@ export async function status(config: UniDeskConfig, options: CommonOptions): Pro
function renderStatusTable(input: {
ok: boolean;
target: ObservabilityTarget;
backendEndpoint: string;
collectorEndpoint: string;
summary: Record<string, unknown> | null;
remote: Record<string, unknown> | null;
}): RenderedCliResult {
@@ -167,6 +171,7 @@ function renderStatusTable(input: {
`platform-infra observability status (${input.ok ? "ok" : "not-ok"})`,
"",
`target=${input.target.id} namespace=${input.target.namespace} ready=${textValue(input.summary?.ready)} route=${input.target.route}`,
`tempo=${input.backendEndpoint} collector=${input.collectorEndpoint} deployments=${deployments.length} pods=${pods.length} probes=${probes.length}`,
"",
"Deployments:",
formatTable(["NAME", "READY", "AVAILABLE"], deployments.length > 0 ? deployments : [["-", "-", "-"]]),
@@ -87,6 +87,7 @@ export function compactDiagnoseCodeAgentResult(value: unknown): Record<string, u
spanCount: source.spanCount ?? null,
services: source.services ?? null,
servicePath: source.servicePath ?? null,
observabilityGap: source.observabilityGap ?? null,
businessTraceIds: source.businessTraceIds ?? null,
identity: compactDiagnoseIdentity(source.identity),
agentrun: compactDiagnoseAgentRun(source.agentrun),
@@ -1986,8 +1986,26 @@ service_path = {
service: ("reached" if service in services else "missing")
for service in expected_services
}
service_path["complete"] = all(service in services for service in expected_services)
missing_services = [service for service in expected_services if service not in services]
service_path["complete"] = len(missing_services) == 0
observability_gap = {
"status": "complete" if len(missing_services) == 0 else "missing-service-spans",
"expectedServices": expected_services,
"seenServices": sorted(services),
"missingServices": missing_services,
"complete": len(missing_services) == 0,
}
if missing_services and ("hwlab-cloud-api" in services or identity.get("runId") not in (None, "") or identity.get("commandId") not in (None, "")):
candidates.insert(0, {
"code": "observability_gap_missing_service_spans",
"label": "observability gap",
"confidence": 0.76,
"summary": "The business trace is correlated to Code Agent context but is missing expected service spans; do not interpret the missing manager/runner spans as proof that those services were not involved.",
"evidence": observability_gap,
})
facts = []
if missing_services:
facts.append("observability gap: missing service spans " + ",".join(missing_services))
if http_summary.get("actorForbidden"):
facts.append("actor forbidden")
terminal_status = agentrun.get("terminalStatus")
@@ -2014,6 +2032,7 @@ summary = {
"actorForbidden": http_summary.get("actorForbidden"),
"terminalStatus": terminal_status,
"failureKind": agentrun.get("failureKind"),
"observabilityGap": observability_gap.get("status"),
},
}
evidence = {
@@ -2041,6 +2060,7 @@ payload = {
"spanCount": len(spans),
"services": services,
"servicePath": service_path,
"observabilityGap": observability_gap,
"businessTraceIds": business_trace_ids[:20],
"identity": identity,
"agentrun": {
@@ -613,6 +613,8 @@ export function renderDiagnoseCodeAgentTable(input: {
const projectionLag = asPlainRecord(input.result.projectionLag);
const summary = asPlainRecord(input.result.summary);
const evidence = asPlainRecord(input.result.evidence);
const observabilityGap = asPlainRecord(input.result.observabilityGap);
const servicePath = asPlainRecord(input.result.servicePath);
const rootCauses = asArray(input.result.rootCauseCandidates).map((item) => asPlainRecord(item) ?? {});
const http = asPlainRecord(input.result.http);
const services = joinValues(input.result.services, 54);
@@ -633,6 +635,14 @@ export function renderDiagnoseCodeAgentTable(input: {
shortenEnd(textValue(candidate.summary ?? candidate.label), 80),
]);
const httpRows = httpTableRows(http);
const expectedServices = asArray(observabilityGap?.expectedServices).map((item) => textValue(item)).filter((item) => item !== "-");
const seenServices = new Set(asArray(observabilityGap?.seenServices).map((item) => textValue(item)).filter((item) => item !== "-"));
const missingServices = new Set(asArray(observabilityGap?.missingServices).map((item) => textValue(item)).filter((item) => item !== "-"));
const serviceRows = expectedServices.map((service) => [
service,
textValue(servicePath?.[service] ?? (missingServices.has(service) ? "missing" : seenServices.has(service) ? "reached" : "-")),
missingServices.has(service) ? "missing" : seenServices.has(service) ? "seen" : "-",
]);
const queryClauses = asArray(input.query.queryClauses).map((item) => textValue(item)).filter((item) => item !== "-");
const requestedRunId = textValue(input.query.runId);
const requestedCommandId = textValue(input.query.commandId);
@@ -658,6 +668,10 @@ export function renderDiagnoseCodeAgentTable(input: {
"Root causes:",
formatTable(["CODE", "CONF", "SUMMARY"], rootRows.length > 0 ? rootRows : [["-", "-", "-"]]),
"",
"Service trace coverage:",
formatTable(["SERVICE", "PATH", "SPAN"], serviceRows.length > 0 ? serviceRows : [["-", "-", "-"]]),
` observabilityGap=${textValue(observabilityGap?.status)} missing=${joinValues(observabilityGap?.missingServices, 60)} seen=${joinValues(observabilityGap?.seenServices, 80)}`,
"",
"HTTP:",
formatTable(["METHOD", "ROUTE", "STATUS", "COUNT"], httpRows.length > 0 ? httpRows : [["-", "-", "-", "-"]]),
"",