Merge pull request #1291 from pikasTech/fix/1287-otel-sentinel-visibility
fix: surface OTel and sentinel visibility gaps
This commit is contained in:
@@ -2743,7 +2743,7 @@ function buildFindings(samples, control, network, errors, sampleMetrics, promptN
|
||||
const effectiveApiDomLag = apiDomLag || buildApiDomLagReport(samples, network);
|
||||
if (commandFailures.length > 0) findings.push({ id: "observer-command-failed", severity: "red", summary: "observer control commands failed; analyze must surface command failure instead of hiding it in command artifacts", count: commandFailures.length, commands: commandFailures.slice(0, 20) });
|
||||
findings.push(...buildFrontendFreezeFindings(errors, control));
|
||||
findings.push(...buildBrowserProcessFindings(browserProcess));
|
||||
findings.push(...buildBrowserProcessFindings(browserProcess, runtimeAlerts));
|
||||
findings.push(...buildControlledNavigationRootCauseFindings(control, manifest));
|
||||
findings.push(...buildSessionInvariantFindings(control, manifest));
|
||||
const commandTimes = control
|
||||
@@ -3220,10 +3220,11 @@ function buildBrowserProcessReport(rows) {
|
||||
};
|
||||
}
|
||||
|
||||
function buildBrowserProcessFindings(report) {
|
||||
function buildBrowserProcessFindings(report, runtimeAlerts = null) {
|
||||
const summary = report?.summary || {};
|
||||
if (!summary || Number(summary.sampleCount ?? 0) <= 0) return [];
|
||||
const findings = [];
|
||||
const rootCauseSignals = browserRootCauseSignals(report, runtimeAlerts);
|
||||
const maxTotalRssMb = Number(summary.maxTotalRssMb ?? 0);
|
||||
const maxProcessRssMb = Number(summary.maxProcessRssMb ?? 0);
|
||||
if (maxTotalRssMb >= alertThresholds.browserTotalRssRedMb || maxProcessRssMb >= alertThresholds.browserProcessRssRedMb) {
|
||||
@@ -3240,6 +3241,7 @@ function buildBrowserProcessFindings(report) {
|
||||
rootCause: "frontend_browser_process_memory_pressure",
|
||||
rootCauseStatus: "confirmed-from-runner-process-rss",
|
||||
rootCauseConfidence: "high",
|
||||
rootCauseSignals,
|
||||
fallbackAllowed: false,
|
||||
valuesRedacted: true,
|
||||
});
|
||||
@@ -3260,6 +3262,7 @@ function buildBrowserProcessFindings(report) {
|
||||
rootCause: "frontend_browser_process_memory_leak_or_unbounded_render_growth",
|
||||
rootCauseStatus: "confirmed-from-runner-process-rss-growth",
|
||||
rootCauseConfidence: "high",
|
||||
rootCauseSignals,
|
||||
fallbackAllowed: false,
|
||||
valuesRedacted: true,
|
||||
});
|
||||
@@ -3285,6 +3288,7 @@ function buildBrowserProcessFindings(report) {
|
||||
rootCause: "frontend_browser_page_unresponsive_to_playwright",
|
||||
rootCauseStatus: "confirmed-from-cdp-runtime-evaluate",
|
||||
rootCauseConfidence: "high",
|
||||
rootCauseSignals,
|
||||
fallbackAllowed: false,
|
||||
valuesRedacted: true,
|
||||
});
|
||||
@@ -3303,6 +3307,7 @@ function buildBrowserProcessFindings(report) {
|
||||
rootCause: "frontend_browser_cdp_metrics_unresponsive",
|
||||
rootCauseStatus: "confirmed-from-cdp-metrics-timeouts",
|
||||
rootCauseConfidence: "high",
|
||||
rootCauseSignals,
|
||||
fallbackAllowed: false,
|
||||
valuesRedacted: true,
|
||||
});
|
||||
@@ -3310,6 +3315,32 @@ function buildBrowserProcessFindings(report) {
|
||||
return findings;
|
||||
}
|
||||
|
||||
function browserRootCauseSignals(report, runtimeAlerts) {
|
||||
const browserSummary = report?.summary || {};
|
||||
const runtimeSummary = runtimeAlerts?.summary || {};
|
||||
const sampleCount = Number(browserSummary.sampleCount ?? 0);
|
||||
const sessionListReadCount = Number(runtimeSummary.workbenchSessionListReadCount ?? 0);
|
||||
const traceEventsReadCount = Number(runtimeSummary.workbenchTraceEventsReadCount ?? 0);
|
||||
const webPerformanceBeaconFailureCount = Number(runtimeSummary.webPerformanceBeaconFailureCount ?? 0);
|
||||
const eventSourceFailureCount = Number(runtimeSummary.workbenchEventSourceFailureCount ?? 0);
|
||||
const suspectedFrontendRefreshStorm = sessionListReadCount >= Math.max(20, sampleCount * 2);
|
||||
return {
|
||||
suspectedFrontendRefreshStorm,
|
||||
sessionListReadCount,
|
||||
traceEventsReadCount,
|
||||
webPerformanceBeaconFailureCount,
|
||||
eventSourceFailureCount,
|
||||
requestFailedCount: runtimeSummary.significantRequestFailedCount ?? runtimeSummary.requestFailedCount ?? 0,
|
||||
httpErrorCount: runtimeSummary.httpErrorCount ?? 0,
|
||||
topRequestFailedPaths: (runtimeAlerts?.networkSignificantRequestFailedByPath ?? runtimeAlerts?.networkRequestFailedByPath ?? []).slice(0, 5),
|
||||
topHttpErrorPaths: (runtimeAlerts?.networkHttpErrorsByPath ?? []).slice(0, 5),
|
||||
note: suspectedFrontendRefreshStorm
|
||||
? "suspected frontend refresh storm: session list reads exceed the sample-derived budget during a browser red finding"
|
||||
: "root-cause signals are included so memory/responsiveness/CDP red findings can be correlated without manual grep",
|
||||
valuesRedacted: true,
|
||||
};
|
||||
}
|
||||
|
||||
function computeBrowserProcessGrowth(samples, windowMs) {
|
||||
const budgetMs = Math.max(1000, Number(windowMs || 0));
|
||||
const sorted = (samples || []).filter((item) => Number.isFinite(item.tsMs)).sort((a, b) => a.tsMs - b.tsMs);
|
||||
@@ -4154,6 +4185,18 @@ function buildRuntimeAlerts(samples, control, network, consoleEvents, errors) {
|
||||
const requestFailed = naturalNetwork
|
||||
.filter((item) => item?.type === "requestfailed")
|
||||
.map((item) => networkAlertEvent(item, promptTimes));
|
||||
const workbenchSessionListReadCount = naturalNetwork
|
||||
.filter((item) => urlPath(item?.url) === "/v1/workbench/sessions")
|
||||
.length;
|
||||
const workbenchTraceEventsReadCount = naturalNetwork
|
||||
.filter((item) => /^\/v1\/workbench\/traces\/[^/]+\/events$/u.test(urlPath(item?.url)))
|
||||
.length;
|
||||
const webPerformanceBeaconFailureCount = naturalNetwork
|
||||
.filter((item) => urlPath(item?.url) === "/v1/web-performance" && (item?.type === "requestfailed" || Number(item?.status) >= 400))
|
||||
.length;
|
||||
const workbenchEventSourceFailureCount = naturalNetwork
|
||||
.filter((item) => urlPath(item?.url) === "/v1/workbench/events" && (item?.type === "requestfailed" || Number(item?.status) >= 400))
|
||||
.length;
|
||||
const significantRequestFailed = requestFailed.filter(
|
||||
(item) => !isBenignLongLivedStreamClosureAlert(item) && !isObserverRefreshClosureAlert(item, observerRefreshTimes),
|
||||
);
|
||||
@@ -4284,6 +4327,10 @@ function buildRuntimeAlerts(samples, control, network, consoleEvents, errors) {
|
||||
httpErrorCount: httpErrors.length,
|
||||
requestFailedCount: requestFailed.length,
|
||||
significantRequestFailedCount: significantRequestFailed.length,
|
||||
workbenchSessionListReadCount,
|
||||
workbenchTraceEventsReadCount,
|
||||
webPerformanceBeaconFailureCount,
|
||||
workbenchEventSourceFailureCount,
|
||||
benignLongLivedStreamClosureCount: requestFailed.length - significantRequestFailed.length,
|
||||
domDiagnosticSampleCount: domDiagnostics.length,
|
||||
domDiagnosticGroupCount: groupDomDiagnostics(domDiagnostics).length,
|
||||
|
||||
@@ -166,6 +166,26 @@ function samplesForTrace(items,traceId,maxSeq=null){
|
||||
function latestSampleForTrace(items,traceId,maxSeq=null){
|
||||
return samplesForTrace(items,traceId,maxSeq).slice(-1)[0]||null;
|
||||
}
|
||||
function sampleHasTerminalForTrace(sample,traceId){
|
||||
if(!sample) return false;
|
||||
const status=statusFor([sample],traceId);
|
||||
if(terminalFinalStatus(status)) return true;
|
||||
const final=finalResponseFor([sample],traceId);
|
||||
if(final&&!final.empty) return true;
|
||||
return traceEntries([sample],traceId).some((entry)=>terminalText(entry.text)||terminalFinalStatus(entry.item?.status));
|
||||
}
|
||||
function bestTraceFrameSample(traceId,maxSeq=null){
|
||||
if(!traceId) return null;
|
||||
const matched=samples.filter((sample)=>traceIdsFromSamples([sample]).includes(traceId));
|
||||
if(matched.length===0) return null;
|
||||
const terminal=matched.filter((sample)=>sampleHasTerminalForTrace(sample,traceId)).slice(-1)[0];
|
||||
if(terminal) return terminal;
|
||||
if(maxSeq!==null){
|
||||
const before=matched.filter((sample)=>{const seq=numOrNull(sample.seq); return seq===null||seq<=maxSeq;}).slice(-1)[0];
|
||||
if(before) return before;
|
||||
}
|
||||
return matched.slice(-1)[0]||null;
|
||||
}
|
||||
function traceEntries(items,traceId){
|
||||
const entries=[];
|
||||
for(const sample of items){
|
||||
@@ -345,9 +365,9 @@ function selectSample(rows){
|
||||
if(requestedSampleSeq!==null){const exact=samples.find((sample)=>Number(sample.seq)===requestedSampleSeq); if(exact)return exact;}
|
||||
if(requestedTimestamp){const target=tsMs(requestedTimestamp); if(target!==null){const before=samples.filter((sample)=>{const ms=tsMs(sample.ts); return ms!==null&&ms<=target}).slice(-1)[0]; if(before)return before;}}
|
||||
const requestedRow=requestedTurn!==null?rows[requestedTurn-1]:null;
|
||||
if(requestedRow?.traceId){const byTrace=latestSampleForTrace(samples,requestedRow.traceId,numOrNull(requestedRow.lastSeq)); if(byTrace)return byTrace;}
|
||||
if(requestedRow?.traceId){const byTrace=bestTraceFrameSample(requestedRow.traceId,numOrNull(requestedRow.lastSeq)); if(byTrace)return byTrace;}
|
||||
if(requestedRow&&requestedRow.lastSeq!==null&&requestedRow.lastSeq!==undefined){const byTurn=samples.find((sample)=>Number(sample.seq)===Number(requestedRow.lastSeq)); if(byTurn)return byTurn;}
|
||||
if(requestedTraceId){const byTrace=samples.filter((sample)=>traceIdsFromSamples([sample]).includes(requestedTraceId)).slice(-1)[0]; if(byTrace)return byTrace;}
|
||||
if(requestedTraceId){const byTrace=bestTraceFrameSample(requestedTraceId); if(byTrace)return byTrace;}
|
||||
return samples[samples.length-1]||null;
|
||||
}
|
||||
function renderTraceFrame(sample,rows){
|
||||
@@ -365,9 +385,11 @@ function renderTraceFrame(sample,rows){
|
||||
const terminalProjectionVisible=status==='completed'&&finalResponse&&!finalResponse.empty&&turns.some((turn)=>String(turn?.status||'').toLowerCase()==='completed'||/completed|轮次完成|final response/iu.test(textOf(turn)));
|
||||
if(rowLines.length===0&&terminalProjectionVisible) rowLines.push('(trace rows collapsed; terminal turn and Final Response are visible)');
|
||||
const missingRows=rowLines.length===0;
|
||||
const laterTerminalSample=traceId?samples.filter((candidate)=>Number(candidate.seq??0)>Number(sample.seq??0)&&traceIdsFromSamples([candidate]).includes(traceId)&&sampleHasTerminalForTrace(candidate,traceId)).slice(-1)[0]||null:null;
|
||||
const sampleNotice=laterTerminalSample?['TRACE SAMPLE NOTICE','current sample is non-terminal but a later terminal sample exists: seq='+(laterTerminalSample.seq??'-')+' ts='+(laterTerminalSample.ts||'-')+'; rerun with --sample-seq '+(laterTerminalSample.seq??'<seq>')+' or omit explicit sample selection.']:[];
|
||||
const bodyRows=missingRows?['(无 trace rows;这是 blocker,不能当业务通过证据)',...traceFrameDiagnosticLines(sample,traceId,traceRows)]:rowLines;
|
||||
const rendered=['Code Agent 耗时 '+(elapsed>=0?fmtDuration(elapsed):'-')+' 最近 '+(recent>=0?String(recent)+' 秒前':'-')+' ('+status+')','=======================================================','sample seq='+(sample.seq??'-')+' ts='+(sample.ts||'-')+' traceId='+(traceId||'-')+' routeSession='+(sample.routeSessionId||'-')+' activeSession='+(sample.activeSessionId||'-'),...bodyRows,'==========================','Final Response',finalResponse.preview||'(空内容)'].join('\\n');
|
||||
return {ok:!missingRows,renderedText:rendered,blocker:missingRows?'trace-rows-missing':null,sampleSeq:sample.seq??null,traceId,finalResponse,traceDiagnostic:missingRows?{pageRole:sample.pageRole||null,pageId:sample.pageId||null,traceRows:Array.isArray(sample.traceRows)?sample.traceRows.length:0,turns:Array.isArray(sample.turns)?sample.turns.length:0,messages:Array.isArray(sample.messages)?sample.messages.length:0,sampleTraceIds:traceIdsFromSamples([sample]).slice(0,12)}:null,valuesRedacted:true};
|
||||
const rendered=['Code Agent 耗时 '+(elapsed>=0?fmtDuration(elapsed):'-')+' 最近 '+(recent>=0?String(recent)+' 秒前':'-')+' ('+status+')','=======================================================','sample seq='+(sample.seq??'-')+' ts='+(sample.ts||'-')+' traceId='+(traceId||'-')+' routeSession='+(sample.routeSessionId||'-')+' activeSession='+(sample.activeSessionId||'-'),...sampleNotice,...bodyRows,'==========================','Final Response',finalResponse.preview||'(空内容)'].join('\\n');
|
||||
return {ok:!missingRows,renderedText:rendered,blocker:missingRows?'trace-rows-missing':null,sampleSeq:sample.seq??null,traceId,finalResponse,laterTerminalSample:laterTerminalSample?{seq:laterTerminalSample.seq??null,ts:laterTerminalSample.ts??null}:null,traceDiagnostic:missingRows?{pageRole:sample.pageRole||null,pageId:sample.pageId||null,traceRows:Array.isArray(sample.traceRows)?sample.traceRows.length:0,turns:Array.isArray(sample.turns)?sample.turns.length:0,messages:Array.isArray(sample.messages)?sample.messages.length:0,sampleTraceIds:traceIdsFromSamples([sample]).slice(0,12)}:null,valuesRedacted:true};
|
||||
}
|
||||
function firstPresent(values){
|
||||
for(const value of values){
|
||||
|
||||
@@ -120,6 +120,8 @@ export async function status(config: UniDeskConfig, options: CommonOptions): Pro
|
||||
return renderStatusTable({
|
||||
ok: result.exitCode === 0 && summary?.ready === true,
|
||||
target,
|
||||
backendEndpoint: `${observability.traceBackend.serviceName}.${target.namespace}.svc.cluster.local:${observability.traceBackend.httpPort}`,
|
||||
collectorEndpoint: `${observability.collector.serviceName}.${target.namespace}.svc.cluster.local:${observability.collector.otlp.grpcPort}`,
|
||||
summary,
|
||||
remote: parsed === null ? compactCapture(result, { full: false }) : null,
|
||||
});
|
||||
@@ -143,6 +145,8 @@ export async function status(config: UniDeskConfig, options: CommonOptions): Pro
|
||||
function renderStatusTable(input: {
|
||||
ok: boolean;
|
||||
target: ObservabilityTarget;
|
||||
backendEndpoint: string;
|
||||
collectorEndpoint: string;
|
||||
summary: Record<string, unknown> | null;
|
||||
remote: Record<string, unknown> | null;
|
||||
}): RenderedCliResult {
|
||||
@@ -167,6 +171,7 @@ function renderStatusTable(input: {
|
||||
`platform-infra observability status (${input.ok ? "ok" : "not-ok"})`,
|
||||
"",
|
||||
`target=${input.target.id} namespace=${input.target.namespace} ready=${textValue(input.summary?.ready)} route=${input.target.route}`,
|
||||
`tempo=${input.backendEndpoint} collector=${input.collectorEndpoint} deployments=${deployments.length} pods=${pods.length} probes=${probes.length}`,
|
||||
"",
|
||||
"Deployments:",
|
||||
formatTable(["NAME", "READY", "AVAILABLE"], deployments.length > 0 ? deployments : [["-", "-", "-"]]),
|
||||
|
||||
@@ -87,6 +87,7 @@ export function compactDiagnoseCodeAgentResult(value: unknown): Record<string, u
|
||||
spanCount: source.spanCount ?? null,
|
||||
services: source.services ?? null,
|
||||
servicePath: source.servicePath ?? null,
|
||||
observabilityGap: source.observabilityGap ?? null,
|
||||
businessTraceIds: source.businessTraceIds ?? null,
|
||||
identity: compactDiagnoseIdentity(source.identity),
|
||||
agentrun: compactDiagnoseAgentRun(source.agentrun),
|
||||
|
||||
@@ -1986,8 +1986,26 @@ service_path = {
|
||||
service: ("reached" if service in services else "missing")
|
||||
for service in expected_services
|
||||
}
|
||||
service_path["complete"] = all(service in services for service in expected_services)
|
||||
missing_services = [service for service in expected_services if service not in services]
|
||||
service_path["complete"] = len(missing_services) == 0
|
||||
observability_gap = {
|
||||
"status": "complete" if len(missing_services) == 0 else "missing-service-spans",
|
||||
"expectedServices": expected_services,
|
||||
"seenServices": sorted(services),
|
||||
"missingServices": missing_services,
|
||||
"complete": len(missing_services) == 0,
|
||||
}
|
||||
if missing_services and ("hwlab-cloud-api" in services or identity.get("runId") not in (None, "") or identity.get("commandId") not in (None, "")):
|
||||
candidates.insert(0, {
|
||||
"code": "observability_gap_missing_service_spans",
|
||||
"label": "observability gap",
|
||||
"confidence": 0.76,
|
||||
"summary": "The business trace is correlated to Code Agent context but is missing expected service spans; do not interpret the missing manager/runner spans as proof that those services were not involved.",
|
||||
"evidence": observability_gap,
|
||||
})
|
||||
facts = []
|
||||
if missing_services:
|
||||
facts.append("observability gap: missing service spans " + ",".join(missing_services))
|
||||
if http_summary.get("actorForbidden"):
|
||||
facts.append("actor forbidden")
|
||||
terminal_status = agentrun.get("terminalStatus")
|
||||
@@ -2014,6 +2032,7 @@ summary = {
|
||||
"actorForbidden": http_summary.get("actorForbidden"),
|
||||
"terminalStatus": terminal_status,
|
||||
"failureKind": agentrun.get("failureKind"),
|
||||
"observabilityGap": observability_gap.get("status"),
|
||||
},
|
||||
}
|
||||
evidence = {
|
||||
@@ -2041,6 +2060,7 @@ payload = {
|
||||
"spanCount": len(spans),
|
||||
"services": services,
|
||||
"servicePath": service_path,
|
||||
"observabilityGap": observability_gap,
|
||||
"businessTraceIds": business_trace_ids[:20],
|
||||
"identity": identity,
|
||||
"agentrun": {
|
||||
|
||||
@@ -613,6 +613,8 @@ export function renderDiagnoseCodeAgentTable(input: {
|
||||
const projectionLag = asPlainRecord(input.result.projectionLag);
|
||||
const summary = asPlainRecord(input.result.summary);
|
||||
const evidence = asPlainRecord(input.result.evidence);
|
||||
const observabilityGap = asPlainRecord(input.result.observabilityGap);
|
||||
const servicePath = asPlainRecord(input.result.servicePath);
|
||||
const rootCauses = asArray(input.result.rootCauseCandidates).map((item) => asPlainRecord(item) ?? {});
|
||||
const http = asPlainRecord(input.result.http);
|
||||
const services = joinValues(input.result.services, 54);
|
||||
@@ -633,6 +635,14 @@ export function renderDiagnoseCodeAgentTable(input: {
|
||||
shortenEnd(textValue(candidate.summary ?? candidate.label), 80),
|
||||
]);
|
||||
const httpRows = httpTableRows(http);
|
||||
const expectedServices = asArray(observabilityGap?.expectedServices).map((item) => textValue(item)).filter((item) => item !== "-");
|
||||
const seenServices = new Set(asArray(observabilityGap?.seenServices).map((item) => textValue(item)).filter((item) => item !== "-"));
|
||||
const missingServices = new Set(asArray(observabilityGap?.missingServices).map((item) => textValue(item)).filter((item) => item !== "-"));
|
||||
const serviceRows = expectedServices.map((service) => [
|
||||
service,
|
||||
textValue(servicePath?.[service] ?? (missingServices.has(service) ? "missing" : seenServices.has(service) ? "reached" : "-")),
|
||||
missingServices.has(service) ? "missing" : seenServices.has(service) ? "seen" : "-",
|
||||
]);
|
||||
const queryClauses = asArray(input.query.queryClauses).map((item) => textValue(item)).filter((item) => item !== "-");
|
||||
const requestedRunId = textValue(input.query.runId);
|
||||
const requestedCommandId = textValue(input.query.commandId);
|
||||
@@ -658,6 +668,10 @@ export function renderDiagnoseCodeAgentTable(input: {
|
||||
"Root causes:",
|
||||
formatTable(["CODE", "CONF", "SUMMARY"], rootRows.length > 0 ? rootRows : [["-", "-", "-"]]),
|
||||
"",
|
||||
"Service trace coverage:",
|
||||
formatTable(["SERVICE", "PATH", "SPAN"], serviceRows.length > 0 ? serviceRows : [["-", "-", "-"]]),
|
||||
` observabilityGap=${textValue(observabilityGap?.status)} missing=${joinValues(observabilityGap?.missingServices, 60)} seen=${joinValues(observabilityGap?.seenServices, 80)}`,
|
||||
"",
|
||||
"HTTP:",
|
||||
formatTable(["METHOD", "ROUTE", "STATUS", "COUNT"], httpRows.length > 0 ? httpRows : [["-", "-", "-", "-"]]),
|
||||
"",
|
||||
|
||||
Reference in New Issue
Block a user