fix: improve workbench triad diagnostics
This commit is contained in:
@@ -3039,6 +3039,44 @@ function workbenchTriadMismatchKind(row) {
|
||||
return "unknown";
|
||||
}
|
||||
|
||||
function workbenchTriadRootCauseFromDrilldown(drilldown, summary = {}) {
|
||||
const groups = Array.isArray(drilldown?.groups) ? drilldown.groups : [];
|
||||
const hasStaleCompletedRail = groups.some((group) => group?.mismatchKind === "rail-card-status-mismatch" && String(group?.statusTuple || "").includes("rail=completed,card=running,final=false"));
|
||||
if (hasStaleCompletedRail) return {
|
||||
rootCause: "workbench_session_rail_status_stale_after_new_running_turn",
|
||||
rootCauseStatus: "confirmed-from-dom-samples",
|
||||
rootCauseConfidence: "high",
|
||||
dominantMismatchKind: "rail-card-status-mismatch",
|
||||
summary: "Workbench session rail kept the previous completed terminal status while a newer turn card was running and Final Response was absent",
|
||||
nextAction: "Inspect HWLAB frontend session status authority/reducer, especially workbench-server-state sessionStatusAuthorityFromMessages and SessionRail sessionToSessionTab status input; session rail must derive from the latest active turn/message authority rather than the previous sealed terminal message.",
|
||||
sourceOfTruth: "latest durable Workbench turn/message projection for the active session",
|
||||
valuesRedacted: true,
|
||||
};
|
||||
const finalMismatchCount = Number(summary?.cardFinalResponseMismatchCount ?? 0);
|
||||
const hasFinalMismatch = finalMismatchCount > 0 || groups.some((group) => /final=false/u.test(String(group?.statusTuple || "")) && group?.mismatchKind !== "rail-card-status-mismatch");
|
||||
if (hasFinalMismatch) return {
|
||||
rootCause: "workbench_terminal_final_response_not_sealed",
|
||||
rootCauseStatus: "confirmed-from-dom-samples",
|
||||
rootCauseConfidence: "high",
|
||||
dominantMismatchKind: "completed-card-final-response-absent",
|
||||
summary: "Workbench terminal turn card did not expose a structured Final Response body",
|
||||
nextAction: "Inspect HWLAB terminal message/finalResponse projection contract before changing renderer fallback behavior.",
|
||||
sourceOfTruth: "durable Workbench terminal message projection",
|
||||
valuesRedacted: true,
|
||||
};
|
||||
const mismatchKinds = Array.isArray(drilldown?.summary?.mismatchKinds) ? drilldown.summary.mismatchKinds : [];
|
||||
return {
|
||||
rootCause: "workbench_projection_state_triad_not_sealed",
|
||||
rootCauseStatus: "confirmed-from-dom-samples",
|
||||
rootCauseConfidence: "high",
|
||||
dominantMismatchKind: mismatchKinds[0] ?? "unknown",
|
||||
summary: "Workbench session rail status, turn card status, and Final Response body presence diverged from the allowed state tuples",
|
||||
nextAction: "Use drilldown.otelDrilldown.commands for the listed traceIds, then inspect staticSourceHints and add unit tests from unitTestReproHints before changing UI rendering.",
|
||||
sourceOfTruth: "durable Workbench projection/read model",
|
||||
valuesRedacted: true,
|
||||
};
|
||||
}
|
||||
|
||||
function workbenchTriadTuple(row) {
|
||||
return [
|
||||
"rail=" + (row?.railStatus ?? "-"),
|
||||
@@ -3082,6 +3120,7 @@ function workbenchTriadStaticSourceHints() {
|
||||
|
||||
function workbenchTriadUnitTestReproHints() {
|
||||
return [
|
||||
"frontend reducer: when a second trace is running in the same session, session rail status must not stay on the previous completed terminal trace",
|
||||
"backend projector: terminal event must produce a single sealed turn tuple consumed by session list, session detail, messages and turn-status APIs",
|
||||
"backend read model: completed rail status must not coexist with running turn card or missing Final Response for the same trace",
|
||||
"frontend server-state merge: stale running/empty snapshots must not overwrite a sealed completed+Final Response turn",
|
||||
@@ -3194,6 +3233,7 @@ function normalizeWorkbenchTriadStatus(status, running = false) {
|
||||
if (running === true) return "running";
|
||||
if (!value) return null;
|
||||
if (/^(completed|complete|succeeded|success|finished|done|terminal|sealed)$/u.test(value)) return "completed";
|
||||
if (/^(failed|failure|error|blocked|timeout|canceled|cancelled|stale|thread-resume-failed|interrupted|expired|idle)$/u.test(value)) return "completed";
|
||||
if (/^(pending|running|active|busy|admitted|dispatching|executing|streaming|processing|queued|in-progress|creating)$/u.test(value)) return "running";
|
||||
return null;
|
||||
}
|
||||
@@ -3364,30 +3404,35 @@ function buildFindings(samples, control, network, errors, sampleMetrics, promptN
|
||||
...(Array.isArray(turnStateTriad.invalidFullTriads) ? turnStateTriad.invalidFullTriads : []),
|
||||
...(Array.isArray(turnStateTriad.cardFinalResponseMismatches) ? turnStateTriad.cardFinalResponseMismatches : [])
|
||||
];
|
||||
if (Number(turnStateTriadSummary.invalidRowCount ?? 0) > 0) findings.push({
|
||||
id: "workbench-turn-state-triad-inconsistent",
|
||||
severity: "red",
|
||||
summary: "Workbench session rail status, turn card status, and Final Response body presence diverged from the allowed running/running/absent or completed/completed/present tuples",
|
||||
count: turnStateTriadSummary.invalidRowCount,
|
||||
fullTriadCount: turnStateTriadSummary.fullTriadRowCount,
|
||||
invalidFullTriadCount: turnStateTriadSummary.invalidFullTriadCount,
|
||||
cardFinalResponseMismatchCount: turnStateTriadSummary.cardFinalResponseMismatchCount,
|
||||
legacyCollectorMissingCount: turnStateTriadSummary.collectorMissingRowCount,
|
||||
collectorMissingFields: Array.isArray(turnStateTriadSummary.collectorMissingFields) ? turnStateTriadSummary.collectorMissingFields : [],
|
||||
allowedTuples: [
|
||||
{ railStatus: "completed", cardStatus: "completed", finalResponsePresent: true },
|
||||
{ railStatus: "running", cardStatus: "running", finalResponsePresent: false }
|
||||
],
|
||||
samples: turnStateTriadRows.slice(0, 20),
|
||||
drilldown: turnStateTriad.drilldown ?? buildWorkbenchTurnStateTriadDrilldown(turnStateTriadRows),
|
||||
collectorMissingSamples: Array.isArray(turnStateTriad.collectorMissingRows) ? turnStateTriad.collectorMissingRows.slice(0, 10) : [],
|
||||
sourceOfTruth: "durable Workbench projection/read model; do not repair via DOM fallback or GET-side state mutation",
|
||||
nextAction: "Use drilldown.otelDrilldown.commands for the listed traceIds, then inspect staticSourceHints and add unit tests from unitTestReproHints before changing UI rendering.",
|
||||
rootCause: "workbench_projection_state_triad_not_sealed",
|
||||
rootCauseStatus: "confirmed-from-dom-samples",
|
||||
rootCauseConfidence: "high",
|
||||
valuesRedacted: true
|
||||
});
|
||||
if (Number(turnStateTriadSummary.invalidRowCount ?? 0) > 0) {
|
||||
const drilldown = turnStateTriad.drilldown ?? buildWorkbenchTurnStateTriadDrilldown(turnStateTriadRows);
|
||||
const rootCause = workbenchTriadRootCauseFromDrilldown(drilldown, turnStateTriadSummary);
|
||||
findings.push({
|
||||
id: "workbench-turn-state-triad-inconsistent",
|
||||
severity: "red",
|
||||
summary: rootCause.summary,
|
||||
count: turnStateTriadSummary.invalidRowCount,
|
||||
fullTriadCount: turnStateTriadSummary.fullTriadRowCount,
|
||||
invalidFullTriadCount: turnStateTriadSummary.invalidFullTriadCount,
|
||||
cardFinalResponseMismatchCount: turnStateTriadSummary.cardFinalResponseMismatchCount,
|
||||
legacyCollectorMissingCount: turnStateTriadSummary.collectorMissingRowCount,
|
||||
collectorMissingFields: Array.isArray(turnStateTriadSummary.collectorMissingFields) ? turnStateTriadSummary.collectorMissingFields : [],
|
||||
dominantMismatchKind: rootCause.dominantMismatchKind,
|
||||
allowedTuples: [
|
||||
{ railStatus: "completed", cardStatus: "completed", finalResponsePresent: true },
|
||||
{ railStatus: "running", cardStatus: "running", finalResponsePresent: false }
|
||||
],
|
||||
samples: turnStateTriadRows.slice(0, 20),
|
||||
drilldown,
|
||||
collectorMissingSamples: Array.isArray(turnStateTriad.collectorMissingRows) ? turnStateTriad.collectorMissingRows.slice(0, 10) : [],
|
||||
sourceOfTruth: rootCause.sourceOfTruth + "; do not repair via DOM fallback or GET-side state mutation",
|
||||
nextAction: rootCause.nextAction,
|
||||
rootCause: rootCause.rootCause,
|
||||
rootCauseStatus: rootCause.rootCauseStatus,
|
||||
rootCauseConfidence: rootCause.rootCauseConfidence,
|
||||
valuesRedacted: true
|
||||
});
|
||||
}
|
||||
const promptFailures = Array.isArray(promptNetwork?.rounds) ? promptNetwork.rounds.filter((item) => item.chatPostOk === false && !promptCommandHasAuthoritativeSubmitSideEffect(control, item)) : [];
|
||||
if (promptFailures.length > 0) findings.push({ id: "prompt-chat-submit-failed", severity: "red", summary: "sendPrompt command had no successful /v1/agent/chat or /v1/agent/chat/steer POST response in the sampling window", count: promptFailures.length, rounds: promptFailures.slice(0, 10) });
|
||||
const promptSteerRounds = Array.isArray(promptNetwork?.rounds) ? promptNetwork.rounds.filter((item) => item.steerUsed === true) : [];
|
||||
|
||||
@@ -0,0 +1,158 @@
|
||||
import assert from "node:assert/strict";
|
||||
import { mkdtemp, readFile, writeFile } from "node:fs/promises";
|
||||
import { tmpdir } from "node:os";
|
||||
import { join } from "node:path";
|
||||
import { spawnSync } from "node:child_process";
|
||||
import { test } from "bun:test";
|
||||
|
||||
import { nodeWebObserveAnalyzerSource } from "../hwlab-node-web-observe-analyzer-source";
|
||||
|
||||
const alertThresholds = {
|
||||
sameOriginApiSlowMs: 60000,
|
||||
partialApiSlowMs: 60000,
|
||||
longLivedStreamOpenSlowMs: 60000,
|
||||
visibleLoadingSlowMs: 60000,
|
||||
turnTimingSampleSlackSeconds: 60,
|
||||
turnElapsedSevereTimeoutSeconds: 3600,
|
||||
domEvaluateTimeoutRedCount: 99,
|
||||
domEvaluateTimeoutRedWindowMs: 60000,
|
||||
screenshotTimeoutRedCount: 99,
|
||||
pageErrorRedCount: 99,
|
||||
browserProcessSampleIntervalMs: 1000,
|
||||
browserTotalRssRedMb: 999999,
|
||||
browserProcessRssRedMb: 999999,
|
||||
browserRssGrowthRedMb: 999999,
|
||||
browserRssGrowthWindowMs: 60000,
|
||||
playwrightResponsivenessRedMs: 60000,
|
||||
playwrightResponsivenessTimeoutRedCount: 99,
|
||||
cdpMetricsTimeoutRedCount: 99,
|
||||
uncommandedStateChangeCommandWindowMs: 1000,
|
||||
scrollJumpCommandWindowMs: 1000,
|
||||
scrollJumpFromY: 999999,
|
||||
scrollJumpToY: 999999,
|
||||
sessionRailFallbackRatio: 0.5,
|
||||
};
|
||||
|
||||
const browserFreezePolicy = {
|
||||
enabled: true,
|
||||
blockerWindowMs: 60000,
|
||||
memory: {
|
||||
totalRssBlockerMb: 999999,
|
||||
processRssBlockerMb: 999999,
|
||||
growthBlockerMb: 999999,
|
||||
},
|
||||
responsiveness: {
|
||||
latencyBlockerMs: 60000,
|
||||
eventBlockerCount: 99,
|
||||
},
|
||||
cdp: {
|
||||
metricsTimeoutBlockerCount: 99,
|
||||
},
|
||||
kill: {
|
||||
enabled: false,
|
||||
gracefulSignal: "SIGTERM",
|
||||
forceSignal: "SIGKILL",
|
||||
graceMs: 1000,
|
||||
pollIntervalMs: 100,
|
||||
exitCode: 124,
|
||||
},
|
||||
};
|
||||
|
||||
test("observe analyzer classifies stale completed session rail when a newer turn is running", async () => {
|
||||
const stateDir = await mkdtemp(join(tmpdir(), "unidesk-web-observe-analyzer-"));
|
||||
const analyzerPath = join(stateDir, "analyze.mjs");
|
||||
const samplesPath = join(stateDir, "samples.jsonl");
|
||||
await writeFile(analyzerPath, nodeWebObserveAnalyzerSource(), { mode: 0o700 });
|
||||
await writeFile(samplesPath, [
|
||||
JSON.stringify({
|
||||
seq: 1,
|
||||
ts: "2026-07-01T15:21:24.767Z",
|
||||
path: "/workbench/sessions/ses_triage",
|
||||
url: "https://hwlab.example.test/workbench/sessions/ses_triage",
|
||||
pageRole: "control",
|
||||
pageId: "control-test",
|
||||
routeSessionId: "ses_triage",
|
||||
activeSessionId: "ses_triage",
|
||||
sessionRail: {
|
||||
items: [{
|
||||
index: 0,
|
||||
active: true,
|
||||
status: "completed",
|
||||
dataStatus: "completed",
|
||||
running: false,
|
||||
dataRunning: "false",
|
||||
sessionId: "ses_triage",
|
||||
sessionIdPrefix: "ses_triage",
|
||||
}],
|
||||
},
|
||||
turns: [
|
||||
{
|
||||
role: "agent",
|
||||
status: "completed",
|
||||
traceId: "trc_previous_completed",
|
||||
messageId: "msg_previous_completed_agent",
|
||||
finalResponsePresent: true,
|
||||
finalResponseTextBytes: 12,
|
||||
},
|
||||
{
|
||||
role: "agent",
|
||||
status: "running",
|
||||
traceId: "trc_running_new_turn",
|
||||
messageId: "msg_running_new_turn_agent",
|
||||
finalResponsePresent: false,
|
||||
finalResponseTextBytes: 0,
|
||||
},
|
||||
],
|
||||
}),
|
||||
JSON.stringify({
|
||||
seq: 2,
|
||||
ts: "2026-07-01T15:22:49.918Z",
|
||||
path: "/workbench/sessions/ses_triage",
|
||||
url: "https://hwlab.example.test/workbench/sessions/ses_triage",
|
||||
pageRole: "control",
|
||||
pageId: "control-test",
|
||||
routeSessionId: "ses_triage",
|
||||
activeSessionId: "ses_triage",
|
||||
sessionRail: {
|
||||
items: [{
|
||||
index: 0,
|
||||
active: true,
|
||||
status: "canceled",
|
||||
dataStatus: "canceled",
|
||||
running: false,
|
||||
dataRunning: "false",
|
||||
sessionId: "ses_triage",
|
||||
sessionIdPrefix: "ses_triage",
|
||||
}],
|
||||
},
|
||||
turns: [{
|
||||
role: "agent",
|
||||
status: "canceled",
|
||||
traceId: "trc_canceled_terminal",
|
||||
messageId: "msg_canceled_terminal_agent",
|
||||
finalResponsePresent: true,
|
||||
finalResponseTextBytes: 17,
|
||||
}],
|
||||
}),
|
||||
].join("\n") + "\n");
|
||||
|
||||
const result = spawnSync("bun", [analyzerPath, stateDir], {
|
||||
cwd: join(import.meta.dir, "../../.."),
|
||||
env: {
|
||||
...process.env,
|
||||
UNIDESK_WEB_OBSERVE_ANALYZE_TAIL_SAMPLES: "0",
|
||||
UNIDESK_WEB_OBSERVE_ALERT_THRESHOLDS_JSON: JSON.stringify(alertThresholds),
|
||||
UNIDESK_WEB_OBSERVE_BROWSER_FREEZE_POLICY_JSON: JSON.stringify(browserFreezePolicy),
|
||||
},
|
||||
encoding: "utf8",
|
||||
});
|
||||
assert.equal(result.status, 0, result.stderr || result.stdout);
|
||||
|
||||
const report = JSON.parse(await readFile(join(stateDir, "analysis", "report.json"), "utf8"));
|
||||
const finding = report.findings.find((item: Record<string, unknown>) => item.id === "workbench-turn-state-triad-inconsistent");
|
||||
assert.equal(finding?.rootCause, "workbench_session_rail_status_stale_after_new_running_turn");
|
||||
assert.equal(finding?.dominantMismatchKind, "rail-card-status-mismatch");
|
||||
assert.match(String(finding?.summary), /previous completed terminal status/u);
|
||||
assert.equal(report.sampleMetrics.workbenchTurnStateTriad.summary.invalidRowCount, 1);
|
||||
assert.equal(report.sampleMetrics.workbenchTurnStateTriad.summary.cardFinalResponseMismatchCount, 0);
|
||||
}, 20_000);
|
||||
@@ -89,6 +89,7 @@ export function compactDiagnoseCodeAgentResult(value: unknown): Record<string, u
|
||||
servicePath: source.servicePath ?? null,
|
||||
observabilityGap: source.observabilityGap ?? null,
|
||||
businessTraceIds: source.businessTraceIds ?? null,
|
||||
businessTraceScope: source.businessTraceScope ?? null,
|
||||
identity: compactDiagnoseIdentity(source.identity),
|
||||
agentrun: compactDiagnoseAgentRun(source.agentrun),
|
||||
hwlabReadModel: source.hwlabReadModel ?? null,
|
||||
|
||||
@@ -1450,6 +1450,88 @@ def identity_from_spans(spans):
|
||||
identity[key] = preferred if preferred is not None else fallback
|
||||
return identity
|
||||
|
||||
def span_business_trace_id(item):
|
||||
attrs = item.get("attributes", {}) if isinstance(item.get("attributes"), dict) else {}
|
||||
for key in ("traceId", "workbench.trace_id", "workbench.turn_id"):
|
||||
value = attrs.get(key)
|
||||
if isinstance(value, str) and value.strip().startswith("trc_"):
|
||||
return value.strip()
|
||||
return None
|
||||
|
||||
def span_identity_value(item, key):
|
||||
attrs = item.get("attributes", {}) if isinstance(item.get("attributes"), dict) else {}
|
||||
value = attrs.get(key)
|
||||
if value in (None, ""):
|
||||
return None
|
||||
return str(value)
|
||||
|
||||
def span_matches_scope_identity(item, identity):
|
||||
if not isinstance(identity, dict):
|
||||
return False
|
||||
# Only strong per-turn identities are safe enough for no-trace-id spans.
|
||||
# runId/sessionId/runnerJobId can cover several turns in the same Workbench
|
||||
# session and caused terminal status leakage across business trace ids.
|
||||
for key in ("commandId", "turnId"):
|
||||
scoped = identity.get(key)
|
||||
value = span_identity_value(item, key)
|
||||
if scoped not in (None, "") and value not in (None, ""):
|
||||
return str(scoped) == str(value)
|
||||
return False
|
||||
|
||||
def scoped_spans_for_business_trace(spans, business_trace_id):
|
||||
if business_trace_id in (None, ""):
|
||||
return list(spans), {
|
||||
"mode": "unscoped-no-business-trace-id",
|
||||
"requestedBusinessTraceId": None,
|
||||
"totalSpanCount": len(spans),
|
||||
"scopedSpanCount": len(spans),
|
||||
"excludedDifferentTraceSpanCount": 0,
|
||||
"includedNoTraceIdentitySpanCount": 0,
|
||||
"crossBusinessTraceIds": [],
|
||||
}
|
||||
exact = []
|
||||
no_trace = []
|
||||
cross_ids = collections.Counter()
|
||||
for item in spans:
|
||||
trace_id = span_business_trace_id(item)
|
||||
if trace_id == business_trace_id:
|
||||
exact.append(item)
|
||||
elif trace_id:
|
||||
cross_ids[trace_id] += 1
|
||||
else:
|
||||
no_trace.append(item)
|
||||
if not exact:
|
||||
return list(spans), {
|
||||
"mode": "unscoped-no-exact-business-trace-span",
|
||||
"requestedBusinessTraceId": business_trace_id,
|
||||
"totalSpanCount": len(spans),
|
||||
"scopedSpanCount": len(spans),
|
||||
"excludedDifferentTraceSpanCount": 0,
|
||||
"includedNoTraceIdentitySpanCount": 0,
|
||||
"crossBusinessTraceIds": [{"traceId": key, "count": count} for key, count in cross_ids.most_common(8)],
|
||||
}
|
||||
identity = identity_from_spans(exact)
|
||||
exact_ids = set(id(item) for item in exact)
|
||||
scoped = list(exact)
|
||||
included_no_trace = 0
|
||||
for item in no_trace:
|
||||
if id(item) in exact_ids:
|
||||
continue
|
||||
if span_matches_scope_identity(item, identity):
|
||||
scoped.append(item)
|
||||
included_no_trace += 1
|
||||
scoped.sort(key=lambda item: (item.get("_start", 0), item.get("_index", 0)))
|
||||
return scoped, {
|
||||
"mode": "business-trace-scoped",
|
||||
"requestedBusinessTraceId": business_trace_id,
|
||||
"totalSpanCount": len(spans),
|
||||
"scopedSpanCount": len(scoped),
|
||||
"excludedDifferentTraceSpanCount": len(spans) - len(scoped),
|
||||
"includedNoTraceIdentitySpanCount": included_no_trace,
|
||||
"scopeIdentity": {key: identity.get(key) for key in ("commandId", "turnId", "runId", "sessionId") if identity.get(key) not in (None, "")},
|
||||
"crossBusinessTraceIds": [{"traceId": key, "count": count} for key, count in cross_ids.most_common(8)],
|
||||
}
|
||||
|
||||
def flatten_trace(trace_body):
|
||||
services = set()
|
||||
business_trace_ids = set()
|
||||
@@ -1752,6 +1834,20 @@ def root_cause_candidates(http_summary, agentrun, read_model, lag_summary, error
|
||||
"evidence": lag_summary.get("reasons", []),
|
||||
})
|
||||
failure_kind = str(agentrun.get("failureKind") or "")
|
||||
read_model_turn_status = read_model.get("turnStatus")
|
||||
status_conflict = terminal_status_conflicts(agentrun.get("terminalStatus"), read_model_turn_status)
|
||||
if status_conflict:
|
||||
candidates.append({
|
||||
"code": "otel_business_trace_terminal_conflict",
|
||||
"label": "OTel terminal conflict",
|
||||
"confidence": 0.86,
|
||||
"summary": "Scoped AgentRun terminal status conflicts with HWLAB read-model turn status for the requested business trace; prefer the HWLAB read-model status for Workbench RCA and inspect OTel correlation leakage.",
|
||||
"evidence": {
|
||||
"agentrunTerminalStatus": agentrun.get("terminalStatus"),
|
||||
"hwlabReadModelTurnStatus": read_model_turn_status,
|
||||
"turnStatusCounts": read_model.get("turnStatusCounts"),
|
||||
},
|
||||
})
|
||||
if failure_kind == "provider-auth-failed":
|
||||
candidates.append({
|
||||
"code": "provider_auth_failed",
|
||||
@@ -1765,7 +1861,7 @@ def root_cause_candidates(http_summary, agentrun, read_model, lag_summary, error
|
||||
"terminalCategory": agentrun.get("terminalCategory"),
|
||||
},
|
||||
})
|
||||
if agentrun.get("terminalStatus") in ("failed", "error", "timeout", "blocked", "cancelled"):
|
||||
if agentrun.get("terminalStatus") in ("failed", "error", "timeout", "blocked", "cancelled") and not status_conflict:
|
||||
candidates.append({
|
||||
"code": "agentrun_terminal_failed",
|
||||
"label": "AgentRun terminal failed",
|
||||
@@ -1790,6 +1886,19 @@ def root_cause_candidates(http_summary, agentrun, read_model, lag_summary, error
|
||||
"runnerProviderClassification": agentrun.get("runnerProviderClassification"),
|
||||
},
|
||||
})
|
||||
if agentrun.get("terminalStatus") in (None, "") and read_model_turn_status in ("completed", "failed", "error", "timeout", "blocked", "cancelled", "canceled"):
|
||||
candidates.append({
|
||||
"code": "hwlab_read_model_terminal_without_runner_spans",
|
||||
"label": "HWLAB read model terminal",
|
||||
"confidence": 0.7,
|
||||
"summary": "HWLAB read model shows a terminal turn status but scoped AgentRun manager/runner terminal spans are absent; treat missing runner spans as an observability gap, not proof that the turn is still running.",
|
||||
"evidence": {
|
||||
"turnStatus": read_model_turn_status,
|
||||
"turnStatusCounts": read_model.get("turnStatusCounts"),
|
||||
"sessionListReadCount": read_model.get("sessionListReadCount"),
|
||||
"sessionMessageReadCount": read_model.get("sessionMessageReadCount"),
|
||||
},
|
||||
})
|
||||
if error_spans and agentrun.get("terminalStatus") == "completed":
|
||||
candidates.append({
|
||||
"code": "tool_call_failures_recovered",
|
||||
@@ -1812,6 +1921,28 @@ def root_cause_candidates(http_summary, agentrun, read_model, lag_summary, error
|
||||
candidates.sort(key=lambda item: item.get("confidence", 0), reverse=True)
|
||||
return candidates
|
||||
|
||||
def comparable_terminal_status(value):
|
||||
text = str(value or "").strip().lower().replace("_", "-")
|
||||
if not text:
|
||||
return None
|
||||
if text == "cancelled":
|
||||
return "canceled"
|
||||
if text == "success" or text == "succeeded" or text == "complete":
|
||||
return "completed"
|
||||
if text == "error" or text == "failure":
|
||||
return "failed"
|
||||
return text
|
||||
|
||||
def terminal_status_conflicts(left, right):
|
||||
left_status = comparable_terminal_status(left)
|
||||
right_status = comparable_terminal_status(right)
|
||||
if left_status in (None, "") or right_status in (None, ""):
|
||||
return False
|
||||
terminal_values = {"completed", "failed", "timeout", "blocked", "canceled", "stale", "thread-resume-failed", "interrupted", "expired"}
|
||||
if left_status not in terminal_values or right_status not in terminal_values:
|
||||
return False
|
||||
return left_status != right_status
|
||||
|
||||
def candidate_score(trace_id, meta, trace_body, trace_rc, trace_err):
|
||||
parsed = parse_json(trace_body)
|
||||
meta_root_service = str(meta.get("rootServiceName") or "") if isinstance(meta, dict) else ""
|
||||
@@ -1837,13 +1968,15 @@ def candidate_score(trace_id, meta, trace_body, trace_rc, trace_err):
|
||||
"stderrTail": (trace_err or "")[-1000:],
|
||||
}, None
|
||||
flat = flatten_trace(parsed)
|
||||
spans = flat["spans"]
|
||||
services = set(flat["services"])
|
||||
all_spans = flat["spans"]
|
||||
spans, business_scope = scoped_spans_for_business_trace(all_spans, BUSINESS_TRACE_ID)
|
||||
services = set(str(item.get("service") or "") for item in spans)
|
||||
names = [str(item.get("name") or "") for item in spans]
|
||||
lowered_names = [name.lower() for name in names]
|
||||
identity = identity_from_spans(spans)
|
||||
agentrun = agentrun_summary(spans)
|
||||
error_spans = flat["errorSpans"]
|
||||
scoped_span_ids = set(id(item) for item in spans)
|
||||
error_spans = [item for item in flat["errorSpans"] if id(item) in scoped_span_ids]
|
||||
score = 0
|
||||
reasons = []
|
||||
def add(points, reason):
|
||||
@@ -1902,6 +2035,7 @@ def candidate_score(trace_id, meta, trace_body, trace_rc, trace_err):
|
||||
"identity": {key: identity.get(key) for key in ("runId", "commandId", "sessionId", "runnerJobId", "runnerId", "backendProfile") if identity.get(key) not in (None, "")},
|
||||
"terminalStatus": agentrun.get("terminalStatus"),
|
||||
"errorSpanCount": len(error_spans),
|
||||
"businessTraceScope": business_scope,
|
||||
"candidateQuality": candidate_quality,
|
||||
"lowConfidence": candidate_quality != "normal",
|
||||
"rootTraceName": meta_root_name or None,
|
||||
@@ -2043,10 +2177,12 @@ if not isinstance(trace_parsed, dict):
|
||||
raise SystemExit(1)
|
||||
|
||||
flat = flatten_trace(trace_parsed)
|
||||
spans = flat["spans"]
|
||||
services = flat["services"]
|
||||
all_spans = flat["spans"]
|
||||
spans, business_scope = scoped_spans_for_business_trace(all_spans, BUSINESS_TRACE_ID)
|
||||
services = sorted(set(str(item.get("service") or "") for item in spans))
|
||||
business_trace_ids = flat["businessTraceIds"]
|
||||
error_spans = flat["errorSpans"]
|
||||
scoped_span_ids = set(id(item) for item in spans)
|
||||
error_spans = [item for item in flat["errorSpans"] if id(item) in scoped_span_ids]
|
||||
idle_warning_spans = [item for item in spans if str(item.get("name") or "") == "codex_stdio.idle_warning"]
|
||||
http_summary = http_status_summary(spans)
|
||||
read_model = hwlab_read_model_summary(spans)
|
||||
@@ -2080,17 +2216,25 @@ if missing_services and ("hwlab-cloud-api" in services or identity.get("runId")
|
||||
facts = []
|
||||
if missing_services:
|
||||
facts.append("observability gap: missing service spans " + ",".join(missing_services))
|
||||
if business_scope.get("mode") == "business-trace-scoped" and business_scope.get("crossBusinessTraceIds"):
|
||||
facts.append("cross-business-trace spans excluded")
|
||||
if http_summary.get("actorForbidden"):
|
||||
facts.append("actor forbidden")
|
||||
terminal_status = agentrun.get("terminalStatus")
|
||||
if terminal_status in ("failed", "error", "timeout", "blocked", "cancelled"):
|
||||
read_model_turn_status = read_model.get("turnStatus")
|
||||
agentrun_read_model_status_conflict = terminal_status_conflicts(terminal_status, read_model_turn_status)
|
||||
if agentrun_read_model_status_conflict:
|
||||
facts.append("OTel AgentRun terminal status conflicts with HWLAB read model")
|
||||
if terminal_status in ("failed", "error", "timeout", "blocked", "cancelled") and not agentrun_read_model_status_conflict:
|
||||
failure_kind = agentrun.get("failureKind")
|
||||
if failure_kind:
|
||||
facts.append(f"AgentRun terminal failed ({failure_kind})")
|
||||
else:
|
||||
facts.append("AgentRun terminal failed")
|
||||
if terminal_status == "completed":
|
||||
if terminal_status == "completed" and not agentrun_read_model_status_conflict:
|
||||
facts.append("AgentRun completed")
|
||||
if (terminal_status in (None, "") or agentrun_read_model_status_conflict) and read_model_turn_status in ("completed", "failed", "error", "timeout", "blocked", "cancelled", "canceled"):
|
||||
facts.append("HWLAB read model terminal " + str(read_model_turn_status))
|
||||
if lag.get("status") in ("confirmed", "suspected"):
|
||||
facts.append("projection/read-model stale")
|
||||
if idle_warning_spans and terminal_status in (None, ""):
|
||||
@@ -2105,8 +2249,11 @@ summary = {
|
||||
"runnerProvider": agentrun.get("runnerProviderClassification"),
|
||||
"actorForbidden": http_summary.get("actorForbidden"),
|
||||
"terminalStatus": terminal_status,
|
||||
"readModelTurnStatus": read_model_turn_status,
|
||||
"agentrunReadModelStatusConflict": agentrun_read_model_status_conflict,
|
||||
"failureKind": agentrun.get("failureKind"),
|
||||
"observabilityGap": observability_gap.get("status"),
|
||||
"businessTraceScope": business_scope.get("mode"),
|
||||
},
|
||||
}
|
||||
evidence = {
|
||||
@@ -2136,6 +2283,7 @@ payload = {
|
||||
"servicePath": service_path,
|
||||
"observabilityGap": observability_gap,
|
||||
"businessTraceIds": business_trace_ids[:20],
|
||||
"businessTraceScope": business_scope,
|
||||
"identity": identity,
|
||||
"agentrun": {
|
||||
"terminalStatus": agentrun.get("terminalStatus"),
|
||||
|
||||
Reference in New Issue
Block a user