fix: improve workbench triad diagnostics

This commit is contained in:
Codex
2026-07-01 16:15:50 +00:00
parent 113b6809d1
commit 624736f336
4 changed files with 385 additions and 33 deletions
@@ -3039,6 +3039,44 @@ function workbenchTriadMismatchKind(row) {
return "unknown";
}
function workbenchTriadRootCauseFromDrilldown(drilldown, summary = {}) {
const groups = Array.isArray(drilldown?.groups) ? drilldown.groups : [];
const hasStaleCompletedRail = groups.some((group) => group?.mismatchKind === "rail-card-status-mismatch" && String(group?.statusTuple || "").includes("rail=completed,card=running,final=false"));
if (hasStaleCompletedRail) return {
rootCause: "workbench_session_rail_status_stale_after_new_running_turn",
rootCauseStatus: "confirmed-from-dom-samples",
rootCauseConfidence: "high",
dominantMismatchKind: "rail-card-status-mismatch",
summary: "Workbench session rail kept the previous completed terminal status while a newer turn card was running and Final Response was absent",
nextAction: "Inspect HWLAB frontend session status authority/reducer, especially workbench-server-state sessionStatusAuthorityFromMessages and SessionRail sessionToSessionTab status input; session rail must derive from the latest active turn/message authority rather than the previous sealed terminal message.",
sourceOfTruth: "latest durable Workbench turn/message projection for the active session",
valuesRedacted: true,
};
const finalMismatchCount = Number(summary?.cardFinalResponseMismatchCount ?? 0);
const hasFinalMismatch = finalMismatchCount > 0 || groups.some((group) => /final=false/u.test(String(group?.statusTuple || "")) && group?.mismatchKind !== "rail-card-status-mismatch");
if (hasFinalMismatch) return {
rootCause: "workbench_terminal_final_response_not_sealed",
rootCauseStatus: "confirmed-from-dom-samples",
rootCauseConfidence: "high",
dominantMismatchKind: "completed-card-final-response-absent",
summary: "Workbench terminal turn card did not expose a structured Final Response body",
nextAction: "Inspect HWLAB terminal message/finalResponse projection contract before changing renderer fallback behavior.",
sourceOfTruth: "durable Workbench terminal message projection",
valuesRedacted: true,
};
const mismatchKinds = Array.isArray(drilldown?.summary?.mismatchKinds) ? drilldown.summary.mismatchKinds : [];
return {
rootCause: "workbench_projection_state_triad_not_sealed",
rootCauseStatus: "confirmed-from-dom-samples",
rootCauseConfidence: "high",
dominantMismatchKind: mismatchKinds[0] ?? "unknown",
summary: "Workbench session rail status, turn card status, and Final Response body presence diverged from the allowed state tuples",
nextAction: "Use drilldown.otelDrilldown.commands for the listed traceIds, then inspect staticSourceHints and add unit tests from unitTestReproHints before changing UI rendering.",
sourceOfTruth: "durable Workbench projection/read model",
valuesRedacted: true,
};
}
function workbenchTriadTuple(row) {
return [
"rail=" + (row?.railStatus ?? "-"),
@@ -3082,6 +3120,7 @@ function workbenchTriadStaticSourceHints() {
function workbenchTriadUnitTestReproHints() {
return [
"frontend reducer: when a second trace is running in the same session, session rail status must not stay on the previous completed terminal trace",
"backend projector: terminal event must produce a single sealed turn tuple consumed by session list, session detail, messages and turn-status APIs",
"backend read model: completed rail status must not coexist with running turn card or missing Final Response for the same trace",
"frontend server-state merge: stale running/empty snapshots must not overwrite a sealed completed+Final Response turn",
@@ -3194,6 +3233,7 @@ function normalizeWorkbenchTriadStatus(status, running = false) {
if (running === true) return "running";
if (!value) return null;
if (/^(completed|complete|succeeded|success|finished|done|terminal|sealed)$/u.test(value)) return "completed";
if (/^(failed|failure|error|blocked|timeout|canceled|cancelled|stale|thread-resume-failed|interrupted|expired|idle)$/u.test(value)) return "completed";
if (/^(pending|running|active|busy|admitted|dispatching|executing|streaming|processing|queued|in-progress|creating)$/u.test(value)) return "running";
return null;
}
@@ -3364,30 +3404,35 @@ function buildFindings(samples, control, network, errors, sampleMetrics, promptN
...(Array.isArray(turnStateTriad.invalidFullTriads) ? turnStateTriad.invalidFullTriads : []),
...(Array.isArray(turnStateTriad.cardFinalResponseMismatches) ? turnStateTriad.cardFinalResponseMismatches : [])
];
if (Number(turnStateTriadSummary.invalidRowCount ?? 0) > 0) findings.push({
id: "workbench-turn-state-triad-inconsistent",
severity: "red",
summary: "Workbench session rail status, turn card status, and Final Response body presence diverged from the allowed running/running/absent or completed/completed/present tuples",
count: turnStateTriadSummary.invalidRowCount,
fullTriadCount: turnStateTriadSummary.fullTriadRowCount,
invalidFullTriadCount: turnStateTriadSummary.invalidFullTriadCount,
cardFinalResponseMismatchCount: turnStateTriadSummary.cardFinalResponseMismatchCount,
legacyCollectorMissingCount: turnStateTriadSummary.collectorMissingRowCount,
collectorMissingFields: Array.isArray(turnStateTriadSummary.collectorMissingFields) ? turnStateTriadSummary.collectorMissingFields : [],
allowedTuples: [
{ railStatus: "completed", cardStatus: "completed", finalResponsePresent: true },
{ railStatus: "running", cardStatus: "running", finalResponsePresent: false }
],
samples: turnStateTriadRows.slice(0, 20),
drilldown: turnStateTriad.drilldown ?? buildWorkbenchTurnStateTriadDrilldown(turnStateTriadRows),
collectorMissingSamples: Array.isArray(turnStateTriad.collectorMissingRows) ? turnStateTriad.collectorMissingRows.slice(0, 10) : [],
sourceOfTruth: "durable Workbench projection/read model; do not repair via DOM fallback or GET-side state mutation",
nextAction: "Use drilldown.otelDrilldown.commands for the listed traceIds, then inspect staticSourceHints and add unit tests from unitTestReproHints before changing UI rendering.",
rootCause: "workbench_projection_state_triad_not_sealed",
rootCauseStatus: "confirmed-from-dom-samples",
rootCauseConfidence: "high",
valuesRedacted: true
});
if (Number(turnStateTriadSummary.invalidRowCount ?? 0) > 0) {
const drilldown = turnStateTriad.drilldown ?? buildWorkbenchTurnStateTriadDrilldown(turnStateTriadRows);
const rootCause = workbenchTriadRootCauseFromDrilldown(drilldown, turnStateTriadSummary);
findings.push({
id: "workbench-turn-state-triad-inconsistent",
severity: "red",
summary: rootCause.summary,
count: turnStateTriadSummary.invalidRowCount,
fullTriadCount: turnStateTriadSummary.fullTriadRowCount,
invalidFullTriadCount: turnStateTriadSummary.invalidFullTriadCount,
cardFinalResponseMismatchCount: turnStateTriadSummary.cardFinalResponseMismatchCount,
legacyCollectorMissingCount: turnStateTriadSummary.collectorMissingRowCount,
collectorMissingFields: Array.isArray(turnStateTriadSummary.collectorMissingFields) ? turnStateTriadSummary.collectorMissingFields : [],
dominantMismatchKind: rootCause.dominantMismatchKind,
allowedTuples: [
{ railStatus: "completed", cardStatus: "completed", finalResponsePresent: true },
{ railStatus: "running", cardStatus: "running", finalResponsePresent: false }
],
samples: turnStateTriadRows.slice(0, 20),
drilldown,
collectorMissingSamples: Array.isArray(turnStateTriad.collectorMissingRows) ? turnStateTriad.collectorMissingRows.slice(0, 10) : [],
sourceOfTruth: rootCause.sourceOfTruth + "; do not repair via DOM fallback or GET-side state mutation",
nextAction: rootCause.nextAction,
rootCause: rootCause.rootCause,
rootCauseStatus: rootCause.rootCauseStatus,
rootCauseConfidence: rootCause.rootCauseConfidence,
valuesRedacted: true
});
}
const promptFailures = Array.isArray(promptNetwork?.rounds) ? promptNetwork.rounds.filter((item) => item.chatPostOk === false && !promptCommandHasAuthoritativeSubmitSideEffect(control, item)) : [];
if (promptFailures.length > 0) findings.push({ id: "prompt-chat-submit-failed", severity: "red", summary: "sendPrompt command had no successful /v1/agent/chat or /v1/agent/chat/steer POST response in the sampling window", count: promptFailures.length, rounds: promptFailures.slice(0, 10) });
const promptSteerRounds = Array.isArray(promptNetwork?.rounds) ? promptNetwork.rounds.filter((item) => item.steerUsed === true) : [];
@@ -0,0 +1,158 @@
import assert from "node:assert/strict";
import { mkdtemp, readFile, writeFile } from "node:fs/promises";
import { tmpdir } from "node:os";
import { join } from "node:path";
import { spawnSync } from "node:child_process";
import { test } from "bun:test";
import { nodeWebObserveAnalyzerSource } from "../hwlab-node-web-observe-analyzer-source";
const alertThresholds = {
sameOriginApiSlowMs: 60000,
partialApiSlowMs: 60000,
longLivedStreamOpenSlowMs: 60000,
visibleLoadingSlowMs: 60000,
turnTimingSampleSlackSeconds: 60,
turnElapsedSevereTimeoutSeconds: 3600,
domEvaluateTimeoutRedCount: 99,
domEvaluateTimeoutRedWindowMs: 60000,
screenshotTimeoutRedCount: 99,
pageErrorRedCount: 99,
browserProcessSampleIntervalMs: 1000,
browserTotalRssRedMb: 999999,
browserProcessRssRedMb: 999999,
browserRssGrowthRedMb: 999999,
browserRssGrowthWindowMs: 60000,
playwrightResponsivenessRedMs: 60000,
playwrightResponsivenessTimeoutRedCount: 99,
cdpMetricsTimeoutRedCount: 99,
uncommandedStateChangeCommandWindowMs: 1000,
scrollJumpCommandWindowMs: 1000,
scrollJumpFromY: 999999,
scrollJumpToY: 999999,
sessionRailFallbackRatio: 0.5,
};
const browserFreezePolicy = {
enabled: true,
blockerWindowMs: 60000,
memory: {
totalRssBlockerMb: 999999,
processRssBlockerMb: 999999,
growthBlockerMb: 999999,
},
responsiveness: {
latencyBlockerMs: 60000,
eventBlockerCount: 99,
},
cdp: {
metricsTimeoutBlockerCount: 99,
},
kill: {
enabled: false,
gracefulSignal: "SIGTERM",
forceSignal: "SIGKILL",
graceMs: 1000,
pollIntervalMs: 100,
exitCode: 124,
},
};
test("observe analyzer classifies stale completed session rail when a newer turn is running", async () => {
const stateDir = await mkdtemp(join(tmpdir(), "unidesk-web-observe-analyzer-"));
const analyzerPath = join(stateDir, "analyze.mjs");
const samplesPath = join(stateDir, "samples.jsonl");
await writeFile(analyzerPath, nodeWebObserveAnalyzerSource(), { mode: 0o700 });
await writeFile(samplesPath, [
JSON.stringify({
seq: 1,
ts: "2026-07-01T15:21:24.767Z",
path: "/workbench/sessions/ses_triage",
url: "https://hwlab.example.test/workbench/sessions/ses_triage",
pageRole: "control",
pageId: "control-test",
routeSessionId: "ses_triage",
activeSessionId: "ses_triage",
sessionRail: {
items: [{
index: 0,
active: true,
status: "completed",
dataStatus: "completed",
running: false,
dataRunning: "false",
sessionId: "ses_triage",
sessionIdPrefix: "ses_triage",
}],
},
turns: [
{
role: "agent",
status: "completed",
traceId: "trc_previous_completed",
messageId: "msg_previous_completed_agent",
finalResponsePresent: true,
finalResponseTextBytes: 12,
},
{
role: "agent",
status: "running",
traceId: "trc_running_new_turn",
messageId: "msg_running_new_turn_agent",
finalResponsePresent: false,
finalResponseTextBytes: 0,
},
],
}),
JSON.stringify({
seq: 2,
ts: "2026-07-01T15:22:49.918Z",
path: "/workbench/sessions/ses_triage",
url: "https://hwlab.example.test/workbench/sessions/ses_triage",
pageRole: "control",
pageId: "control-test",
routeSessionId: "ses_triage",
activeSessionId: "ses_triage",
sessionRail: {
items: [{
index: 0,
active: true,
status: "canceled",
dataStatus: "canceled",
running: false,
dataRunning: "false",
sessionId: "ses_triage",
sessionIdPrefix: "ses_triage",
}],
},
turns: [{
role: "agent",
status: "canceled",
traceId: "trc_canceled_terminal",
messageId: "msg_canceled_terminal_agent",
finalResponsePresent: true,
finalResponseTextBytes: 17,
}],
}),
].join("\n") + "\n");
const result = spawnSync("bun", [analyzerPath, stateDir], {
cwd: join(import.meta.dir, "../../.."),
env: {
...process.env,
UNIDESK_WEB_OBSERVE_ANALYZE_TAIL_SAMPLES: "0",
UNIDESK_WEB_OBSERVE_ALERT_THRESHOLDS_JSON: JSON.stringify(alertThresholds),
UNIDESK_WEB_OBSERVE_BROWSER_FREEZE_POLICY_JSON: JSON.stringify(browserFreezePolicy),
},
encoding: "utf8",
});
assert.equal(result.status, 0, result.stderr || result.stdout);
const report = JSON.parse(await readFile(join(stateDir, "analysis", "report.json"), "utf8"));
const finding = report.findings.find((item: Record<string, unknown>) => item.id === "workbench-turn-state-triad-inconsistent");
assert.equal(finding?.rootCause, "workbench_session_rail_status_stale_after_new_running_turn");
assert.equal(finding?.dominantMismatchKind, "rail-card-status-mismatch");
assert.match(String(finding?.summary), /previous completed terminal status/u);
assert.equal(report.sampleMetrics.workbenchTurnStateTriad.summary.invalidRowCount, 1);
assert.equal(report.sampleMetrics.workbenchTurnStateTriad.summary.cardFinalResponseMismatchCount, 0);
}, 20_000);
@@ -89,6 +89,7 @@ export function compactDiagnoseCodeAgentResult(value: unknown): Record<string, u
servicePath: source.servicePath ?? null,
observabilityGap: source.observabilityGap ?? null,
businessTraceIds: source.businessTraceIds ?? null,
businessTraceScope: source.businessTraceScope ?? null,
identity: compactDiagnoseIdentity(source.identity),
agentrun: compactDiagnoseAgentRun(source.agentrun),
hwlabReadModel: source.hwlabReadModel ?? null,
@@ -1450,6 +1450,88 @@ def identity_from_spans(spans):
identity[key] = preferred if preferred is not None else fallback
return identity
def span_business_trace_id(item):
attrs = item.get("attributes", {}) if isinstance(item.get("attributes"), dict) else {}
for key in ("traceId", "workbench.trace_id", "workbench.turn_id"):
value = attrs.get(key)
if isinstance(value, str) and value.strip().startswith("trc_"):
return value.strip()
return None
def span_identity_value(item, key):
attrs = item.get("attributes", {}) if isinstance(item.get("attributes"), dict) else {}
value = attrs.get(key)
if value in (None, ""):
return None
return str(value)
def span_matches_scope_identity(item, identity):
if not isinstance(identity, dict):
return False
# Only strong per-turn identities are safe enough for no-trace-id spans.
# runId/sessionId/runnerJobId can cover several turns in the same Workbench
# session and caused terminal status leakage across business trace ids.
for key in ("commandId", "turnId"):
scoped = identity.get(key)
value = span_identity_value(item, key)
if scoped not in (None, "") and value not in (None, ""):
return str(scoped) == str(value)
return False
def scoped_spans_for_business_trace(spans, business_trace_id):
if business_trace_id in (None, ""):
return list(spans), {
"mode": "unscoped-no-business-trace-id",
"requestedBusinessTraceId": None,
"totalSpanCount": len(spans),
"scopedSpanCount": len(spans),
"excludedDifferentTraceSpanCount": 0,
"includedNoTraceIdentitySpanCount": 0,
"crossBusinessTraceIds": [],
}
exact = []
no_trace = []
cross_ids = collections.Counter()
for item in spans:
trace_id = span_business_trace_id(item)
if trace_id == business_trace_id:
exact.append(item)
elif trace_id:
cross_ids[trace_id] += 1
else:
no_trace.append(item)
if not exact:
return list(spans), {
"mode": "unscoped-no-exact-business-trace-span",
"requestedBusinessTraceId": business_trace_id,
"totalSpanCount": len(spans),
"scopedSpanCount": len(spans),
"excludedDifferentTraceSpanCount": 0,
"includedNoTraceIdentitySpanCount": 0,
"crossBusinessTraceIds": [{"traceId": key, "count": count} for key, count in cross_ids.most_common(8)],
}
identity = identity_from_spans(exact)
exact_ids = set(id(item) for item in exact)
scoped = list(exact)
included_no_trace = 0
for item in no_trace:
if id(item) in exact_ids:
continue
if span_matches_scope_identity(item, identity):
scoped.append(item)
included_no_trace += 1
scoped.sort(key=lambda item: (item.get("_start", 0), item.get("_index", 0)))
return scoped, {
"mode": "business-trace-scoped",
"requestedBusinessTraceId": business_trace_id,
"totalSpanCount": len(spans),
"scopedSpanCount": len(scoped),
"excludedDifferentTraceSpanCount": len(spans) - len(scoped),
"includedNoTraceIdentitySpanCount": included_no_trace,
"scopeIdentity": {key: identity.get(key) for key in ("commandId", "turnId", "runId", "sessionId") if identity.get(key) not in (None, "")},
"crossBusinessTraceIds": [{"traceId": key, "count": count} for key, count in cross_ids.most_common(8)],
}
def flatten_trace(trace_body):
services = set()
business_trace_ids = set()
@@ -1752,6 +1834,20 @@ def root_cause_candidates(http_summary, agentrun, read_model, lag_summary, error
"evidence": lag_summary.get("reasons", []),
})
failure_kind = str(agentrun.get("failureKind") or "")
read_model_turn_status = read_model.get("turnStatus")
status_conflict = terminal_status_conflicts(agentrun.get("terminalStatus"), read_model_turn_status)
if status_conflict:
candidates.append({
"code": "otel_business_trace_terminal_conflict",
"label": "OTel terminal conflict",
"confidence": 0.86,
"summary": "Scoped AgentRun terminal status conflicts with HWLAB read-model turn status for the requested business trace; prefer the HWLAB read-model status for Workbench RCA and inspect OTel correlation leakage.",
"evidence": {
"agentrunTerminalStatus": agentrun.get("terminalStatus"),
"hwlabReadModelTurnStatus": read_model_turn_status,
"turnStatusCounts": read_model.get("turnStatusCounts"),
},
})
if failure_kind == "provider-auth-failed":
candidates.append({
"code": "provider_auth_failed",
@@ -1765,7 +1861,7 @@ def root_cause_candidates(http_summary, agentrun, read_model, lag_summary, error
"terminalCategory": agentrun.get("terminalCategory"),
},
})
if agentrun.get("terminalStatus") in ("failed", "error", "timeout", "blocked", "cancelled"):
if agentrun.get("terminalStatus") in ("failed", "error", "timeout", "blocked", "cancelled") and not status_conflict:
candidates.append({
"code": "agentrun_terminal_failed",
"label": "AgentRun terminal failed",
@@ -1790,6 +1886,19 @@ def root_cause_candidates(http_summary, agentrun, read_model, lag_summary, error
"runnerProviderClassification": agentrun.get("runnerProviderClassification"),
},
})
if agentrun.get("terminalStatus") in (None, "") and read_model_turn_status in ("completed", "failed", "error", "timeout", "blocked", "cancelled", "canceled"):
candidates.append({
"code": "hwlab_read_model_terminal_without_runner_spans",
"label": "HWLAB read model terminal",
"confidence": 0.7,
"summary": "HWLAB read model shows a terminal turn status but scoped AgentRun manager/runner terminal spans are absent; treat missing runner spans as an observability gap, not proof that the turn is still running.",
"evidence": {
"turnStatus": read_model_turn_status,
"turnStatusCounts": read_model.get("turnStatusCounts"),
"sessionListReadCount": read_model.get("sessionListReadCount"),
"sessionMessageReadCount": read_model.get("sessionMessageReadCount"),
},
})
if error_spans and agentrun.get("terminalStatus") == "completed":
candidates.append({
"code": "tool_call_failures_recovered",
@@ -1812,6 +1921,28 @@ def root_cause_candidates(http_summary, agentrun, read_model, lag_summary, error
candidates.sort(key=lambda item: item.get("confidence", 0), reverse=True)
return candidates
def comparable_terminal_status(value):
text = str(value or "").strip().lower().replace("_", "-")
if not text:
return None
if text == "cancelled":
return "canceled"
if text == "success" or text == "succeeded" or text == "complete":
return "completed"
if text == "error" or text == "failure":
return "failed"
return text
def terminal_status_conflicts(left, right):
left_status = comparable_terminal_status(left)
right_status = comparable_terminal_status(right)
if left_status in (None, "") or right_status in (None, ""):
return False
terminal_values = {"completed", "failed", "timeout", "blocked", "canceled", "stale", "thread-resume-failed", "interrupted", "expired"}
if left_status not in terminal_values or right_status not in terminal_values:
return False
return left_status != right_status
def candidate_score(trace_id, meta, trace_body, trace_rc, trace_err):
parsed = parse_json(trace_body)
meta_root_service = str(meta.get("rootServiceName") or "") if isinstance(meta, dict) else ""
@@ -1837,13 +1968,15 @@ def candidate_score(trace_id, meta, trace_body, trace_rc, trace_err):
"stderrTail": (trace_err or "")[-1000:],
}, None
flat = flatten_trace(parsed)
spans = flat["spans"]
services = set(flat["services"])
all_spans = flat["spans"]
spans, business_scope = scoped_spans_for_business_trace(all_spans, BUSINESS_TRACE_ID)
services = set(str(item.get("service") or "") for item in spans)
names = [str(item.get("name") or "") for item in spans]
lowered_names = [name.lower() for name in names]
identity = identity_from_spans(spans)
agentrun = agentrun_summary(spans)
error_spans = flat["errorSpans"]
scoped_span_ids = set(id(item) for item in spans)
error_spans = [item for item in flat["errorSpans"] if id(item) in scoped_span_ids]
score = 0
reasons = []
def add(points, reason):
@@ -1902,6 +2035,7 @@ def candidate_score(trace_id, meta, trace_body, trace_rc, trace_err):
"identity": {key: identity.get(key) for key in ("runId", "commandId", "sessionId", "runnerJobId", "runnerId", "backendProfile") if identity.get(key) not in (None, "")},
"terminalStatus": agentrun.get("terminalStatus"),
"errorSpanCount": len(error_spans),
"businessTraceScope": business_scope,
"candidateQuality": candidate_quality,
"lowConfidence": candidate_quality != "normal",
"rootTraceName": meta_root_name or None,
@@ -2043,10 +2177,12 @@ if not isinstance(trace_parsed, dict):
raise SystemExit(1)
flat = flatten_trace(trace_parsed)
spans = flat["spans"]
services = flat["services"]
all_spans = flat["spans"]
spans, business_scope = scoped_spans_for_business_trace(all_spans, BUSINESS_TRACE_ID)
services = sorted(set(str(item.get("service") or "") for item in spans))
business_trace_ids = flat["businessTraceIds"]
error_spans = flat["errorSpans"]
scoped_span_ids = set(id(item) for item in spans)
error_spans = [item for item in flat["errorSpans"] if id(item) in scoped_span_ids]
idle_warning_spans = [item for item in spans if str(item.get("name") or "") == "codex_stdio.idle_warning"]
http_summary = http_status_summary(spans)
read_model = hwlab_read_model_summary(spans)
@@ -2080,17 +2216,25 @@ if missing_services and ("hwlab-cloud-api" in services or identity.get("runId")
facts = []
if missing_services:
facts.append("observability gap: missing service spans " + ",".join(missing_services))
if business_scope.get("mode") == "business-trace-scoped" and business_scope.get("crossBusinessTraceIds"):
facts.append("cross-business-trace spans excluded")
if http_summary.get("actorForbidden"):
facts.append("actor forbidden")
terminal_status = agentrun.get("terminalStatus")
if terminal_status in ("failed", "error", "timeout", "blocked", "cancelled"):
read_model_turn_status = read_model.get("turnStatus")
agentrun_read_model_status_conflict = terminal_status_conflicts(terminal_status, read_model_turn_status)
if agentrun_read_model_status_conflict:
facts.append("OTel AgentRun terminal status conflicts with HWLAB read model")
if terminal_status in ("failed", "error", "timeout", "blocked", "cancelled") and not agentrun_read_model_status_conflict:
failure_kind = agentrun.get("failureKind")
if failure_kind:
facts.append(f"AgentRun terminal failed ({failure_kind})")
else:
facts.append("AgentRun terminal failed")
if terminal_status == "completed":
if terminal_status == "completed" and not agentrun_read_model_status_conflict:
facts.append("AgentRun completed")
if (terminal_status in (None, "") or agentrun_read_model_status_conflict) and read_model_turn_status in ("completed", "failed", "error", "timeout", "blocked", "cancelled", "canceled"):
facts.append("HWLAB read model terminal " + str(read_model_turn_status))
if lag.get("status") in ("confirmed", "suspected"):
facts.append("projection/read-model stale")
if idle_warning_spans and terminal_status in (None, ""):
@@ -2105,8 +2249,11 @@ summary = {
"runnerProvider": agentrun.get("runnerProviderClassification"),
"actorForbidden": http_summary.get("actorForbidden"),
"terminalStatus": terminal_status,
"readModelTurnStatus": read_model_turn_status,
"agentrunReadModelStatusConflict": agentrun_read_model_status_conflict,
"failureKind": agentrun.get("failureKind"),
"observabilityGap": observability_gap.get("status"),
"businessTraceScope": business_scope.get("mode"),
},
}
evidence = {
@@ -2136,6 +2283,7 @@ payload = {
"servicePath": service_path,
"observabilityGap": observability_gap,
"businessTraceIds": business_trace_ids[:20],
"businessTraceScope": business_scope,
"identity": identity,
"agentrun": {
"terminalStatus": agentrun.get("terminalStatus"),