fix(monitor): clarify workbench read-model root causes
This commit is contained in:
@@ -19,7 +19,7 @@
|
||||
| 短名 | Web工作台 |
|
||||
| 层级 | L2 课题 |
|
||||
| 状态 | 已生效 |
|
||||
| 实现引用版本 | draft-2026-06-20-p0-long-running-workbench; draft-2026-06-20-p0-error-diagnostics; draft-2026-06-20-p0-passive-web-probe-observer; draft-2026-06-20-p1-view-local-timing-ticker; draft-2026-06-24-p0-no-ui-timing-fabrication; draft-2026-06-25-p0-web-caserun-e2e; draft-2026-06-25-p0-project-management-mdtodo; PJ2026-0104010803 唯一投影 draft-2026-06-20-p0-durable-facts-model; draft-2026-06-20-p1-zero-split-durable-realtime; draft-2026-06-20-p2-terminal-outbox-recovery; draft-2026-06-24-p0-aggregate-event-stream; draft-2026-06-25-p0-serve-session-aggregate-authority; draft-2026-06-25-p0-session-warm-runner-contract |
|
||||
| 实现引用版本 | draft-2026-06-20-p0-long-running-workbench; draft-2026-06-20-p0-error-diagnostics; draft-2026-06-20-p0-passive-web-probe-observer; draft-2026-06-20-p1-view-local-timing-ticker; draft-2026-06-24-p0-no-ui-timing-fabrication; draft-2026-06-25-p0-web-caserun-e2e; draft-2026-06-25-p0-project-management-mdtodo; PJ2026-0104010803 唯一投影 draft-2026-06-20-p0-durable-facts-model; draft-2026-06-20-p1-zero-split-durable-realtime; draft-2026-06-20-p2-terminal-outbox-recovery; draft-2026-06-24-p0-aggregate-event-stream; draft-2026-06-25-p0-serve-session-aggregate-authority; draft-2026-06-25-p0-session-warm-runner-contract; draft-2026-06-27-p0-workbench-read-model-contract |
|
||||
| 需求规格模板 | [ISO/IEC/IEEE 29148 需求规格模板](../../templates/iso-iec-ieee-29148-requirements-spec-template.md) |
|
||||
| 上级规格 | [PJ2026-0104 客户端](PJ2026-0104-client.md) |
|
||||
| 规格治理索引 | [规格治理](spec-governance.md) |
|
||||
@@ -410,6 +410,10 @@ Timeline Projection 只能从 messages、parts、turn status 和 trace events
|
||||
|
||||
[PJ2026-0104010803 Workbench唯一投影](PJ2026-0104010803-workbench-unique-projection.md) 要求 Web 工作台把 `sessionId`、`messageId`、`partId`、`turnId` 和 `traceId` 作为 reducer 合并键。REST snapshot、SSE event、submit optimistic 和 trace pagination 的任何 late response 都只能更新其自身 key 对应的事实,不得清空当前 selected session、覆盖已存在 message page、或用 list summary/localStorage/workspace snapshot 重建当前 timeline。running trace 在 terminal 前收到 `hasMore=false` 只表示当前 trace page 暂时追平,不表示 `fullTraceLoaded=true`;后续 SSE 或 REST catch-up 仍必须能追加 terminal event 和 final response。
|
||||
|
||||
Message page 在刷新、切换 session、SSE 重连和 REST 补洞后必须保持同一 turn timeline 顺序:同一轮用户输入应先于对应 assistant/agent terminal,跨轮次按 turn timeline/aggregate seq/事件时间排序,不得按 role、source table、投影写入批次或 `updatedAt desc` 形成 `UUAA`、`UUUAAA` 等用户消息聚簇。前端不得通过 DOM 后处理或本地排序掩盖 read model 聚簇;若 API 已经聚簇,修复点必须回到 WorkbenchReadModel 或 projection writer。
|
||||
|
||||
Session rail 的 title/preview 必须来自同一 durable message/part projection 的脱敏摘要,并随 session list/detail 返回稳定字段。仅当 read model 明确缺失 title/preview 时,UI 才能短暂展示 fallback `Session ses_*`;web-probe 与 OTel 必须把 fallback 数量、比例和示例暴露为 projection/read-model 问题,而不是把 fallback 当作正常标题。
|
||||
|
||||
Web 工作台严禁读侧推理。`turn.status`、`message.status`、`session.running`、`trace terminal`、`finalResponse` 和 `projectionStatus` 只能来自唯一 durable projection。Trace event row 的 `completed` 只表示该事件或工具行完成,不能终结 turn;message text 为空只表示无可展示正文,不能生成占位 final response;session list summary、workspace selected state、localStorage mirror、轮询耗时或 elapsed timeout 不能改写 running/terminal。若 API 返回缺字段、投影滞后或多字段矛盾,Web 只能展示 loading、degraded、unknown 或 blocker,并把问题暴露给投影层修复,不得在 reducer、selector、组件或测试 helper 中合成“看起来正确”的状态。
|
||||
|
||||
浏览器本地 now 只属于 view-layer render input。它不得进入 reducer action、store mutation、projection merge、lifecycle selector、session sorting、terminal 判定、transport diagnostic 或 fake-server fixture 事实生成。前端回归必须覆盖固定 projection timestamp 且无 SSE/API 更新时,运行中“最近”和“耗时”仍随 fake clock 前进;同时覆盖 terminal 消息耗时不随 fake clock 前进。
|
||||
|
||||
@@ -19,7 +19,7 @@
|
||||
| 短名 | Workbench唯一投影 |
|
||||
| 层级 | L4 专项规格切片 |
|
||||
| 状态 | 已生效 |
|
||||
| 实现引用版本 | draft-2026-06-20-p0-durable-facts-model; draft-2026-06-20-p1-view-local-timing-ticker; draft-2026-06-20-p1-zero-split-durable-realtime; draft-2026-06-20-p2-terminal-outbox-recovery; draft-2026-06-22-p1-workbench-redis-derived-cache; draft-2026-06-24-p0-no-ui-timing-fabrication; draft-2026-06-24-p0-aggregate-event-stream; draft-2026-06-24-p1-opencode-message-part-authority; draft-2026-06-25-p0-serve-session-aggregate-authority; draft-2026-06-25-p0-session-warm-runner-contract |
|
||||
| 实现引用版本 | draft-2026-06-20-p0-durable-facts-model; draft-2026-06-20-p1-view-local-timing-ticker; draft-2026-06-20-p1-zero-split-durable-realtime; draft-2026-06-20-p2-terminal-outbox-recovery; draft-2026-06-22-p1-workbench-redis-derived-cache; draft-2026-06-24-p0-no-ui-timing-fabrication; draft-2026-06-24-p0-aggregate-event-stream; draft-2026-06-24-p1-opencode-message-part-authority; draft-2026-06-25-p0-serve-session-aggregate-authority; draft-2026-06-25-p0-session-warm-runner-contract; draft-2026-06-27-p0-read-model-timeline-contract |
|
||||
| 需求规格模板 | [ISO/IEC/IEEE 29148 需求规格模板](../../templates/iso-iec-ieee-29148-requirements-spec-template.md) |
|
||||
| 上级规格 | [PJ2026-010401 Web工作台](PJ2026-010401-web-workbench.md) |
|
||||
| 关联规格 | [PJ2026-010403 API契约](PJ2026-010403-api-contract.md)、[PJ2026-01060505 Workbench性能](PJ2026-01060505-workbench-performance.md)、[PJ2026-010205 HWLAB接入](PJ2026-010205-hwlab-dispatch.md)、[PJ2026-0102 Agent编排](PJ2026-0102-agent-orchestration.md) |
|
||||
@@ -456,6 +456,10 @@ Read model 可以在组装完成后读取或写入 Redis 派生读缓存,但
|
||||
|
||||
`GET /v1/workbench/*` 必须纯读。若 projection 滞后,响应应输出 `projectionStatus`、`lastProjectedSeq`、`sourceRunId`、`sourceCommandId`、`blocker` 或等价 diagnostic 字段,不能在 GET 内做 read-through repair。
|
||||
|
||||
MessagePage 是 Workbench timeline 事实,不是 facts 表写入批次的直接 dump。WorkbenchReadModel 必须按 turn timeline、aggregate seq 或等价事件时间组装用户可见顺序;同一 turn 的 user message 必须位于对应 assistant/agent terminal message 之前,跨 turn 不得因为 user message 的 `projectedSeq/sourceSeq` 先整体写入、assistant terminal 后整体写入而输出 `UUAA`、`UUUAAA` 等 role cluster。`projectedSeq/sourceSeq/sourceEventId` 必须保留用于审计和幂等,但不得让 role/source bucket 或投影批次覆盖用户可见 timeline。
|
||||
|
||||
Session summary 的 title/preview 必须由 durable message/part projection 生成脱敏摘要,并随 session list/detail 作为显式字段返回。fallback `Session ses_*` 只表示 read model 缺摘要,是需要通过 OTel/session_list_read、web-probe finding 或 reprojection 暴露的问题,不是正常 UI 文案来源。
|
||||
|
||||
Read model 严禁读侧推理。它不得从 `result?.status ?? trace?.status`、最后一条 trace event `status=completed`、message text 是否为空、part/tool row 状态、session list summary、workspace selected state、localStorage mirror、轮询耗时或 elapsed timeout 推断 turn terminal、message terminal、trace terminal、session running、final response 或 projection caught-up。若 durable facts 中唯一投影对象缺少这些字段,Read model 必须返回未知、投影中、degraded 或 blocker 等显式 diagnostic,由 projection writer/finalizer 修复源头;不得在读取时临时合成“看起来正确”的状态。
|
||||
|
||||
Read model 读取到 sealed final response 时,必须把主消息正文、finalResponse、message status 和 turn terminal 从 sealed projection 输出;projection diagnostic、trace hydration diagnostic、transport diagnostic、result sync diagnostic 和 session health 只能作为独立字段或详情区数据输出。读取链路不得因为后续 turn snapshot 失败、trace event page 失败、SSE gap、realtime timeout 或 compat wrapper 失败而替换 sealed 主正文。
|
||||
@@ -468,6 +472,8 @@ Trace event page 必须按 `projectedSeq` 单调返回,响应范围必须合
|
||||
|
||||
Trace event page 的可见性必须以 durable turn/message/session 关系和 owner visibility 判断,不能只依赖 session 当前 `lastTraceId`。同一 session 后续产生新 trace 后,旧 trace 只要对应 turn、message 或 projection facts 对当前 actor 可见,`GET /v1/workbench/traces/{traceId}/events` 仍应返回 200 和 trace page;若 trace facts 缺页或 projection 未追平,应返回 projection blocker/degraded,而不是 actor 不可见 404。真正 archived/deleted/not-found 的生命周期必须来自 canonical lifecycle projection。
|
||||
|
||||
Trace event page continuation 的空窗口不得升级成 404。若 page 没有新 event、但 trace checkpoint 或 projection diagnostic 显示 read model 尚未追到 `traceLastSeq/lastProjectedSeq`,响应应为 `200` 空页并标记 `projectionStatus/projectionHealth=projecting` 或等价 degraded/stalled,暴露 `latestProjectedSeq` 或 `lastProjectedSeq`、`traceLastSeq`、`fullTraceLoaded=false` 和 `diagnostic.code=workbench_trace_events_missing`。`fullTraceLoaded=true` 只能表示本次 page 实际 `toProjectedSeq` 已追到 `traceLastSeq` 且没有更多 page,不能仅由 `hasMore=false` 或 `range.total` 推断。
|
||||
|
||||
Workbench facts/read model 的 PostgreSQL pool connect/query timeout 必须被转换为结构化 degraded/readiness/blocker 响应和日志,不能逃逸为 cloud-api 进程未捕获异常。Liveness 不应因单次 Workbench read query timeout 变为 EOF;readiness 或 route 响应应暴露 route、store method、timeout kind、runtime readiness、projection candidate count、相关 trace/run/command 摘要和 `valuesRedacted=true`。
|
||||
|
||||
### 6.5 WB-PROJ-REQ-005 compat wrapper
|
||||
|
||||
@@ -19,7 +19,7 @@
|
||||
| 短名 | API契约 |
|
||||
| 层级 | L2 课题 |
|
||||
| 状态 | 已生效 |
|
||||
| 实现引用版本 | draft-2026-06-20-p0-workbench-pure-read-api; draft-2026-06-20-p0-error-diagnostics; draft-2026-06-20-p1-view-local-timing-ticker; draft-2026-06-22-p1-workbench-redis-derived-cache; draft-2026-06-25-p0-web-caserun-e2e; draft-2026-06-25-p0-project-management-mdtodo; draft-2026-06-25-p0-mdtodo-web-active-editing-hwpod-source; PJ2026-0104010803 唯一投影 draft-2026-06-20-p0-durable-facts-model; draft-2026-06-20-p1-zero-split-durable-realtime; draft-2026-06-20-p2-terminal-outbox-recovery; draft-2026-06-24-p0-aggregate-event-stream; draft-2026-06-25-p0-serve-session-aggregate-authority; PJ2026-01050105 Web鉴权 draft-2026-06-18-p0-auth |
|
||||
| 实现引用版本 | draft-2026-06-20-p0-workbench-pure-read-api; draft-2026-06-20-p0-error-diagnostics; draft-2026-06-20-p1-view-local-timing-ticker; draft-2026-06-22-p1-workbench-redis-derived-cache; draft-2026-06-25-p0-web-caserun-e2e; draft-2026-06-25-p0-project-management-mdtodo; draft-2026-06-25-p0-mdtodo-web-active-editing-hwpod-source; draft-2026-06-27-p0-workbench-read-api-contract; PJ2026-0104010803 唯一投影 draft-2026-06-20-p0-durable-facts-model; draft-2026-06-20-p1-zero-split-durable-realtime; draft-2026-06-20-p2-terminal-outbox-recovery; draft-2026-06-24-p0-aggregate-event-stream; draft-2026-06-25-p0-serve-session-aggregate-authority; PJ2026-01050105 Web鉴权 draft-2026-06-18-p0-auth |
|
||||
| 需求规格模板 | [ISO/IEC/IEEE 29148 需求规格模板](../../templates/iso-iec-ieee-29148-requirements-spec-template.md) |
|
||||
| 上级规格 | [PJ2026-0104 客户端](PJ2026-0104-client.md) |
|
||||
| 关联规格 | [PJ2026-0104010803 Workbench唯一投影](PJ2026-0104010803-workbench-unique-projection.md)、[PJ2026-01060505 Workbench性能](PJ2026-01060505-workbench-performance.md) |
|
||||
@@ -282,8 +282,12 @@ Trace resource 必须支持按事件位置继续读取,例如 `sinceSeq` / `cu
|
||||
|
||||
所有 GET path 必须无业务副作用。需要 repair、finalize、sync 或 migration 时,应由后台 finalizer、显式 mutation 或持久化投影入口执行,不能隐藏在 workspace、conversation、turn 或 trace GET 中。所有分页响应必须包含稳定 cursor 或 seq 边界;trace event page 的分页 cursor 使用 HWLAB `projectedSeq`,并保留 AgentRun `sourceSeq/sourceEventId` 作为来源审计字段,二者不得重叠或互相替代。所有错误响应必须包含 route、status、code/blocker、redacted actor、target resource 和下一步建议。
|
||||
|
||||
`GET /v1/workbench/sessions/{sessionId}/messages` 返回的 MessagePage 必须是会话 timeline 顺序,而不是数据库写入批次顺序。排序键应优先体现 turn timeline/aggregate seq/事件时间;同一 turn 内用户 message 先于 assistant/agent terminal message;同一 session 多轮刷新后不得出现由 read model 排序造成的 role cluster。Message DTO 应保留 `projectedSeq/sourceSeq/sourceEventId` 供审计,但客户端不得把这些来源字段当作唯一排序修复点。
|
||||
|
||||
`GET /v1/workbench/traces/{traceId}/events` 的可见性必须以 trace 所属 turn/message/session durable facts 和当前 actor visibility 判断。历史 trace 不再是 session 当前 `lastTraceId` 时仍应可读;缺页或投影滞后应返回 projection blocker/degraded,而不是 `workbench_trace_not_found` 的 actor 不可见误导。
|
||||
|
||||
`GET /v1/workbench/traces/{traceId}/events` 的 continuation 空窗口属于正常分页状态或投影追平窗口,不得返回 404。若 actor 可见且 metadata/checkpoint 表明 trace 仍有未追平区间,API 应返回 `200`、`events=[]`、空 `range.fromProjectedSeq/toProjectedSeq`、原 `nextProjectedSeq`、`status=projecting` 或等价 diagnostic,并暴露 `projectionStatus`、`projectionHealth`、`lastProjectedSeq`、`traceLastSeq`、`fullTraceLoaded=false`、`diagnostic.code=workbench_trace_events_missing`。`404` 只用于真实不可见、未归档、删除或 actor 权限不匹配的 trace,不用于 no-new-events、lag 或 catch-up。
|
||||
|
||||
Cancel mutation 在转发 AgentRun cancel 前必须核验本地 sealed terminal 或上游 run/command state。若 command 已 terminal,API 应返回 already-terminal/no-op 或触发 terminal projection,不得把 completed/failed/blocked 等真实 terminal 改写为 canceled。Workbench read paths 仍保持纯读,不在 GET 中执行该核验或修复。
|
||||
|
||||
Workbench read-model dependency timeout 必须返回结构化 degraded/readiness/blocker 响应;PostgreSQL pool connect/query timeout 不得作为未捕获异常杀死 cloud-api 进程,也不得让 liveness 因单次 Workbench read query 变成 EOF。
|
||||
|
||||
@@ -19,7 +19,7 @@
|
||||
| 短名 | Web哨兵 |
|
||||
| 层级 | L3 子课题 |
|
||||
| 状态 | 已生效 |
|
||||
| 实现引用版本 | draft-2026-06-25-p0-web-probe-sentinel |
|
||||
| 实现引用版本 | draft-2026-06-25-p0-web-probe-sentinel; draft-2026-06-27-p0-workbench-read-model-rootcause |
|
||||
| Dashboard 实现引用版本 | draft-2026-06-26-p8-web-probe-sentinel-recovery |
|
||||
| 多实例实现引用版本 | draft-2026-06-26-p9-multi-web-probe-sentinel |
|
||||
| Monitor Web 聚合实现引用版本 | draft-2026-06-26-p10-monitor-web-aggregation |
|
||||
@@ -523,6 +523,8 @@ HWLAB runtime 发布 Pipeline 应在 Argo sync 前调用当前哨兵 `maintenanc
|
||||
|
||||
刷新/切换检查只检测同一 canary session 的消息/trace/final 投影顺序。受控切走和切回窗口内的 `session-route-changed` / `active-session-changed` 不构成业务异常;切回后仍存在 route/active mismatch、trace 丢失、final 缺失或 command failure 时沿用既有 blocker 规则。若同一 session 可见消息出现多个 user message cards 连续展示,且这些 user cards 之间缺少 assistant/agent/code-agent terminal 或 response card,`observe analyze` 必须产生 `workbench-message-order-user-clustered-after-navigation`,severity=`amber`,blocking=`false`,并记录 afterRound、canarySessionId、routeSessionId、activeSessionId、连续 user 数量、sentinel marker 范围、sample seq、traceId 列表、pageRole/pageId 和 redacted message order 摘要。
|
||||
|
||||
`observe analyze` 对 Workbench read-model 类问题必须输出稳定 rootCause code,便于 issue closeout、OTel drill-down 和 dashboard 过滤:消息 role cluster 使用 `session_message_role_clustered`;trace event page 404 使用 `trace_events_paging_contract_mismatch`;projection/read-model 长期落后使用 `projection_read_model_stale`;session rail fallback title/preview 使用 `session_title_fallback_from_facts`。这些 finding 必须保持脱敏,只输出 opaque id、hash、seq、count、ratio、traceId/sessionId 前缀或红acted 摘要,不输出 Secret、完整 prompt 或 provider payload。
|
||||
|
||||
每一轮任务都必须需要工具调用。验收报告要证明十轮在同一 session 内完成,记录每轮 traceId、terminal status、tool-call evidence/count、耗时、慢 API、network/console/requestfailed finding、trace 顺序异常、terminal-not-last、session mismatch 和 final-response flicker。
|
||||
|
||||
`dsflash-go` 是 AgentRun backend profile。实现必须验证 profile-scoped SecretRef、config 和 `model-catalog.json` presence/fingerprint;缺失时结构化失败,不允许 fallback 到 `codex`、`deepseek`、`minimax-m3` 或其他 profile。
|
||||
|
||||
@@ -1527,10 +1527,10 @@ function buildSessionInvariantFindings(control, manifest = {}) {
|
||||
const traceIds = arrayStrings(messageOrder.traceIds).slice(0, 12);
|
||||
const findingId = stringOrNull(detail.findingId) ?? "workbench-message-order-user-clustered-after-navigation";
|
||||
const severity = stringOrNull(detail.severity) ?? "amber";
|
||||
const rootCause = "workbench-message-order-projection-drops-terminal-cards-after-navigation";
|
||||
const rootCauseStatus = "confirmed-from-controlled-refresh-dom;otel-read-model-comparison-required";
|
||||
const rootCause = "session_message_role_clustered";
|
||||
const rootCauseStatus = "confirmed-from-controlled-refresh-dom;check-otel-session_messages_read-role-sequence";
|
||||
const rootCauseConfidence = "medium";
|
||||
const nextAction = "Use OTel session_messages_read/session detail for the same canarySessionId and traceIds. If the read model is interleaved but DOM is clustered, fix Workbench renderer/hydration ordering; if the read model is already clustered, fix session message projection/read-model ordering.";
|
||||
const nextAction = "Use OTel session_messages_read/session detail for the same canarySessionId and traceIds. Compare roleSequencePrefix and adjacentSameRoleCount; if the read model is already clustered, fix Workbench projection/read-model timeline ordering before changing renderer code.";
|
||||
findings.push({
|
||||
id: findingId,
|
||||
severity,
|
||||
@@ -2373,10 +2373,10 @@ function buildFindings(samples, control, network, errors, sampleMetrics, promptN
|
||||
id: "session-rail-title-fallback-root-cause",
|
||||
severity: "red",
|
||||
summary: "INV-02 root cause visible: session rail is rendering fallback Session ses_* titles, so list projection/read model or rail binding is missing stable title/preview data before DOM render",
|
||||
rootCause: "session-list-title-projection-missing-or-not-bound",
|
||||
rootCause: "session_title_fallback_from_facts",
|
||||
rootCauseStatus: "confirmed-from-dom-session-rail",
|
||||
rootCauseConfidence: "high",
|
||||
nextAction: "Check OTel session_list_read fallbackTitleCount/fallbackTitleRatio for the same run; fix session list projection/read model title/firstUserMessagePreview before changing DOM fallback text.",
|
||||
nextAction: "Check OTel session_list_read fallbackTitleCount/fallbackTitleRatio and emptyPreviewCount for the same run; fix session list projection/read model title/preview before changing DOM fallback text.",
|
||||
count: sessionRailTitleSummary.overThresholdSampleCount ?? sessionRailTitleSummary.majorityFallbackSampleCount,
|
||||
thresholdRatio: alertThresholds.sessionRailFallbackRatio,
|
||||
maxFallbackRatio: sessionRailTitleSummary.maxFallbackRatio,
|
||||
@@ -2398,7 +2398,7 @@ function buildFindings(samples, control, network, errors, sampleMetrics, promptN
|
||||
id: "trace-events-page-read-404-root-cause",
|
||||
severity: "red",
|
||||
summary: "INV-07 root cause visible: /v1/workbench/traces/:traceId/events returned HTTP 404 for a trace event page read, so the failure is in the trace-events API paging/read-model contract before DOM rendering",
|
||||
rootCause: "trace-events-api-page-read-returned-404",
|
||||
rootCause: "trace_events_paging_contract_mismatch",
|
||||
rootCauseStatus: "confirmed-from-browser-network",
|
||||
rootCauseConfidence: "high",
|
||||
nextAction: "Use OTel trace_events_read for the same trace to compare sinceSeq/afterProjectedSeq, returnedEvents, range, totalEvents, hasMore and fullTraceLoaded; fix backend paging contract or add missing instrumentation before changing renderer behavior.",
|
||||
|
||||
@@ -1636,10 +1636,10 @@ def root_cause_candidates(http_summary, agentrun, read_model, lag_summary, error
|
||||
})
|
||||
if lag_summary.get("status") == "confirmed":
|
||||
candidates.append({
|
||||
"code": "hwlab_projection_stale",
|
||||
"label": "HWLAB projection stale",
|
||||
"code": "projection_read_model_stale",
|
||||
"label": "projection/read-model stale",
|
||||
"confidence": 0.94,
|
||||
"summary": "HWLAB projection stale: read model sequence is behind the caller/requested event window.",
|
||||
"summary": "Projection/read-model stale: read model sequence is behind the caller/requested event window.",
|
||||
"evidence": {
|
||||
"sourceEventCount": read_model.get("sourceEventCount"),
|
||||
"requestedSinceSeq": read_model.get("requestedSinceSeq"),
|
||||
@@ -1649,10 +1649,10 @@ def root_cause_candidates(http_summary, agentrun, read_model, lag_summary, error
|
||||
})
|
||||
elif lag_summary.get("status") == "suspected":
|
||||
candidates.append({
|
||||
"code": "hwlab_projection_lag_suspected",
|
||||
"label": "HWLAB projection stale",
|
||||
"code": "projection_read_model_lag_suspected",
|
||||
"label": "projection/read-model stale",
|
||||
"confidence": 0.64,
|
||||
"summary": "HWLAB projection stale is suspected because AgentRun is terminal but HWLAB read-model evidence is incomplete.",
|
||||
"summary": "Projection/read-model stale is suspected because AgentRun is terminal but HWLAB read-model evidence is incomplete.",
|
||||
"evidence": lag_summary.get("reasons", []),
|
||||
})
|
||||
failure_kind = str(agentrun.get("failureKind") or "")
|
||||
@@ -1976,7 +1976,7 @@ if terminal_status in ("failed", "error", "timeout", "blocked", "cancelled"):
|
||||
if terminal_status == "completed":
|
||||
facts.append("AgentRun completed")
|
||||
if lag.get("status") in ("confirmed", "suspected"):
|
||||
facts.append("HWLAB projection stale")
|
||||
facts.append("projection/read-model stale")
|
||||
if idle_warning_spans and terminal_status in (None, ""):
|
||||
facts.append("AgentRun runner idle warnings active")
|
||||
if not facts:
|
||||
|
||||
Reference in New Issue
Block a user