fix(code-queue): add active-run liveness diagnostics

This commit is contained in:
Codex
2026-05-19 03:48:17 +00:00
parent f36ea37548
commit 2a45bfe180
13 changed files with 561 additions and 17 deletions
+2 -2
View File
@@ -35,7 +35,7 @@ Typical targeted commands:
- Core API: `docker exec unidesk-backend-core` calls internal `GET /api/overview`, which must report `dbReady: true`, `pgdata.volumeName=unidesk_pgdata_10gb`, a positive PostgreSQL database byte count, and at least one online node; internal `GET /api/performance` must report component request statistics, internal operation statistics, PGDATA usage and Code Queue PostgreSQL storage metadata. - Core API: `docker exec unidesk-backend-core` calls internal `GET /api/overview`, which must report `dbReady: true`, `pgdata.volumeName=unidesk_pgdata_10gb`, a positive PostgreSQL database byte count, and at least one online node; internal `GET /api/performance` must report component request statistics, internal operation statistics, PGDATA usage and Code Queue PostgreSQL storage metadata.
- Provider self-connection: internal `GET /api/nodes` must contain `main-server` with `status: online`, `labels.providerGatewayVersion` equal to `src/components/provider-gateway/package.json`, `labels.providerGatewayUpgradePolicy: "always-enabled"`, `labels.providerGatewayRestartPolicyOk: true`, `labels.providerGatewayPidModeOk: true`, and `labels.providerGatewayRuntimeGuardOk: true`; internal `GET /api/nodes/system-status` must contain CPU/memory/disk samples plus a non-empty process resource list sorted by `memoryBytes` by default, where `memoryBytes` should use PSS when `/proc/[pid]/smaps_rollup` is available, otherwise `rssBytes - statm.shared` before raw RSS, and must retain `rssBytes` for diagnostics; internal `GET /api/nodes/docker-status` must contain a Docker snapshot for `main-server`; every running `provider-gateway` container visible in Docker snapshots must report `restartPolicy: "always"` and `pidMode: "host"`; public provider ingress `/health` must return ok. - Provider self-connection: internal `GET /api/nodes` must contain `main-server` with `status: online`, `labels.providerGatewayVersion` equal to `src/components/provider-gateway/package.json`, `labels.providerGatewayUpgradePolicy: "always-enabled"`, `labels.providerGatewayRestartPolicyOk: true`, `labels.providerGatewayPidModeOk: true`, and `labels.providerGatewayRuntimeGuardOk: true`; internal `GET /api/nodes/system-status` must contain CPU/memory/disk samples plus a non-empty process resource list sorted by `memoryBytes` by default, where `memoryBytes` should use PSS when `/proc/[pid]/smaps_rollup` is available, otherwise `rssBytes - statm.shared` before raw RSS, and must retain `rssBytes` for diagnostics; internal `GET /api/nodes/docker-status` must contain a Docker snapshot for `main-server`; every running `provider-gateway` container visible in Docker snapshots must report `restartPolicy: "always"` and `pidMode: "host"`; public provider ingress `/health` must return ok.
- Provider remote control: internal `/api/dispatch` must successfully complete a real `provider.upgrade` task in `mode: "plan"` so the upgrade path is validated without recreating the running gateway during E2E. - Provider remote control: internal `/api/dispatch` must successfully complete a real `provider.upgrade` task in `mode: "plan"` so the upgrade path is validated without recreating the running gateway during E2E.
- User services: internal `/api/microservices` must include `todo-note` and `oa-event-flow` on `main-server`, canonical `filebrowser` on `D518`, plus `k3sctl-adapter`, `code-queue`, `findjob`, `pipeline`, `met-nonlinear`, `claudeqq` and `filebrowser-d601` on `D601` with `public=false`; `/api/microservices/todo-note/health` must report `storage=postgres`, `/api/microservices/todo-note/proxy/api/instances` must expose the migrated Todo Note lists, and a temporary Todo Note list create/add/toggle/undo/delete cycle must succeed through the real provider-gateway proxy; `/api/microservices/oa-event-flow/health`, `/api/microservices/oa-event-flow/proxy/api/diagnostics`, `/api/microservices/oa-event-flow/proxy/api/events`, `/api/microservices/oa-event-flow/proxy/api/events?tags=service:pipeline` and `/api/microservices/oa-event-flow/proxy/api/stats/trace` must prove the independent OA event table、Pipeline bridge 和 stats center are reachable through UniDesk proxy; `/api/microservices/k3sctl-adapter/health` and `/api/microservices/k3sctl-adapter/proxy/api/control-plane` must expose the D601 `unidesk-k3s` control plane, `kubeApiProxy.mode=kubernetes-api-service-proxy`, D601 active Code Queue instance `servingHealthy=true`, `presentNodeIds` containing `D601`, `missingNodeIds=[]`, `status=healthy`, and `noFallback=true`; `/api/microservices/code-queue/health` must return the active Code Queue backend summary with default model `gpt-5.5`, `egressProxy.connected=true`, and `/api/microservices/code-queue/proxy/api/tasks/overview` must return queue state through backend-core -> k3sctl-adapter -> Kubernetes API service proxy -> k3s/k8s Service, not through a `serviceId=code-queue` provider-gateway direct task or `/api/code-queue-direct`; Code Queue raw prompt observation fields must preserve long prompt tails across create/list/detail/frontend paths, with any shortened text exposed only through explicit `*Preview` objects carrying `chars` and `truncated`; `/api/microservices/filebrowser/health`, `/api/microservices/filebrowser-d601/health` and `/api/microservices/filebrowser/proxy/` must prove File Browser health and WebUI access through UniDesk proxy; `/api/microservices/findjob/health` and `/api/microservices/findjob/proxy/api/summary` must succeed through the real provider-gateway proxy; `/api/microservices/findjob/proxy/api/jobs?__unideskArrayLimit=jobs:5` must return a bounded preview with `_unidesk.arrayLimits` metadata; `/api/microservices/pipeline/health`, `/api/microservices/pipeline/proxy/api/snapshot?__unideskArrayLimit=registry.components:8,runs:3` and `/api/microservices/pipeline/proxy/api/oa-event-flow/diagnostics` must return Pipeline health, registry/run previews and OA event-flow evidence; `/api/microservices/met-nonlinear/health`, `/api/microservices/met-nonlinear/proxy/api/queue`, `/api/microservices/met-nonlinear/proxy/api/projects?root=projects&limit=500`, `/api/microservices/met-nonlinear/proxy/api/projects?root=ex_projects&limit=500`, `/api/microservices/met-nonlinear/proxy/api/projects/config?path=<projectPath>` and `/api/microservices/met-nonlinear/proxy/api/images` must return the D601 TS backend health, queue/GPU policy, full project tree inputs, structured project detail and ready `met-nonlinear-ml:tf26` image status. - User services: internal `/api/microservices` must include `todo-note` and `oa-event-flow` on `main-server`, canonical `filebrowser` on `D518`, plus `k3sctl-adapter`, `code-queue`, `findjob`, `pipeline`, `met-nonlinear`, `claudeqq` and `filebrowser-d601` on `D601` with `public=false`; `/api/microservices/todo-note/health` must report `storage=postgres`, `/api/microservices/todo-note/proxy/api/instances` must expose the migrated Todo Note lists, and a temporary Todo Note list create/add/toggle/undo/delete cycle must succeed through the real provider-gateway proxy; `/api/microservices/oa-event-flow/health`, `/api/microservices/oa-event-flow/proxy/api/diagnostics`, `/api/microservices/oa-event-flow/proxy/api/events`, `/api/microservices/oa-event-flow/proxy/api/events?tags=service:pipeline` and `/api/microservices/oa-event-flow/proxy/api/stats/trace` must prove the independent OA event table、Pipeline bridge 和 stats center are reachable through UniDesk proxy; `/api/microservices/k3sctl-adapter/health` and `/api/microservices/k3sctl-adapter/proxy/api/control-plane` must expose the D601 `unidesk-k3s` control plane, `kubeApiProxy.mode=kubernetes-api-service-proxy`, D601 active Code Queue instance `servingHealthy=true`, `presentNodeIds` containing `D601`, `missingNodeIds=[]`, `status=healthy`, and `noFallback=true`; `/api/microservices/code-queue/health` must return the active Code Queue backend summary with default model `gpt-5.5`, `egressProxy.connected=true`, `queue.executionDiagnostics` containing DB active state, scheduler active slots, scheduler heartbeat and Trace/OA progress, and `/api/microservices/code-queue/proxy/api/tasks/overview` must return queue state through backend-core -> k3sctl-adapter -> Kubernetes API service proxy -> k3s/k8s Service, not through a `serviceId=code-queue` provider-gateway direct task or `/api/code-queue-direct`; Code Queue raw prompt observation fields must preserve long prompt tails across create/list/detail/frontend paths, with any shortened text exposed only through explicit `*Preview` objects carrying `chars` and `truncated`; `/api/microservices/filebrowser/health`, `/api/microservices/filebrowser-d601/health` and `/api/microservices/filebrowser/proxy/` must prove File Browser health and WebUI access through UniDesk proxy; `/api/microservices/findjob/health` and `/api/microservices/findjob/proxy/api/summary` must succeed through the real provider-gateway proxy; `/api/microservices/findjob/proxy/api/jobs?__unideskArrayLimit=jobs:5` must return a bounded preview with `_unidesk.arrayLimits` metadata; `/api/microservices/pipeline/health`, `/api/microservices/pipeline/proxy/api/snapshot?__unideskArrayLimit=registry.components:8,runs:3` and `/api/microservices/pipeline/proxy/api/oa-event-flow/diagnostics` must return Pipeline health, registry/run previews and OA event-flow evidence; `/api/microservices/met-nonlinear/health`, `/api/microservices/met-nonlinear/proxy/api/queue`, `/api/microservices/met-nonlinear/proxy/api/projects?root=projects&limit=500`, `/api/microservices/met-nonlinear/proxy/api/projects?root=ex_projects&limit=500`, `/api/microservices/met-nonlinear/proxy/api/projects/config?path=<projectPath>` and `/api/microservices/met-nonlinear/proxy/api/images` must return the D601 TS backend health, queue/GPU policy, full project tree inputs, structured project detail and ready `met-nonlinear-ml:tf26` image status. Code Queue liveness fixture checks are first-class E2E selections: `code-queue:active-run-heartbeat-visible`, `code-queue:trace-gap-not-stale`, `code-queue:stale-active-owner-expired`, `code-queue:control-plane-split-brain-diagnostics` and `code-queue:oa-publisher-degraded-visible`.
- ClaudeQQ availability: `/api/microservices/claudeqq/health` must only pass when `ready=true`, NapCat HTTP and WebSocket are connected, and `napcat.loginState=logged_in`; `/api/microservices/claudeqq/proxy/api/napcat/login` must show the same logged-in account state and `/api/microservices/claudeqq/proxy/api/events/recent` must prove the backend can read the persistent event cache. A QR-code-only or not-logged-in NapCat state must be treated as unhealthy. - ClaudeQQ availability: `/api/microservices/claudeqq/health` must only pass when `ready=true`, NapCat HTTP and WebSocket are connected, and `napcat.loginState=logged_in`; `/api/microservices/claudeqq/proxy/api/napcat/login` must show the same logged-in account state and `/api/microservices/claudeqq/proxy/api/events/recent` must prove the backend can read the persistent event cache. A QR-code-only or not-logged-in NapCat state must be treated as unhealthy.
- Database: the command writes an `unidesk_e2e_markers` row through `docker exec unidesk-database psql`, confirms provider state is stored in PostgreSQL, and checks Todo Note rows exist in `todo_note_instances` using the same named volume. - Database: the command writes an `unidesk_e2e_markers` row through `docker exec unidesk-database psql`, confirms provider state is stored in PostgreSQL, and checks Todo Note rows exist in `todo_note_instances` using the same named volume.
- Pipeline OA event flow: `microservice:pipeline-oa-event-flow` must prove both no-audit and monitor-audit runs are driven by OA events end to end. The event stream must show `node-finished` as a neutral fact with `pipeline:{pipelineId}` and `epoch:{runId}` tags, OA policy as the source of downstream/audit decisions, monitor decisions as OA control events, and runner control-result evidence. E2E must fail if delivery still depends on a legacy detail audit policy flag as policy authority, independent legacy audit-request points, a legacy batch completion gate, direct monitor-to-runner calls, or frontend/CLI writes to Pipeline `.state`. - Pipeline OA event flow: `microservice:pipeline-oa-event-flow` must prove both no-audit and monitor-audit runs are driven by OA events end to end. The event stream must show `node-finished` as a neutral fact with `pipeline:{pipelineId}` and `epoch:{runId}` tags, OA policy as the source of downstream/audit decisions, monitor decisions as OA control events, and runner control-result evidence. E2E must fail if delivery still depends on a legacy detail audit policy flag as policy authority, independent legacy audit-request points, a legacy batch completion gate, direct monitor-to-runner calls, or frontend/CLI writes to Pipeline `.state`.
@@ -55,7 +55,7 @@ Remote update records in the frontend are covered by the same rule: `provider.up
Provider operation availability is also covered by the structured rendering rule. `host.ssh` availability must be displayed as badges or equivalent controls derived from capabilities and `hostSsh*` labels, and remote update availability must be displayed from `provider.upgrade` capability plus the `always-enabled` policy; these fields must not require opening raw Provider JSON. Provider operation availability is also covered by the structured rendering rule. `host.ssh` availability must be displayed as badges or equivalent controls derived from capabilities and `hostSsh*` labels, and remote update availability must be displayed from `provider.upgrade` capability plus the `always-enabled` policy; these fields must not require opening raw Provider JSON.
User service pages are covered by the same rule. `Todo Note` must show lists, task tree, filters, reminder input, movement controls, undo/redo and metrics as controls; `OA Event Flow` must show health, live tag stream state, event table, tag filter presets and Trace stats table as controls; `Code Queue` must show queue cards, live transcript, model/cwd/max attempt inputs, judge decision, attempt table, append prompt, interrupt and retry controls; `File Browser` must show D518 as the default target, D601 as an alternate target, a screenshot export action, and an embedded upstream WebUI frame served through `/api/microservices/<id>/proxy/` with compact file rows that do not let material-icon fallback text cover file metadata; `FindJob` must show metrics, jobs and drafts as cards/tables; `Pipeline` must show component classes, React Flow graph nodes/edges, run cards, Gantt execution lines and OpenCode Trace timelines as controls; `MET Nonlinear` must show queue rows, GPU/image cards, a real path tree for the project library, structured project/job detail panels, project config preview, `data/` training state, model parameter count, metrics, progress bars, ETA, `epoch/h` speed and history diagnostics as controls; `ClaudeQQ` must show NapCat HTTP/WS/login badges, QR/login panel, event cache, subscriptions and message push controls; the full user-service config, summary, snapshot, jobs preview, drafts, OA events and run JSON can only appear after an explicit `查看原始JSON` click. User service pages are covered by the same rule. `Todo Note` must show lists, task tree, filters, reminder input, movement controls, undo/redo and metrics as controls; `OA Event Flow` must show health, live tag stream state, event table, tag filter presets and Trace stats table as controls; `Code Queue` must show queue cards, execution liveness diagnostics, live transcript, model/cwd/max attempt inputs, judge decision, attempt table, append prompt, interrupt and retry controls; `File Browser` must show D518 as the default target, D601 as an alternate target, a screenshot export action, and an embedded upstream WebUI frame served through `/api/microservices/<id>/proxy/` with compact file rows that do not let material-icon fallback text cover file metadata; `FindJob` must show metrics, jobs and drafts as cards/tables; `Pipeline` must show component classes, React Flow graph nodes/edges, run cards, Gantt execution lines and OpenCode Trace timelines as controls; `MET Nonlinear` must show queue rows, GPU/image cards, a real path tree for the project library, structured project/job detail panels, project config preview, `data/` training state, model parameter count, metrics, progress bars, ETA, `epoch/h` speed and history diagnostics as controls; `ClaudeQQ` must show NapCat HTTP/WS/login badges, QR/login panel, event cache, subscriptions and message push controls; the full user-service config, summary, snapshot, jobs preview, drafts, OA events and run JSON can only appear after an explicit `查看原始JSON` click.
## Public Boundary Rule ## Public Boundary Rule
+3 -1
View File
@@ -180,7 +180,9 @@ D601 上必须显式使用原生 k3s kubeconfig`KUBECONFIG=/etc/rancher/k3s/k
- 服务拆分语义:`code-queue-read` 只承载 GET/HEAD 查询、overview、任务详情、Trace/output/transcript、统计和只读健康,可多副本滚动更新;它必须设置 `CODE_QUEUE_SERVICE_ROLE=read``CODE_QUEUE_SCHEDULER_ENABLED=false`,且不得接受入队、queue 变更、已读、重试、移动、追加 prompt 或打断这类 mutation。`code-queue-write` 承载入队、queue 创建/合并/更新、已读、手动重试、移动等命令写入,初期保持单副本和 `CODE_QUEUE_SERVICE_ROLE=write`,只把命令和任务状态写入 PostgreSQL,不启动 agent 子进程。`code-queue-scheduler` 是唯一拥有 scheduler 和 active run 的执行服务,设置 `CODE_QUEUE_SERVICE_ROLE=scheduler``CODE_QUEUE_SCHEDULER_ENABLED=true`,负责从 PostgreSQL 热任务集轮询新写入任务、推进队列、启动 Codex/OpenCode、处理 running task 的 steer/interrupt、发送终态通知和暴露执行端 `/health`。普通 Service 负载均衡不得把 mutation 打到 read,也不得把 running task 控制打到 write。 - 服务拆分语义:`code-queue-read` 只承载 GET/HEAD 查询、overview、任务详情、Trace/output/transcript、统计和只读健康,可多副本滚动更新;它必须设置 `CODE_QUEUE_SERVICE_ROLE=read``CODE_QUEUE_SCHEDULER_ENABLED=false`,且不得接受入队、queue 变更、已读、重试、移动、追加 prompt 或打断这类 mutation。`code-queue-write` 承载入队、queue 创建/合并/更新、已读、手动重试、移动等命令写入,初期保持单副本和 `CODE_QUEUE_SERVICE_ROLE=write`,只把命令和任务状态写入 PostgreSQL,不启动 agent 子进程。`code-queue-scheduler` 是唯一拥有 scheduler 和 active run 的执行服务,设置 `CODE_QUEUE_SERVICE_ROLE=scheduler``CODE_QUEUE_SCHEDULER_ENABLED=true`,负责从 PostgreSQL 热任务集轮询新写入任务、推进队列、启动 Codex/OpenCode、处理 running task 的 steer/interrupt、发送终态通知和暴露执行端 `/health`。普通 Service 负载均衡不得把 mutation 打到 read,也不得把 running task 控制打到 write。
- 实例语义:D601 是当前唯一 active 执行节点,`code-queue-scheduler` 以一个 scheduler Pod 承载长生命周期 Codex/OpenCode 子进程并轮询主 PostgreSQL 中由 `code-queue-mgr` 写入的 queued/retry_wait 任务。D518 不属于当前 Code Queue k3s 拓扑;在没有原生 k3s-agent 与稳定 Kubernetes 网络前,不得把 D518 写回 `expectedNodeIds` 或恢复 `code-queue-d518` standby。D601 scheduler 默认关闭 `CODE_QUEUE_STARTUP_OA_BACKFILL_ENABLED`;历史 OA Trace/STEP 回填必须通过显式 `/api/oa/backfill` 运维动作触发,不能在每次 Pod 重启时自动批量发布旧事件。 - 实例语义:D601 是当前唯一 active 执行节点,`code-queue-scheduler` 以一个 scheduler Pod 承载长生命周期 Codex/OpenCode 子进程并轮询主 PostgreSQL 中由 `code-queue-mgr` 写入的 queued/retry_wait 任务。D518 不属于当前 Code Queue k3s 拓扑;在没有原生 k3s-agent 与稳定 Kubernetes 网络前,不得把 D518 写回 `expectedNodeIds` 或恢复 `code-queue-d518` standby。D601 scheduler 默认关闭 `CODE_QUEUE_STARTUP_OA_BACKFILL_ENABLED`;历史 OA Trace/STEP 回填必须通过显式 `/api/oa/backfill` 运维动作触发,不能在每次 Pod 重启时自动批量发布旧事件。
- 滚动更新边界:master `code-queue-mgr` 保证 D601 抖动或执行面滚动更新期间普通提交、queue 管理和历史读取仍可用;但当前 D601 scheduler Pod 内仍直接承载正在运行的 agent 子进程,scheduler Pod 被替换时 active task 仍会进入 restart-recovery/retry 语义,不能宣称 running task 零中断。真正的长期目标是继续把调度器和执行器拆开:scheduler 只负责 claim task 并创建 Kubernetes Job/Pod 或独立 workerrunner 把输出、状态、attempt、事件和通知写回 PostgreSQL/OA Event Flow/归档;只有这样 controller/scheduler 滚动更新才不会影响正在执行的任务。 - 滚动更新边界:master `code-queue-mgr` 保证 D601 抖动或执行面滚动更新期间普通提交、queue 管理和历史读取仍可用;但当前 D601 scheduler Pod 内仍直接承载正在运行的 agent 子进程,scheduler Pod 被替换时 active task 仍会进入 restart-recovery/retry 语义,不能宣称 running task 零中断。真正的长期目标是继续把调度器和执行器拆开:scheduler 只负责 claim task 并创建 Kubernetes Job/Pod 或独立 workerrunner 把输出、状态、attempt、事件和通知写回 PostgreSQL/OA Event Flow/归档;只有这样 controller/scheduler 滚动更新才不会影响正在执行的任务。
- Restart recoveryD601 scheduler 启动时必须把没有本地 active run`running`/`judging` 任务恢复为 `retry_wait` 并先写回 PostgreSQL,再开启新一轮 scheduler 轮询;同时必须清理 `queued`/`retry_wait`/terminal 任务残留的 `activeTurnId`,否则 PG 中残留的 running 或旧 turn id 会阻塞队列且不会被执行。health/overview 中的 `activeTaskIds` 只代表当前进程真实持有的 agent run;数据库里仍处于 `running`/`judging` 但没有本地 run 的任务只能作为 scheduler 侧 `orphanedActiveTaskIds` 暴露,不能计入 active run slot。主 server 直管 `code-queue-mgr` 只有 PostgreSQL 视角,不得把数据库中的 `running`/`judging` 误报为真实 active run;只能作为 `databaseActiveTaskIds`/`executionStateSource=postgres-control-plane` 这类控制面状态返回 - Active run livenessCode Queue 活性判断必须同时读取 PostgreSQL 任务状态、D601 scheduler 本地 active run/active slot/active queue、scheduler-owned heartbeat、Trace/OA 持久化进度和 OA publisher pending/lastError。scheduler heartbeat 至少包含 `taskId``attempt``activeTurnId`/`codexThreadId``owner`/`schedulerInstance``lastLocalHeartbeatAt``lastObservedAgentEventAt``lastPersistedTraceAt``outputMaxSeq``agentPort`;后续如拆出独立 runner,可以在同一结构上追加 worker/claim lease 字段。master `code-queue-mgr` `postgres-control-plane` 视图不能单独作为“任务已卡死/未执行”的依据;当 master 看到 `databaseActiveTaskCount>0`、本地 `activeRunSlotCount=0`,但存在新鲜 scheduler heartbeat 时,`executionDiagnostics.state` 必须报告 `split-brain`/`degraded` 而不是 `healthy`
- Trace gap 与 stale activeTrace/OA 长时间没有新 seq 或 publisher 有 pending/lastError 只说明持久化链路可能 degraded;只要 scheduler-owned `lastLocalHeartbeatAt` 仍新鲜,就必须归类为 trace gap,不得触发 stale retry。只有 PostgreSQL 仍为 `running`/`judging`、scheduler 本地没有 active run/slot/waiter,并且 owner heartbeat 已过期时,任务才允许进入 stale recovery candidate;缺失 heartbeat 是 degraded 诊断,不是自动恢复许可。
- Restart recoveryD601 scheduler 启动或 reconciliation 时只能由 scheduler-owned 恢复入口处理 stale active,必须留下 recovery reason、source/method 审计事件,并使用条件更新防止覆盖并发 owner 写入;恢复原因需要区分 user interrupt、admin stale recovery 和 service restart recovery。禁止裸改 production PostgreSQL 任务状态,禁止把 production scheduler/数据库作为破坏性测试对象。health/overview 中的 `activeTaskIds` 只代表当前进程真实持有的 agent run;数据库里仍处于 `running`/`judging` 但没有本地 run 的任务只能作为 scheduler 侧 `orphanedActiveTaskIds` 或 diagnostics 暴露,不能计入 active run slot。主 server 直管 `code-queue-mgr` 只有 PostgreSQL 视角,不得把数据库中的 `running`/`judging` 误报为真实 active run;只能作为 `databaseActiveTaskIds`/`executionStateSource=postgres-control-plane` 这类控制面状态返回。
- Transient dependency recoveryD601 scheduler/read/write 通过 provider egress 和 TCP gateway 访问主 PostgreSQL、OA Event Flow 与模型 API,必须把 `CONNECTION_CLOSED``CONNECT_TIMEOUT`、stale PostgreSQL client、provider egress 瞬时失败和 MiniMax judge provider 初始化失败视为可恢复运行时抖动。实现上应轮换失效数据库 client、重试或降级 judge provider 初始化、释放 active run slot 并继续扫描后续 queued/retry_wait 任务;不得因为一次连接关闭、一次 judge provider transient error 或滚动更新窗口让 scheduler 长期停止推进。 - Transient dependency recoveryD601 scheduler/read/write 通过 provider egress 和 TCP gateway 访问主 PostgreSQL、OA Event Flow 与模型 API,必须把 `CONNECTION_CLOSED``CONNECT_TIMEOUT`、stale PostgreSQL client、provider egress 瞬时失败和 MiniMax judge provider 初始化失败视为可恢复运行时抖动。实现上应轮换失效数据库 client、重试或降级 judge provider 初始化、释放 active run slot 并继续扫描后续 queued/retry_wait 任务;不得因为一次连接关闭、一次 judge provider transient error 或滚动更新窗口让 scheduler 长期停止推进。
- 部署引用:Code Queue 镜像仍复用 `src/components/microservices/code-queue/Dockerfile`Kubernetes 运行清单为 `src/components/microservices/k3sctl-adapter/k3s/code-queue.k8s.yaml``config.json` 对外记录 k3s manifest `src/components/microservices/k3sctl-adapter/k3s/code-queue.k3s.json`;主 server 根目录 `docker-compose.yml` 不包含 `code-queue` service,旧 D601 direct Compose 文件只作为迁移/本地诊断参考,不是正式运行入口。 - 部署引用:Code Queue 镜像仍复用 `src/components/microservices/code-queue/Dockerfile`Kubernetes 运行清单为 `src/components/microservices/k3sctl-adapter/k3s/code-queue.k8s.yaml``config.json` 对外记录 k3s manifest `src/components/microservices/k3sctl-adapter/k3s/code-queue.k3s.json`;主 server 根目录 `docker-compose.yml` 不包含 `code-queue` service,旧 D601 direct Compose 文件只作为迁移/本地诊断参考,不是正式运行入口。
- 主服务依赖映射:Code Queue 仍以主 PostgreSQL 为权威数据库,但 D601 k3s Pod 不能依赖公网直连 `74.48.78.17:15432/4255`。Pod 内 `DATABASE_URL``OA_EVENT_FLOW_BASE_URL` 必须指向集群内 `d601-tcp-egress-gateway` Service,再由该 gateway 通过 D601 provider-gateway egress proxy 的 HTTP CONNECT 转发到主 PostgreSQL 和 OA Event Flow;新增 TCP 依赖时扩展 `TCP_EGRESS_ROUTES`,不得在业务容器里新增一次性公网直连或 ad hoc 隧道。D601 active 实例的 `CODE_QUEUE_NOTIFY_CLAUDEQQ_BASE_URL` 必须使用集群内 ClaudeQQ Service `http://claudeqq.unidesk.svc.cluster.local:3290`,并把 `claudeqq`/`claudeqq.unidesk.svc.cluster.local` 加入 `NO_PROXY`,避免任务完成通知被默认出网代理错误转发。旧 `http://host.docker.internal:3290` 只允许作为迁移期诊断,不得作为 Code Queue k3s Pod 的正式通知路径。这些端口映射只服务受控节点运行时,必须用防火墙或等价策略限制来源,不得成为浏览器或任意公网客户端入口。 - 主服务依赖映射:Code Queue 仍以主 PostgreSQL 为权威数据库,但 D601 k3s Pod 不能依赖公网直连 `74.48.78.17:15432/4255`。Pod 内 `DATABASE_URL``OA_EVENT_FLOW_BASE_URL` 必须指向集群内 `d601-tcp-egress-gateway` Service,再由该 gateway 通过 D601 provider-gateway egress proxy 的 HTTP CONNECT 转发到主 PostgreSQL 和 OA Event Flow;新增 TCP 依赖时扩展 `TCP_EGRESS_ROUTES`,不得在业务容器里新增一次性公网直连或 ad hoc 隧道。D601 active 实例的 `CODE_QUEUE_NOTIFY_CLAUDEQQ_BASE_URL` 必须使用集群内 ClaudeQQ Service `http://claudeqq.unidesk.svc.cluster.local:3290`,并把 `claudeqq`/`claudeqq.unidesk.svc.cluster.local` 加入 `NO_PROXY`,避免任务完成通知被默认出网代理错误转发。旧 `http://host.docker.internal:3290` 只允许作为迁移期诊断,不得作为 Code Queue k3s Pod 的正式通知路径。这些端口映射只服务受控节点运行时,必须用防火墙或等价策略限制来源,不得成为浏览器或任意公网客户端入口。
+6
View File
@@ -47,3 +47,9 @@ frontend Bun server 必须提供同源 `/api/frontend-performance`,记录 webu
当最近失败请求集中出现 frontend `core_proxy` 502/503/504,路径为 `/api/microservices/code-queue/proxy/...` 的 overview、trace 或 summary,且 k3s/k8s Pod 仍在运行时,必须先运行 `bun scripts/cli.ts microservice diagnostics code-queue`,区分 provider-gateway online、WebSocket HTTP tunnel、k3sctl-adapter、Kubernetes API service proxy 和目标 Service 五段状态。provider tunnel 类失败必须记录响应 body/headers 中的 `requestId``stage``failureReason``x-unidesk-request-id``x-unidesk-tunnel-error`;如需主动验证错误结构,运行 `bun scripts/cli.ts microservice tunnel-self-test code-queue`,该自测应返回预期失败但 `ok=true` 的诊断结果。随后再继续判断“Kubernetes API service proxy 不可达”“Code Queue 进程不可达”和“Code Queue event loop 被热路径同步工作饿死”。如果 `debug health` 或 provider-gateway egress health 显示 `providerGatewayEgressProxyActiveTunnels` 持续偏高、`pendingTunnels` 非零或 `oldestTunnelAgeMs` 长时间增长,应先按 provider-gateway egress tunnel 生命周期排障,确认 `egress_tcp_open`、connect timeout、idle cleanup 与 core socket close 清理是否生效。排障顺序是同时查看 `/api/frontend-performance``/api/performance``k3sctl-adapter` `/api/control-plane`、Kubernetes Pod `/live``/health`、overview/trace-step curl、`kubectl top pod` 或 Docker stats、容器 `RestartCount`/`OOMKilled` 和 Code Queue 日志;如果 Pod 内 `/health` 也超时,应优先检查实时 output 发布、archive 读取、transcript 构建、统计计算、启动维护、历史 OA backfill 和远程 Provider 准备/SSH 子进程是否阻塞 event loop,而不是先调整 frontend 渲染或代理超时。Code Queue 默认不得在启动时自动执行历史 OA backfill 或通知表索引维护;显式 backfill 必须作为运维动作记录,并在运行期间并发证明 `/live``/health``/api/tasks/overview` 仍快速返回。涉及 D601 等远程 Provider 时,还要检查 `runCodeQueueSsh`/开发容器准备是否仍存在同步子进程、无 timeout 的 SSH、无上限 stdout/stderr 或 stale TUN 重建等待;修复后必须在远程准备探针运行期间并发证明 Pod `/health``/api/tasks/overview` 仍快速返回。 当最近失败请求集中出现 frontend `core_proxy` 502/503/504,路径为 `/api/microservices/code-queue/proxy/...` 的 overview、trace 或 summary,且 k3s/k8s Pod 仍在运行时,必须先运行 `bun scripts/cli.ts microservice diagnostics code-queue`,区分 provider-gateway online、WebSocket HTTP tunnel、k3sctl-adapter、Kubernetes API service proxy 和目标 Service 五段状态。provider tunnel 类失败必须记录响应 body/headers 中的 `requestId``stage``failureReason``x-unidesk-request-id``x-unidesk-tunnel-error`;如需主动验证错误结构,运行 `bun scripts/cli.ts microservice tunnel-self-test code-queue`,该自测应返回预期失败但 `ok=true` 的诊断结果。随后再继续判断“Kubernetes API service proxy 不可达”“Code Queue 进程不可达”和“Code Queue event loop 被热路径同步工作饿死”。如果 `debug health` 或 provider-gateway egress health 显示 `providerGatewayEgressProxyActiveTunnels` 持续偏高、`pendingTunnels` 非零或 `oldestTunnelAgeMs` 长时间增长,应先按 provider-gateway egress tunnel 生命周期排障,确认 `egress_tcp_open`、connect timeout、idle cleanup 与 core socket close 清理是否生效。排障顺序是同时查看 `/api/frontend-performance``/api/performance``k3sctl-adapter` `/api/control-plane`、Kubernetes Pod `/live``/health`、overview/trace-step curl、`kubectl top pod` 或 Docker stats、容器 `RestartCount`/`OOMKilled` 和 Code Queue 日志;如果 Pod 内 `/health` 也超时,应优先检查实时 output 发布、archive 读取、transcript 构建、统计计算、启动维护、历史 OA backfill 和远程 Provider 准备/SSH 子进程是否阻塞 event loop,而不是先调整 frontend 渲染或代理超时。Code Queue 默认不得在启动时自动执行历史 OA backfill 或通知表索引维护;显式 backfill 必须作为运维动作记录,并在运行期间并发证明 `/live``/health``/api/tasks/overview` 仍快速返回。涉及 D601 等远程 Provider 时,还要检查 `runCodeQueueSsh`/开发容器准备是否仍存在同步子进程、无 timeout 的 SSH、无上限 stdout/stderr 或 stale TUN 重建等待;修复后必须在远程准备探针运行期间并发证明 Pod `/health``/api/tasks/overview` 仍快速返回。
Code Queue task 明明产出最终回复却反复 `retry_wait` 时,应优先用任务详情里的 latest attempt 字段核查 `terminalStatus``transportClosedBeforeTerminal``appServerExitCode``finalResponseChars``judge.raw._safetyOverride` 和 attempt output。OpenCode 远程任务中,`opencode completed status=completed exit=0` 加当前 attempt 非空 assistant 输出应对应 `terminalStatus=completed``transportClosedBeforeTerminal=false`;如果因为缺少 `step_finish` 事件仍触发 `_safetyOverride=terminal_not_completed`,说明协议终态归一化有回归。相反,当前 attempt 没有最终 assistant response 时即使 tool/read/bash 证据完整,也必须 retry,不能用旧 `task.finalResponse` 或 reasoning/tool evidence 代替可见最终回复。 Code Queue task 明明产出最终回复却反复 `retry_wait` 时,应优先用任务详情里的 latest attempt 字段核查 `terminalStatus``transportClosedBeforeTerminal``appServerExitCode``finalResponseChars``judge.raw._safetyOverride` 和 attempt output。OpenCode 远程任务中,`opencode completed status=completed exit=0` 加当前 attempt 非空 assistant 输出应对应 `terminalStatus=completed``transportClosedBeforeTerminal=false`;如果因为缺少 `step_finish` 事件仍触发 `_safetyOverride=terminal_not_completed`,说明协议终态归一化有回归。相反,当前 attempt 没有最终 assistant response 时即使 tool/read/bash 证据完整,也必须 retry,不能用旧 `task.finalResponse` 或 reasoning/tool evidence 代替可见最终回复。
### Code Queue Liveness
Code Queue 的“任务是否卡死”不能由单一控制面字段判断。排障必须同时看 PostgreSQL 中的 `running`/`judging` 任务、D601 scheduler 本地 active run/active slot/active queue、scheduler-owned heartbeat、Trace/OA 持久化进度和 OA publisher pending/lastError。master `code-queue-mgr``postgres-control-plane` 视图只证明数据库行存在;当它显示 `activeRunSlotCount=0` 但 D601 heartbeat 仍新鲜时,正确结论是 control-plane/execution-plane 分裂,diagnostics 应显示 `split-brain``degraded`,不能宣称任务未执行或卡死。
Trace/OA 长时间没有新 seq 但 scheduler heartbeat 正常时,应归类为 trace gap 或 publisher degraded,不得自动 retry。只有 scheduler 本地没有 active run,且对应 owner heartbeat 已过期时,才允许进入 stale recovery candidate;缺失 heartbeat 只能触发 degraded 诊断和人工确认。任何恢复入口都必须由 scheduler 执行,使用条件更新和审计事件区分 user interrupt、admin stale recovery 与 service restart recovery;禁止直接修改 production PostgreSQL 任务状态来“修复” active run。
@@ -0,0 +1,24 @@
import { CODE_QUEUE_LIVENESS_CHECK_NAMES, runCodeQueueLivenessFixtureChecks } from "./src/code-queue-liveness-fixtures";
function optionValues(args: string[], name: string): string[] {
const values: string[] = [];
for (let index = 0; index < args.length; index += 1) {
if (args[index] !== name) continue;
const raw = args[index + 1];
if (raw === undefined || raw.startsWith("--")) throw new Error(`${name} requires a check name`);
values.push(...raw.split(",").map((item) => item.trim()).filter(Boolean));
index += 1;
}
return values;
}
function main(): void {
const only = optionValues(Bun.argv.slice(2), "--only");
const unknown = only.filter((name) => !CODE_QUEUE_LIVENESS_CHECK_NAMES.includes(name as never));
if (unknown.length > 0) throw new Error(`unknown Code Queue liveness check(s): ${unknown.join(", ")}`);
const result = runCodeQueueLivenessFixtureChecks(only);
process.stdout.write(`${JSON.stringify(result, null, 2)}\n`);
if (!result.ok) process.exit(1);
}
if (import.meta.main) main();
+8
View File
@@ -237,6 +237,8 @@ export function runChecks(config: UniDeskConfig, options: CheckOptions = default
fileItem("src/components/microservices/code-queue-mgr/src/prompt-observation.ts"), fileItem("src/components/microservices/code-queue-mgr/src/prompt-observation.ts"),
fileItem("scripts/src/deploy.ts"), fileItem("scripts/src/deploy.ts"),
fileItem("scripts/code-queue-issue3-regression-test.ts"), fileItem("scripts/code-queue-issue3-regression-test.ts"),
fileItem("scripts/code-queue-liveness-diagnostics-test.ts"),
fileItem("scripts/src/code-queue-liveness-fixtures.ts"),
fileItem("scripts/src/ci.ts"), fileItem("scripts/src/ci.ts"),
fileItem("scripts/src/e2e.ts"), fileItem("scripts/src/e2e.ts"),
fileItem("scripts/code-queue-prompt-observation-test.ts"), fileItem("scripts/code-queue-prompt-observation-test.ts"),
@@ -250,10 +252,16 @@ export function runChecks(config: UniDeskConfig, options: CheckOptions = default
items.push(commandItem("typescript:scripts", ["bunx", "tsc", "-p", "scripts/tsconfig.json", "--noEmit", "--pretty", "false"], 120_000)); items.push(commandItem("typescript:scripts", ["bunx", "tsc", "-p", "scripts/tsconfig.json", "--noEmit", "--pretty", "false"], 120_000));
items.push(commandItem("code-queue:prompt-observation-contract", ["bun", "scripts/code-queue-prompt-observation-test.ts"], 30_000)); items.push(commandItem("code-queue:prompt-observation-contract", ["bun", "scripts/code-queue-prompt-observation-test.ts"], 30_000));
items.push(commandItem("code-queue:issue3-diagnostics-and-image-preflight", ["bun", "scripts/code-queue-issue3-regression-test.ts"], 30_000)); items.push(commandItem("code-queue:issue3-diagnostics-and-image-preflight", ["bun", "scripts/code-queue-issue3-regression-test.ts"], 30_000));
items.push(commandItem("code-queue:active-run-heartbeat-visible", ["bun", "scripts/code-queue-liveness-diagnostics-test.ts", "--only", "code-queue:active-run-heartbeat-visible"], 30_000));
items.push(commandItem("code-queue:trace-gap-not-stale", ["bun", "scripts/code-queue-liveness-diagnostics-test.ts", "--only", "code-queue:trace-gap-not-stale"], 30_000));
items.push(commandItem("code-queue:stale-active-owner-expired", ["bun", "scripts/code-queue-liveness-diagnostics-test.ts", "--only", "code-queue:stale-active-owner-expired"], 30_000));
items.push(commandItem("code-queue:control-plane-split-brain-diagnostics", ["bun", "scripts/code-queue-liveness-diagnostics-test.ts", "--only", "code-queue:control-plane-split-brain-diagnostics"], 30_000));
items.push(commandItem("code-queue:oa-publisher-degraded-visible", ["bun", "scripts/code-queue-liveness-diagnostics-test.ts", "--only", "code-queue:oa-publisher-degraded-visible"], 30_000));
} else { } else {
items.push(skippedItem("typescript:scripts", "scripts TypeScript typecheck is opt-in", "--scripts-typecheck or --full")); items.push(skippedItem("typescript:scripts", "scripts TypeScript typecheck is opt-in", "--scripts-typecheck or --full"));
items.push(skippedItem("code-queue:prompt-observation-contract", "prompt observation contract is opt-in with script checks", "--scripts-typecheck or --full")); items.push(skippedItem("code-queue:prompt-observation-contract", "prompt observation contract is opt-in with script checks", "--scripts-typecheck or --full"));
items.push(skippedItem("code-queue:issue3-diagnostics-and-image-preflight", "Code Queue issue #3 regression fixtures are opt-in with script checks", "--scripts-typecheck or --full")); items.push(skippedItem("code-queue:issue3-diagnostics-and-image-preflight", "Code Queue issue #3 regression fixtures are opt-in with script checks", "--scripts-typecheck or --full"));
items.push(skippedItem("code-queue:liveness-diagnostics-fixtures", "Code Queue liveness diagnostics fixtures are opt-in with script checks", "--scripts-typecheck or --full"));
} }
if (options.logs) { if (options.logs) {
items.push(unifiedLogRotationItem()); items.push(unifiedLogRotationItem());
+259
View File
@@ -0,0 +1,259 @@
import { buildExecutionDiagnostics, buildSchedulerHeartbeat, staleRecoveryCandidate, taskHasTraceGapButFreshHeartbeat } from "../../src/components/microservices/code-queue/src/execution-diagnostics";
import type { ActiveRun } from "../../src/components/microservices/code-queue/src/code-agent/common";
import type { CodeQueueExecutionDiagnostics, QueueTask, SchedulerActiveRunHeartbeat, TaskStatus } from "../../src/components/microservices/code-queue/src/types";
export const CODE_QUEUE_LIVENESS_CHECK_NAMES = [
"code-queue:active-run-heartbeat-visible",
"code-queue:trace-gap-not-stale",
"code-queue:stale-active-owner-expired",
"code-queue:control-plane-split-brain-diagnostics",
"code-queue:oa-publisher-degraded-visible",
] as const;
type CodeQueueLivenessCheckName = typeof CODE_QUEUE_LIVENESS_CHECK_NAMES[number];
interface FixtureCheck {
name: CodeQueueLivenessCheckName;
ok: boolean;
detail: Record<string, unknown>;
}
const now = "2026-05-19T00:10:00.000Z";
const freshAt = "2026-05-19T00:09:50.000Z";
const oldTraceAt = "2026-05-18T23:40:00.000Z";
const expiredAt = "2026-05-18T23:50:00.000Z";
function assertCondition(condition: unknown, message: string, detail: Record<string, unknown> = {}): void {
if (!condition) throw new Error(`${message}: ${JSON.stringify(detail)}`);
}
function fixtureTask(id: string, status: TaskStatus, heartbeat: SchedulerActiveRunHeartbeat | null = null): QueueTask {
return {
id,
queueId: "default",
queueEnteredAt: "2026-05-19T00:00:00.000Z",
prompt: `${id} prompt`,
basePrompt: `${id} prompt`,
referenceTaskIds: [],
referenceInjection: null,
providerId: "D601",
cwd: "/workspace",
model: "gpt-5.5",
reasoningEffort: null,
executionMode: "default",
maxAttempts: 99,
status,
createdAt: "2026-05-19T00:00:00.000Z",
updatedAt: "2026-05-19T00:00:00.000Z",
startedAt: status === "running" || status === "judging" ? "2026-05-19T00:00:00.000Z" : null,
finishedAt: null,
readAt: null,
currentAttempt: status === "queued" ? 0 : 1,
currentMode: status === "queued" ? null : "initial",
codexThreadId: heartbeat?.codexThreadId ?? null,
activeTurnId: heartbeat?.activeTurnId ?? null,
schedulerHeartbeat: heartbeat,
finalResponse: "",
outputMaxSeq: heartbeat?.outputMaxSeq ?? 0,
lastError: null,
lastJudge: null,
judgeFailCount: 0,
promptHistory: [],
output: [],
events: [],
attempts: [],
cancelRequested: false,
nextPrompt: null,
nextMode: null,
};
}
function heartbeat(taskId: string, at: string, overrides: Partial<SchedulerActiveRunHeartbeat> = {}): SchedulerActiveRunHeartbeat {
return {
taskId,
queueId: "default",
attempt: 1,
activeTurnId: "turn_fixture",
codexThreadId: "thread_fixture",
owner: "D601",
schedulerInstance: "code-queue-scheduler-fixture",
executionPlane: "scheduler-execution-plane",
agentPort: "codex",
status: "running",
lastLocalHeartbeatAt: at,
lastObservedAgentEventAt: at,
lastPersistedTraceAt: at,
outputMaxSeq: 10,
source: "scheduler",
...overrides,
};
}
function activeRun(taskId: string, queueId = "default"): ActiveRun {
return {
taskId,
queueId,
app: { stop: () => undefined },
port: "codex",
threadId: "thread_fixture",
turnId: "turn_fixture",
startedAt: "2026-05-19T00:00:00.000Z",
lastLocalHeartbeatAt: freshAt,
lastObservedAgentEventAt: freshAt,
lastPersistedTraceAt: freshAt,
};
}
function schedulerDiagnostics(tasks: QueueTask[], activeRuns: ActiveRun[] = [], oaPublisher: unknown = null): CodeQueueExecutionDiagnostics {
return buildExecutionDiagnostics({
now,
controlPlane: "D601-code-queue-scheduler",
executionStateSource: "scheduler-execution-plane",
tasks,
activeRuns,
activeRunSlotCount: activeRuns.length,
activeQueueIds: activeRuns.map((run) => run.queueId),
processingQueueIds: [],
orphanedActiveTaskIds: tasks.filter((task) => (task.status === "running" || task.status === "judging") && !activeRuns.some((run) => run.taskId === task.id)).map((task) => task.id),
oaPublisher: oaPublisher as never,
});
}
function checkActiveRunHeartbeatVisible(): FixtureCheck {
const task = fixtureTask("codex_fixture_active_1", "running");
const run = activeRun(task.id);
task.schedulerHeartbeat = buildSchedulerHeartbeat(task, run, {
now: freshAt,
owner: "D601",
schedulerInstance: "code-queue-scheduler-fixture",
agentPort: "codex",
lastObservedAgentEventAt: freshAt,
lastPersistedTraceAt: freshAt,
});
const diagnostics = schedulerDiagnostics([task], [run]);
assertCondition(diagnostics.activeHeartbeatTaskIds.includes(task.id), "active heartbeat task id must be visible", diagnostics as unknown as Record<string, unknown>);
assertCondition(diagnostics.schedulerActiveTaskIds.includes(task.id), "scheduler active task id must be visible", diagnostics as unknown as Record<string, unknown>);
assertCondition(diagnostics.lastSchedulerHeartbeatAt === freshAt, "last scheduler heartbeat must be surfaced", diagnostics as unknown as Record<string, unknown>);
return {
name: "code-queue:active-run-heartbeat-visible",
ok: true,
detail: {
state: diagnostics.state,
schedulerActiveTaskIds: diagnostics.schedulerActiveTaskIds,
activeHeartbeatTaskIds: diagnostics.activeHeartbeatTaskIds,
lastSchedulerHeartbeatAt: diagnostics.lastSchedulerHeartbeatAt,
heartbeat: task.schedulerHeartbeat,
},
};
}
function checkTraceGapNotStale(): FixtureCheck {
const task = fixtureTask("codex_fixture_trace_gap_1", "running", heartbeat("codex_fixture_trace_gap_1", freshAt, {
lastPersistedTraceAt: oldTraceAt,
outputMaxSeq: 89112,
}));
const decision = staleRecoveryCandidate({ task, localActive: false, now });
const hasTraceGap = taskHasTraceGapButFreshHeartbeat(task, now);
const diagnostics = schedulerDiagnostics([task], []);
assertCondition(hasTraceGap, "trace gap with fresh owner heartbeat should be classified", { decision, diagnostics });
assertCondition(decision.allowed === false && decision.reason === "owner-heartbeat-fresh", "fresh heartbeat must block stale retry", { decision });
assertCondition(diagnostics.traceGapNotStaleTaskIds.includes(task.id), "diagnostics must expose trace gap as not stale", diagnostics as unknown as Record<string, unknown>);
assertCondition(!diagnostics.staleRecoveryCandidateTaskIds.includes(task.id), "trace gap must not enter stale recovery candidates", diagnostics as unknown as Record<string, unknown>);
return {
name: "code-queue:trace-gap-not-stale",
ok: true,
detail: {
decision,
traceGapNotStaleTaskIds: diagnostics.traceGapNotStaleTaskIds,
staleRecoveryCandidateTaskIds: diagnostics.staleRecoveryCandidateTaskIds,
},
};
}
function checkStaleActiveOwnerExpired(): FixtureCheck {
const task = fixtureTask("codex_fixture_stale_1", "running", heartbeat("codex_fixture_stale_1", expiredAt, {
lastObservedAgentEventAt: expiredAt,
lastPersistedTraceAt: expiredAt,
}));
const decision = staleRecoveryCandidate({ task, localActive: false, now });
const diagnostics = schedulerDiagnostics([task], []);
assertCondition(decision.allowed === true && decision.reason === "owner-heartbeat-expired", "expired owner heartbeat should be the stale recovery gate", { decision });
assertCondition(diagnostics.state === "stale-active", "diagnostics must mark stale active only after owner heartbeat expiry", diagnostics as unknown as Record<string, unknown>);
assertCondition(diagnostics.staleRecoveryCandidateTaskIds.includes(task.id), "expired owner heartbeat should create a stale candidate", diagnostics as unknown as Record<string, unknown>);
return {
name: "code-queue:stale-active-owner-expired",
ok: true,
detail: {
decision,
state: diagnostics.state,
staleRecoveryCandidateTaskIds: diagnostics.staleRecoveryCandidateTaskIds,
},
};
}
function checkControlPlaneSplitBrainDiagnostics(): FixtureCheck {
const task = fixtureTask("codex_fixture_split_1", "running", heartbeat("codex_fixture_split_1", freshAt));
const diagnostics = buildExecutionDiagnostics({
now,
controlPlane: "master-code-queue-mgr",
executionStateSource: "postgres-control-plane",
tasks: [task],
activeRuns: [],
activeRunSlotCount: 0,
oaPublisher: null,
});
assertCondition(diagnostics.state === "split-brain" && diagnostics.splitBrain === true, "master postgres-control-plane must report split-brain when DB active has fresh scheduler heartbeat", diagnostics as unknown as Record<string, unknown>);
assertCondition(diagnostics.schedulerActiveRunSlotCount === 0 && diagnostics.databaseActiveTaskCount === 1, "split-brain fixture should preserve the exact control-plane divergence", diagnostics as unknown as Record<string, unknown>);
return {
name: "code-queue:control-plane-split-brain-diagnostics",
ok: true,
detail: {
state: diagnostics.state,
splitBrain: diagnostics.splitBrain,
executionStateSource: diagnostics.executionStateSource,
databaseActiveTaskIds: diagnostics.databaseActiveTaskIds,
schedulerActiveRunSlotCount: diagnostics.schedulerActiveRunSlotCount,
heartbeatFreshTaskIds: diagnostics.heartbeatFreshTaskIds,
},
};
}
function checkOaPublisherDegradedVisible(): FixtureCheck {
const oaPublisher = { pending: 3, lastError: "fixture OA publish retry", lastPublishedAt: null };
const diagnostics = schedulerDiagnostics([], [], oaPublisher);
assertCondition(diagnostics.state === "degraded", "OA publisher pending/lastError must degrade diagnostics", diagnostics as unknown as Record<string, unknown>);
assertCondition(diagnostics.oaPublisher === oaPublisher, "OA publisher detail must remain visible", diagnostics as unknown as Record<string, unknown>);
return {
name: "code-queue:oa-publisher-degraded-visible",
ok: true,
detail: {
state: diagnostics.state,
degraded: diagnostics.degraded,
oaPublisher: diagnostics.oaPublisher as Record<string, unknown>,
},
};
}
export function runCodeQueueLivenessFixtureChecks(only: string[] = []): { ok: boolean; checks: FixtureCheck[]; failures: Array<{ name: string; error: string }> } {
const selected = new Set(only.filter((name) => name.trim().length > 0));
const runners: Array<[CodeQueueLivenessCheckName, () => FixtureCheck]> = [
["code-queue:active-run-heartbeat-visible", checkActiveRunHeartbeatVisible],
["code-queue:trace-gap-not-stale", checkTraceGapNotStale],
["code-queue:stale-active-owner-expired", checkStaleActiveOwnerExpired],
["code-queue:control-plane-split-brain-diagnostics", checkControlPlaneSplitBrainDiagnostics],
["code-queue:oa-publisher-degraded-visible", checkOaPublisherDegradedVisible],
];
const checks: FixtureCheck[] = [];
const failures: Array<{ name: string; error: string }> = [];
for (const [name, run] of runners) {
if (selected.size > 0 && !selected.has(name)) continue;
try {
checks.push(run());
} catch (error) {
failures.push({ name, error: error instanceof Error ? error.message : String(error) });
checks.push({ name, ok: false, detail: { error: error instanceof Error ? error.message : String(error) } });
}
}
if (checks.length === 0) throw new Error(`no Code Queue liveness fixture checks matched: ${Array.from(selected).join(", ")}`);
return { ok: failures.length === 0, checks, failures };
}
+58 -1
View File
@@ -197,6 +197,52 @@ function compactLastAssistant(value: unknown, full: boolean): Record<string, unk
}; };
} }
function compactSchedulerHeartbeat(value: unknown): Record<string, unknown> | null {
const record = asRecord(value);
if (record === null) return null;
return {
taskId: record.taskId ?? null,
attempt: record.attempt ?? null,
owner: record.owner ?? null,
schedulerInstance: record.schedulerInstance ?? null,
agentPort: record.agentPort ?? null,
activeTurnId: record.activeTurnId ?? null,
codexThreadId: record.codexThreadId ?? null,
lastLocalHeartbeatAt: record.lastLocalHeartbeatAt ?? null,
lastObservedAgentEventAt: record.lastObservedAgentEventAt ?? null,
lastPersistedTraceAt: record.lastPersistedTraceAt ?? null,
outputMaxSeq: record.outputMaxSeq ?? null,
};
}
function compactExecutionDiagnostics(value: unknown): Record<string, unknown> | null {
const record = asRecord(value);
if (record === null) return null;
return {
state: record.state ?? record.health ?? null,
degraded: record.degraded ?? null,
splitBrain: record.splitBrain ?? null,
executionStateSource: record.executionStateSource ?? null,
controlPlane: record.controlPlane ?? null,
databaseActiveTaskCount: record.databaseActiveTaskCount ?? null,
databaseActiveTaskIds: record.databaseActiveTaskIds ?? [],
schedulerActiveRunSlotCount: record.schedulerActiveRunSlotCount ?? null,
schedulerActiveTaskIds: record.schedulerActiveTaskIds ?? [],
activeHeartbeatTaskIds: record.activeHeartbeatTaskIds ?? [],
heartbeatFreshTaskIds: record.heartbeatFreshTaskIds ?? [],
heartbeatExpiredTaskIds: record.heartbeatExpiredTaskIds ?? [],
heartbeatMissingTaskIds: record.heartbeatMissingTaskIds ?? [],
staleRecoveryCandidateTaskIds: record.staleRecoveryCandidateTaskIds ?? [],
traceGapTaskIds: record.traceGapTaskIds ?? [],
traceGapNotStaleTaskIds: record.traceGapNotStaleTaskIds ?? [],
lastSchedulerHeartbeatAt: record.lastSchedulerHeartbeatAt ?? null,
lastObservedAgentEventAt: record.lastObservedAgentEventAt ?? null,
lastPersistedTraceAt: record.lastPersistedTraceAt ?? null,
oaPublisher: record.oaPublisher ?? null,
reasons: record.reasons ?? [],
};
}
function compactToolSummary(value: unknown, full: boolean): Record<string, unknown> { function compactToolSummary(value: unknown, full: boolean): Record<string, unknown> {
const record = asRecord(value) ?? {}; const record = asRecord(value) ?? {};
const items = asArray(record.items).map((item) => { const items = asArray(record.items).map((item) => {
@@ -252,6 +298,7 @@ function compactSummary(summary: unknown, options: CodexTaskOptions, taskId: str
codexThreadId: record.codexThreadId ?? null, codexThreadId: record.codexThreadId ?? null,
activeTurnId: record.activeTurnId ?? null, activeTurnId: record.activeTurnId ?? null,
cancelRequested: record.cancelRequested ?? null, cancelRequested: record.cancelRequested ?? null,
schedulerHeartbeat: compactSchedulerHeartbeat(record.schedulerHeartbeat),
}, },
timing: record.timing ?? null, timing: record.timing ?? null,
createdAt: record.createdAt ?? null, createdAt: record.createdAt ?? null,
@@ -718,7 +765,12 @@ function requireMergeTargetQueueId(args: string[], command: string): string {
} }
function codeQueues(): unknown { function codeQueues(): unknown {
return unwrapCodexResponse(coreInternalFetch(codeQueueProxyPath("/api/queues"))); const response = unwrapCodexResponse(coreInternalFetch(codeQueueProxyPath("/api/queues")));
return {
upstream: response.upstream,
queues: response.body.queues ?? [],
queue: compactQueueMutationSummary(response.body.queue),
};
} }
function codexCreateQueue(queueId: string): unknown { function codexCreateQueue(queueId: string): unknown {
@@ -824,6 +876,7 @@ function compactTaskMutationResponse(task: unknown, options: CompactTaskMutation
maxAttempts: record.maxAttempts ?? null, maxAttempts: record.maxAttempts ?? null,
currentAttempt: record.currentAttempt ?? null, currentAttempt: record.currentAttempt ?? null,
cancelRequested: record.cancelRequested ?? null, cancelRequested: record.cancelRequested ?? null,
schedulerHeartbeat: compactSchedulerHeartbeat(record.schedulerHeartbeat),
createdAt: record.createdAt ?? null, createdAt: record.createdAt ?? null,
startedAt: record.startedAt ?? null, startedAt: record.startedAt ?? null,
updatedAt: record.updatedAt ?? null, updatedAt: record.updatedAt ?? null,
@@ -845,6 +898,10 @@ function compactQueueMutationSummary(value: unknown): Record<string, unknown> |
return { return {
activeQueueIds: record.activeQueueIds ?? null, activeQueueIds: record.activeQueueIds ?? null,
activeTaskIds: record.activeTaskIds ?? null, activeTaskIds: record.activeTaskIds ?? null,
databaseActiveTaskCount: record.databaseActiveTaskCount ?? null,
databaseActiveTaskIds: record.databaseActiveTaskIds ?? null,
schedulerHeartbeatStaleMs: record.schedulerHeartbeatStaleMs ?? null,
executionDiagnostics: compactExecutionDiagnostics(record.executionDiagnostics),
queuedTaskIds: record.queuedTaskIds ?? null, queuedTaskIds: record.queuedTaskIds ?? null,
counts: record.counts ?? null, counts: record.counts ?? null,
byQueue: Array.isArray(record.byQueue) ? record.byQueue : undefined, byQueue: Array.isArray(record.byQueue) ? record.byQueue : undefined,
+18
View File
@@ -3,6 +3,7 @@ import { connect } from "node:net";
import { join } from "node:path"; import { join } from "node:path";
import { chromium, type Page } from "playwright"; import { chromium, type Page } from "playwright";
import { createRouteRegistry, MODULES } from "../../src/components/frontend/src/navigation"; import { createRouteRegistry, MODULES } from "../../src/components/frontend/src/navigation";
import { CODE_QUEUE_LIVENESS_CHECK_NAMES, runCodeQueueLivenessFixtureChecks } from "./code-queue-liveness-fixtures";
import { runCommand } from "./command"; import { runCommand } from "./command";
import { type UniDeskConfig, repoRoot, rootPath } from "./config"; import { type UniDeskConfig, repoRoot, rootPath } from "./config";
import { boundedJsonDetail } from "./preview"; import { boundedJsonDetail } from "./preview";
@@ -166,11 +167,14 @@ const FRONTEND_CHECK_NAMES = [
"frontend:no-console-errors", "frontend:no-console-errors",
] as const; ] as const;
const CODE_QUEUE_FIXTURE_CHECK_NAMES = [...CODE_QUEUE_LIVENESS_CHECK_NAMES] as const;
const ALL_E2E_CHECK_NAMES = [ const ALL_E2E_CHECK_NAMES = [
...NETWORK_CHECK_NAMES, ...NETWORK_CHECK_NAMES,
...SERVICE_CHECK_NAMES, ...SERVICE_CHECK_NAMES,
...DATABASE_CHECK_NAMES, ...DATABASE_CHECK_NAMES,
...FRONTEND_CHECK_NAMES, ...FRONTEND_CHECK_NAMES,
...CODE_QUEUE_FIXTURE_CHECK_NAMES,
] as const; ] as const;
function uniqueText(values: string[]): string[] { function uniqueText(values: string[]): string[] {
@@ -552,6 +556,14 @@ function addSelectedCheck(checks: E2ECheck[], options: E2ERunOptions, name: stri
addCheck(checks, name, passed, detail); addCheck(checks, name, passed, detail);
} }
function codeQueueFixtureChecks(checks: E2ECheck[], options: E2ERunOptions): void {
const selected = CODE_QUEUE_FIXTURE_CHECK_NAMES.filter((name) => wantsCheck(options, name));
const result = runCodeQueueLivenessFixtureChecks(selected);
for (const check of result.checks) {
addSelectedCheck(checks, options, check.name, check.ok, check.detail);
}
}
function safeTestId(value: string): string { function safeTestId(value: string): string {
return value.replace(/[^a-zA-Z0-9_-]/g, "_"); return value.replace(/[^a-zA-Z0-9_-]/g, "_");
} }
@@ -3420,8 +3432,14 @@ export async function runE2E(
const needDatabase = wantsPrefix(options, "database") const needDatabase = wantsPrefix(options, "database")
|| wantsCheck(options, "frontend:task-history-diagnostics"); || wantsCheck(options, "frontend:task-history-diagnostics");
const needFrontend = wantsPrefix(options, "frontend"); const needFrontend = wantsPrefix(options, "frontend");
const needCodeQueueFixtures = wantsAnyCheck(options, [...CODE_QUEUE_FIXTURE_CHECK_NAMES]);
const executedSections: string[] = []; const executedSections: string[] = [];
if (needCodeQueueFixtures) {
executedSections.push("code-queue-fixtures");
codeQueueFixtureChecks(checks, options);
}
if (needNetwork) { if (needNetwork) {
executedSections.push("network"); executedSections.push("network");
await exposureChecks(config, urls, checks, options); await exposureChecks(config, urls, checks, options);
+77 -2
View File
@@ -2169,6 +2169,81 @@ input:focus, select:focus, textarea:focus { border-color: var(--accent-2); }
.codex-trace-status-chip.service { .codex-trace-status-chip.service {
white-space: normal; white-space: normal;
} }
.codex-trace-status-chip.liveness.ok {
border-color: rgba(78, 183, 168, 0.50);
color: var(--accent-2);
}
.codex-trace-status-chip.liveness.warn {
border-color: rgba(215, 161, 58, 0.55);
color: #ffe0a2;
}
.codex-trace-status-chip.liveness.failed {
border-color: rgba(255, 98, 98, 0.58);
color: #ffb2b2;
}
.codex-liveness-panel .panel-body {
display: grid;
gap: 8px;
}
.codex-liveness-grid {
display: grid;
grid-template-columns: repeat(5, minmax(130px, 1fr));
gap: 8px;
min-width: 0;
}
.codex-liveness-metric {
display: grid;
gap: 4px;
min-width: 0;
padding: 8px;
border: 1px solid rgba(78, 183, 168, 0.22);
background:
linear-gradient(135deg, rgba(78, 183, 168, 0.08), rgba(255,255,255,0.015)),
rgba(0,0,0,0.16);
}
.codex-liveness-metric.warn {
border-color: rgba(215, 161, 58, 0.44);
}
.codex-liveness-metric.failed {
border-color: rgba(255, 98, 98, 0.46);
}
.codex-liveness-metric.ok {
border-color: rgba(78, 183, 168, 0.45);
}
.codex-liveness-metric span {
color: var(--muted);
font-size: 10px;
letter-spacing: 0.12em;
text-transform: uppercase;
}
.codex-liveness-metric strong {
min-width: 0;
color: var(--text);
font-size: 15px;
font-weight: 700;
overflow-wrap: anywhere;
}
.codex-liveness-metric code {
min-width: 0;
color: var(--muted);
font-size: 11px;
overflow-wrap: anywhere;
}
.codex-liveness-reasons {
display: flex;
flex-wrap: wrap;
gap: 5px;
min-width: 0;
}
.codex-liveness-reasons span {
max-width: 100%;
padding: 4px 7px;
border: 1px solid rgba(215, 161, 58, 0.28);
color: #ffe0a2;
background: rgba(215, 161, 58, 0.07);
font-size: 11px;
overflow-wrap: anywhere;
}
.codex-mark-all-read-btn { .codex-mark-all-read-btn {
border-color: rgba(78, 183, 168, 0.40); border-color: rgba(78, 183, 168, 0.40);
color: #bdece4; color: #bdece4;
@@ -5490,7 +5565,7 @@ input:focus, select:focus, textarea:focus { border-color: var(--accent-2); }
} }
@media (max-width: 1120px) { @media (max-width: 1120px) {
.metric-grid, .policy-grid, .security-board, .docker-metrics, .monitor-chart-grid, .monitor-summary-grid, .performance-metric-stack, .codex-load-test-grid, .baidu-doc-grid, .filebrowser-target-grid, .oa-flow-metrics { grid-template-columns: repeat(2, minmax(0, 1fr)); } .metric-grid, .policy-grid, .security-board, .docker-metrics, .monitor-chart-grid, .monitor-summary-grid, .performance-metric-stack, .codex-load-test-grid, .codex-liveness-grid, .baidu-doc-grid, .filebrowser-target-grid, .oa-flow-metrics { grid-template-columns: repeat(2, minmax(0, 1fr)); }
.pipeline-oa-guarantees { grid-template-columns: repeat(2, minmax(0, 1fr)); } .pipeline-oa-guarantees { grid-template-columns: repeat(2, minmax(0, 1fr)); }
.dispatch-form, .schedule-form { grid-template-columns: 1fr 1fr; } .dispatch-form, .schedule-form { grid-template-columns: 1fr 1fr; }
.dispatch-actions { align-items: center; } .dispatch-actions { align-items: center; }
@@ -5680,7 +5755,7 @@ input:focus, select:focus, textarea:focus { border-color: var(--accent-2); }
padding: 4px 9px; padding: 4px 9px;
white-space: nowrap; white-space: nowrap;
} }
.metric-grid, .policy-grid, .security-board, .dispatch-form, .schedule-form, .schedule-card-grid, .docker-metrics, .monitor-chart-grid, .monitor-summary-grid, .gateway-record-grid, .met-detail-kv, .code-queue-metrics, .codex-stats-summary-grid, .codex-form-grid, .baidu-doc-grid { grid-template-columns: 1fr; } .metric-grid, .policy-grid, .security-board, .dispatch-form, .schedule-form, .schedule-card-grid, .docker-metrics, .monitor-chart-grid, .monitor-summary-grid, .gateway-record-grid, .met-detail-kv, .code-queue-metrics, .codex-stats-summary-grid, .codex-liveness-grid, .codex-form-grid, .baidu-doc-grid { grid-template-columns: 1fr; }
.compact-row, .heartbeat-row, .log-row, .endpoint-list article, .volume-route, .findjob-hero, .pipeline-hero, .code-queue-hero, .claudeqq-login-card, .baidu-login-card, .baidu-pathbar { grid-template-columns: 1fr; align-items: start; } .compact-row, .heartbeat-row, .log-row, .endpoint-list article, .volume-route, .findjob-hero, .pipeline-hero, .code-queue-hero, .claudeqq-login-card, .baidu-login-card, .baidu-pathbar { grid-template-columns: 1fr; align-items: start; }
.codex-output-line { grid-template-columns: 1fr; } .codex-output-line { grid-template-columns: 1fr; }
.codex-transcript { min-height: 360px; } .codex-transcript { min-height: 360px; }
@@ -219,6 +219,34 @@ function activeTaskIds(queue: any): string[] {
return Array.isArray(queue?.activeTaskIds) ? queue.activeTaskIds.map((id: any) => String(id || "")).filter(Boolean) : [String(queue?.activeTaskId || "")].filter(Boolean); return Array.isArray(queue?.activeTaskIds) ? queue.activeTaskIds.map((id: any) => String(id || "")).filter(Boolean) : [String(queue?.activeTaskId || "")].filter(Boolean);
} }
function stringArray(value: any): string[] {
return Array.isArray(value) ? value.map((item: any) => String(item || "")).filter(Boolean) : [];
}
function compactIdList(value: any, limit = 3): string {
const ids = stringArray(value);
if (ids.length === 0) return "--";
const visible = ids.slice(0, limit).join(" / ");
return ids.length > limit ? `${visible} +${ids.length - limit}` : visible;
}
function executionDiagnosticsFromQueue(queue: any, health: any): AnyRecord {
return objectRecord(queue?.executionDiagnostics)
|| objectRecord(health?.body?.queue?.executionDiagnostics)
|| objectRecord(health?.queue?.executionDiagnostics)
|| objectRecord(health?.body?.executionDiagnostics)
|| objectRecord(health?.executionDiagnostics)
|| {};
}
function diagnosticsTone(state: any): string {
const value = String(state || "unknown").toLowerCase();
if (value === "healthy") return "ok";
if (value === "split-brain" || value === "stale-active") return "failed";
if (value === "degraded") return "warn";
return "unknown";
}
const allQueuesId = "__all__"; const allQueuesId = "__all__";
const queueMobileMediaQuery = "(max-width: 760px)"; const queueMobileMediaQuery = "(max-width: 760px)";
const queueDesktopMediaQuery = "(min-width: 761px)"; const queueDesktopMediaQuery = "(min-width: 761px)";
@@ -1536,6 +1564,48 @@ function CodexStatsIcon() {
); );
} }
function LivenessMetric({ label, value, hint, tone }: AnyRecord) {
return h("div", { className: `codex-liveness-metric ${tone || ""}` },
h("span", null, label),
h("strong", null, value ?? "--"),
hint ? h("code", null, hint) : null,
);
}
function CodeQueueLivenessPanel({ diagnostics, queue, onRaw }: AnyRecord) {
const state = String(diagnostics?.state || diagnostics?.health || "unknown");
const oaPublisher = objectRecord(diagnostics?.oaPublisher);
const reasons = stringArray(diagnostics?.reasons).slice(0, 3);
const tone = diagnosticsTone(state);
return h(Panel, {
title: "执行活性",
eyebrow: `${String(diagnostics?.executionStateSource || queue?.executionStateSource || "unknown")} / ${String(diagnostics?.controlPlane || "code-queue")}`,
summary: h("div", { className: "codex-trace-status" },
h("span", { className: `codex-trace-status-chip liveness ${tone}` }, h("b", null, "状态"), state),
h("span", { className: "codex-trace-status-chip" }, h("b", null, "DB active"), String(diagnostics?.databaseActiveTaskCount ?? queue?.databaseActiveTaskCount ?? 0)),
h("span", { className: "codex-trace-status-chip" }, h("b", null, "scheduler slots"), String(diagnostics?.schedulerActiveRunSlotCount ?? queue?.activeRunSlotCount ?? 0)),
h("span", { className: "codex-trace-status-chip" }, h("b", null, "heartbeat"), `${stringArray(diagnostics?.heartbeatFreshTaskIds).length} fresh / ${stringArray(diagnostics?.heartbeatExpiredTaskIds).length} expired`),
oaPublisher ? h("span", { className: "codex-trace-status-chip" }, h("b", null, "OA"), `${Number(oaPublisher.pending || 0)} pending${oaPublisher.lastError ? " / error" : ""}`) : null,
),
actions: h(RawButton, { title: "Code Queue Execution Diagnostics", data: diagnostics, onOpen: onRaw, testId: "raw-code-queue-execution-diagnostics" }),
className: "codex-liveness-panel",
},
h("div", { className: "codex-liveness-grid", "data-testid": "codex-liveness-diagnostics" },
h(LivenessMetric, { tone, label: "健康状态", value: state, hint: diagnostics?.splitBrain ? "split-brain" : diagnostics?.degraded ? "degraded" : "ready" }),
h(LivenessMetric, { label: "PostgreSQL active", value: String(diagnostics?.databaseActiveTaskCount ?? queue?.databaseActiveTaskCount ?? 0), hint: compactIdList(diagnostics?.databaseActiveTaskIds ?? queue?.databaseActiveTaskIds) }),
h(LivenessMetric, { label: "Scheduler active", value: String(diagnostics?.schedulerActiveRunSlotCount ?? queue?.activeRunSlotCount ?? 0), hint: compactIdList(diagnostics?.schedulerActiveTaskIds ?? queue?.activeTaskIds) }),
h(LivenessMetric, { label: "Fresh heartbeat", value: String(stringArray(diagnostics?.heartbeatFreshTaskIds).length), hint: compactIdList(diagnostics?.heartbeatFreshTaskIds) }),
h(LivenessMetric, { tone: stringArray(diagnostics?.traceGapNotStaleTaskIds).length > 0 ? "warn" : "", label: "Trace gap", value: String(stringArray(diagnostics?.traceGapTaskIds).length), hint: compactIdList(diagnostics?.traceGapNotStaleTaskIds) }),
h(LivenessMetric, { tone: stringArray(diagnostics?.staleRecoveryCandidateTaskIds).length > 0 ? "failed" : "", label: "Stale candidates", value: String(stringArray(diagnostics?.staleRecoveryCandidateTaskIds).length), hint: compactIdList(diagnostics?.staleRecoveryCandidateTaskIds) }),
h(LivenessMetric, { label: "Last scheduler heartbeat", value: fmtRelativeAge(diagnostics?.lastSchedulerHeartbeatAt), hint: String(diagnostics?.lastSchedulerHeartbeatAt || "--") }),
h(LivenessMetric, { label: "Last agent event", value: fmtRelativeAge(diagnostics?.lastObservedAgentEventAt), hint: String(diagnostics?.lastObservedAgentEventAt || "--") }),
h(LivenessMetric, { label: "Last trace persist", value: fmtRelativeAge(diagnostics?.lastPersistedTraceAt), hint: String(diagnostics?.lastPersistedTraceAt || "--") }),
h(LivenessMetric, { tone: oaPublisher?.lastError ? "warn" : "", label: "OA publisher", value: `${Number(oaPublisher?.pending || 0)} pending`, hint: oaPublisher?.lastError ? shortText(oaPublisher.lastError, 90) : "ok" }),
),
reasons.length > 0 ? h("div", { className: "codex-liveness-reasons" }, reasons.map((reason: string) => h("span", { key: reason }, reason))) : null,
);
}
function CodexStatsPanel({ stats, queueName: activeQueueName, onRaw }: AnyRecord) { function CodexStatsPanel({ stats, queueName: activeQueueName, onRaw }: AnyRecord) {
const rows = taskStatisticsRows(stats); const rows = taskStatisticsRows(stats);
const totals = taskStatisticsTotals(stats); const totals = taskStatisticsTotals(stats);
@@ -2112,6 +2182,7 @@ export function CodeQueuePage({ microservices, onRaw, apiBaseUrl = "/api", initi
const tasks = applyLocalReadStateToRows(taskRows(tasksData)); const tasks = applyLocalReadStateToRows(taskRows(tasksData));
const loadedUnreadTerminalTasks = tasks.filter(taskIsUnreadTerminal); const loadedUnreadTerminalTasks = tasks.filter(taskIsUnreadTerminal);
const queue = tasksData?.queue || health?.body?.queue || health?.queue || {}; const queue = tasksData?.queue || health?.body?.queue || health?.queue || {};
const executionDiagnostics = executionDiagnosticsFromQueue(queue, health);
const statistics = taskStatistics(tasksData, queue); const statistics = taskStatistics(tasksData, queue);
const pagination = taskPagination(tasksData); const pagination = taskPagination(tasksData);
const queueRows = knownQueueRows(queue, queueId); const queueRows = knownQueueRows(queue, queueId);
@@ -3828,6 +3899,7 @@ export function CodeQueuePage({ microservices, onRaw, apiBaseUrl = "/api", initi
h(UniDeskErrorBanner, { error, wide: true }), h(UniDeskErrorBanner, { error, wide: true }),
mergeDialog, mergeDialog,
h("div", { className: "codex-session-stage codex-session-stage-top" }, h("div", { className: "codex-session-stage codex-session-stage-top" },
h(CodeQueueLivenessPanel, { diagnostics: executionDiagnostics, queue, onRaw }),
sessionPanel, sessionPanel,
), ),
h("div", { className: "code-queue-layout" }, h("div", { className: "code-queue-layout" },
@@ -782,6 +782,13 @@ fn execution_diagnostics_from_tasks(tasks: &[TaskMeta], now: &str) -> Value {
if !trace_gap_not_stale_task_ids.is_empty() { if !trace_gap_not_stale_task_ids.is_empty() {
reasons.push("trace progress is stale while scheduler heartbeat is fresh; this is a trace gap, not stale active"); reasons.push("trace progress is stale while scheduler heartbeat is fresh; this is a trace gap, not stale active");
} }
let database_active_task_count = database_active_task_ids.len();
let scheduler_orphaned_active_task_ids = database_active_task_ids.clone();
let scheduler_orphaned_active_task_count = scheduler_orphaned_active_task_ids.len();
let active_heartbeat_count = active_heartbeat_task_ids.len();
let last_scheduler_heartbeat_at = max_timestamp(active_heartbeats.iter().map(|(_, heartbeat)| heartbeat_string_field(heartbeat, "lastLocalHeartbeatAt")).collect());
let last_observed_agent_event_at = max_timestamp(active_heartbeats.iter().map(|(_, heartbeat)| heartbeat_string_field(heartbeat, "lastObservedAgentEventAt")).collect());
let last_persisted_trace_at = max_timestamp(active_heartbeats.iter().map(|(_, heartbeat)| heartbeat_string_field(heartbeat, "lastPersistedTraceAt")).collect());
json!({ json!({
"state": state, "state": state,
@@ -791,16 +798,16 @@ fn execution_diagnostics_from_tasks(tasks: &[TaskMeta], now: &str) -> Value {
"executionStateSource": "postgres-control-plane", "executionStateSource": "postgres-control-plane",
"controlPlane": "master-code-queue-mgr", "controlPlane": "master-code-queue-mgr",
"databaseActiveTaskIds": database_active_task_ids, "databaseActiveTaskIds": database_active_task_ids,
"databaseActiveTaskCount": database_active_task_ids.len(), "databaseActiveTaskCount": database_active_task_count,
"schedulerActiveTaskIds": [], "schedulerActiveTaskIds": [],
"schedulerActiveTaskCount": 0, "schedulerActiveTaskCount": 0,
"schedulerActiveRunSlotCount": 0, "schedulerActiveRunSlotCount": 0,
"schedulerActiveQueueIds": [], "schedulerActiveQueueIds": [],
"schedulerProcessingQueueIds": [], "schedulerProcessingQueueIds": [],
"schedulerOrphanedActiveTaskIds": database_active_task_ids, "schedulerOrphanedActiveTaskIds": scheduler_orphaned_active_task_ids,
"schedulerOrphanedActiveTaskCount": database_active_task_ids.len(), "schedulerOrphanedActiveTaskCount": scheduler_orphaned_active_task_count,
"activeHeartbeatTaskIds": active_heartbeat_task_ids, "activeHeartbeatTaskIds": active_heartbeat_task_ids,
"activeHeartbeatCount": active_heartbeat_task_ids.len(), "activeHeartbeatCount": active_heartbeat_count,
"heartbeatFreshTaskIds": heartbeat_fresh_task_ids, "heartbeatFreshTaskIds": heartbeat_fresh_task_ids,
"heartbeatExpiredTaskIds": heartbeat_expired_task_ids, "heartbeatExpiredTaskIds": heartbeat_expired_task_ids,
"heartbeatMissingTaskIds": heartbeat_missing_task_ids, "heartbeatMissingTaskIds": heartbeat_missing_task_ids,
@@ -809,9 +816,9 @@ fn execution_diagnostics_from_tasks(tasks: &[TaskMeta], now: &str) -> Value {
"traceGapNotStaleTaskIds": trace_gap_not_stale_task_ids, "traceGapNotStaleTaskIds": trace_gap_not_stale_task_ids,
"schedulerHeartbeatStaleMs": SCHEDULER_HEARTBEAT_STALE_MS, "schedulerHeartbeatStaleMs": SCHEDULER_HEARTBEAT_STALE_MS,
"now": now, "now": now,
"lastSchedulerHeartbeatAt": max_timestamp(active_heartbeats.iter().map(|(_, heartbeat)| heartbeat_string_field(heartbeat, "lastLocalHeartbeatAt")).collect()), "lastSchedulerHeartbeatAt": last_scheduler_heartbeat_at,
"lastObservedAgentEventAt": max_timestamp(active_heartbeats.iter().map(|(_, heartbeat)| heartbeat_string_field(heartbeat, "lastObservedAgentEventAt")).collect()), "lastObservedAgentEventAt": last_observed_agent_event_at,
"lastPersistedTraceAt": max_timestamp(active_heartbeats.iter().map(|(_, heartbeat)| heartbeat_string_field(heartbeat, "lastPersistedTraceAt")).collect()), "lastPersistedTraceAt": last_persisted_trace_at,
"oaPublisher": Value::Null, "oaPublisher": Value::Null,
"reasons": reasons, "reasons": reasons,
"guidance": [ "guidance": [
@@ -1179,6 +1186,7 @@ fn queue_summary(state: &AppState) -> Result<Value, String> {
let active_tasks: Vec<TaskMeta> = active_rows.iter().map(|row| row_to_task(row, true)).collect(); let active_tasks: Vec<TaskMeta> = active_rows.iter().map(|row| row_to_task(row, true)).collect();
let mut database_active_task_ids: Vec<String> = active_tasks.iter().map(|task| task.id.clone()).collect(); let mut database_active_task_ids: Vec<String> = active_tasks.iter().map(|task| task.id.clone()).collect();
database_active_task_ids.sort(); database_active_task_ids.sort();
let database_active_task_count = database_active_task_ids.len();
let execution_diagnostics = execution_diagnostics_from_tasks(&active_tasks, &now_iso()); let execution_diagnostics = execution_diagnostics_from_tasks(&active_tasks, &now_iso());
if !queues.iter().any(|queue| queue.get("id").and_then(Value::as_str) == Some(DEFAULT_QUEUE_ID)) { if !queues.iter().any(|queue| queue.get("id").and_then(Value::as_str) == Some(DEFAULT_QUEUE_ID)) {
let now = now_iso(); let now = now_iso();
@@ -1215,7 +1223,7 @@ fn queue_summary(state: &AppState) -> Result<Value, String> {
"activeTaskIds": [], "activeTaskIds": [],
"activeTaskId": Value::Null, "activeTaskId": Value::Null,
"databaseActiveTaskIds": database_active_task_ids, "databaseActiveTaskIds": database_active_task_ids,
"databaseActiveTaskCount": database_active_task_ids.len(), "databaseActiveTaskCount": database_active_task_count,
"executionStateSource": "postgres-control-plane", "executionStateSource": "postgres-control-plane",
"executionDiagnostics": execution_diagnostics, "executionDiagnostics": execution_diagnostics,
"schedulerHeartbeatStaleMs": SCHEDULER_HEARTBEAT_STALE_MS, "schedulerHeartbeatStaleMs": SCHEDULER_HEARTBEAT_STALE_MS,
@@ -3501,9 +3501,7 @@ function taskIsRecoverableOrphanedActive(task: QueueTask, staleMs = orphanedActi
if (task.status !== "running" && task.status !== "judging") return false; if (task.status !== "running" && task.status !== "judging") return false;
if (activeTaskHasLocalRunSlotOrWaiter(task)) return false; if (activeTaskHasLocalRunSlotOrWaiter(task)) return false;
const decision = staleRecoveryCandidate({ task, localActive: false, heartbeatStaleMs: staleMs, now: nowIso() }); const decision = staleRecoveryCandidate({ task, localActive: false, heartbeatStaleMs: staleMs, now: nowIso() });
if (decision.allowed) return true; return decision.allowed;
if (decision.reason === "owner-heartbeat-missing") return orphanedActiveTaskAgeMs(task) >= staleMs;
return false;
} }
function schedulerReconcileStatus(tasks: QueueTask[] = state.tasks): JsonValue { function schedulerReconcileStatus(tasks: QueueTask[] = state.tasks): JsonValue {
@@ -256,6 +256,23 @@ function runQueueOrderingSelfTest(): JsonValue {
const orphanRunning = queueOrderTestTask("codex_4400_orphan_running", "running", "2026-05-11T13:00:00.000Z", "2026-05-11T13:00:00.000Z"); const orphanRunning = queueOrderTestTask("codex_4400_orphan_running", "running", "2026-05-11T13:00:00.000Z", "2026-05-11T13:00:00.000Z");
orphanRunning.queueId = "queue_orphan_recovery"; orphanRunning.queueId = "queue_orphan_recovery";
orphanRunning.activeTurnId = null; orphanRunning.activeTurnId = null;
orphanRunning.schedulerHeartbeat = {
taskId: orphanRunning.id,
queueId: orphanRunning.queueId,
attempt: 1,
activeTurnId: null,
codexThreadId: null,
owner: "self-test",
schedulerInstance: "self-test",
executionPlane: "scheduler-execution-plane",
agentPort: "codex",
status: "running",
lastLocalHeartbeatAt: "2026-05-11T13:00:00.000Z",
lastObservedAgentEventAt: null,
lastPersistedTraceAt: null,
outputMaxSeq: 0,
source: "scheduler",
};
const queuedBehindOrphan = queueOrderTestTask("codex_4401_queued", "queued", "2026-05-11T13:01:00.000Z", "2026-05-11T13:01:00.000Z"); const queuedBehindOrphan = queueOrderTestTask("codex_4401_queued", "queued", "2026-05-11T13:01:00.000Z", "2026-05-11T13:01:00.000Z");
queuedBehindOrphan.queueId = "queue_orphan_recovery"; queuedBehindOrphan.queueId = "queue_orphan_recovery";
const originalMaxActiveQueues = ctx().config.maxActiveQueues; const originalMaxActiveQueues = ctx().config.maxActiveQueues;