fix(code-queue): add active-run liveness diagnostics
This commit is contained in:
@@ -35,7 +35,7 @@ Typical targeted commands:
|
|||||||
- Core API: `docker exec unidesk-backend-core` calls internal `GET /api/overview`, which must report `dbReady: true`, `pgdata.volumeName=unidesk_pgdata_10gb`, a positive PostgreSQL database byte count, and at least one online node; internal `GET /api/performance` must report component request statistics, internal operation statistics, PGDATA usage and Code Queue PostgreSQL storage metadata.
|
- Core API: `docker exec unidesk-backend-core` calls internal `GET /api/overview`, which must report `dbReady: true`, `pgdata.volumeName=unidesk_pgdata_10gb`, a positive PostgreSQL database byte count, and at least one online node; internal `GET /api/performance` must report component request statistics, internal operation statistics, PGDATA usage and Code Queue PostgreSQL storage metadata.
|
||||||
- Provider self-connection: internal `GET /api/nodes` must contain `main-server` with `status: online`, `labels.providerGatewayVersion` equal to `src/components/provider-gateway/package.json`, `labels.providerGatewayUpgradePolicy: "always-enabled"`, `labels.providerGatewayRestartPolicyOk: true`, `labels.providerGatewayPidModeOk: true`, and `labels.providerGatewayRuntimeGuardOk: true`; internal `GET /api/nodes/system-status` must contain CPU/memory/disk samples plus a non-empty process resource list sorted by `memoryBytes` by default, where `memoryBytes` should use PSS when `/proc/[pid]/smaps_rollup` is available, otherwise `rssBytes - statm.shared` before raw RSS, and must retain `rssBytes` for diagnostics; internal `GET /api/nodes/docker-status` must contain a Docker snapshot for `main-server`; every running `provider-gateway` container visible in Docker snapshots must report `restartPolicy: "always"` and `pidMode: "host"`; public provider ingress `/health` must return ok.
|
- Provider self-connection: internal `GET /api/nodes` must contain `main-server` with `status: online`, `labels.providerGatewayVersion` equal to `src/components/provider-gateway/package.json`, `labels.providerGatewayUpgradePolicy: "always-enabled"`, `labels.providerGatewayRestartPolicyOk: true`, `labels.providerGatewayPidModeOk: true`, and `labels.providerGatewayRuntimeGuardOk: true`; internal `GET /api/nodes/system-status` must contain CPU/memory/disk samples plus a non-empty process resource list sorted by `memoryBytes` by default, where `memoryBytes` should use PSS when `/proc/[pid]/smaps_rollup` is available, otherwise `rssBytes - statm.shared` before raw RSS, and must retain `rssBytes` for diagnostics; internal `GET /api/nodes/docker-status` must contain a Docker snapshot for `main-server`; every running `provider-gateway` container visible in Docker snapshots must report `restartPolicy: "always"` and `pidMode: "host"`; public provider ingress `/health` must return ok.
|
||||||
- Provider remote control: internal `/api/dispatch` must successfully complete a real `provider.upgrade` task in `mode: "plan"` so the upgrade path is validated without recreating the running gateway during E2E.
|
- Provider remote control: internal `/api/dispatch` must successfully complete a real `provider.upgrade` task in `mode: "plan"` so the upgrade path is validated without recreating the running gateway during E2E.
|
||||||
- User services: internal `/api/microservices` must include `todo-note` and `oa-event-flow` on `main-server`, canonical `filebrowser` on `D518`, plus `k3sctl-adapter`, `code-queue`, `findjob`, `pipeline`, `met-nonlinear`, `claudeqq` and `filebrowser-d601` on `D601` with `public=false`; `/api/microservices/todo-note/health` must report `storage=postgres`, `/api/microservices/todo-note/proxy/api/instances` must expose the migrated Todo Note lists, and a temporary Todo Note list create/add/toggle/undo/delete cycle must succeed through the real provider-gateway proxy; `/api/microservices/oa-event-flow/health`, `/api/microservices/oa-event-flow/proxy/api/diagnostics`, `/api/microservices/oa-event-flow/proxy/api/events`, `/api/microservices/oa-event-flow/proxy/api/events?tags=service:pipeline` and `/api/microservices/oa-event-flow/proxy/api/stats/trace` must prove the independent OA event table、Pipeline bridge 和 stats center are reachable through UniDesk proxy; `/api/microservices/k3sctl-adapter/health` and `/api/microservices/k3sctl-adapter/proxy/api/control-plane` must expose the D601 `unidesk-k3s` control plane, `kubeApiProxy.mode=kubernetes-api-service-proxy`, D601 active Code Queue instance `servingHealthy=true`, `presentNodeIds` containing `D601`, `missingNodeIds=[]`, `status=healthy`, and `noFallback=true`; `/api/microservices/code-queue/health` must return the active Code Queue backend summary with default model `gpt-5.5`, `egressProxy.connected=true`, and `/api/microservices/code-queue/proxy/api/tasks/overview` must return queue state through backend-core -> k3sctl-adapter -> Kubernetes API service proxy -> k3s/k8s Service, not through a `serviceId=code-queue` provider-gateway direct task or `/api/code-queue-direct`; Code Queue raw prompt observation fields must preserve long prompt tails across create/list/detail/frontend paths, with any shortened text exposed only through explicit `*Preview` objects carrying `chars` and `truncated`; `/api/microservices/filebrowser/health`, `/api/microservices/filebrowser-d601/health` and `/api/microservices/filebrowser/proxy/` must prove File Browser health and WebUI access through UniDesk proxy; `/api/microservices/findjob/health` and `/api/microservices/findjob/proxy/api/summary` must succeed through the real provider-gateway proxy; `/api/microservices/findjob/proxy/api/jobs?__unideskArrayLimit=jobs:5` must return a bounded preview with `_unidesk.arrayLimits` metadata; `/api/microservices/pipeline/health`, `/api/microservices/pipeline/proxy/api/snapshot?__unideskArrayLimit=registry.components:8,runs:3` and `/api/microservices/pipeline/proxy/api/oa-event-flow/diagnostics` must return Pipeline health, registry/run previews and OA event-flow evidence; `/api/microservices/met-nonlinear/health`, `/api/microservices/met-nonlinear/proxy/api/queue`, `/api/microservices/met-nonlinear/proxy/api/projects?root=projects&limit=500`, `/api/microservices/met-nonlinear/proxy/api/projects?root=ex_projects&limit=500`, `/api/microservices/met-nonlinear/proxy/api/projects/config?path=<projectPath>` and `/api/microservices/met-nonlinear/proxy/api/images` must return the D601 TS backend health, queue/GPU policy, full project tree inputs, structured project detail and ready `met-nonlinear-ml:tf26` image status.
|
- User services: internal `/api/microservices` must include `todo-note` and `oa-event-flow` on `main-server`, canonical `filebrowser` on `D518`, plus `k3sctl-adapter`, `code-queue`, `findjob`, `pipeline`, `met-nonlinear`, `claudeqq` and `filebrowser-d601` on `D601` with `public=false`; `/api/microservices/todo-note/health` must report `storage=postgres`, `/api/microservices/todo-note/proxy/api/instances` must expose the migrated Todo Note lists, and a temporary Todo Note list create/add/toggle/undo/delete cycle must succeed through the real provider-gateway proxy; `/api/microservices/oa-event-flow/health`, `/api/microservices/oa-event-flow/proxy/api/diagnostics`, `/api/microservices/oa-event-flow/proxy/api/events`, `/api/microservices/oa-event-flow/proxy/api/events?tags=service:pipeline` and `/api/microservices/oa-event-flow/proxy/api/stats/trace` must prove the independent OA event table、Pipeline bridge 和 stats center are reachable through UniDesk proxy; `/api/microservices/k3sctl-adapter/health` and `/api/microservices/k3sctl-adapter/proxy/api/control-plane` must expose the D601 `unidesk-k3s` control plane, `kubeApiProxy.mode=kubernetes-api-service-proxy`, D601 active Code Queue instance `servingHealthy=true`, `presentNodeIds` containing `D601`, `missingNodeIds=[]`, `status=healthy`, and `noFallback=true`; `/api/microservices/code-queue/health` must return the active Code Queue backend summary with default model `gpt-5.5`, `egressProxy.connected=true`, `queue.executionDiagnostics` containing DB active state, scheduler active slots, scheduler heartbeat and Trace/OA progress, and `/api/microservices/code-queue/proxy/api/tasks/overview` must return queue state through backend-core -> k3sctl-adapter -> Kubernetes API service proxy -> k3s/k8s Service, not through a `serviceId=code-queue` provider-gateway direct task or `/api/code-queue-direct`; Code Queue raw prompt observation fields must preserve long prompt tails across create/list/detail/frontend paths, with any shortened text exposed only through explicit `*Preview` objects carrying `chars` and `truncated`; `/api/microservices/filebrowser/health`, `/api/microservices/filebrowser-d601/health` and `/api/microservices/filebrowser/proxy/` must prove File Browser health and WebUI access through UniDesk proxy; `/api/microservices/findjob/health` and `/api/microservices/findjob/proxy/api/summary` must succeed through the real provider-gateway proxy; `/api/microservices/findjob/proxy/api/jobs?__unideskArrayLimit=jobs:5` must return a bounded preview with `_unidesk.arrayLimits` metadata; `/api/microservices/pipeline/health`, `/api/microservices/pipeline/proxy/api/snapshot?__unideskArrayLimit=registry.components:8,runs:3` and `/api/microservices/pipeline/proxy/api/oa-event-flow/diagnostics` must return Pipeline health, registry/run previews and OA event-flow evidence; `/api/microservices/met-nonlinear/health`, `/api/microservices/met-nonlinear/proxy/api/queue`, `/api/microservices/met-nonlinear/proxy/api/projects?root=projects&limit=500`, `/api/microservices/met-nonlinear/proxy/api/projects?root=ex_projects&limit=500`, `/api/microservices/met-nonlinear/proxy/api/projects/config?path=<projectPath>` and `/api/microservices/met-nonlinear/proxy/api/images` must return the D601 TS backend health, queue/GPU policy, full project tree inputs, structured project detail and ready `met-nonlinear-ml:tf26` image status. Code Queue liveness fixture checks are first-class E2E selections: `code-queue:active-run-heartbeat-visible`, `code-queue:trace-gap-not-stale`, `code-queue:stale-active-owner-expired`, `code-queue:control-plane-split-brain-diagnostics` and `code-queue:oa-publisher-degraded-visible`.
|
||||||
- ClaudeQQ availability: `/api/microservices/claudeqq/health` must only pass when `ready=true`, NapCat HTTP and WebSocket are connected, and `napcat.loginState=logged_in`; `/api/microservices/claudeqq/proxy/api/napcat/login` must show the same logged-in account state and `/api/microservices/claudeqq/proxy/api/events/recent` must prove the backend can read the persistent event cache. A QR-code-only or not-logged-in NapCat state must be treated as unhealthy.
|
- ClaudeQQ availability: `/api/microservices/claudeqq/health` must only pass when `ready=true`, NapCat HTTP and WebSocket are connected, and `napcat.loginState=logged_in`; `/api/microservices/claudeqq/proxy/api/napcat/login` must show the same logged-in account state and `/api/microservices/claudeqq/proxy/api/events/recent` must prove the backend can read the persistent event cache. A QR-code-only or not-logged-in NapCat state must be treated as unhealthy.
|
||||||
- Database: the command writes an `unidesk_e2e_markers` row through `docker exec unidesk-database psql`, confirms provider state is stored in PostgreSQL, and checks Todo Note rows exist in `todo_note_instances` using the same named volume.
|
- Database: the command writes an `unidesk_e2e_markers` row through `docker exec unidesk-database psql`, confirms provider state is stored in PostgreSQL, and checks Todo Note rows exist in `todo_note_instances` using the same named volume.
|
||||||
- Pipeline OA event flow: `microservice:pipeline-oa-event-flow` must prove both no-audit and monitor-audit runs are driven by OA events end to end. The event stream must show `node-finished` as a neutral fact with `pipeline:{pipelineId}` and `epoch:{runId}` tags, OA policy as the source of downstream/audit decisions, monitor decisions as OA control events, and runner control-result evidence. E2E must fail if delivery still depends on a legacy detail audit policy flag as policy authority, independent legacy audit-request points, a legacy batch completion gate, direct monitor-to-runner calls, or frontend/CLI writes to Pipeline `.state`.
|
- Pipeline OA event flow: `microservice:pipeline-oa-event-flow` must prove both no-audit and monitor-audit runs are driven by OA events end to end. The event stream must show `node-finished` as a neutral fact with `pipeline:{pipelineId}` and `epoch:{runId}` tags, OA policy as the source of downstream/audit decisions, monitor decisions as OA control events, and runner control-result evidence. E2E must fail if delivery still depends on a legacy detail audit policy flag as policy authority, independent legacy audit-request points, a legacy batch completion gate, direct monitor-to-runner calls, or frontend/CLI writes to Pipeline `.state`.
|
||||||
@@ -55,7 +55,7 @@ Remote update records in the frontend are covered by the same rule: `provider.up
|
|||||||
|
|
||||||
Provider operation availability is also covered by the structured rendering rule. `host.ssh` availability must be displayed as badges or equivalent controls derived from capabilities and `hostSsh*` labels, and remote update availability must be displayed from `provider.upgrade` capability plus the `always-enabled` policy; these fields must not require opening raw Provider JSON.
|
Provider operation availability is also covered by the structured rendering rule. `host.ssh` availability must be displayed as badges or equivalent controls derived from capabilities and `hostSsh*` labels, and remote update availability must be displayed from `provider.upgrade` capability plus the `always-enabled` policy; these fields must not require opening raw Provider JSON.
|
||||||
|
|
||||||
User service pages are covered by the same rule. `Todo Note` must show lists, task tree, filters, reminder input, movement controls, undo/redo and metrics as controls; `OA Event Flow` must show health, live tag stream state, event table, tag filter presets and Trace stats table as controls; `Code Queue` must show queue cards, live transcript, model/cwd/max attempt inputs, judge decision, attempt table, append prompt, interrupt and retry controls; `File Browser` must show D518 as the default target, D601 as an alternate target, a screenshot export action, and an embedded upstream WebUI frame served through `/api/microservices/<id>/proxy/` with compact file rows that do not let material-icon fallback text cover file metadata; `FindJob` must show metrics, jobs and drafts as cards/tables; `Pipeline` must show component classes, React Flow graph nodes/edges, run cards, Gantt execution lines and OpenCode Trace timelines as controls; `MET Nonlinear` must show queue rows, GPU/image cards, a real path tree for the project library, structured project/job detail panels, project config preview, `data/` training state, model parameter count, metrics, progress bars, ETA, `epoch/h` speed and history diagnostics as controls; `ClaudeQQ` must show NapCat HTTP/WS/login badges, QR/login panel, event cache, subscriptions and message push controls; the full user-service config, summary, snapshot, jobs preview, drafts, OA events and run JSON can only appear after an explicit `查看原始JSON` click.
|
User service pages are covered by the same rule. `Todo Note` must show lists, task tree, filters, reminder input, movement controls, undo/redo and metrics as controls; `OA Event Flow` must show health, live tag stream state, event table, tag filter presets and Trace stats table as controls; `Code Queue` must show queue cards, execution liveness diagnostics, live transcript, model/cwd/max attempt inputs, judge decision, attempt table, append prompt, interrupt and retry controls; `File Browser` must show D518 as the default target, D601 as an alternate target, a screenshot export action, and an embedded upstream WebUI frame served through `/api/microservices/<id>/proxy/` with compact file rows that do not let material-icon fallback text cover file metadata; `FindJob` must show metrics, jobs and drafts as cards/tables; `Pipeline` must show component classes, React Flow graph nodes/edges, run cards, Gantt execution lines and OpenCode Trace timelines as controls; `MET Nonlinear` must show queue rows, GPU/image cards, a real path tree for the project library, structured project/job detail panels, project config preview, `data/` training state, model parameter count, metrics, progress bars, ETA, `epoch/h` speed and history diagnostics as controls; `ClaudeQQ` must show NapCat HTTP/WS/login badges, QR/login panel, event cache, subscriptions and message push controls; the full user-service config, summary, snapshot, jobs preview, drafts, OA events and run JSON can only appear after an explicit `查看原始JSON` click.
|
||||||
|
|
||||||
## Public Boundary Rule
|
## Public Boundary Rule
|
||||||
|
|
||||||
|
|||||||
@@ -180,7 +180,9 @@ D601 上必须显式使用原生 k3s kubeconfig:`KUBECONFIG=/etc/rancher/k3s/k
|
|||||||
- 服务拆分语义:`code-queue-read` 只承载 GET/HEAD 查询、overview、任务详情、Trace/output/transcript、统计和只读健康,可多副本滚动更新;它必须设置 `CODE_QUEUE_SERVICE_ROLE=read` 与 `CODE_QUEUE_SCHEDULER_ENABLED=false`,且不得接受入队、queue 变更、已读、重试、移动、追加 prompt 或打断这类 mutation。`code-queue-write` 承载入队、queue 创建/合并/更新、已读、手动重试、移动等命令写入,初期保持单副本和 `CODE_QUEUE_SERVICE_ROLE=write`,只把命令和任务状态写入 PostgreSQL,不启动 agent 子进程。`code-queue-scheduler` 是唯一拥有 scheduler 和 active run 的执行服务,设置 `CODE_QUEUE_SERVICE_ROLE=scheduler` 与 `CODE_QUEUE_SCHEDULER_ENABLED=true`,负责从 PostgreSQL 热任务集轮询新写入任务、推进队列、启动 Codex/OpenCode、处理 running task 的 steer/interrupt、发送终态通知和暴露执行端 `/health`。普通 Service 负载均衡不得把 mutation 打到 read,也不得把 running task 控制打到 write。
|
- 服务拆分语义:`code-queue-read` 只承载 GET/HEAD 查询、overview、任务详情、Trace/output/transcript、统计和只读健康,可多副本滚动更新;它必须设置 `CODE_QUEUE_SERVICE_ROLE=read` 与 `CODE_QUEUE_SCHEDULER_ENABLED=false`,且不得接受入队、queue 变更、已读、重试、移动、追加 prompt 或打断这类 mutation。`code-queue-write` 承载入队、queue 创建/合并/更新、已读、手动重试、移动等命令写入,初期保持单副本和 `CODE_QUEUE_SERVICE_ROLE=write`,只把命令和任务状态写入 PostgreSQL,不启动 agent 子进程。`code-queue-scheduler` 是唯一拥有 scheduler 和 active run 的执行服务,设置 `CODE_QUEUE_SERVICE_ROLE=scheduler` 与 `CODE_QUEUE_SCHEDULER_ENABLED=true`,负责从 PostgreSQL 热任务集轮询新写入任务、推进队列、启动 Codex/OpenCode、处理 running task 的 steer/interrupt、发送终态通知和暴露执行端 `/health`。普通 Service 负载均衡不得把 mutation 打到 read,也不得把 running task 控制打到 write。
|
||||||
- 实例语义:D601 是当前唯一 active 执行节点,`code-queue-scheduler` 以一个 scheduler Pod 承载长生命周期 Codex/OpenCode 子进程并轮询主 PostgreSQL 中由 `code-queue-mgr` 写入的 queued/retry_wait 任务。D518 不属于当前 Code Queue k3s 拓扑;在没有原生 k3s-agent 与稳定 Kubernetes 网络前,不得把 D518 写回 `expectedNodeIds` 或恢复 `code-queue-d518` standby。D601 scheduler 默认关闭 `CODE_QUEUE_STARTUP_OA_BACKFILL_ENABLED`;历史 OA Trace/STEP 回填必须通过显式 `/api/oa/backfill` 运维动作触发,不能在每次 Pod 重启时自动批量发布旧事件。
|
- 实例语义:D601 是当前唯一 active 执行节点,`code-queue-scheduler` 以一个 scheduler Pod 承载长生命周期 Codex/OpenCode 子进程并轮询主 PostgreSQL 中由 `code-queue-mgr` 写入的 queued/retry_wait 任务。D518 不属于当前 Code Queue k3s 拓扑;在没有原生 k3s-agent 与稳定 Kubernetes 网络前,不得把 D518 写回 `expectedNodeIds` 或恢复 `code-queue-d518` standby。D601 scheduler 默认关闭 `CODE_QUEUE_STARTUP_OA_BACKFILL_ENABLED`;历史 OA Trace/STEP 回填必须通过显式 `/api/oa/backfill` 运维动作触发,不能在每次 Pod 重启时自动批量发布旧事件。
|
||||||
- 滚动更新边界:master `code-queue-mgr` 保证 D601 抖动或执行面滚动更新期间普通提交、queue 管理和历史读取仍可用;但当前 D601 scheduler Pod 内仍直接承载正在运行的 agent 子进程,scheduler Pod 被替换时 active task 仍会进入 restart-recovery/retry 语义,不能宣称 running task 零中断。真正的长期目标是继续把调度器和执行器拆开:scheduler 只负责 claim task 并创建 Kubernetes Job/Pod 或独立 worker,runner 把输出、状态、attempt、事件和通知写回 PostgreSQL/OA Event Flow/归档;只有这样 controller/scheduler 滚动更新才不会影响正在执行的任务。
|
- 滚动更新边界:master `code-queue-mgr` 保证 D601 抖动或执行面滚动更新期间普通提交、queue 管理和历史读取仍可用;但当前 D601 scheduler Pod 内仍直接承载正在运行的 agent 子进程,scheduler Pod 被替换时 active task 仍会进入 restart-recovery/retry 语义,不能宣称 running task 零中断。真正的长期目标是继续把调度器和执行器拆开:scheduler 只负责 claim task 并创建 Kubernetes Job/Pod 或独立 worker,runner 把输出、状态、attempt、事件和通知写回 PostgreSQL/OA Event Flow/归档;只有这样 controller/scheduler 滚动更新才不会影响正在执行的任务。
|
||||||
- Restart recovery:D601 scheduler 启动时必须把没有本地 active run 的 `running`/`judging` 任务恢复为 `retry_wait` 并先写回 PostgreSQL,再开启新一轮 scheduler 轮询;同时必须清理 `queued`/`retry_wait`/terminal 任务残留的 `activeTurnId`,否则 PG 中残留的 running 或旧 turn id 会阻塞队列且不会被执行。health/overview 中的 `activeTaskIds` 只代表当前进程真实持有的 agent run;数据库里仍处于 `running`/`judging` 但没有本地 run 的任务只能作为 scheduler 侧 `orphanedActiveTaskIds` 暴露,不能计入 active run slot。主 server 直管 `code-queue-mgr` 只有 PostgreSQL 视角,不得把数据库中的 `running`/`judging` 误报为真实 active run;只能作为 `databaseActiveTaskIds`/`executionStateSource=postgres-control-plane` 这类控制面状态返回。
|
- Active run liveness:Code Queue 活性判断必须同时读取 PostgreSQL 任务状态、D601 scheduler 本地 active run/active slot/active queue、scheduler-owned heartbeat、Trace/OA 持久化进度和 OA publisher pending/lastError。scheduler heartbeat 至少包含 `taskId`、`attempt`、`activeTurnId`/`codexThreadId`、`owner`/`schedulerInstance`、`lastLocalHeartbeatAt`、`lastObservedAgentEventAt`、`lastPersistedTraceAt`、`outputMaxSeq` 与 `agentPort`;后续如拆出独立 runner,可以在同一结构上追加 worker/claim lease 字段。master `code-queue-mgr` 的 `postgres-control-plane` 视图不能单独作为“任务已卡死/未执行”的依据;当 master 看到 `databaseActiveTaskCount>0`、本地 `activeRunSlotCount=0`,但存在新鲜 scheduler heartbeat 时,`executionDiagnostics.state` 必须报告 `split-brain`/`degraded` 而不是 `healthy`。
|
||||||
|
- Trace gap 与 stale active:Trace/OA 长时间没有新 seq 或 publisher 有 pending/lastError 只说明持久化链路可能 degraded;只要 scheduler-owned `lastLocalHeartbeatAt` 仍新鲜,就必须归类为 trace gap,不得触发 stale retry。只有 PostgreSQL 仍为 `running`/`judging`、scheduler 本地没有 active run/slot/waiter,并且 owner heartbeat 已过期时,任务才允许进入 stale recovery candidate;缺失 heartbeat 是 degraded 诊断,不是自动恢复许可。
|
||||||
|
- Restart recovery:D601 scheduler 启动或 reconciliation 时只能由 scheduler-owned 恢复入口处理 stale active,必须留下 recovery reason、source/method 审计事件,并使用条件更新防止覆盖并发 owner 写入;恢复原因需要区分 user interrupt、admin stale recovery 和 service restart recovery。禁止裸改 production PostgreSQL 任务状态,禁止把 production scheduler/数据库作为破坏性测试对象。health/overview 中的 `activeTaskIds` 只代表当前进程真实持有的 agent run;数据库里仍处于 `running`/`judging` 但没有本地 run 的任务只能作为 scheduler 侧 `orphanedActiveTaskIds` 或 diagnostics 暴露,不能计入 active run slot。主 server 直管 `code-queue-mgr` 只有 PostgreSQL 视角,不得把数据库中的 `running`/`judging` 误报为真实 active run;只能作为 `databaseActiveTaskIds`/`executionStateSource=postgres-control-plane` 这类控制面状态返回。
|
||||||
- Transient dependency recovery:D601 scheduler/read/write 通过 provider egress 和 TCP gateway 访问主 PostgreSQL、OA Event Flow 与模型 API,必须把 `CONNECTION_CLOSED`、`CONNECT_TIMEOUT`、stale PostgreSQL client、provider egress 瞬时失败和 MiniMax judge provider 初始化失败视为可恢复运行时抖动。实现上应轮换失效数据库 client、重试或降级 judge provider 初始化、释放 active run slot 并继续扫描后续 queued/retry_wait 任务;不得因为一次连接关闭、一次 judge provider transient error 或滚动更新窗口让 scheduler 长期停止推进。
|
- Transient dependency recovery:D601 scheduler/read/write 通过 provider egress 和 TCP gateway 访问主 PostgreSQL、OA Event Flow 与模型 API,必须把 `CONNECTION_CLOSED`、`CONNECT_TIMEOUT`、stale PostgreSQL client、provider egress 瞬时失败和 MiniMax judge provider 初始化失败视为可恢复运行时抖动。实现上应轮换失效数据库 client、重试或降级 judge provider 初始化、释放 active run slot 并继续扫描后续 queued/retry_wait 任务;不得因为一次连接关闭、一次 judge provider transient error 或滚动更新窗口让 scheduler 长期停止推进。
|
||||||
- 部署引用:Code Queue 镜像仍复用 `src/components/microservices/code-queue/Dockerfile`,Kubernetes 运行清单为 `src/components/microservices/k3sctl-adapter/k3s/code-queue.k8s.yaml`,`config.json` 对外记录 k3s manifest `src/components/microservices/k3sctl-adapter/k3s/code-queue.k3s.json`;主 server 根目录 `docker-compose.yml` 不包含 `code-queue` service,旧 D601 direct Compose 文件只作为迁移/本地诊断参考,不是正式运行入口。
|
- 部署引用:Code Queue 镜像仍复用 `src/components/microservices/code-queue/Dockerfile`,Kubernetes 运行清单为 `src/components/microservices/k3sctl-adapter/k3s/code-queue.k8s.yaml`,`config.json` 对外记录 k3s manifest `src/components/microservices/k3sctl-adapter/k3s/code-queue.k3s.json`;主 server 根目录 `docker-compose.yml` 不包含 `code-queue` service,旧 D601 direct Compose 文件只作为迁移/本地诊断参考,不是正式运行入口。
|
||||||
- 主服务依赖映射:Code Queue 仍以主 PostgreSQL 为权威数据库,但 D601 k3s Pod 不能依赖公网直连 `74.48.78.17:15432/4255`。Pod 内 `DATABASE_URL` 和 `OA_EVENT_FLOW_BASE_URL` 必须指向集群内 `d601-tcp-egress-gateway` Service,再由该 gateway 通过 D601 provider-gateway egress proxy 的 HTTP CONNECT 转发到主 PostgreSQL 和 OA Event Flow;新增 TCP 依赖时扩展 `TCP_EGRESS_ROUTES`,不得在业务容器里新增一次性公网直连或 ad hoc 隧道。D601 active 实例的 `CODE_QUEUE_NOTIFY_CLAUDEQQ_BASE_URL` 必须使用集群内 ClaudeQQ Service `http://claudeqq.unidesk.svc.cluster.local:3290`,并把 `claudeqq`/`claudeqq.unidesk.svc.cluster.local` 加入 `NO_PROXY`,避免任务完成通知被默认出网代理错误转发。旧 `http://host.docker.internal:3290` 只允许作为迁移期诊断,不得作为 Code Queue k3s Pod 的正式通知路径。这些端口映射只服务受控节点运行时,必须用防火墙或等价策略限制来源,不得成为浏览器或任意公网客户端入口。
|
- 主服务依赖映射:Code Queue 仍以主 PostgreSQL 为权威数据库,但 D601 k3s Pod 不能依赖公网直连 `74.48.78.17:15432/4255`。Pod 内 `DATABASE_URL` 和 `OA_EVENT_FLOW_BASE_URL` 必须指向集群内 `d601-tcp-egress-gateway` Service,再由该 gateway 通过 D601 provider-gateway egress proxy 的 HTTP CONNECT 转发到主 PostgreSQL 和 OA Event Flow;新增 TCP 依赖时扩展 `TCP_EGRESS_ROUTES`,不得在业务容器里新增一次性公网直连或 ad hoc 隧道。D601 active 实例的 `CODE_QUEUE_NOTIFY_CLAUDEQQ_BASE_URL` 必须使用集群内 ClaudeQQ Service `http://claudeqq.unidesk.svc.cluster.local:3290`,并把 `claudeqq`/`claudeqq.unidesk.svc.cluster.local` 加入 `NO_PROXY`,避免任务完成通知被默认出网代理错误转发。旧 `http://host.docker.internal:3290` 只允许作为迁移期诊断,不得作为 Code Queue k3s Pod 的正式通知路径。这些端口映射只服务受控节点运行时,必须用防火墙或等价策略限制来源,不得成为浏览器或任意公网客户端入口。
|
||||||
|
|||||||
@@ -47,3 +47,9 @@ frontend Bun server 必须提供同源 `/api/frontend-performance`,记录 webu
|
|||||||
当最近失败请求集中出现 frontend `core_proxy` 502/503/504,路径为 `/api/microservices/code-queue/proxy/...` 的 overview、trace 或 summary,且 k3s/k8s Pod 仍在运行时,必须先运行 `bun scripts/cli.ts microservice diagnostics code-queue`,区分 provider-gateway online、WebSocket HTTP tunnel、k3sctl-adapter、Kubernetes API service proxy 和目标 Service 五段状态。provider tunnel 类失败必须记录响应 body/headers 中的 `requestId`、`stage`、`failureReason`、`x-unidesk-request-id` 和 `x-unidesk-tunnel-error`;如需主动验证错误结构,运行 `bun scripts/cli.ts microservice tunnel-self-test code-queue`,该自测应返回预期失败但 `ok=true` 的诊断结果。随后再继续判断“Kubernetes API service proxy 不可达”“Code Queue 进程不可达”和“Code Queue event loop 被热路径同步工作饿死”。如果 `debug health` 或 provider-gateway egress health 显示 `providerGatewayEgressProxyActiveTunnels` 持续偏高、`pendingTunnels` 非零或 `oldestTunnelAgeMs` 长时间增长,应先按 provider-gateway egress tunnel 生命周期排障,确认 `egress_tcp_open`、connect timeout、idle cleanup 与 core socket close 清理是否生效。排障顺序是同时查看 `/api/frontend-performance`、`/api/performance`、`k3sctl-adapter` `/api/control-plane`、Kubernetes Pod `/live`、`/health`、overview/trace-step curl、`kubectl top pod` 或 Docker stats、容器 `RestartCount`/`OOMKilled` 和 Code Queue 日志;如果 Pod 内 `/health` 也超时,应优先检查实时 output 发布、archive 读取、transcript 构建、统计计算、启动维护、历史 OA backfill 和远程 Provider 准备/SSH 子进程是否阻塞 event loop,而不是先调整 frontend 渲染或代理超时。Code Queue 默认不得在启动时自动执行历史 OA backfill 或通知表索引维护;显式 backfill 必须作为运维动作记录,并在运行期间并发证明 `/live`、`/health` 与 `/api/tasks/overview` 仍快速返回。涉及 D601 等远程 Provider 时,还要检查 `runCodeQueueSsh`/开发容器准备是否仍存在同步子进程、无 timeout 的 SSH、无上限 stdout/stderr 或 stale TUN 重建等待;修复后必须在远程准备探针运行期间并发证明 Pod `/health` 与 `/api/tasks/overview` 仍快速返回。
|
当最近失败请求集中出现 frontend `core_proxy` 502/503/504,路径为 `/api/microservices/code-queue/proxy/...` 的 overview、trace 或 summary,且 k3s/k8s Pod 仍在运行时,必须先运行 `bun scripts/cli.ts microservice diagnostics code-queue`,区分 provider-gateway online、WebSocket HTTP tunnel、k3sctl-adapter、Kubernetes API service proxy 和目标 Service 五段状态。provider tunnel 类失败必须记录响应 body/headers 中的 `requestId`、`stage`、`failureReason`、`x-unidesk-request-id` 和 `x-unidesk-tunnel-error`;如需主动验证错误结构,运行 `bun scripts/cli.ts microservice tunnel-self-test code-queue`,该自测应返回预期失败但 `ok=true` 的诊断结果。随后再继续判断“Kubernetes API service proxy 不可达”“Code Queue 进程不可达”和“Code Queue event loop 被热路径同步工作饿死”。如果 `debug health` 或 provider-gateway egress health 显示 `providerGatewayEgressProxyActiveTunnels` 持续偏高、`pendingTunnels` 非零或 `oldestTunnelAgeMs` 长时间增长,应先按 provider-gateway egress tunnel 生命周期排障,确认 `egress_tcp_open`、connect timeout、idle cleanup 与 core socket close 清理是否生效。排障顺序是同时查看 `/api/frontend-performance`、`/api/performance`、`k3sctl-adapter` `/api/control-plane`、Kubernetes Pod `/live`、`/health`、overview/trace-step curl、`kubectl top pod` 或 Docker stats、容器 `RestartCount`/`OOMKilled` 和 Code Queue 日志;如果 Pod 内 `/health` 也超时,应优先检查实时 output 发布、archive 读取、transcript 构建、统计计算、启动维护、历史 OA backfill 和远程 Provider 准备/SSH 子进程是否阻塞 event loop,而不是先调整 frontend 渲染或代理超时。Code Queue 默认不得在启动时自动执行历史 OA backfill 或通知表索引维护;显式 backfill 必须作为运维动作记录,并在运行期间并发证明 `/live`、`/health` 与 `/api/tasks/overview` 仍快速返回。涉及 D601 等远程 Provider 时,还要检查 `runCodeQueueSsh`/开发容器准备是否仍存在同步子进程、无 timeout 的 SSH、无上限 stdout/stderr 或 stale TUN 重建等待;修复后必须在远程准备探针运行期间并发证明 Pod `/health` 与 `/api/tasks/overview` 仍快速返回。
|
||||||
|
|
||||||
Code Queue task 明明产出最终回复却反复 `retry_wait` 时,应优先用任务详情里的 latest attempt 字段核查 `terminalStatus`、`transportClosedBeforeTerminal`、`appServerExitCode`、`finalResponseChars`、`judge.raw._safetyOverride` 和 attempt output。OpenCode 远程任务中,`opencode completed status=completed exit=0` 加当前 attempt 非空 assistant 输出应对应 `terminalStatus=completed`、`transportClosedBeforeTerminal=false`;如果因为缺少 `step_finish` 事件仍触发 `_safetyOverride=terminal_not_completed`,说明协议终态归一化有回归。相反,当前 attempt 没有最终 assistant response 时即使 tool/read/bash 证据完整,也必须 retry,不能用旧 `task.finalResponse` 或 reasoning/tool evidence 代替可见最终回复。
|
Code Queue task 明明产出最终回复却反复 `retry_wait` 时,应优先用任务详情里的 latest attempt 字段核查 `terminalStatus`、`transportClosedBeforeTerminal`、`appServerExitCode`、`finalResponseChars`、`judge.raw._safetyOverride` 和 attempt output。OpenCode 远程任务中,`opencode completed status=completed exit=0` 加当前 attempt 非空 assistant 输出应对应 `terminalStatus=completed`、`transportClosedBeforeTerminal=false`;如果因为缺少 `step_finish` 事件仍触发 `_safetyOverride=terminal_not_completed`,说明协议终态归一化有回归。相反,当前 attempt 没有最终 assistant response 时即使 tool/read/bash 证据完整,也必须 retry,不能用旧 `task.finalResponse` 或 reasoning/tool evidence 代替可见最终回复。
|
||||||
|
|
||||||
|
### Code Queue Liveness
|
||||||
|
|
||||||
|
Code Queue 的“任务是否卡死”不能由单一控制面字段判断。排障必须同时看 PostgreSQL 中的 `running`/`judging` 任务、D601 scheduler 本地 active run/active slot/active queue、scheduler-owned heartbeat、Trace/OA 持久化进度和 OA publisher pending/lastError。master `code-queue-mgr` 的 `postgres-control-plane` 视图只证明数据库行存在;当它显示 `activeRunSlotCount=0` 但 D601 heartbeat 仍新鲜时,正确结论是 control-plane/execution-plane 分裂,diagnostics 应显示 `split-brain` 或 `degraded`,不能宣称任务未执行或卡死。
|
||||||
|
|
||||||
|
Trace/OA 长时间没有新 seq 但 scheduler heartbeat 正常时,应归类为 trace gap 或 publisher degraded,不得自动 retry。只有 scheduler 本地没有 active run,且对应 owner heartbeat 已过期时,才允许进入 stale recovery candidate;缺失 heartbeat 只能触发 degraded 诊断和人工确认。任何恢复入口都必须由 scheduler 执行,使用条件更新和审计事件区分 user interrupt、admin stale recovery 与 service restart recovery;禁止直接修改 production PostgreSQL 任务状态来“修复” active run。
|
||||||
|
|||||||
@@ -0,0 +1,24 @@
|
|||||||
|
import { CODE_QUEUE_LIVENESS_CHECK_NAMES, runCodeQueueLivenessFixtureChecks } from "./src/code-queue-liveness-fixtures";
|
||||||
|
|
||||||
|
function optionValues(args: string[], name: string): string[] {
|
||||||
|
const values: string[] = [];
|
||||||
|
for (let index = 0; index < args.length; index += 1) {
|
||||||
|
if (args[index] !== name) continue;
|
||||||
|
const raw = args[index + 1];
|
||||||
|
if (raw === undefined || raw.startsWith("--")) throw new Error(`${name} requires a check name`);
|
||||||
|
values.push(...raw.split(",").map((item) => item.trim()).filter(Boolean));
|
||||||
|
index += 1;
|
||||||
|
}
|
||||||
|
return values;
|
||||||
|
}
|
||||||
|
|
||||||
|
function main(): void {
|
||||||
|
const only = optionValues(Bun.argv.slice(2), "--only");
|
||||||
|
const unknown = only.filter((name) => !CODE_QUEUE_LIVENESS_CHECK_NAMES.includes(name as never));
|
||||||
|
if (unknown.length > 0) throw new Error(`unknown Code Queue liveness check(s): ${unknown.join(", ")}`);
|
||||||
|
const result = runCodeQueueLivenessFixtureChecks(only);
|
||||||
|
process.stdout.write(`${JSON.stringify(result, null, 2)}\n`);
|
||||||
|
if (!result.ok) process.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (import.meta.main) main();
|
||||||
@@ -237,6 +237,8 @@ export function runChecks(config: UniDeskConfig, options: CheckOptions = default
|
|||||||
fileItem("src/components/microservices/code-queue-mgr/src/prompt-observation.ts"),
|
fileItem("src/components/microservices/code-queue-mgr/src/prompt-observation.ts"),
|
||||||
fileItem("scripts/src/deploy.ts"),
|
fileItem("scripts/src/deploy.ts"),
|
||||||
fileItem("scripts/code-queue-issue3-regression-test.ts"),
|
fileItem("scripts/code-queue-issue3-regression-test.ts"),
|
||||||
|
fileItem("scripts/code-queue-liveness-diagnostics-test.ts"),
|
||||||
|
fileItem("scripts/src/code-queue-liveness-fixtures.ts"),
|
||||||
fileItem("scripts/src/ci.ts"),
|
fileItem("scripts/src/ci.ts"),
|
||||||
fileItem("scripts/src/e2e.ts"),
|
fileItem("scripts/src/e2e.ts"),
|
||||||
fileItem("scripts/code-queue-prompt-observation-test.ts"),
|
fileItem("scripts/code-queue-prompt-observation-test.ts"),
|
||||||
@@ -250,10 +252,16 @@ export function runChecks(config: UniDeskConfig, options: CheckOptions = default
|
|||||||
items.push(commandItem("typescript:scripts", ["bunx", "tsc", "-p", "scripts/tsconfig.json", "--noEmit", "--pretty", "false"], 120_000));
|
items.push(commandItem("typescript:scripts", ["bunx", "tsc", "-p", "scripts/tsconfig.json", "--noEmit", "--pretty", "false"], 120_000));
|
||||||
items.push(commandItem("code-queue:prompt-observation-contract", ["bun", "scripts/code-queue-prompt-observation-test.ts"], 30_000));
|
items.push(commandItem("code-queue:prompt-observation-contract", ["bun", "scripts/code-queue-prompt-observation-test.ts"], 30_000));
|
||||||
items.push(commandItem("code-queue:issue3-diagnostics-and-image-preflight", ["bun", "scripts/code-queue-issue3-regression-test.ts"], 30_000));
|
items.push(commandItem("code-queue:issue3-diagnostics-and-image-preflight", ["bun", "scripts/code-queue-issue3-regression-test.ts"], 30_000));
|
||||||
|
items.push(commandItem("code-queue:active-run-heartbeat-visible", ["bun", "scripts/code-queue-liveness-diagnostics-test.ts", "--only", "code-queue:active-run-heartbeat-visible"], 30_000));
|
||||||
|
items.push(commandItem("code-queue:trace-gap-not-stale", ["bun", "scripts/code-queue-liveness-diagnostics-test.ts", "--only", "code-queue:trace-gap-not-stale"], 30_000));
|
||||||
|
items.push(commandItem("code-queue:stale-active-owner-expired", ["bun", "scripts/code-queue-liveness-diagnostics-test.ts", "--only", "code-queue:stale-active-owner-expired"], 30_000));
|
||||||
|
items.push(commandItem("code-queue:control-plane-split-brain-diagnostics", ["bun", "scripts/code-queue-liveness-diagnostics-test.ts", "--only", "code-queue:control-plane-split-brain-diagnostics"], 30_000));
|
||||||
|
items.push(commandItem("code-queue:oa-publisher-degraded-visible", ["bun", "scripts/code-queue-liveness-diagnostics-test.ts", "--only", "code-queue:oa-publisher-degraded-visible"], 30_000));
|
||||||
} else {
|
} else {
|
||||||
items.push(skippedItem("typescript:scripts", "scripts TypeScript typecheck is opt-in", "--scripts-typecheck or --full"));
|
items.push(skippedItem("typescript:scripts", "scripts TypeScript typecheck is opt-in", "--scripts-typecheck or --full"));
|
||||||
items.push(skippedItem("code-queue:prompt-observation-contract", "prompt observation contract is opt-in with script checks", "--scripts-typecheck or --full"));
|
items.push(skippedItem("code-queue:prompt-observation-contract", "prompt observation contract is opt-in with script checks", "--scripts-typecheck or --full"));
|
||||||
items.push(skippedItem("code-queue:issue3-diagnostics-and-image-preflight", "Code Queue issue #3 regression fixtures are opt-in with script checks", "--scripts-typecheck or --full"));
|
items.push(skippedItem("code-queue:issue3-diagnostics-and-image-preflight", "Code Queue issue #3 regression fixtures are opt-in with script checks", "--scripts-typecheck or --full"));
|
||||||
|
items.push(skippedItem("code-queue:liveness-diagnostics-fixtures", "Code Queue liveness diagnostics fixtures are opt-in with script checks", "--scripts-typecheck or --full"));
|
||||||
}
|
}
|
||||||
if (options.logs) {
|
if (options.logs) {
|
||||||
items.push(unifiedLogRotationItem());
|
items.push(unifiedLogRotationItem());
|
||||||
|
|||||||
@@ -0,0 +1,259 @@
|
|||||||
|
import { buildExecutionDiagnostics, buildSchedulerHeartbeat, staleRecoveryCandidate, taskHasTraceGapButFreshHeartbeat } from "../../src/components/microservices/code-queue/src/execution-diagnostics";
|
||||||
|
import type { ActiveRun } from "../../src/components/microservices/code-queue/src/code-agent/common";
|
||||||
|
import type { CodeQueueExecutionDiagnostics, QueueTask, SchedulerActiveRunHeartbeat, TaskStatus } from "../../src/components/microservices/code-queue/src/types";
|
||||||
|
|
||||||
|
export const CODE_QUEUE_LIVENESS_CHECK_NAMES = [
|
||||||
|
"code-queue:active-run-heartbeat-visible",
|
||||||
|
"code-queue:trace-gap-not-stale",
|
||||||
|
"code-queue:stale-active-owner-expired",
|
||||||
|
"code-queue:control-plane-split-brain-diagnostics",
|
||||||
|
"code-queue:oa-publisher-degraded-visible",
|
||||||
|
] as const;
|
||||||
|
|
||||||
|
type CodeQueueLivenessCheckName = typeof CODE_QUEUE_LIVENESS_CHECK_NAMES[number];
|
||||||
|
|
||||||
|
interface FixtureCheck {
|
||||||
|
name: CodeQueueLivenessCheckName;
|
||||||
|
ok: boolean;
|
||||||
|
detail: Record<string, unknown>;
|
||||||
|
}
|
||||||
|
|
||||||
|
const now = "2026-05-19T00:10:00.000Z";
|
||||||
|
const freshAt = "2026-05-19T00:09:50.000Z";
|
||||||
|
const oldTraceAt = "2026-05-18T23:40:00.000Z";
|
||||||
|
const expiredAt = "2026-05-18T23:50:00.000Z";
|
||||||
|
|
||||||
|
function assertCondition(condition: unknown, message: string, detail: Record<string, unknown> = {}): void {
|
||||||
|
if (!condition) throw new Error(`${message}: ${JSON.stringify(detail)}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
function fixtureTask(id: string, status: TaskStatus, heartbeat: SchedulerActiveRunHeartbeat | null = null): QueueTask {
|
||||||
|
return {
|
||||||
|
id,
|
||||||
|
queueId: "default",
|
||||||
|
queueEnteredAt: "2026-05-19T00:00:00.000Z",
|
||||||
|
prompt: `${id} prompt`,
|
||||||
|
basePrompt: `${id} prompt`,
|
||||||
|
referenceTaskIds: [],
|
||||||
|
referenceInjection: null,
|
||||||
|
providerId: "D601",
|
||||||
|
cwd: "/workspace",
|
||||||
|
model: "gpt-5.5",
|
||||||
|
reasoningEffort: null,
|
||||||
|
executionMode: "default",
|
||||||
|
maxAttempts: 99,
|
||||||
|
status,
|
||||||
|
createdAt: "2026-05-19T00:00:00.000Z",
|
||||||
|
updatedAt: "2026-05-19T00:00:00.000Z",
|
||||||
|
startedAt: status === "running" || status === "judging" ? "2026-05-19T00:00:00.000Z" : null,
|
||||||
|
finishedAt: null,
|
||||||
|
readAt: null,
|
||||||
|
currentAttempt: status === "queued" ? 0 : 1,
|
||||||
|
currentMode: status === "queued" ? null : "initial",
|
||||||
|
codexThreadId: heartbeat?.codexThreadId ?? null,
|
||||||
|
activeTurnId: heartbeat?.activeTurnId ?? null,
|
||||||
|
schedulerHeartbeat: heartbeat,
|
||||||
|
finalResponse: "",
|
||||||
|
outputMaxSeq: heartbeat?.outputMaxSeq ?? 0,
|
||||||
|
lastError: null,
|
||||||
|
lastJudge: null,
|
||||||
|
judgeFailCount: 0,
|
||||||
|
promptHistory: [],
|
||||||
|
output: [],
|
||||||
|
events: [],
|
||||||
|
attempts: [],
|
||||||
|
cancelRequested: false,
|
||||||
|
nextPrompt: null,
|
||||||
|
nextMode: null,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function heartbeat(taskId: string, at: string, overrides: Partial<SchedulerActiveRunHeartbeat> = {}): SchedulerActiveRunHeartbeat {
|
||||||
|
return {
|
||||||
|
taskId,
|
||||||
|
queueId: "default",
|
||||||
|
attempt: 1,
|
||||||
|
activeTurnId: "turn_fixture",
|
||||||
|
codexThreadId: "thread_fixture",
|
||||||
|
owner: "D601",
|
||||||
|
schedulerInstance: "code-queue-scheduler-fixture",
|
||||||
|
executionPlane: "scheduler-execution-plane",
|
||||||
|
agentPort: "codex",
|
||||||
|
status: "running",
|
||||||
|
lastLocalHeartbeatAt: at,
|
||||||
|
lastObservedAgentEventAt: at,
|
||||||
|
lastPersistedTraceAt: at,
|
||||||
|
outputMaxSeq: 10,
|
||||||
|
source: "scheduler",
|
||||||
|
...overrides,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function activeRun(taskId: string, queueId = "default"): ActiveRun {
|
||||||
|
return {
|
||||||
|
taskId,
|
||||||
|
queueId,
|
||||||
|
app: { stop: () => undefined },
|
||||||
|
port: "codex",
|
||||||
|
threadId: "thread_fixture",
|
||||||
|
turnId: "turn_fixture",
|
||||||
|
startedAt: "2026-05-19T00:00:00.000Z",
|
||||||
|
lastLocalHeartbeatAt: freshAt,
|
||||||
|
lastObservedAgentEventAt: freshAt,
|
||||||
|
lastPersistedTraceAt: freshAt,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function schedulerDiagnostics(tasks: QueueTask[], activeRuns: ActiveRun[] = [], oaPublisher: unknown = null): CodeQueueExecutionDiagnostics {
|
||||||
|
return buildExecutionDiagnostics({
|
||||||
|
now,
|
||||||
|
controlPlane: "D601-code-queue-scheduler",
|
||||||
|
executionStateSource: "scheduler-execution-plane",
|
||||||
|
tasks,
|
||||||
|
activeRuns,
|
||||||
|
activeRunSlotCount: activeRuns.length,
|
||||||
|
activeQueueIds: activeRuns.map((run) => run.queueId),
|
||||||
|
processingQueueIds: [],
|
||||||
|
orphanedActiveTaskIds: tasks.filter((task) => (task.status === "running" || task.status === "judging") && !activeRuns.some((run) => run.taskId === task.id)).map((task) => task.id),
|
||||||
|
oaPublisher: oaPublisher as never,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function checkActiveRunHeartbeatVisible(): FixtureCheck {
|
||||||
|
const task = fixtureTask("codex_fixture_active_1", "running");
|
||||||
|
const run = activeRun(task.id);
|
||||||
|
task.schedulerHeartbeat = buildSchedulerHeartbeat(task, run, {
|
||||||
|
now: freshAt,
|
||||||
|
owner: "D601",
|
||||||
|
schedulerInstance: "code-queue-scheduler-fixture",
|
||||||
|
agentPort: "codex",
|
||||||
|
lastObservedAgentEventAt: freshAt,
|
||||||
|
lastPersistedTraceAt: freshAt,
|
||||||
|
});
|
||||||
|
const diagnostics = schedulerDiagnostics([task], [run]);
|
||||||
|
assertCondition(diagnostics.activeHeartbeatTaskIds.includes(task.id), "active heartbeat task id must be visible", diagnostics as unknown as Record<string, unknown>);
|
||||||
|
assertCondition(diagnostics.schedulerActiveTaskIds.includes(task.id), "scheduler active task id must be visible", diagnostics as unknown as Record<string, unknown>);
|
||||||
|
assertCondition(diagnostics.lastSchedulerHeartbeatAt === freshAt, "last scheduler heartbeat must be surfaced", diagnostics as unknown as Record<string, unknown>);
|
||||||
|
return {
|
||||||
|
name: "code-queue:active-run-heartbeat-visible",
|
||||||
|
ok: true,
|
||||||
|
detail: {
|
||||||
|
state: diagnostics.state,
|
||||||
|
schedulerActiveTaskIds: diagnostics.schedulerActiveTaskIds,
|
||||||
|
activeHeartbeatTaskIds: diagnostics.activeHeartbeatTaskIds,
|
||||||
|
lastSchedulerHeartbeatAt: diagnostics.lastSchedulerHeartbeatAt,
|
||||||
|
heartbeat: task.schedulerHeartbeat,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function checkTraceGapNotStale(): FixtureCheck {
|
||||||
|
const task = fixtureTask("codex_fixture_trace_gap_1", "running", heartbeat("codex_fixture_trace_gap_1", freshAt, {
|
||||||
|
lastPersistedTraceAt: oldTraceAt,
|
||||||
|
outputMaxSeq: 89112,
|
||||||
|
}));
|
||||||
|
const decision = staleRecoveryCandidate({ task, localActive: false, now });
|
||||||
|
const hasTraceGap = taskHasTraceGapButFreshHeartbeat(task, now);
|
||||||
|
const diagnostics = schedulerDiagnostics([task], []);
|
||||||
|
assertCondition(hasTraceGap, "trace gap with fresh owner heartbeat should be classified", { decision, diagnostics });
|
||||||
|
assertCondition(decision.allowed === false && decision.reason === "owner-heartbeat-fresh", "fresh heartbeat must block stale retry", { decision });
|
||||||
|
assertCondition(diagnostics.traceGapNotStaleTaskIds.includes(task.id), "diagnostics must expose trace gap as not stale", diagnostics as unknown as Record<string, unknown>);
|
||||||
|
assertCondition(!diagnostics.staleRecoveryCandidateTaskIds.includes(task.id), "trace gap must not enter stale recovery candidates", diagnostics as unknown as Record<string, unknown>);
|
||||||
|
return {
|
||||||
|
name: "code-queue:trace-gap-not-stale",
|
||||||
|
ok: true,
|
||||||
|
detail: {
|
||||||
|
decision,
|
||||||
|
traceGapNotStaleTaskIds: diagnostics.traceGapNotStaleTaskIds,
|
||||||
|
staleRecoveryCandidateTaskIds: diagnostics.staleRecoveryCandidateTaskIds,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function checkStaleActiveOwnerExpired(): FixtureCheck {
|
||||||
|
const task = fixtureTask("codex_fixture_stale_1", "running", heartbeat("codex_fixture_stale_1", expiredAt, {
|
||||||
|
lastObservedAgentEventAt: expiredAt,
|
||||||
|
lastPersistedTraceAt: expiredAt,
|
||||||
|
}));
|
||||||
|
const decision = staleRecoveryCandidate({ task, localActive: false, now });
|
||||||
|
const diagnostics = schedulerDiagnostics([task], []);
|
||||||
|
assertCondition(decision.allowed === true && decision.reason === "owner-heartbeat-expired", "expired owner heartbeat should be the stale recovery gate", { decision });
|
||||||
|
assertCondition(diagnostics.state === "stale-active", "diagnostics must mark stale active only after owner heartbeat expiry", diagnostics as unknown as Record<string, unknown>);
|
||||||
|
assertCondition(diagnostics.staleRecoveryCandidateTaskIds.includes(task.id), "expired owner heartbeat should create a stale candidate", diagnostics as unknown as Record<string, unknown>);
|
||||||
|
return {
|
||||||
|
name: "code-queue:stale-active-owner-expired",
|
||||||
|
ok: true,
|
||||||
|
detail: {
|
||||||
|
decision,
|
||||||
|
state: diagnostics.state,
|
||||||
|
staleRecoveryCandidateTaskIds: diagnostics.staleRecoveryCandidateTaskIds,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function checkControlPlaneSplitBrainDiagnostics(): FixtureCheck {
|
||||||
|
const task = fixtureTask("codex_fixture_split_1", "running", heartbeat("codex_fixture_split_1", freshAt));
|
||||||
|
const diagnostics = buildExecutionDiagnostics({
|
||||||
|
now,
|
||||||
|
controlPlane: "master-code-queue-mgr",
|
||||||
|
executionStateSource: "postgres-control-plane",
|
||||||
|
tasks: [task],
|
||||||
|
activeRuns: [],
|
||||||
|
activeRunSlotCount: 0,
|
||||||
|
oaPublisher: null,
|
||||||
|
});
|
||||||
|
assertCondition(diagnostics.state === "split-brain" && diagnostics.splitBrain === true, "master postgres-control-plane must report split-brain when DB active has fresh scheduler heartbeat", diagnostics as unknown as Record<string, unknown>);
|
||||||
|
assertCondition(diagnostics.schedulerActiveRunSlotCount === 0 && diagnostics.databaseActiveTaskCount === 1, "split-brain fixture should preserve the exact control-plane divergence", diagnostics as unknown as Record<string, unknown>);
|
||||||
|
return {
|
||||||
|
name: "code-queue:control-plane-split-brain-diagnostics",
|
||||||
|
ok: true,
|
||||||
|
detail: {
|
||||||
|
state: diagnostics.state,
|
||||||
|
splitBrain: diagnostics.splitBrain,
|
||||||
|
executionStateSource: diagnostics.executionStateSource,
|
||||||
|
databaseActiveTaskIds: diagnostics.databaseActiveTaskIds,
|
||||||
|
schedulerActiveRunSlotCount: diagnostics.schedulerActiveRunSlotCount,
|
||||||
|
heartbeatFreshTaskIds: diagnostics.heartbeatFreshTaskIds,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function checkOaPublisherDegradedVisible(): FixtureCheck {
|
||||||
|
const oaPublisher = { pending: 3, lastError: "fixture OA publish retry", lastPublishedAt: null };
|
||||||
|
const diagnostics = schedulerDiagnostics([], [], oaPublisher);
|
||||||
|
assertCondition(diagnostics.state === "degraded", "OA publisher pending/lastError must degrade diagnostics", diagnostics as unknown as Record<string, unknown>);
|
||||||
|
assertCondition(diagnostics.oaPublisher === oaPublisher, "OA publisher detail must remain visible", diagnostics as unknown as Record<string, unknown>);
|
||||||
|
return {
|
||||||
|
name: "code-queue:oa-publisher-degraded-visible",
|
||||||
|
ok: true,
|
||||||
|
detail: {
|
||||||
|
state: diagnostics.state,
|
||||||
|
degraded: diagnostics.degraded,
|
||||||
|
oaPublisher: diagnostics.oaPublisher as Record<string, unknown>,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
export function runCodeQueueLivenessFixtureChecks(only: string[] = []): { ok: boolean; checks: FixtureCheck[]; failures: Array<{ name: string; error: string }> } {
|
||||||
|
const selected = new Set(only.filter((name) => name.trim().length > 0));
|
||||||
|
const runners: Array<[CodeQueueLivenessCheckName, () => FixtureCheck]> = [
|
||||||
|
["code-queue:active-run-heartbeat-visible", checkActiveRunHeartbeatVisible],
|
||||||
|
["code-queue:trace-gap-not-stale", checkTraceGapNotStale],
|
||||||
|
["code-queue:stale-active-owner-expired", checkStaleActiveOwnerExpired],
|
||||||
|
["code-queue:control-plane-split-brain-diagnostics", checkControlPlaneSplitBrainDiagnostics],
|
||||||
|
["code-queue:oa-publisher-degraded-visible", checkOaPublisherDegradedVisible],
|
||||||
|
];
|
||||||
|
const checks: FixtureCheck[] = [];
|
||||||
|
const failures: Array<{ name: string; error: string }> = [];
|
||||||
|
for (const [name, run] of runners) {
|
||||||
|
if (selected.size > 0 && !selected.has(name)) continue;
|
||||||
|
try {
|
||||||
|
checks.push(run());
|
||||||
|
} catch (error) {
|
||||||
|
failures.push({ name, error: error instanceof Error ? error.message : String(error) });
|
||||||
|
checks.push({ name, ok: false, detail: { error: error instanceof Error ? error.message : String(error) } });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (checks.length === 0) throw new Error(`no Code Queue liveness fixture checks matched: ${Array.from(selected).join(", ")}`);
|
||||||
|
return { ok: failures.length === 0, checks, failures };
|
||||||
|
}
|
||||||
@@ -197,6 +197,52 @@ function compactLastAssistant(value: unknown, full: boolean): Record<string, unk
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function compactSchedulerHeartbeat(value: unknown): Record<string, unknown> | null {
|
||||||
|
const record = asRecord(value);
|
||||||
|
if (record === null) return null;
|
||||||
|
return {
|
||||||
|
taskId: record.taskId ?? null,
|
||||||
|
attempt: record.attempt ?? null,
|
||||||
|
owner: record.owner ?? null,
|
||||||
|
schedulerInstance: record.schedulerInstance ?? null,
|
||||||
|
agentPort: record.agentPort ?? null,
|
||||||
|
activeTurnId: record.activeTurnId ?? null,
|
||||||
|
codexThreadId: record.codexThreadId ?? null,
|
||||||
|
lastLocalHeartbeatAt: record.lastLocalHeartbeatAt ?? null,
|
||||||
|
lastObservedAgentEventAt: record.lastObservedAgentEventAt ?? null,
|
||||||
|
lastPersistedTraceAt: record.lastPersistedTraceAt ?? null,
|
||||||
|
outputMaxSeq: record.outputMaxSeq ?? null,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function compactExecutionDiagnostics(value: unknown): Record<string, unknown> | null {
|
||||||
|
const record = asRecord(value);
|
||||||
|
if (record === null) return null;
|
||||||
|
return {
|
||||||
|
state: record.state ?? record.health ?? null,
|
||||||
|
degraded: record.degraded ?? null,
|
||||||
|
splitBrain: record.splitBrain ?? null,
|
||||||
|
executionStateSource: record.executionStateSource ?? null,
|
||||||
|
controlPlane: record.controlPlane ?? null,
|
||||||
|
databaseActiveTaskCount: record.databaseActiveTaskCount ?? null,
|
||||||
|
databaseActiveTaskIds: record.databaseActiveTaskIds ?? [],
|
||||||
|
schedulerActiveRunSlotCount: record.schedulerActiveRunSlotCount ?? null,
|
||||||
|
schedulerActiveTaskIds: record.schedulerActiveTaskIds ?? [],
|
||||||
|
activeHeartbeatTaskIds: record.activeHeartbeatTaskIds ?? [],
|
||||||
|
heartbeatFreshTaskIds: record.heartbeatFreshTaskIds ?? [],
|
||||||
|
heartbeatExpiredTaskIds: record.heartbeatExpiredTaskIds ?? [],
|
||||||
|
heartbeatMissingTaskIds: record.heartbeatMissingTaskIds ?? [],
|
||||||
|
staleRecoveryCandidateTaskIds: record.staleRecoveryCandidateTaskIds ?? [],
|
||||||
|
traceGapTaskIds: record.traceGapTaskIds ?? [],
|
||||||
|
traceGapNotStaleTaskIds: record.traceGapNotStaleTaskIds ?? [],
|
||||||
|
lastSchedulerHeartbeatAt: record.lastSchedulerHeartbeatAt ?? null,
|
||||||
|
lastObservedAgentEventAt: record.lastObservedAgentEventAt ?? null,
|
||||||
|
lastPersistedTraceAt: record.lastPersistedTraceAt ?? null,
|
||||||
|
oaPublisher: record.oaPublisher ?? null,
|
||||||
|
reasons: record.reasons ?? [],
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
function compactToolSummary(value: unknown, full: boolean): Record<string, unknown> {
|
function compactToolSummary(value: unknown, full: boolean): Record<string, unknown> {
|
||||||
const record = asRecord(value) ?? {};
|
const record = asRecord(value) ?? {};
|
||||||
const items = asArray(record.items).map((item) => {
|
const items = asArray(record.items).map((item) => {
|
||||||
@@ -252,6 +298,7 @@ function compactSummary(summary: unknown, options: CodexTaskOptions, taskId: str
|
|||||||
codexThreadId: record.codexThreadId ?? null,
|
codexThreadId: record.codexThreadId ?? null,
|
||||||
activeTurnId: record.activeTurnId ?? null,
|
activeTurnId: record.activeTurnId ?? null,
|
||||||
cancelRequested: record.cancelRequested ?? null,
|
cancelRequested: record.cancelRequested ?? null,
|
||||||
|
schedulerHeartbeat: compactSchedulerHeartbeat(record.schedulerHeartbeat),
|
||||||
},
|
},
|
||||||
timing: record.timing ?? null,
|
timing: record.timing ?? null,
|
||||||
createdAt: record.createdAt ?? null,
|
createdAt: record.createdAt ?? null,
|
||||||
@@ -718,7 +765,12 @@ function requireMergeTargetQueueId(args: string[], command: string): string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
function codeQueues(): unknown {
|
function codeQueues(): unknown {
|
||||||
return unwrapCodexResponse(coreInternalFetch(codeQueueProxyPath("/api/queues")));
|
const response = unwrapCodexResponse(coreInternalFetch(codeQueueProxyPath("/api/queues")));
|
||||||
|
return {
|
||||||
|
upstream: response.upstream,
|
||||||
|
queues: response.body.queues ?? [],
|
||||||
|
queue: compactQueueMutationSummary(response.body.queue),
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
function codexCreateQueue(queueId: string): unknown {
|
function codexCreateQueue(queueId: string): unknown {
|
||||||
@@ -824,6 +876,7 @@ function compactTaskMutationResponse(task: unknown, options: CompactTaskMutation
|
|||||||
maxAttempts: record.maxAttempts ?? null,
|
maxAttempts: record.maxAttempts ?? null,
|
||||||
currentAttempt: record.currentAttempt ?? null,
|
currentAttempt: record.currentAttempt ?? null,
|
||||||
cancelRequested: record.cancelRequested ?? null,
|
cancelRequested: record.cancelRequested ?? null,
|
||||||
|
schedulerHeartbeat: compactSchedulerHeartbeat(record.schedulerHeartbeat),
|
||||||
createdAt: record.createdAt ?? null,
|
createdAt: record.createdAt ?? null,
|
||||||
startedAt: record.startedAt ?? null,
|
startedAt: record.startedAt ?? null,
|
||||||
updatedAt: record.updatedAt ?? null,
|
updatedAt: record.updatedAt ?? null,
|
||||||
@@ -845,6 +898,10 @@ function compactQueueMutationSummary(value: unknown): Record<string, unknown> |
|
|||||||
return {
|
return {
|
||||||
activeQueueIds: record.activeQueueIds ?? null,
|
activeQueueIds: record.activeQueueIds ?? null,
|
||||||
activeTaskIds: record.activeTaskIds ?? null,
|
activeTaskIds: record.activeTaskIds ?? null,
|
||||||
|
databaseActiveTaskCount: record.databaseActiveTaskCount ?? null,
|
||||||
|
databaseActiveTaskIds: record.databaseActiveTaskIds ?? null,
|
||||||
|
schedulerHeartbeatStaleMs: record.schedulerHeartbeatStaleMs ?? null,
|
||||||
|
executionDiagnostics: compactExecutionDiagnostics(record.executionDiagnostics),
|
||||||
queuedTaskIds: record.queuedTaskIds ?? null,
|
queuedTaskIds: record.queuedTaskIds ?? null,
|
||||||
counts: record.counts ?? null,
|
counts: record.counts ?? null,
|
||||||
byQueue: Array.isArray(record.byQueue) ? record.byQueue : undefined,
|
byQueue: Array.isArray(record.byQueue) ? record.byQueue : undefined,
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ import { connect } from "node:net";
|
|||||||
import { join } from "node:path";
|
import { join } from "node:path";
|
||||||
import { chromium, type Page } from "playwright";
|
import { chromium, type Page } from "playwright";
|
||||||
import { createRouteRegistry, MODULES } from "../../src/components/frontend/src/navigation";
|
import { createRouteRegistry, MODULES } from "../../src/components/frontend/src/navigation";
|
||||||
|
import { CODE_QUEUE_LIVENESS_CHECK_NAMES, runCodeQueueLivenessFixtureChecks } from "./code-queue-liveness-fixtures";
|
||||||
import { runCommand } from "./command";
|
import { runCommand } from "./command";
|
||||||
import { type UniDeskConfig, repoRoot, rootPath } from "./config";
|
import { type UniDeskConfig, repoRoot, rootPath } from "./config";
|
||||||
import { boundedJsonDetail } from "./preview";
|
import { boundedJsonDetail } from "./preview";
|
||||||
@@ -166,11 +167,14 @@ const FRONTEND_CHECK_NAMES = [
|
|||||||
"frontend:no-console-errors",
|
"frontend:no-console-errors",
|
||||||
] as const;
|
] as const;
|
||||||
|
|
||||||
|
const CODE_QUEUE_FIXTURE_CHECK_NAMES = [...CODE_QUEUE_LIVENESS_CHECK_NAMES] as const;
|
||||||
|
|
||||||
const ALL_E2E_CHECK_NAMES = [
|
const ALL_E2E_CHECK_NAMES = [
|
||||||
...NETWORK_CHECK_NAMES,
|
...NETWORK_CHECK_NAMES,
|
||||||
...SERVICE_CHECK_NAMES,
|
...SERVICE_CHECK_NAMES,
|
||||||
...DATABASE_CHECK_NAMES,
|
...DATABASE_CHECK_NAMES,
|
||||||
...FRONTEND_CHECK_NAMES,
|
...FRONTEND_CHECK_NAMES,
|
||||||
|
...CODE_QUEUE_FIXTURE_CHECK_NAMES,
|
||||||
] as const;
|
] as const;
|
||||||
|
|
||||||
function uniqueText(values: string[]): string[] {
|
function uniqueText(values: string[]): string[] {
|
||||||
@@ -552,6 +556,14 @@ function addSelectedCheck(checks: E2ECheck[], options: E2ERunOptions, name: stri
|
|||||||
addCheck(checks, name, passed, detail);
|
addCheck(checks, name, passed, detail);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function codeQueueFixtureChecks(checks: E2ECheck[], options: E2ERunOptions): void {
|
||||||
|
const selected = CODE_QUEUE_FIXTURE_CHECK_NAMES.filter((name) => wantsCheck(options, name));
|
||||||
|
const result = runCodeQueueLivenessFixtureChecks(selected);
|
||||||
|
for (const check of result.checks) {
|
||||||
|
addSelectedCheck(checks, options, check.name, check.ok, check.detail);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
function safeTestId(value: string): string {
|
function safeTestId(value: string): string {
|
||||||
return value.replace(/[^a-zA-Z0-9_-]/g, "_");
|
return value.replace(/[^a-zA-Z0-9_-]/g, "_");
|
||||||
}
|
}
|
||||||
@@ -3420,8 +3432,14 @@ export async function runE2E(
|
|||||||
const needDatabase = wantsPrefix(options, "database")
|
const needDatabase = wantsPrefix(options, "database")
|
||||||
|| wantsCheck(options, "frontend:task-history-diagnostics");
|
|| wantsCheck(options, "frontend:task-history-diagnostics");
|
||||||
const needFrontend = wantsPrefix(options, "frontend");
|
const needFrontend = wantsPrefix(options, "frontend");
|
||||||
|
const needCodeQueueFixtures = wantsAnyCheck(options, [...CODE_QUEUE_FIXTURE_CHECK_NAMES]);
|
||||||
const executedSections: string[] = [];
|
const executedSections: string[] = [];
|
||||||
|
|
||||||
|
if (needCodeQueueFixtures) {
|
||||||
|
executedSections.push("code-queue-fixtures");
|
||||||
|
codeQueueFixtureChecks(checks, options);
|
||||||
|
}
|
||||||
|
|
||||||
if (needNetwork) {
|
if (needNetwork) {
|
||||||
executedSections.push("network");
|
executedSections.push("network");
|
||||||
await exposureChecks(config, urls, checks, options);
|
await exposureChecks(config, urls, checks, options);
|
||||||
|
|||||||
@@ -2169,6 +2169,81 @@ input:focus, select:focus, textarea:focus { border-color: var(--accent-2); }
|
|||||||
.codex-trace-status-chip.service {
|
.codex-trace-status-chip.service {
|
||||||
white-space: normal;
|
white-space: normal;
|
||||||
}
|
}
|
||||||
|
.codex-trace-status-chip.liveness.ok {
|
||||||
|
border-color: rgba(78, 183, 168, 0.50);
|
||||||
|
color: var(--accent-2);
|
||||||
|
}
|
||||||
|
.codex-trace-status-chip.liveness.warn {
|
||||||
|
border-color: rgba(215, 161, 58, 0.55);
|
||||||
|
color: #ffe0a2;
|
||||||
|
}
|
||||||
|
.codex-trace-status-chip.liveness.failed {
|
||||||
|
border-color: rgba(255, 98, 98, 0.58);
|
||||||
|
color: #ffb2b2;
|
||||||
|
}
|
||||||
|
.codex-liveness-panel .panel-body {
|
||||||
|
display: grid;
|
||||||
|
gap: 8px;
|
||||||
|
}
|
||||||
|
.codex-liveness-grid {
|
||||||
|
display: grid;
|
||||||
|
grid-template-columns: repeat(5, minmax(130px, 1fr));
|
||||||
|
gap: 8px;
|
||||||
|
min-width: 0;
|
||||||
|
}
|
||||||
|
.codex-liveness-metric {
|
||||||
|
display: grid;
|
||||||
|
gap: 4px;
|
||||||
|
min-width: 0;
|
||||||
|
padding: 8px;
|
||||||
|
border: 1px solid rgba(78, 183, 168, 0.22);
|
||||||
|
background:
|
||||||
|
linear-gradient(135deg, rgba(78, 183, 168, 0.08), rgba(255,255,255,0.015)),
|
||||||
|
rgba(0,0,0,0.16);
|
||||||
|
}
|
||||||
|
.codex-liveness-metric.warn {
|
||||||
|
border-color: rgba(215, 161, 58, 0.44);
|
||||||
|
}
|
||||||
|
.codex-liveness-metric.failed {
|
||||||
|
border-color: rgba(255, 98, 98, 0.46);
|
||||||
|
}
|
||||||
|
.codex-liveness-metric.ok {
|
||||||
|
border-color: rgba(78, 183, 168, 0.45);
|
||||||
|
}
|
||||||
|
.codex-liveness-metric span {
|
||||||
|
color: var(--muted);
|
||||||
|
font-size: 10px;
|
||||||
|
letter-spacing: 0.12em;
|
||||||
|
text-transform: uppercase;
|
||||||
|
}
|
||||||
|
.codex-liveness-metric strong {
|
||||||
|
min-width: 0;
|
||||||
|
color: var(--text);
|
||||||
|
font-size: 15px;
|
||||||
|
font-weight: 700;
|
||||||
|
overflow-wrap: anywhere;
|
||||||
|
}
|
||||||
|
.codex-liveness-metric code {
|
||||||
|
min-width: 0;
|
||||||
|
color: var(--muted);
|
||||||
|
font-size: 11px;
|
||||||
|
overflow-wrap: anywhere;
|
||||||
|
}
|
||||||
|
.codex-liveness-reasons {
|
||||||
|
display: flex;
|
||||||
|
flex-wrap: wrap;
|
||||||
|
gap: 5px;
|
||||||
|
min-width: 0;
|
||||||
|
}
|
||||||
|
.codex-liveness-reasons span {
|
||||||
|
max-width: 100%;
|
||||||
|
padding: 4px 7px;
|
||||||
|
border: 1px solid rgba(215, 161, 58, 0.28);
|
||||||
|
color: #ffe0a2;
|
||||||
|
background: rgba(215, 161, 58, 0.07);
|
||||||
|
font-size: 11px;
|
||||||
|
overflow-wrap: anywhere;
|
||||||
|
}
|
||||||
.codex-mark-all-read-btn {
|
.codex-mark-all-read-btn {
|
||||||
border-color: rgba(78, 183, 168, 0.40);
|
border-color: rgba(78, 183, 168, 0.40);
|
||||||
color: #bdece4;
|
color: #bdece4;
|
||||||
@@ -5490,7 +5565,7 @@ input:focus, select:focus, textarea:focus { border-color: var(--accent-2); }
|
|||||||
}
|
}
|
||||||
|
|
||||||
@media (max-width: 1120px) {
|
@media (max-width: 1120px) {
|
||||||
.metric-grid, .policy-grid, .security-board, .docker-metrics, .monitor-chart-grid, .monitor-summary-grid, .performance-metric-stack, .codex-load-test-grid, .baidu-doc-grid, .filebrowser-target-grid, .oa-flow-metrics { grid-template-columns: repeat(2, minmax(0, 1fr)); }
|
.metric-grid, .policy-grid, .security-board, .docker-metrics, .monitor-chart-grid, .monitor-summary-grid, .performance-metric-stack, .codex-load-test-grid, .codex-liveness-grid, .baidu-doc-grid, .filebrowser-target-grid, .oa-flow-metrics { grid-template-columns: repeat(2, minmax(0, 1fr)); }
|
||||||
.pipeline-oa-guarantees { grid-template-columns: repeat(2, minmax(0, 1fr)); }
|
.pipeline-oa-guarantees { grid-template-columns: repeat(2, minmax(0, 1fr)); }
|
||||||
.dispatch-form, .schedule-form { grid-template-columns: 1fr 1fr; }
|
.dispatch-form, .schedule-form { grid-template-columns: 1fr 1fr; }
|
||||||
.dispatch-actions { align-items: center; }
|
.dispatch-actions { align-items: center; }
|
||||||
@@ -5680,7 +5755,7 @@ input:focus, select:focus, textarea:focus { border-color: var(--accent-2); }
|
|||||||
padding: 4px 9px;
|
padding: 4px 9px;
|
||||||
white-space: nowrap;
|
white-space: nowrap;
|
||||||
}
|
}
|
||||||
.metric-grid, .policy-grid, .security-board, .dispatch-form, .schedule-form, .schedule-card-grid, .docker-metrics, .monitor-chart-grid, .monitor-summary-grid, .gateway-record-grid, .met-detail-kv, .code-queue-metrics, .codex-stats-summary-grid, .codex-form-grid, .baidu-doc-grid { grid-template-columns: 1fr; }
|
.metric-grid, .policy-grid, .security-board, .dispatch-form, .schedule-form, .schedule-card-grid, .docker-metrics, .monitor-chart-grid, .monitor-summary-grid, .gateway-record-grid, .met-detail-kv, .code-queue-metrics, .codex-stats-summary-grid, .codex-liveness-grid, .codex-form-grid, .baidu-doc-grid { grid-template-columns: 1fr; }
|
||||||
.compact-row, .heartbeat-row, .log-row, .endpoint-list article, .volume-route, .findjob-hero, .pipeline-hero, .code-queue-hero, .claudeqq-login-card, .baidu-login-card, .baidu-pathbar { grid-template-columns: 1fr; align-items: start; }
|
.compact-row, .heartbeat-row, .log-row, .endpoint-list article, .volume-route, .findjob-hero, .pipeline-hero, .code-queue-hero, .claudeqq-login-card, .baidu-login-card, .baidu-pathbar { grid-template-columns: 1fr; align-items: start; }
|
||||||
.codex-output-line { grid-template-columns: 1fr; }
|
.codex-output-line { grid-template-columns: 1fr; }
|
||||||
.codex-transcript { min-height: 360px; }
|
.codex-transcript { min-height: 360px; }
|
||||||
|
|||||||
@@ -219,6 +219,34 @@ function activeTaskIds(queue: any): string[] {
|
|||||||
return Array.isArray(queue?.activeTaskIds) ? queue.activeTaskIds.map((id: any) => String(id || "")).filter(Boolean) : [String(queue?.activeTaskId || "")].filter(Boolean);
|
return Array.isArray(queue?.activeTaskIds) ? queue.activeTaskIds.map((id: any) => String(id || "")).filter(Boolean) : [String(queue?.activeTaskId || "")].filter(Boolean);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function stringArray(value: any): string[] {
|
||||||
|
return Array.isArray(value) ? value.map((item: any) => String(item || "")).filter(Boolean) : [];
|
||||||
|
}
|
||||||
|
|
||||||
|
function compactIdList(value: any, limit = 3): string {
|
||||||
|
const ids = stringArray(value);
|
||||||
|
if (ids.length === 0) return "--";
|
||||||
|
const visible = ids.slice(0, limit).join(" / ");
|
||||||
|
return ids.length > limit ? `${visible} +${ids.length - limit}` : visible;
|
||||||
|
}
|
||||||
|
|
||||||
|
function executionDiagnosticsFromQueue(queue: any, health: any): AnyRecord {
|
||||||
|
return objectRecord(queue?.executionDiagnostics)
|
||||||
|
|| objectRecord(health?.body?.queue?.executionDiagnostics)
|
||||||
|
|| objectRecord(health?.queue?.executionDiagnostics)
|
||||||
|
|| objectRecord(health?.body?.executionDiagnostics)
|
||||||
|
|| objectRecord(health?.executionDiagnostics)
|
||||||
|
|| {};
|
||||||
|
}
|
||||||
|
|
||||||
|
function diagnosticsTone(state: any): string {
|
||||||
|
const value = String(state || "unknown").toLowerCase();
|
||||||
|
if (value === "healthy") return "ok";
|
||||||
|
if (value === "split-brain" || value === "stale-active") return "failed";
|
||||||
|
if (value === "degraded") return "warn";
|
||||||
|
return "unknown";
|
||||||
|
}
|
||||||
|
|
||||||
const allQueuesId = "__all__";
|
const allQueuesId = "__all__";
|
||||||
const queueMobileMediaQuery = "(max-width: 760px)";
|
const queueMobileMediaQuery = "(max-width: 760px)";
|
||||||
const queueDesktopMediaQuery = "(min-width: 761px)";
|
const queueDesktopMediaQuery = "(min-width: 761px)";
|
||||||
@@ -1536,6 +1564,48 @@ function CodexStatsIcon() {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function LivenessMetric({ label, value, hint, tone }: AnyRecord) {
|
||||||
|
return h("div", { className: `codex-liveness-metric ${tone || ""}` },
|
||||||
|
h("span", null, label),
|
||||||
|
h("strong", null, value ?? "--"),
|
||||||
|
hint ? h("code", null, hint) : null,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
function CodeQueueLivenessPanel({ diagnostics, queue, onRaw }: AnyRecord) {
|
||||||
|
const state = String(diagnostics?.state || diagnostics?.health || "unknown");
|
||||||
|
const oaPublisher = objectRecord(diagnostics?.oaPublisher);
|
||||||
|
const reasons = stringArray(diagnostics?.reasons).slice(0, 3);
|
||||||
|
const tone = diagnosticsTone(state);
|
||||||
|
return h(Panel, {
|
||||||
|
title: "执行活性",
|
||||||
|
eyebrow: `${String(diagnostics?.executionStateSource || queue?.executionStateSource || "unknown")} / ${String(diagnostics?.controlPlane || "code-queue")}`,
|
||||||
|
summary: h("div", { className: "codex-trace-status" },
|
||||||
|
h("span", { className: `codex-trace-status-chip liveness ${tone}` }, h("b", null, "状态"), state),
|
||||||
|
h("span", { className: "codex-trace-status-chip" }, h("b", null, "DB active"), String(diagnostics?.databaseActiveTaskCount ?? queue?.databaseActiveTaskCount ?? 0)),
|
||||||
|
h("span", { className: "codex-trace-status-chip" }, h("b", null, "scheduler slots"), String(diagnostics?.schedulerActiveRunSlotCount ?? queue?.activeRunSlotCount ?? 0)),
|
||||||
|
h("span", { className: "codex-trace-status-chip" }, h("b", null, "heartbeat"), `${stringArray(diagnostics?.heartbeatFreshTaskIds).length} fresh / ${stringArray(diagnostics?.heartbeatExpiredTaskIds).length} expired`),
|
||||||
|
oaPublisher ? h("span", { className: "codex-trace-status-chip" }, h("b", null, "OA"), `${Number(oaPublisher.pending || 0)} pending${oaPublisher.lastError ? " / error" : ""}`) : null,
|
||||||
|
),
|
||||||
|
actions: h(RawButton, { title: "Code Queue Execution Diagnostics", data: diagnostics, onOpen: onRaw, testId: "raw-code-queue-execution-diagnostics" }),
|
||||||
|
className: "codex-liveness-panel",
|
||||||
|
},
|
||||||
|
h("div", { className: "codex-liveness-grid", "data-testid": "codex-liveness-diagnostics" },
|
||||||
|
h(LivenessMetric, { tone, label: "健康状态", value: state, hint: diagnostics?.splitBrain ? "split-brain" : diagnostics?.degraded ? "degraded" : "ready" }),
|
||||||
|
h(LivenessMetric, { label: "PostgreSQL active", value: String(diagnostics?.databaseActiveTaskCount ?? queue?.databaseActiveTaskCount ?? 0), hint: compactIdList(diagnostics?.databaseActiveTaskIds ?? queue?.databaseActiveTaskIds) }),
|
||||||
|
h(LivenessMetric, { label: "Scheduler active", value: String(diagnostics?.schedulerActiveRunSlotCount ?? queue?.activeRunSlotCount ?? 0), hint: compactIdList(diagnostics?.schedulerActiveTaskIds ?? queue?.activeTaskIds) }),
|
||||||
|
h(LivenessMetric, { label: "Fresh heartbeat", value: String(stringArray(diagnostics?.heartbeatFreshTaskIds).length), hint: compactIdList(diagnostics?.heartbeatFreshTaskIds) }),
|
||||||
|
h(LivenessMetric, { tone: stringArray(diagnostics?.traceGapNotStaleTaskIds).length > 0 ? "warn" : "", label: "Trace gap", value: String(stringArray(diagnostics?.traceGapTaskIds).length), hint: compactIdList(diagnostics?.traceGapNotStaleTaskIds) }),
|
||||||
|
h(LivenessMetric, { tone: stringArray(diagnostics?.staleRecoveryCandidateTaskIds).length > 0 ? "failed" : "", label: "Stale candidates", value: String(stringArray(diagnostics?.staleRecoveryCandidateTaskIds).length), hint: compactIdList(diagnostics?.staleRecoveryCandidateTaskIds) }),
|
||||||
|
h(LivenessMetric, { label: "Last scheduler heartbeat", value: fmtRelativeAge(diagnostics?.lastSchedulerHeartbeatAt), hint: String(diagnostics?.lastSchedulerHeartbeatAt || "--") }),
|
||||||
|
h(LivenessMetric, { label: "Last agent event", value: fmtRelativeAge(diagnostics?.lastObservedAgentEventAt), hint: String(diagnostics?.lastObservedAgentEventAt || "--") }),
|
||||||
|
h(LivenessMetric, { label: "Last trace persist", value: fmtRelativeAge(diagnostics?.lastPersistedTraceAt), hint: String(diagnostics?.lastPersistedTraceAt || "--") }),
|
||||||
|
h(LivenessMetric, { tone: oaPublisher?.lastError ? "warn" : "", label: "OA publisher", value: `${Number(oaPublisher?.pending || 0)} pending`, hint: oaPublisher?.lastError ? shortText(oaPublisher.lastError, 90) : "ok" }),
|
||||||
|
),
|
||||||
|
reasons.length > 0 ? h("div", { className: "codex-liveness-reasons" }, reasons.map((reason: string) => h("span", { key: reason }, reason))) : null,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
function CodexStatsPanel({ stats, queueName: activeQueueName, onRaw }: AnyRecord) {
|
function CodexStatsPanel({ stats, queueName: activeQueueName, onRaw }: AnyRecord) {
|
||||||
const rows = taskStatisticsRows(stats);
|
const rows = taskStatisticsRows(stats);
|
||||||
const totals = taskStatisticsTotals(stats);
|
const totals = taskStatisticsTotals(stats);
|
||||||
@@ -2112,6 +2182,7 @@ export function CodeQueuePage({ microservices, onRaw, apiBaseUrl = "/api", initi
|
|||||||
const tasks = applyLocalReadStateToRows(taskRows(tasksData));
|
const tasks = applyLocalReadStateToRows(taskRows(tasksData));
|
||||||
const loadedUnreadTerminalTasks = tasks.filter(taskIsUnreadTerminal);
|
const loadedUnreadTerminalTasks = tasks.filter(taskIsUnreadTerminal);
|
||||||
const queue = tasksData?.queue || health?.body?.queue || health?.queue || {};
|
const queue = tasksData?.queue || health?.body?.queue || health?.queue || {};
|
||||||
|
const executionDiagnostics = executionDiagnosticsFromQueue(queue, health);
|
||||||
const statistics = taskStatistics(tasksData, queue);
|
const statistics = taskStatistics(tasksData, queue);
|
||||||
const pagination = taskPagination(tasksData);
|
const pagination = taskPagination(tasksData);
|
||||||
const queueRows = knownQueueRows(queue, queueId);
|
const queueRows = knownQueueRows(queue, queueId);
|
||||||
@@ -3828,6 +3899,7 @@ export function CodeQueuePage({ microservices, onRaw, apiBaseUrl = "/api", initi
|
|||||||
h(UniDeskErrorBanner, { error, wide: true }),
|
h(UniDeskErrorBanner, { error, wide: true }),
|
||||||
mergeDialog,
|
mergeDialog,
|
||||||
h("div", { className: "codex-session-stage codex-session-stage-top" },
|
h("div", { className: "codex-session-stage codex-session-stage-top" },
|
||||||
|
h(CodeQueueLivenessPanel, { diagnostics: executionDiagnostics, queue, onRaw }),
|
||||||
sessionPanel,
|
sessionPanel,
|
||||||
),
|
),
|
||||||
h("div", { className: "code-queue-layout" },
|
h("div", { className: "code-queue-layout" },
|
||||||
|
|||||||
@@ -782,6 +782,13 @@ fn execution_diagnostics_from_tasks(tasks: &[TaskMeta], now: &str) -> Value {
|
|||||||
if !trace_gap_not_stale_task_ids.is_empty() {
|
if !trace_gap_not_stale_task_ids.is_empty() {
|
||||||
reasons.push("trace progress is stale while scheduler heartbeat is fresh; this is a trace gap, not stale active");
|
reasons.push("trace progress is stale while scheduler heartbeat is fresh; this is a trace gap, not stale active");
|
||||||
}
|
}
|
||||||
|
let database_active_task_count = database_active_task_ids.len();
|
||||||
|
let scheduler_orphaned_active_task_ids = database_active_task_ids.clone();
|
||||||
|
let scheduler_orphaned_active_task_count = scheduler_orphaned_active_task_ids.len();
|
||||||
|
let active_heartbeat_count = active_heartbeat_task_ids.len();
|
||||||
|
let last_scheduler_heartbeat_at = max_timestamp(active_heartbeats.iter().map(|(_, heartbeat)| heartbeat_string_field(heartbeat, "lastLocalHeartbeatAt")).collect());
|
||||||
|
let last_observed_agent_event_at = max_timestamp(active_heartbeats.iter().map(|(_, heartbeat)| heartbeat_string_field(heartbeat, "lastObservedAgentEventAt")).collect());
|
||||||
|
let last_persisted_trace_at = max_timestamp(active_heartbeats.iter().map(|(_, heartbeat)| heartbeat_string_field(heartbeat, "lastPersistedTraceAt")).collect());
|
||||||
|
|
||||||
json!({
|
json!({
|
||||||
"state": state,
|
"state": state,
|
||||||
@@ -791,16 +798,16 @@ fn execution_diagnostics_from_tasks(tasks: &[TaskMeta], now: &str) -> Value {
|
|||||||
"executionStateSource": "postgres-control-plane",
|
"executionStateSource": "postgres-control-plane",
|
||||||
"controlPlane": "master-code-queue-mgr",
|
"controlPlane": "master-code-queue-mgr",
|
||||||
"databaseActiveTaskIds": database_active_task_ids,
|
"databaseActiveTaskIds": database_active_task_ids,
|
||||||
"databaseActiveTaskCount": database_active_task_ids.len(),
|
"databaseActiveTaskCount": database_active_task_count,
|
||||||
"schedulerActiveTaskIds": [],
|
"schedulerActiveTaskIds": [],
|
||||||
"schedulerActiveTaskCount": 0,
|
"schedulerActiveTaskCount": 0,
|
||||||
"schedulerActiveRunSlotCount": 0,
|
"schedulerActiveRunSlotCount": 0,
|
||||||
"schedulerActiveQueueIds": [],
|
"schedulerActiveQueueIds": [],
|
||||||
"schedulerProcessingQueueIds": [],
|
"schedulerProcessingQueueIds": [],
|
||||||
"schedulerOrphanedActiveTaskIds": database_active_task_ids,
|
"schedulerOrphanedActiveTaskIds": scheduler_orphaned_active_task_ids,
|
||||||
"schedulerOrphanedActiveTaskCount": database_active_task_ids.len(),
|
"schedulerOrphanedActiveTaskCount": scheduler_orphaned_active_task_count,
|
||||||
"activeHeartbeatTaskIds": active_heartbeat_task_ids,
|
"activeHeartbeatTaskIds": active_heartbeat_task_ids,
|
||||||
"activeHeartbeatCount": active_heartbeat_task_ids.len(),
|
"activeHeartbeatCount": active_heartbeat_count,
|
||||||
"heartbeatFreshTaskIds": heartbeat_fresh_task_ids,
|
"heartbeatFreshTaskIds": heartbeat_fresh_task_ids,
|
||||||
"heartbeatExpiredTaskIds": heartbeat_expired_task_ids,
|
"heartbeatExpiredTaskIds": heartbeat_expired_task_ids,
|
||||||
"heartbeatMissingTaskIds": heartbeat_missing_task_ids,
|
"heartbeatMissingTaskIds": heartbeat_missing_task_ids,
|
||||||
@@ -809,9 +816,9 @@ fn execution_diagnostics_from_tasks(tasks: &[TaskMeta], now: &str) -> Value {
|
|||||||
"traceGapNotStaleTaskIds": trace_gap_not_stale_task_ids,
|
"traceGapNotStaleTaskIds": trace_gap_not_stale_task_ids,
|
||||||
"schedulerHeartbeatStaleMs": SCHEDULER_HEARTBEAT_STALE_MS,
|
"schedulerHeartbeatStaleMs": SCHEDULER_HEARTBEAT_STALE_MS,
|
||||||
"now": now,
|
"now": now,
|
||||||
"lastSchedulerHeartbeatAt": max_timestamp(active_heartbeats.iter().map(|(_, heartbeat)| heartbeat_string_field(heartbeat, "lastLocalHeartbeatAt")).collect()),
|
"lastSchedulerHeartbeatAt": last_scheduler_heartbeat_at,
|
||||||
"lastObservedAgentEventAt": max_timestamp(active_heartbeats.iter().map(|(_, heartbeat)| heartbeat_string_field(heartbeat, "lastObservedAgentEventAt")).collect()),
|
"lastObservedAgentEventAt": last_observed_agent_event_at,
|
||||||
"lastPersistedTraceAt": max_timestamp(active_heartbeats.iter().map(|(_, heartbeat)| heartbeat_string_field(heartbeat, "lastPersistedTraceAt")).collect()),
|
"lastPersistedTraceAt": last_persisted_trace_at,
|
||||||
"oaPublisher": Value::Null,
|
"oaPublisher": Value::Null,
|
||||||
"reasons": reasons,
|
"reasons": reasons,
|
||||||
"guidance": [
|
"guidance": [
|
||||||
@@ -1179,6 +1186,7 @@ fn queue_summary(state: &AppState) -> Result<Value, String> {
|
|||||||
let active_tasks: Vec<TaskMeta> = active_rows.iter().map(|row| row_to_task(row, true)).collect();
|
let active_tasks: Vec<TaskMeta> = active_rows.iter().map(|row| row_to_task(row, true)).collect();
|
||||||
let mut database_active_task_ids: Vec<String> = active_tasks.iter().map(|task| task.id.clone()).collect();
|
let mut database_active_task_ids: Vec<String> = active_tasks.iter().map(|task| task.id.clone()).collect();
|
||||||
database_active_task_ids.sort();
|
database_active_task_ids.sort();
|
||||||
|
let database_active_task_count = database_active_task_ids.len();
|
||||||
let execution_diagnostics = execution_diagnostics_from_tasks(&active_tasks, &now_iso());
|
let execution_diagnostics = execution_diagnostics_from_tasks(&active_tasks, &now_iso());
|
||||||
if !queues.iter().any(|queue| queue.get("id").and_then(Value::as_str) == Some(DEFAULT_QUEUE_ID)) {
|
if !queues.iter().any(|queue| queue.get("id").and_then(Value::as_str) == Some(DEFAULT_QUEUE_ID)) {
|
||||||
let now = now_iso();
|
let now = now_iso();
|
||||||
@@ -1215,7 +1223,7 @@ fn queue_summary(state: &AppState) -> Result<Value, String> {
|
|||||||
"activeTaskIds": [],
|
"activeTaskIds": [],
|
||||||
"activeTaskId": Value::Null,
|
"activeTaskId": Value::Null,
|
||||||
"databaseActiveTaskIds": database_active_task_ids,
|
"databaseActiveTaskIds": database_active_task_ids,
|
||||||
"databaseActiveTaskCount": database_active_task_ids.len(),
|
"databaseActiveTaskCount": database_active_task_count,
|
||||||
"executionStateSource": "postgres-control-plane",
|
"executionStateSource": "postgres-control-plane",
|
||||||
"executionDiagnostics": execution_diagnostics,
|
"executionDiagnostics": execution_diagnostics,
|
||||||
"schedulerHeartbeatStaleMs": SCHEDULER_HEARTBEAT_STALE_MS,
|
"schedulerHeartbeatStaleMs": SCHEDULER_HEARTBEAT_STALE_MS,
|
||||||
|
|||||||
@@ -3501,9 +3501,7 @@ function taskIsRecoverableOrphanedActive(task: QueueTask, staleMs = orphanedActi
|
|||||||
if (task.status !== "running" && task.status !== "judging") return false;
|
if (task.status !== "running" && task.status !== "judging") return false;
|
||||||
if (activeTaskHasLocalRunSlotOrWaiter(task)) return false;
|
if (activeTaskHasLocalRunSlotOrWaiter(task)) return false;
|
||||||
const decision = staleRecoveryCandidate({ task, localActive: false, heartbeatStaleMs: staleMs, now: nowIso() });
|
const decision = staleRecoveryCandidate({ task, localActive: false, heartbeatStaleMs: staleMs, now: nowIso() });
|
||||||
if (decision.allowed) return true;
|
return decision.allowed;
|
||||||
if (decision.reason === "owner-heartbeat-missing") return orphanedActiveTaskAgeMs(task) >= staleMs;
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function schedulerReconcileStatus(tasks: QueueTask[] = state.tasks): JsonValue {
|
function schedulerReconcileStatus(tasks: QueueTask[] = state.tasks): JsonValue {
|
||||||
|
|||||||
@@ -256,6 +256,23 @@ function runQueueOrderingSelfTest(): JsonValue {
|
|||||||
const orphanRunning = queueOrderTestTask("codex_4400_orphan_running", "running", "2026-05-11T13:00:00.000Z", "2026-05-11T13:00:00.000Z");
|
const orphanRunning = queueOrderTestTask("codex_4400_orphan_running", "running", "2026-05-11T13:00:00.000Z", "2026-05-11T13:00:00.000Z");
|
||||||
orphanRunning.queueId = "queue_orphan_recovery";
|
orphanRunning.queueId = "queue_orphan_recovery";
|
||||||
orphanRunning.activeTurnId = null;
|
orphanRunning.activeTurnId = null;
|
||||||
|
orphanRunning.schedulerHeartbeat = {
|
||||||
|
taskId: orphanRunning.id,
|
||||||
|
queueId: orphanRunning.queueId,
|
||||||
|
attempt: 1,
|
||||||
|
activeTurnId: null,
|
||||||
|
codexThreadId: null,
|
||||||
|
owner: "self-test",
|
||||||
|
schedulerInstance: "self-test",
|
||||||
|
executionPlane: "scheduler-execution-plane",
|
||||||
|
agentPort: "codex",
|
||||||
|
status: "running",
|
||||||
|
lastLocalHeartbeatAt: "2026-05-11T13:00:00.000Z",
|
||||||
|
lastObservedAgentEventAt: null,
|
||||||
|
lastPersistedTraceAt: null,
|
||||||
|
outputMaxSeq: 0,
|
||||||
|
source: "scheduler",
|
||||||
|
};
|
||||||
const queuedBehindOrphan = queueOrderTestTask("codex_4401_queued", "queued", "2026-05-11T13:01:00.000Z", "2026-05-11T13:01:00.000Z");
|
const queuedBehindOrphan = queueOrderTestTask("codex_4401_queued", "queued", "2026-05-11T13:01:00.000Z", "2026-05-11T13:01:00.000Z");
|
||||||
queuedBehindOrphan.queueId = "queue_orphan_recovery";
|
queuedBehindOrphan.queueId = "queue_orphan_recovery";
|
||||||
const originalMaxActiveQueues = ctx().config.maxActiveQueues;
|
const originalMaxActiveQueues = ctx().config.maxActiveQueues;
|
||||||
|
|||||||
Reference in New Issue
Block a user