From d01d0ca5092fc83a35041f3cc363dfd865de9941 Mon Sep 17 00:00:00 2001 From: Codex Date: Sun, 17 May 2026 06:52:02 +0000 Subject: [PATCH 01/15] chore: pin decision center lock source --- config.json | 4 ++-- deploy.json | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/config.json b/config.json index 09b1c009..027150e1 100644 --- a/config.json +++ b/config.json @@ -547,7 +547,7 @@ "description": "k3sctl-adapter 是 UniDesk 直管的 k3s 控制平面适配微服务,连接 D601 原生 k3s/k3sctl 控制平面,并通过 k3s 标准服务路由代理 D601 上的代管微服务。", "repository": { "url": "https://github.com/pikasTech/unidesk", - "commitId": "d74439ecba7849e1c7305d57a65007458d8b0671", + "commitId": "d769c6e56c49881dfd65120fa634782c468e38b9", "dockerfile": "src/components/microservices/k3sctl-adapter/Dockerfile", "composeFile": "src/components/microservices/k3sctl-adapter/docker-compose.d601.yml", "composeService": "k3sctl-adapter", @@ -707,7 +707,7 @@ "description": "Decision Center 是由 D601 k3s 控制面代管的决策权威记录服务,用于沉淀会议记录、决议、目标、问题分级、停放事项和证据;参谋对话仍使用 Codex 原生会话。", "repository": { "url": "https://github.com/pikasTech/unidesk", - "commitId": "d74439ecba7849e1c7305d57a65007458d8b0671", + "commitId": "d769c6e56c49881dfd65120fa634782c468e38b9", "dockerfile": "src/components/microservices/decision-center/Dockerfile", "composeFile": "src/components/microservices/k3sctl-adapter/k3s/decision-center.k3s.json", "composeService": "decision-center", diff --git a/deploy.json b/deploy.json index ee98940a..50a2ac2e 100644 --- a/deploy.json +++ b/deploy.json @@ -44,7 +44,7 @@ { "id": "k3sctl-adapter", "repo": "https://github.com/pikasTech/unidesk", - "commitId": "d74439ecba7849e1c7305d57a65007458d8b0671" + "commitId": "d769c6e56c49881dfd65120fa634782c468e38b9" }, { "id": "code-queue", @@ -59,7 +59,7 @@ { "id": "decision-center", "repo": "https://github.com/pikasTech/unidesk", - "commitId": "d74439ecba7849e1c7305d57a65007458d8b0671" + "commitId": "d769c6e56c49881dfd65120fa634782c468e38b9" } ] } From f1d8cb921053c38390b05330d98a89c40de67b35 Mon Sep 17 00:00:00 2001 From: Codex Date: Sun, 17 May 2026 06:53:03 +0000 Subject: [PATCH 02/15] Stabilize provider HTTP tunnel diagnostics --- docs/reference/cli.md | 2 +- docs/reference/microservices.md | 2 + docs/reference/observability.md | 2 +- docs/reference/provider-gateway.md | 2 + scripts/cli.ts | 2 + scripts/src/microservices.ts | 10 +- scripts/src/remote.ts | 6 +- src/components/backend-core/src/egress-tcp.ts | 8 + src/components/backend-core/src/index.ts | 13 +- .../backend-core/src/microservice-proxy.ts | 394 ++++++++++++++++-- src/components/backend-core/src/types.ts | 9 +- .../microservices/k3sctl-adapter/src/index.ts | 118 ++++++ 12 files changed, 518 insertions(+), 50 deletions(-) diff --git a/docs/reference/cli.md b/docs/reference/cli.md index 2195a373..49caaad6 100644 --- a/docs/reference/cli.md +++ b/docs/reference/cli.md @@ -113,7 +113,7 @@ bun scripts/cli.ts ssh D601 glob --root /home/ubuntu/pikapython --pattern '**/*- `--main-server-ip` 是一个全局前缀,必须放在需要透传的命令同一次调用中,例如 `bun scripts/cli.ts --main-server-ip 74.48.78.17 debug health`。默认传输是公网 frontend:本地 CLI 读取本仓库 `config.json` 中的 frontend 登录账号密码,登录 `http://:/` 获取 HttpOnly session cookie,然后通过 frontend 的 `/api/*` 同源代理访问 backend-core 内网 API;因此计算节点只需要能访问公网 frontend,不需要主 server SSH key,也不需要打开 backend-core REST API 或 PostgreSQL 端口。 -默认 frontend 传输支持 `debug health`、`debug dispatch`、`debug task`、`microservice list/status/health/proxy`、`decision upload/list/show/health`、`codex task `、`codex output `、`codex judge --attempt N` 和 `ssh `。其中 `ssh` 的 remote frontend 传输使用 `host.ssh` dispatch 执行有界远端命令,适合 `ssh D601 hostname` 和 `ssh D601 skills` 这类自测;交互式登录 shell 仍应在主 server 本机 CLI 使用,或显式切换到旧 SSH 传输后在主 server 上执行。frontend 远程透传不会流式转发本地 stdin,因此 `ssh py < script.py`、`ssh apply-patch < patch.diff` 这类 stdin-backed helper 必须在主 server 本机运行,或显式切换到 `--main-server-transport ssh`。若确实需要旧行为,可使用 `--main-server-key ` 或 `--main-server-transport ssh`,这时 CLI 会通过 SSH 登录主 server 的 `--main-server-root` 目录执行同一个 `bun scripts/cli.ts `。 +默认 frontend 传输支持 `debug health`、`debug dispatch`、`debug task`、`microservice list/status/health/diagnostics/tunnel-self-test/proxy`、`decision upload/list/show/health`、`codex task `、`codex output `、`codex judge --attempt N` 和 `ssh `。其中 `ssh` 的 remote frontend 传输使用 `host.ssh` dispatch 执行有界远端命令,适合 `ssh D601 hostname` 和 `ssh D601 skills` 这类自测;交互式登录 shell 仍应在主 server 本机 CLI 使用,或显式切换到旧 SSH 传输后在主 server 上执行。frontend 远程透传不会流式转发本地 stdin,因此 `ssh py < script.py`、`ssh apply-patch < patch.diff` 这类 stdin-backed helper 必须在主 server 本机运行,或显式切换到 `--main-server-transport ssh`。若确实需要旧行为,可使用 `--main-server-key ` 或 `--main-server-transport ssh`,这时 CLI 会通过 SSH 登录主 server 的 `--main-server-root` 目录执行同一个 `bun scripts/cli.ts `。 计算节点可以用该入口测试自身的远程升级闭环,而不需要在计算节点公开 core REST API 或 database。标准顺序是:先运行 `bun scripts/cli.ts --main-server-ip 74.48.78.17 debug health` 确认主 server 看到当前 Provider 在线,且该 Provider labels 中 `unideskCapabilities` 包含 `host.ssh`、`hostSshConfigured=true`、`hostSshKeyPresent=true`;再运行 `bun scripts/cli.ts --main-server-ip 74.48.78.17 debug dispatch provider.upgrade --mode schedule --wait-ms 15000` 触发真实 `provider.upgrade`;随后再次运行 `debug health` 确认节点重新上线;最后运行 `bun scripts/cli.ts --main-server-ip 74.48.78.17 debug dispatch host.ssh --wait-ms 15000` 和 `bun scripts/cli.ts --main-server-ip 74.48.78.17 ssh hostname` 验证 SSH 透传能力。provider-gateway 新部署或升级后没有完成这组 remote CLI 自测,不能视为交付完成。 diff --git a/docs/reference/microservices.md b/docs/reference/microservices.md index 138f11fb..1adcba79 100644 --- a/docs/reference/microservices.md +++ b/docs/reference/microservices.md @@ -316,6 +316,8 @@ ClaudeQQ 的业务源码和持久化数据仍在 D601,但正式运行由 k3s - `bun scripts/cli.ts microservice health oa-event-flow`、`bun scripts/cli.ts microservice proxy oa-event-flow /api/diagnostics --raw` 与 `bun scripts/cli.ts microservice proxy oa-event-flow '/api/events?tags=service:code-queue&limit=20' --raw`:验证统一 OA 事件流、事件表、tag 查询和统计中心。 - `bun scripts/cli.ts microservice health k3sctl-adapter` 与 `bun scripts/cli.ts microservice proxy k3sctl-adapter /api/control-plane --raw`:验证 D601 `unidesk-k3s` 控制面 adapter、manifest、D601 scheduler/read/write 实例状态、`presentNodeIds` 包含 `D601`、`missingNodeIds=[]` 和 no-fallback 运行路径。 - `bun scripts/cli.ts microservice health code-queue` 与 `bun scripts/cli.ts microservice proxy code-queue /api/tasks/overview`:验证 Code Queue 经过 backend-core -> k3sctl-adapter -> k3s Service proxy 的单一路径,其中 `/health` 指向 `code-queue-scheduler`,overview/详情只读请求指向 `code-queue-read`,写入类请求指向 `code-queue-write`;输出不得出现 `serviceId=code-queue` 的 provider-gateway `microservice.http` 业务代理任务,写入、追加 prompt、打断和 readAt/未读状态都必须由 backend 写入 PostgreSQL,frontend 不得用本地存储伪造成功状态。 +- `bun scripts/cli.ts microservice diagnostics code-queue`:拆分 k3sctl-managed 链路健康,返回 `providerGateway`、`httpTunnel`、`k3sctlAdapter`、`kubernetesApiServiceProxy` 和 `targetService` 五段状态。该命令仍通过 backend-core 用户服务代理访问,不允许浏览器或 CLI 绕到 k3s、NodePort、Pod IP 或 D601 本机业务端口。 +- `bun scripts/cli.ts microservice tunnel-self-test code-queue`:触发一次预期失败的 provider HTTP tunnel 请求,用于确认失败响应包含 `requestId`、`stage`、`x-unidesk-request-id` 和 `x-unidesk-tunnel-error`;该自测只访问 provider 侧无效 loopback 端口,不创建 Code Queue 队列,也不绕过正式 backend-core 入口。 - `bun scripts/cli.ts microservice health filebrowser`、`bun scripts/cli.ts microservice health filebrowser-d601` 与 `bun scripts/cli.ts microservice proxy filebrowser / --max-body-bytes 2000`:验证 D518 主 File Browser 和 D601 备用 File Browser 私有代理链路;浏览器 WebUI 必须通过 `/api/microservices/filebrowser/proxy/` 或 `/api/microservices/filebrowser-d601/proxy/` 访问,不得直接开放 `4251` 公网端口。 - `bun scripts/cli.ts --main-server-ip 74.48.78.17 microservice health findjob`:在计算节点或其他非主 server 主机上通过公网 frontend remote CLI 进行同一验证,不需要主 server SSH key。 diff --git a/docs/reference/observability.md b/docs/reference/observability.md index 6f3dab7a..a108f6a4 100644 --- a/docs/reference/observability.md +++ b/docs/reference/observability.md @@ -32,6 +32,6 @@ frontend Bun server 必须提供同源 `/api/frontend-performance`,记录 webu 性能优化必须先用这些指标锁定慢操作名称、路径、耗时和代理层级,再改后端查询或前后端通信策略;不得只凭主观体感改 UI。Code Queue 这类控制面页面出现 `core_proxy`、`GET /api/microservices/code-queue/proxy/api/tasks/overview`、`POST /api/microservices/code-queue/proxy/api/tasks//read` 等超过 1s 的慢操作时,应保留优化前后的性能面板证据,并同时记录 live API 耗时、容器内存、`/health` 存储摘要和是否仍通过 PostgreSQL/append-only archive 重建历史数据。短 TTL cache、warmup 或页面内存缓存只能作为重复请求抖动保护,性能证据必须证明数据库索引/聚合、分页和渐进式披露本身已把核心路径降到目标内,不能用长缓存遮蔽慢 SQL 或全量 JSON 物化。 -当最近失败请求集中出现 frontend `core_proxy` 502,路径为 `/api/microservices/code-queue/proxy/...` 的 overview、trace 或 summary,且 k3s/k8s Pod 仍在运行时,必须区分“Kubernetes API service proxy 不可达”“Code Queue 进程不可达”和“Code Queue event loop 被热路径同步工作饿死”。排障顺序是同时查看 `/api/frontend-performance`、`/api/performance`、`k3sctl-adapter` `/api/control-plane`、Kubernetes Pod `/live`、`/health`、overview/trace-step curl、`kubectl top pod` 或 Docker stats、容器 `RestartCount`/`OOMKilled` 和 Code Queue 日志;如果 Pod 内 `/health` 也超时,应优先检查实时 output 发布、archive 读取、transcript 构建、统计计算、启动维护、历史 OA backfill 和远程 Provider 准备/SSH 子进程是否阻塞 event loop,而不是先调整 frontend 渲染或代理超时。Code Queue 默认不得在启动时自动执行历史 OA backfill 或通知表索引维护;显式 backfill 必须作为运维动作记录,并在运行期间并发证明 `/live`、`/health` 与 `/api/tasks/overview` 仍快速返回。涉及 D601 等远程 Provider 时,还要检查 `runCodeQueueSsh`/开发容器准备是否仍存在同步子进程、无 timeout 的 SSH、无上限 stdout/stderr 或 stale TUN 重建等待;修复后必须在远程准备探针运行期间并发证明 Pod `/health` 与 `/api/tasks/overview` 仍快速返回。 +当最近失败请求集中出现 frontend `core_proxy` 502/503/504,路径为 `/api/microservices/code-queue/proxy/...` 的 overview、trace 或 summary,且 k3s/k8s Pod 仍在运行时,必须先运行 `bun scripts/cli.ts microservice diagnostics code-queue`,区分 provider-gateway online、WebSocket HTTP tunnel、k3sctl-adapter、Kubernetes API service proxy 和目标 Service 五段状态。provider tunnel 类失败必须记录响应 body/headers 中的 `requestId`、`stage`、`failureReason`、`x-unidesk-request-id` 和 `x-unidesk-tunnel-error`;如需主动验证错误结构,运行 `bun scripts/cli.ts microservice tunnel-self-test code-queue`,该自测应返回预期失败但 `ok=true` 的诊断结果。随后再继续判断“Kubernetes API service proxy 不可达”“Code Queue 进程不可达”和“Code Queue event loop 被热路径同步工作饿死”。排障顺序是同时查看 `/api/frontend-performance`、`/api/performance`、`k3sctl-adapter` `/api/control-plane`、Kubernetes Pod `/live`、`/health`、overview/trace-step curl、`kubectl top pod` 或 Docker stats、容器 `RestartCount`/`OOMKilled` 和 Code Queue 日志;如果 Pod 内 `/health` 也超时,应优先检查实时 output 发布、archive 读取、transcript 构建、统计计算、启动维护、历史 OA backfill 和远程 Provider 准备/SSH 子进程是否阻塞 event loop,而不是先调整 frontend 渲染或代理超时。Code Queue 默认不得在启动时自动执行历史 OA backfill 或通知表索引维护;显式 backfill 必须作为运维动作记录,并在运行期间并发证明 `/live`、`/health` 与 `/api/tasks/overview` 仍快速返回。涉及 D601 等远程 Provider 时,还要检查 `runCodeQueueSsh`/开发容器准备是否仍存在同步子进程、无 timeout 的 SSH、无上限 stdout/stderr 或 stale TUN 重建等待;修复后必须在远程准备探针运行期间并发证明 Pod `/health` 与 `/api/tasks/overview` 仍快速返回。 Code Queue task 明明产出最终回复却反复 `retry_wait` 时,应优先用任务详情里的 latest attempt 字段核查 `terminalStatus`、`transportClosedBeforeTerminal`、`appServerExitCode`、`finalResponseChars`、`judge.raw._safetyOverride` 和 attempt output。OpenCode 远程任务中,`opencode completed status=completed exit=0` 加当前 attempt 非空 assistant 输出应对应 `terminalStatus=completed`、`transportClosedBeforeTerminal=false`;如果因为缺少 `step_finish` 事件仍触发 `_safetyOverride=terminal_not_completed`,说明协议终态归一化有回归。相反,当前 attempt 没有最终 assistant response 时即使 tool/read/bash 证据完整,也必须 retry,不能用旧 `task.finalResponse` 或 reasoning/tool evidence 代替可见最终回复。 diff --git a/docs/reference/provider-gateway.md b/docs/reference/provider-gateway.md index ead0e4f7..3290151d 100644 --- a/docs/reference/provider-gateway.md +++ b/docs/reference/provider-gateway.md @@ -92,6 +92,8 @@ provider ingress 是唯一允许公网暴露的 provider 连接接口,当前 backend-core 下发目标 service id、节点本机 `targetBaseUrl`、path、query、method、request body、timeout 和可选 JSON 数组裁剪参数;provider-gateway 支持 `GET`、`HEAD`、`POST`、`PUT`、`PATCH`、`DELETE`,但最终允许方法必须由每个用户服务的 `backend.allowedMethods` 显式配置。provider-gateway 只允许访问 `http://127.0.0.1`、`http://localhost`、`http://host.docker.internal` 这些节点本地地址;主 server 内置 Todo Note 后端可使用 Compose 服务名 `http://todo-note:4211`。`deployment.mode=k3sctl-managed` 的 Code Queue 不得通过 provider-gateway 直连业务容器,正式路径只能是 backend-core -> provider WebSocket HTTP tunnel -> `k3sctl-adapter` -> Kubernetes native Service/DNS,必要时显式 fallback 到 Kubernetes API service proxy -> k3s/k8s Service。该能力不打开 provider-gateway 入站端口,也不替代业务仓库自身 Dockerfile/docker-compose。 +backend-core 必须把 provider WebSocket HTTP tunnel 的失败分类到响应 body 和 headers:失败响应至少包含 `requestId`、`providerId`、`serviceId`、`stage`、`failureReason` 或 provider result,并带 `x-unidesk-request-id` 与 `x-unidesk-tunnel-error`。`GET`/`HEAD` 非 stream 请求允许短超时分层重试;`POST`、`PATCH`、`PUT`、`DELETE` 这类可能产生副作用的请求不得自动重复。Provider 重连时 backend-core 必须先确认 close 事件来自当前 active socket,旧 socket 被新 socket 替换后的迟到 close 不得清理新连接上的 tunnel waiter,也不得把节点误标 offline。 + 超大 JSON 响应可以使用 `jsonArrayLimits` 在 provider-gateway 返回前裁剪指定数组,并在响应体中写入 `_unidesk.arrayLimits` 元数据,便于 UniDesk frontend 预览列表而不展示裸 JSON。长期应优先推动业务后端提供分页 API;裁剪只是 UniDesk 集成层的展示保护。 ## Egress Proxy diff --git a/scripts/cli.ts b/scripts/cli.ts index badc1d8c..05441f06 100644 --- a/scripts/cli.ts +++ b/scripts/cli.ts @@ -45,6 +45,8 @@ function help(): unknown { { command: "microservice list", description: "List UniDesk-managed user services and their provider/runtime mapping." }, { command: "microservice status ", description: "Show one user service config, repository reference, backend mapping, and runtime status." }, { command: "microservice health ", description: "Probe one user service through backend-core -> provider-gateway HTTP proxy." }, + { command: "microservice diagnostics ", description: "Split k3sctl-managed proxy health into provider-gateway, HTTP tunnel, adapter, Kubernetes API service proxy, and target Service checks." }, + { command: "microservice tunnel-self-test ", description: "Trigger an expected provider HTTP tunnel failure and verify requestId/stage diagnostics are returned." }, { command: "microservice proxy [--method GET|POST|PUT|PATCH|DELETE] [--raw] [--max-body-bytes N]", description: "Access a private user-service backend path through the same frontend-only proxy used by WebUI; large bodies are summarized unless --raw is set." }, { command: "decision upload [--title text] [--type meeting|decision] [--level G0|G1|G2|G3|P0|P1|P2|P3|none] [--status active|blocked|parked|done] [--linked-goal-id id] [--evidence url]", description: "Upload a meeting note or decision record through backend-core -> decision-center user-service proxy." }, { command: "decision list [--type ...] [--status ...] [--level ...] [--linked-goal-id id] [--limit N]", description: "List Decision Center records through the user-service proxy." }, diff --git a/scripts/src/microservices.ts b/scripts/src/microservices.ts index d9422d4e..24954060 100644 --- a/scripts/src/microservices.ts +++ b/scripts/src/microservices.ts @@ -95,10 +95,18 @@ export async function runMicroserviceCommand(_config: UniDeskConfig, args: strin const id = requireId(idArg, "microservice health"); return coreInternalFetch(`/api/microservices/${encodeId(id)}/health`); } + if (action === "diagnostics") { + const id = requireId(idArg, "microservice diagnostics"); + return coreInternalFetch(`/api/microservices/${encodeId(id)}/diagnostics`); + } + if (action === "tunnel-self-test") { + const id = requireId(idArg, "microservice tunnel-self-test"); + return coreInternalFetch(`/api/microservices/${encodeId(id)}/tunnel-self-test`); + } if (action === "proxy") { const id = requireId(idArg, "microservice proxy"); const path = requireProxyPath(pathArg); return summarizeMicroserviceProxyResponse(coreInternalFetch(`/api/microservices/${encodeId(id)}/proxy${path}`, { method: methodOption(args) }), args); } - throw new Error("microservice command must be one of: list, status, health, proxy"); + throw new Error("microservice command must be one of: list, status, health, diagnostics, tunnel-self-test, proxy"); } diff --git a/scripts/src/remote.ts b/scripts/src/remote.ts index bf3f508e..d09b52a1 100644 --- a/scripts/src/remote.ts +++ b/scripts/src/remote.ts @@ -455,7 +455,7 @@ async function remoteMicroservice(session: FrontendSession, args: string[]): Pro if (action === "list") { return { transport: "frontend", response: await frontendJson(session, "/api/microservices", undefined, 12_000) }; } - if ((action === "status" || action === "health") && id !== undefined) { + if ((action === "status" || action === "health" || action === "diagnostics" || action === "tunnel-self-test") && id !== undefined) { return { transport: "frontend", response: await frontendJson(session, `/api/microservices/${encodeURIComponent(id)}/${action}`, undefined, 18_000), @@ -468,7 +468,7 @@ async function remoteMicroservice(session: FrontendSession, args: string[]): Pro response: summarizeMicroserviceProxyResponse(response, args), }; } - throw new Error("remote microservice command must be: microservice list | status | health | proxy "); + throw new Error("remote microservice command must be: microservice list | status | health | diagnostics | tunnel-self-test | proxy "); } async function remoteCodeQueue(session: FrontendSession, args: string[]): Promise { @@ -559,7 +559,7 @@ async function runRemoteCliOverFrontend(options: RemoteCliOptions, config: UniDe emitRemoteJson(name, { transport: "frontend", baseUrl: session.baseUrl, - commands: ["debug health", "debug dispatch", "debug task", "ssh ", "ssh skills", "microservice list", "microservice status ", "microservice health ", "microservice proxy ", "decision upload ", "decision list", "decision show ", "codex task ", "codex judge --attempt N", "network perf"], + commands: ["debug health", "debug dispatch", "debug task", "ssh ", "ssh skills", "microservice list", "microservice status ", "microservice health ", "microservice diagnostics ", "microservice tunnel-self-test ", "microservice proxy ", "decision upload ", "decision list", "decision show ", "codex task ", "codex judge --attempt N", "network perf"], }); return 0; } diff --git a/src/components/backend-core/src/egress-tcp.ts b/src/components/backend-core/src/egress-tcp.ts index 83e49f7a..dedb45fd 100644 --- a/src/components/backend-core/src/egress-tcp.ts +++ b/src/components/backend-core/src/egress-tcp.ts @@ -93,3 +93,11 @@ export function closeEgressTcpConnectionsForProvider(providerId: string): void { connection.socket.destroy(); } } + +export function closeEgressTcpConnectionsForSocket(provider: ProviderSocket): void { + for (const [key, connection] of ctx.activeEgressTcpConnections) { + if (connection.provider !== provider) continue; + ctx.activeEgressTcpConnections.delete(key); + connection.socket.destroy(); + } +} diff --git a/src/components/backend-core/src/index.ts b/src/components/backend-core/src/index.ts index bf4a0ca8..10080bfe 100644 --- a/src/components/backend-core/src/index.ts +++ b/src/components/backend-core/src/index.ts @@ -9,7 +9,7 @@ import { recordRequestPerformance, withPerformanceOperation, getPerformance } fr import { handleProviderMessage, markProviderOffline, markStaleProvidersOffline } from "./provider-registry"; import { markStaleTasksFailed, dispatchTask } from "./task-dispatcher"; import { handleSshClientMessage, sshRoute } from "./ssh-bridge"; -import { closeEgressTcpConnectionsForProvider } from "./egress-tcp"; +import { closeEgressTcpConnectionsForProvider, closeEgressTcpConnectionsForSocket } from "./egress-tcp"; import { scheduledTaskRoute, runDueScheduledTasks, recoverScheduledRuns } from "./scheduler"; import { microserviceRoute, getMicroservices } from "./microservice-proxy"; import { getOverview, codexQueueLoadTest } from "./overview"; @@ -171,17 +171,18 @@ const providerServer = Bun.serve({ const providerId = ws.data.providerId; logger("warn", "provider_socket_close", { providerId: providerId ?? null }); if (providerId !== undefined) { + if (ctx.activeProviders.get(providerId) !== ws) { + closeEgressTcpConnectionsForSocket(ws); + logger("info", "provider_socket_close_ignored_replaced", { providerId }); + return; + } closeEgressTcpConnectionsForProvider(providerId); for (const [requestId, waiter] of ctx.httpTunnelWaiters) { if (requestId.startsWith(`${providerId}:`)) { ctx.httpTunnelWaiters.delete(requestId); - waiter(null); + waiter(null, "provider-disconnected"); } } - if (ctx.activeProviders.get(providerId) !== ws) { - logger("info", "provider_socket_close_ignored_replaced", { providerId }); - return; - } markProviderOffline(providerId).catch((error) => logger("error", "provider_offline_mark_failed", { providerId, error: errorToJson(error) })); } }, diff --git a/src/components/backend-core/src/microservice-proxy.ts b/src/components/backend-core/src/microservice-proxy.ts index ee4951e4..a37ec9b6 100644 --- a/src/components/backend-core/src/microservice-proxy.ts +++ b/src/components/backend-core/src/microservice-proxy.ts @@ -1,6 +1,6 @@ import type { JsonValue } from "../../shared/src/index"; import { ctx, config, logger } from "./context"; -import type { MicroserviceConfig, MicroserviceProxyCacheEntry, MicroserviceHealthAssessment, MicroserviceAvailabilityEntry, RawTaskRow } from "./types"; +import type { HttpTunnelFailureReason, MicroserviceConfig, MicroserviceProxyCacheEntry, MicroserviceHealthAssessment, MicroserviceAvailabilityEntry, RawTaskRow } from "./types"; import { jsonResponse, errorToJson, compactJson, isPlainRecord, truncateText } from "./http"; import { createAndSendTask, waitForTaskTerminal, providerSupports } from "./task-dispatcher"; import { getNodes, getNodeDockerStatuses } from "./db"; @@ -12,6 +12,7 @@ import { getNodes, getNodeDockerStatuses } from "./db"; const microserviceProxyMaxBodyTextLength = 8 * 1024 * 1024; const microserviceAvailabilityTtlMs = 30_000; const codeQueueOverviewPathFallbackStaleMs = 30_000; +const providerHttpTunnelMaxAttempts = 3; const microserviceForwardRequestHeaders = [ "accept", "content-type", @@ -456,6 +457,13 @@ function responseFromMicroserviceCache(entry: MicroserviceProxyCacheEntry, state }); } +function isMicroserviceTransientFailureResponse(response: Response): boolean { + if (response.status !== 502 && response.status !== 503 && response.status !== 504) return false; + return response.headers.get("x-unidesk-transient-error") === "true" + || response.headers.get("x-unidesk-tunnel-error") !== null + || response.headers.get("x-unidesk-upstream-proxy-mode") === "provider-gateway-http-fetch"; +} + function readMicroserviceCache(key: string): Response | null { const entry = ctx.microserviceProxyCache.get(key); if (entry === undefined) return null; @@ -626,43 +634,248 @@ async function k3sctlAdapterMicroserviceResponse( return fetchMicroserviceUpstreamResponse(adapter, method, adapterTargetPath, proxyOptions, requestHeaders, bodyText, abortSignal); } +async function k3sctlManagedDiagnosticsResponse(service: MicroserviceConfig): Promise { + const adapterServiceId = service.deployment.adapterServiceId ?? "k3sctl-adapter"; + const adapter = microserviceById(adapterServiceId); + const checkedAt = new Date().toISOString(); + const providerId = adapter?.providerId ?? service.providerId; + const providerOnline = ctx.activeProviders.has(providerId); + const providerTunnelCapable = await providerSupports(providerId, "microservice.http.tunnel"); + if (adapter === null) { + return jsonResponse({ + ok: false, + serviceId: service.id, + checkedAt, + requestPath: "/diagnostics", + checks: { + providerGateway: { ok: providerOnline, providerId, online: providerOnline }, + httpTunnel: { ok: providerTunnelCapable, providerId, capable: providerTunnelCapable }, + k3sctlAdapter: { ok: false, serviceId: adapterServiceId, error: `k3sctl adapter microservice not found: ${adapterServiceId}` }, + kubernetesApiServiceProxy: { ok: false, skipped: true }, + targetService: { ok: false, skipped: true }, + }, + }, 502); + } + + const k3sServiceId = service.id === "code-queue" + ? codeQueueK3sServiceIdForRequest("GET", service.backend.healthPath) + : service.deployment.k3sServiceId ?? service.id; + const adapterPath = `/api/services/${encodeURIComponent(k3sServiceId)}/diagnostics`; + const response = await fetchMicroserviceUpstreamResponse( + adapter, + "GET", + adapterPath, + { query: "", jsonArrayLimits: {} }, + { accept: "application/json" }, + "", + ); + const contentType = response.headers.get("content-type") ?? "application/json; charset=utf-8"; + const bodyText = await response.text(); + let adapterBody: JsonValue = bodyText; + try { + adapterBody = JSON.parse(bodyText) as JsonValue; + } catch { + adapterBody = bodyText.slice(0, 4000); + } + const bodyRecord = isPlainRecord(adapterBody) ? adapterBody : {}; + const adapterChecks = isPlainRecord(bodyRecord.checks) ? bodyRecord.checks : {}; + const checks = { + providerGateway: { + ok: providerOnline, + providerId, + online: providerOnline, + activeSocketCount: ctx.activeProviders.size, + }, + httpTunnel: { + ok: response.ok && response.headers.get("x-unidesk-proxy-mode") === "provider-ws-http-tunnel", + providerId, + capable: providerTunnelCapable, + requestId: response.headers.get("x-unidesk-request-id") ?? null, + attempts: response.headers.get("x-unidesk-http-tunnel-attempts") ?? null, + upstreamProxyMode: response.headers.get("x-unidesk-upstream-proxy-mode") ?? null, + proxyStatus: response.status, + }, + k3sctlAdapter: { + ok: response.ok, + serviceId: adapter.id, + providerId: adapter.providerId, + status: response.status, + contentType, + }, + kubernetesApiServiceProxy: compactJson(adapterChecks.kubernetesApiServiceProxy ?? { ok: false, skipped: true }), + targetService: compactJson(adapterChecks.targetService ?? adapterChecks.managedService ?? { ok: false, skipped: true }), + } satisfies Record; + const httpTunnelCheck = checks.httpTunnel as Record; + return jsonResponse({ + ok: response.ok && providerOnline && httpTunnelCheck.ok === true, + serviceId: service.id, + k3sServiceId, + checkedAt, + path: service.backend.healthPath, + chain: "CLI/frontend -> backend-core -> provider-gateway HTTP tunnel -> k3sctl-adapter -> Kubernetes API service proxy -> k3s Service", + checks, + adapter: adapterBody, + }, response.ok ? 200 : response.status); +} + +async function microserviceTunnelSelfTestResponse(service: MicroserviceConfig): Promise { + const tunnelService = isK3sctlManagedMicroservice(service) + ? microserviceById(service.deployment.adapterServiceId ?? "k3sctl-adapter") + : service; + if (tunnelService === null) { + return jsonResponse({ ok: false, serviceId: service.id, error: "tunnel service not found", adapterServiceId: service.deployment.adapterServiceId ?? null }, 502); + } + if (!(await providerSupports(tunnelService.providerId, "microservice.http.tunnel"))) { + return jsonResponse({ + ok: false, + serviceId: service.id, + providerId: tunnelService.providerId, + error: `provider does not declare microservice.http.tunnel capability: ${tunnelService.providerId}`, + }, 409); + } + const probeService = { + ...tunnelService, + backend: { + ...tunnelService.backend, + nodeBaseUrl: "http://127.0.0.1:1", + timeoutMs: 1000, + }, + }; + const response = await providerHttpTunnelMicroserviceResponse( + probeService, + "GET", + "/", + { query: "", jsonArrayLimits: {} }, + { accept: "application/json" }, + "", + ); + const headers = { + requestId: response.headers.get("x-unidesk-request-id"), + tunnelError: response.headers.get("x-unidesk-tunnel-error"), + providerId: response.headers.get("x-unidesk-provider-id"), + serviceId: response.headers.get("x-unidesk-service-id"), + transient: response.headers.get("x-unidesk-transient-error"), + }; + const bodyText = await response.text(); + let body: JsonValue = bodyText.slice(0, 4000); + try { + body = JSON.parse(bodyText) as JsonValue; + } catch { + // Keep bounded text body for malformed JSON diagnostics. + } + const bodyRecord = isPlainRecord(body) ? body : {}; + const hasRequestId = typeof bodyRecord.requestId === "string" && bodyRecord.requestId.length > 0; + const hasStage = typeof bodyRecord.stage === "string" && bodyRecord.stage.length > 0; + const ok = response.status === 502 && hasRequestId && hasStage && headers.requestId === bodyRecord.requestId; + return jsonResponse({ + ok, + serviceId: service.id, + tunnelServiceId: tunnelService.id, + providerId: tunnelService.providerId, + expectedFailure: true, + status: response.status, + checks: { + expectedStatus: response.status === 502, + bodyHasRequestId: hasRequestId, + bodyHasStage: hasStage, + headerHasRequestId: typeof headers.requestId === "string" && headers.requestId.length > 0, + headerHasTunnelError: typeof headers.tunnelError === "string" && headers.tunnelError.length > 0, + }, + headers, + body, + }, ok ? 200 : 502); +} + function providerHttpTunnelRequestId(providerId: string): string { return `${providerId}:http_${Date.now()}_${Math.random().toString(16).slice(2)}`; } +function canRetryProviderHttpTunnel(method: string, targetPath: string): boolean { + const normalizedMethod = method.toUpperCase(); + if (normalizedMethod !== "GET" && normalizedMethod !== "HEAD") return false; + return !targetPath.endsWith("/stream"); +} + +function providerHttpTunnelWaitMs(service: MicroserviceConfig, attempt: number, retryable: boolean): number { + const baseTimeoutMs = Math.max(1000, service.backend.timeoutMs); + if (!retryable) return baseTimeoutMs + 3000; + if (attempt === 1) return Math.min(baseTimeoutMs + 3000, Math.max(5000, Math.floor(baseTimeoutMs * 0.45))); + if (attempt === 2) return Math.min(baseTimeoutMs + 3000, Math.max(6000, Math.floor(baseTimeoutMs * 0.7))); + return baseTimeoutMs + 3000; +} + +function tunnelErrorBody( + service: MicroserviceConfig, + requestId: string, + error: string, + stage: string, + status: number, + extra: Record = {}, +): Response { + const response = jsonResponse({ + ok: false, + error, + stage, + providerId: service.providerId, + serviceId: service.id, + requestId, + ...extra, + }, status); + response.headers.set("x-unidesk-request-id", requestId); + response.headers.set("x-unidesk-provider-id", service.providerId); + response.headers.set("x-unidesk-service-id", service.id); + response.headers.set("x-unidesk-tunnel-error", stage); + if (status === 502 || status === 503 || status === 504) response.headers.set("x-unidesk-transient-error", "true"); + return response; +} + +function providerHttpTunnelFailureStatus(reason: HttpTunnelFailureReason | null): number { + if (reason === "aborted") return 499; + if (reason === "provider-disconnected") return 503; + if (reason === "send-failed") return 502; + return 504; +} + +function tunnelFailureRetryable(reason: HttpTunnelFailureReason | null): boolean { + return reason === "timeout" || reason === "provider-disconnected" || reason === "send-failed"; +} + async function waitForProviderHttpTunnelResponse( providerId: string, requestId: string, timeoutMs: number, abortSignal?: AbortSignal, -): Promise<{ providerId: string; requestId: string; ok: boolean; result: JsonValue } | null> { +): Promise<{ message: { providerId: string; requestId: string; ok: boolean; result: JsonValue } | null; reason: HttpTunnelFailureReason | null }> { return await new Promise((resolve) => { let settled = false; let abortHandler: (() => void) | null = null; - const timer = setTimeout(() => settle(null), Math.max(1, timeoutMs)); - const settle = (message: { providerId: string; requestId: string; ok: boolean; result: JsonValue } | null): void => { + const timer = setTimeout(() => settle(null, "timeout"), Math.max(1, timeoutMs)); + const settle = ( + message: { providerId: string; requestId: string; ok: boolean; result: JsonValue } | null, + reason: HttpTunnelFailureReason | null = null, + ): void => { if (settled) return; settled = true; clearTimeout(timer); if (abortHandler !== null) abortSignal?.removeEventListener("abort", abortHandler); ctx.httpTunnelWaiters.delete(requestId); - resolve(message); + resolve({ message, reason }); }; - abortHandler = () => settle(null); + abortHandler = () => settle(null, "aborted"); if (abortSignal !== undefined) { if (abortSignal.aborted) { - settle(null); + settle(null, "aborted"); return; } abortSignal.addEventListener("abort", abortHandler, { once: true }); } - ctx.httpTunnelWaiters.set(requestId, (message) => { + ctx.httpTunnelWaiters.set(requestId, (message, reason) => { if (message !== null && message.providerId !== providerId) { logger("warn", "http_tunnel_provider_mismatch", { requestId, expectedProviderId: providerId, actualProviderId: message.providerId }); - settle(null); + settle(null, "provider-mismatch"); return; } - settle(message); + settle(message, reason ?? null); }); }); } @@ -676,32 +889,116 @@ async function providerHttpTunnelMicroserviceResponse( bodyText: string, abortSignal?: AbortSignal, ): Promise { - const socket = ctx.activeProviders.get(service.providerId); - if (socket === undefined) return jsonResponse({ ok: false, error: `provider is offline: ${service.providerId}` }, 503); - const requestId = providerHttpTunnelRequestId(service.providerId); - const timeoutMs = service.backend.timeoutMs + 3000; - const waiter = waitForProviderHttpTunnelResponse(service.providerId, requestId, timeoutMs, abortSignal); - socket.send(JSON.stringify({ - type: "http_tunnel_request", - requestId, - payload: { - source: "microservice-frontend-proxy", - serviceId: service.id, - method, - targetBaseUrl: service.backend.nodeBaseUrl, - path: targetPath, - query: proxyOptions.query, - requestHeaders, - bodyText, - jsonArrayLimits: proxyOptions.jsonArrayLimits, - timeoutMs: service.backend.timeoutMs, - cacheTtlMs: providerMicroserviceCacheTtlMs(service.id, targetPath), - }, - })); - const message = await waiter; - if (message === null) return jsonResponse({ ok: false, error: "provider HTTP tunnel timed out or disconnected", providerId: service.providerId, requestId }, 504); - if (!message.ok) return jsonResponse({ ok: false, error: "provider HTTP tunnel failed", providerId: service.providerId, requestId, result: message.result }, 502); - return responseFromProviderMicroserviceResult(dockerStatusRecord(message.result), "provider-ws-http-tunnel"); + const retryable = canRetryProviderHttpTunnel(method, targetPath); + const maxAttempts = retryable ? providerHttpTunnelMaxAttempts : 1; + const attempts: JsonValue[] = []; + let lastRequestId = ""; + for (let attempt = 1; attempt <= maxAttempts; attempt += 1) { + const socket = ctx.activeProviders.get(service.providerId); + const requestId = providerHttpTunnelRequestId(service.providerId); + lastRequestId = requestId; + if (socket === undefined) { + attempts.push({ attempt, requestId, ok: false, reason: "provider-offline" }); + return tunnelErrorBody(service, requestId, `provider is offline: ${service.providerId}`, "provider-gateway-online", 503, { + retryable, + attempts, + }); + } + const timeoutMs = providerHttpTunnelWaitMs(service, attempt, retryable); + const startedAt = Date.now(); + const waiter = waitForProviderHttpTunnelResponse(service.providerId, requestId, timeoutMs, abortSignal); + try { + socket.send(JSON.stringify({ + type: "http_tunnel_request", + requestId, + payload: { + source: "microservice-frontend-proxy", + serviceId: service.id, + method, + targetBaseUrl: service.backend.nodeBaseUrl, + path: targetPath, + query: proxyOptions.query, + requestHeaders, + bodyText, + jsonArrayLimits: proxyOptions.jsonArrayLimits, + timeoutMs: service.backend.timeoutMs, + cacheTtlMs: providerMicroserviceCacheTtlMs(service.id, targetPath), + }, + })); + } catch (error) { + ctx.httpTunnelWaiters.get(requestId)?.(null, "send-failed"); + const durationMs = Date.now() - startedAt; + attempts.push({ attempt, requestId, ok: false, reason: "send-failed", durationMs, error: errorToJson(error) }); + if (attempt < maxAttempts) { + logger("warn", "http_tunnel_send_retry", { providerId: service.providerId, serviceId: service.id, requestId, attempt, maxAttempts, error: errorToJson(error) }); + await Bun.sleep(Math.min(500, 75 * attempt)); + continue; + } + return tunnelErrorBody(service, requestId, "provider HTTP tunnel send failed", "http-tunnel-send", 502, { + retryable, + attempts, + detail: errorToJson(error), + }); + } + const { message, reason } = await waiter; + const durationMs = Date.now() - startedAt; + if (message === null) { + attempts.push({ attempt, requestId, ok: false, reason: reason ?? "timeout", durationMs, timeoutMs }); + if (retryable && tunnelFailureRetryable(reason) && attempt < maxAttempts) { + logger("warn", "http_tunnel_retry", { + providerId: service.providerId, + serviceId: service.id, + requestId, + attempt, + maxAttempts, + reason: reason ?? "timeout", + durationMs, + timeoutMs, + }); + await Bun.sleep(Math.min(750, 100 * attempt)); + continue; + } + return tunnelErrorBody( + service, + requestId, + "provider HTTP tunnel timed out or disconnected", + reason === "provider-disconnected" ? "http-tunnel-provider-disconnected" : reason === "aborted" ? "client-aborted" : "http-tunnel-wait", + providerHttpTunnelFailureStatus(reason), + { retryable, attempts, timeoutMs, failureReason: reason ?? "timeout" }, + ); + } + attempts.push({ attempt, requestId, ok: message.ok, durationMs }); + if (!message.ok) { + const result = dockerStatusRecord(message.result); + const resultError = typeof result.error === "string" ? result.error : "provider HTTP tunnel failed"; + logger("warn", "http_tunnel_provider_error", { + providerId: service.providerId, + serviceId: service.id, + requestId, + attempt, + maxAttempts, + durationMs, + result: compactJson(result), + }); + if (retryable && attempt < maxAttempts) { + await Bun.sleep(Math.min(750, 100 * attempt)); + continue; + } + return tunnelErrorBody(service, requestId, "provider HTTP tunnel failed", "provider-gateway-http-fetch", 502, { + retryable, + attempts, + result: message.result, + providerError: resultError, + }); + } + const response = responseFromProviderMicroserviceResult(dockerStatusRecord(message.result), "provider-ws-http-tunnel"); + response.headers.set("x-unidesk-request-id", requestId); + response.headers.set("x-unidesk-http-tunnel-attempt", String(attempt)); + response.headers.set("x-unidesk-http-tunnel-attempts", String(attempts.length)); + response.headers.set("x-unidesk-provider-id", service.providerId); + return response; + } + return tunnelErrorBody(service, lastRequestId, "provider HTTP tunnel exhausted attempts", "http-tunnel-wait", 504, { retryable, attempts }); } async function fetchMicroserviceUpstreamResponse( @@ -901,14 +1198,29 @@ export async function microserviceRoute(req: Request, url: URL): Promise" }, 404); + if (targetPath.length === 0) return jsonResponse({ ok: false, error: "microservice route must be /status, /health, /diagnostics, /tunnel-self-test, or /proxy/" }, 404); if (suffix === "health" && method !== "GET" && method !== "HEAD") { return jsonResponse({ ok: false, error: "microservice health only supports GET/HEAD" }, 405); } + if (suffix === "diagnostics" && method !== "GET" && method !== "HEAD") { + return jsonResponse({ ok: false, error: "microservice diagnostics only supports GET/HEAD" }, 405); + } + if (suffix === "tunnel-self-test" && method !== "GET" && method !== "HEAD") { + return jsonResponse({ ok: false, error: "microservice tunnel self-test only supports GET/HEAD" }, 405); + } if (!isMicroserviceMethodAllowed(service, method)) { return jsonResponse({ ok: false, error: "microservice method is not allowed", serviceId, method, allowedMethods: service.backend.allowedMethods }, 405); } + if (suffix === "diagnostics") { + if (!isK3sctlManagedMicroservice(service)) return strictMicroserviceHealthResponse(service, method === "HEAD"); + return k3sctlManagedDiagnosticsResponse(service); + } + if (suffix === "tunnel-self-test") return microserviceTunnelSelfTestResponse(service); if (!isMicroservicePathAllowed(service, targetPath)) { return jsonResponse({ ok: false, error: "microservice path is not allowed", serviceId, targetPath }, 403); } @@ -951,6 +1263,14 @@ export async function microserviceRoute(req: Request, url: URL): Promise 0) { const snapshot = await cacheableResponseSnapshot(response); rememberMicroserviceCache(cacheKey, cacheTtlMs, snapshot); diff --git a/src/components/backend-core/src/types.ts b/src/components/backend-core/src/types.ts index 681160b9..f887c182 100644 --- a/src/components/backend-core/src/types.ts +++ b/src/components/backend-core/src/types.ts @@ -168,6 +168,13 @@ export interface EgressTcpConnection { provider: ProviderSocket; } +export type HttpTunnelFailureReason = + | "timeout" + | "aborted" + | "provider-disconnected" + | "provider-mismatch" + | "send-failed"; + export interface MicroserviceProxyCacheEntry { expiresAt: number; staleExpiresAt: number; @@ -193,6 +200,6 @@ export type HttpTunnelWaiter = (message: { requestId: string; ok: boolean; result: JsonValue; -} | null) => void; +} | null, reason?: HttpTunnelFailureReason) => void; export type LoggerFn = (level: "debug" | "info" | "warn" | "error", message: string, data?: JsonValue) => void; diff --git a/src/components/microservices/k3sctl-adapter/src/index.ts b/src/components/microservices/k3sctl-adapter/src/index.ts index 22e50554..91dd0839 100644 --- a/src/components/microservices/k3sctl-adapter/src/index.ts +++ b/src/components/microservices/k3sctl-adapter/src/index.ts @@ -723,6 +723,30 @@ function parseCurlHeaderBody(output: Buffer): { status: number; contentType: str return { status: Number.isFinite(status) ? status : 502, contentType, bodyText }; } +function bodyPreview(bodyText: string, contentType: string): JsonValue { + if (contentType.toLowerCase().includes("json")) { + try { + return JSON.parse(bodyText) as JsonValue; + } catch { + return bodyText.slice(0, 2000); + } + } + return bodyText.slice(0, 2000); +} + +function responseProbeRecord(response: Response, bodyText: string, startedAt: number): JsonRecord { + const contentType = response.headers.get("content-type") ?? "application/octet-stream"; + return { + ok: response.ok, + status: response.ok ? "healthy" : "unhealthy", + upstreamStatus: response.status, + contentType, + proxyMode: response.headers.get("x-unidesk-proxy-mode") ?? "", + durationMs: Date.now() - startedAt, + body: bodyPreview(bodyText, contentType), + }; +} + async function kubeApiServiceProxyResponse( service: ManagedService, req: Request, @@ -733,6 +757,25 @@ async function kubeApiServiceProxyResponse( return kubeApiProxyResponse(service, req, serviceProxyApiPath(service, targetPath), query, timeoutMs); } +async function kubeApiServiceProxyProbe(service: ManagedService, targetPath: string, timeoutMs: number): Promise { + const startedAt = Date.now(); + try { + const request = new Request("http://k3sctl-adapter.local/diagnostics", { method: "GET", headers: { accept: "application/json" } }); + const response = await kubeApiServiceProxyResponse(service, request, targetPath, "", timeoutMs); + const bodyText = await response.text(); + return responseProbeRecord(response, bodyText, startedAt); + } catch (error) { + return { + ok: false, + status: "unhealthy", + upstreamStatus: null, + proxyMode: "kubernetes-api-service-proxy", + durationMs: Date.now() - startedAt, + error: errorToJson(error), + }; + } +} + async function nativeServiceProxyResponse( service: ManagedService, req: Request, @@ -1116,6 +1159,74 @@ async function controlPlaneSnapshot(): Promise { }; } +async function serviceDiagnostics(service: ManagedService): Promise { + const checkedAt = new Date().toISOString(); + const healthPath = activeEndpoint(service).healthPath; + const healthTimeoutMs = Math.max(500, Math.min(config.healthTimeoutMs, 5000)); + const kubernetesApiServiceProxy = await kubeApiServiceProxyProbe(service, healthPath, healthTimeoutMs); + const targetServiceStartedAt = Date.now(); + let targetService: JsonRecord; + try { + const nativeRequest = new Request("http://k3sctl-adapter.local/diagnostics", { method: "GET", headers: { accept: "application/json" } }); + const native = await nativeServiceProxyResponse(service, nativeRequest.clone(), healthPath, "", healthTimeoutMs); + const response = native ?? await kubeApiServiceProxyResponse(service, nativeRequest, healthPath, "", healthTimeoutMs); + const bodyText = await response.text(); + targetService = responseProbeRecord(response, bodyText, targetServiceStartedAt); + } catch (error) { + targetService = { + ok: false, + status: "unhealthy", + upstreamStatus: null, + proxyMode: "k3sctl-service-health", + durationMs: Date.now() - targetServiceStartedAt, + error: errorToJson(error), + }; + } + const managedService = await serviceStatus(service).then((status) => ({ + ok: status.healthy === true, + status: String(status.status ?? ""), + servingHealthy: status.servingHealthy === true, + topologyComplete: status.topologyComplete === true, + activeInstanceId: String(status.activeInstanceId ?? ""), + active: status.active ?? null, + missingNodeIds: Array.isArray(status.missingNodeIds) ? status.missingNodeIds as JsonValue : [], + } satisfies JsonRecord)).catch((error) => ({ + ok: false, + status: "unhealthy", + error: errorToJson(error), + } satisfies JsonRecord)); + const kubernetesApiServiceProxyOk = kubernetesApiServiceProxy.ok === true; + const targetServiceOk = targetService.ok === true; + const checks = { + k3sctlAdapter: { + ok: true, + nodeId: config.nodeId, + clusterId: config.clusterId, + startedAt, + }, + kubernetesApiServiceProxy: { + ...kubernetesApiServiceProxy, + configured: kubeClient !== null, + kubeconfigPath: config.kubeconfigPath, + connectHost: config.kubeApiConnectHost, + }, + targetService, + managedService, + } satisfies Record; + const ok = kubernetesApiServiceProxyOk && targetServiceOk; + return { + ok, + service: "k3sctl-adapter", + serviceId: service.id, + namespace: service.namespace, + checkedAt, + healthPath, + route: service.route, + noFallback: true, + checks, + }; +} + function forwardHeaders(request: Request): Headers { const headers = new Headers(); for (const name of ["accept", "content-type", "x-requested-with"]) { @@ -1165,6 +1276,13 @@ async function route(req: Request): Promise { const status = await serviceStatus(service); return req.method === "HEAD" ? new Response(null, { status: status.healthy === true ? 200 : 503 }) : jsonResponse({ ok: status.healthy === true, managedService: status }, status.healthy === true ? 200 : 503); } + const diagnosticsMatch = url.pathname.match(/^\/api\/services\/([^/]+)\/diagnostics$/u); + if (diagnosticsMatch !== null && (req.method === "GET" || req.method === "HEAD")) { + const service = serviceById(decodeURIComponent(diagnosticsMatch[1] ?? "")); + if (service === null) return jsonResponse({ ok: false, error: "managed service not found" }, 404); + const diagnostics = await serviceDiagnostics(service); + return req.method === "HEAD" ? new Response(null, { status: diagnostics.ok === true ? 200 : 503 }) : jsonResponse(diagnostics, diagnostics.ok === true ? 200 : 503); + } const proxyMatch = url.pathname.match(/^\/api\/services\/([^/]+)\/proxy(\/.*)$/u); if (proxyMatch !== null) { const service = serviceById(decodeURIComponent(proxyMatch[1] ?? "")); From aff2d9ba267c321be3f2dff144592a575babf507 Mon Sep 17 00:00:00 2001 From: Codex Date: Sun, 17 May 2026 06:48:35 +0000 Subject: [PATCH 03/15] Allow CI perf gate to fall back to any task --- scripts/ci-code-queue-read-perf.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/ci-code-queue-read-perf.ts b/scripts/ci-code-queue-read-perf.ts index c7642a89..80fe6b14 100644 --- a/scripts/ci-code-queue-read-perf.ts +++ b/scripts/ci-code-queue-read-perf.ts @@ -111,6 +111,7 @@ async function candidateTasks(url: string): Promise { ...tasks.filter((task) => terminalStatus(task.status) && ((task.stepCount ?? 0) === 0 || task.stepCount === null)), ...tasks.filter((task) => terminalStatus(task.status)), ...tasks.filter((task) => !terminalStatus(task.status) && task.status !== "queued" && task.status !== "running" && task.status !== "judging"), + ...tasks, ]; const seen = new Set(); return ordered.filter((task) => { @@ -148,7 +149,7 @@ async function traceSeq(url: string, taskId: string, timeoutMs: number): Promise async function traceTarget(url: string): Promise<{ taskId: string; skippedTaskIds: string[]; selection: JsonValue }> { const tasks = await candidateTasks(url); - if (tasks.length === 0) throw new Error("Code Queue CI perf could not find a terminal task id in the production PostgreSQL task table"); + if (tasks.length === 0) throw new Error("Code Queue CI perf could not find a task id in the production PostgreSQL task table"); const target = tasks[0]; if (target === undefined) throw new Error("Code Queue CI perf could not select a task from the production PostgreSQL task table"); return { taskId: target.id, skippedTaskIds: tasks.slice(1).map((task) => task.id), selection: target as unknown as JsonValue }; From c0a6f5cdbb2167ff37f83063d8fd68a7bb9a11c4 Mon Sep 17 00:00:00 2001 From: Codex Date: Sun, 17 May 2026 06:52:21 +0000 Subject: [PATCH 04/15] Fix CI wait success reporting --- scripts/src/ci.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/src/ci.ts b/scripts/src/ci.ts index 81c50e48..31800cd5 100644 --- a/scripts/src/ci.ts +++ b/scripts/src/ci.ts @@ -278,8 +278,9 @@ function run(options: CiOptions): Record { `kubectl get pipelinerun/${shellQuote(name)} -n unidesk-ci -o json`, "exit 124", ].join("\n"))) : null; + const waitSucceeded = wait === null || wait.exitCode === 0 || wait.stdout.trimStart().startsWith("True\tSucceeded\t"); return { - ok: wait === null || wait.exitCode === 0, + ok: waitSucceeded, pipelineRun: name, namespace: "unidesk-ci", repoUrl: options.repoUrl, From 6dfdfc3fb85005b30be97f0cc8b2272261fecf65 Mon Sep 17 00:00:00 2001 From: Codex Date: Sun, 17 May 2026 07:10:47 +0000 Subject: [PATCH 05/15] Accept structured tunnel self-test timeouts --- src/components/backend-core/src/microservice-proxy.ts | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/components/backend-core/src/microservice-proxy.ts b/src/components/backend-core/src/microservice-proxy.ts index a37ec9b6..75c4e01b 100644 --- a/src/components/backend-core/src/microservice-proxy.ts +++ b/src/components/backend-core/src/microservice-proxy.ts @@ -766,7 +766,8 @@ async function microserviceTunnelSelfTestResponse(service: MicroserviceConfig): const bodyRecord = isPlainRecord(body) ? body : {}; const hasRequestId = typeof bodyRecord.requestId === "string" && bodyRecord.requestId.length > 0; const hasStage = typeof bodyRecord.stage === "string" && bodyRecord.stage.length > 0; - const ok = response.status === 502 && hasRequestId && hasStage && headers.requestId === bodyRecord.requestId; + const expectedStatus = response.status === 502 || response.status === 504; + const ok = expectedStatus && hasRequestId && hasStage && headers.requestId === bodyRecord.requestId; return jsonResponse({ ok, serviceId: service.id, @@ -775,7 +776,7 @@ async function microserviceTunnelSelfTestResponse(service: MicroserviceConfig): expectedFailure: true, status: response.status, checks: { - expectedStatus: response.status === 502, + expectedStatus, bodyHasRequestId: hasRequestId, bodyHasStage: hasStage, headerHasRequestId: typeof headers.requestId === "string" && headers.requestId.length > 0, From 20a7d35722fc87093be3c381afc93fbc18cae4b8 Mon Sep 17 00:00:00 2001 From: Codex Date: Sun, 17 May 2026 07:44:13 +0000 Subject: [PATCH 06/15] Tune CI read service startup window --- .../k3sctl-adapter/k3s/ci/unidesk-ci.pipeline.yaml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/components/microservices/k3sctl-adapter/k3s/ci/unidesk-ci.pipeline.yaml b/src/components/microservices/k3sctl-adapter/k3s/ci/unidesk-ci.pipeline.yaml index d247e69c..404b4338 100644 --- a/src/components/microservices/k3sctl-adapter/k3s/ci/unidesk-ci.pipeline.yaml +++ b/src/components/microservices/k3sctl-adapter/k3s/ci/unidesk-ci.pipeline.yaml @@ -382,6 +382,13 @@ spec: periodSeconds: 5 timeoutSeconds: 3 failureThreshold: 20 + startupProbe: + httpGet: + path: /live + port: http + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 84 livenessProbe: httpGet: path: /live @@ -434,7 +441,7 @@ spec: -H "Content-Type: application/apply-patch+yaml" \ --data-binary @/tmp/code-queue-ci-read-01 \ "$kube_api/api/v1/namespaces/$kube_namespace/services/code-queue-ci-read?fieldManager=unidesk-ci&force=true" >/dev/null - deadline=$((SECONDS + 180)) + deadline=$((SECONDS + 420)) while [ "$SECONDS" -lt "$deadline" ]; do status="$(kube GET "$kube_api/apis/apps/v1/namespaces/$kube_namespace/deployments/code-queue-ci-read")" replicas="$(printf '%s' "$status" | jq -r '.spec.replicas // 1')" From c704ed5cf466ea80dba7ad45430fffb0be9019d8 Mon Sep 17 00:00:00 2001 From: Codex Date: Sun, 17 May 2026 07:49:11 +0000 Subject: [PATCH 07/15] Route Code Queue control reads to scheduler --- src/components/backend-core/src/microservice-proxy.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/src/components/backend-core/src/microservice-proxy.ts b/src/components/backend-core/src/microservice-proxy.ts index 75c4e01b..3fcb2b4f 100644 --- a/src/components/backend-core/src/microservice-proxy.ts +++ b/src/components/backend-core/src/microservice-proxy.ts @@ -381,6 +381,7 @@ function isK3sctlManagedMicroservice(service: MicroserviceConfig): boolean { function codeQueueK3sServiceIdForRequest(method: string, targetPath: string): string { const normalizedMethod = method.toUpperCase(); if (targetPath === "/" || targetPath === "/health" || targetPath === "/live" || targetPath === "/api/dev-ready") return "code-queue-scheduler"; + if (targetPath === "/api/queues" || targetPath === "/api/tasks/overview") return "code-queue-scheduler"; if (targetPath === "/api/oa/backfill" || targetPath === "/api/notifications/claudeqq/drain" || targetPath === "/api/notifications/claudeqq/backfill") return "code-queue-scheduler"; if (targetPath === "/api/judge/probe" || targetPath === "/api/judge/self-test" || targetPath === "/api/queue-order/self-test" || targetPath === "/api/reference-injection/self-test" || targetPath === "/api/trace-port/self-test") return "code-queue-scheduler"; if (/^\/api\/tasks\/[^/]+\/(?:steer|interrupt)$/u.test(targetPath)) return "code-queue-scheduler"; From df8f6a93d5629a67227ba62d95557d70297b5e15 Mon Sep 17 00:00:00 2001 From: Codex Date: Sun, 17 May 2026 07:56:24 +0000 Subject: [PATCH 08/15] Avoid early retry for provider tunnel waits --- src/components/backend-core/src/microservice-proxy.ts | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/components/backend-core/src/microservice-proxy.ts b/src/components/backend-core/src/microservice-proxy.ts index 3fcb2b4f..92489b75 100644 --- a/src/components/backend-core/src/microservice-proxy.ts +++ b/src/components/backend-core/src/microservice-proxy.ts @@ -800,9 +800,6 @@ function canRetryProviderHttpTunnel(method: string, targetPath: string): boolean function providerHttpTunnelWaitMs(service: MicroserviceConfig, attempt: number, retryable: boolean): number { const baseTimeoutMs = Math.max(1000, service.backend.timeoutMs); - if (!retryable) return baseTimeoutMs + 3000; - if (attempt === 1) return Math.min(baseTimeoutMs + 3000, Math.max(5000, Math.floor(baseTimeoutMs * 0.45))); - if (attempt === 2) return Math.min(baseTimeoutMs + 3000, Math.max(6000, Math.floor(baseTimeoutMs * 0.7))); return baseTimeoutMs + 3000; } @@ -946,7 +943,7 @@ async function providerHttpTunnelMicroserviceResponse( const durationMs = Date.now() - startedAt; if (message === null) { attempts.push({ attempt, requestId, ok: false, reason: reason ?? "timeout", durationMs, timeoutMs }); - if (retryable && tunnelFailureRetryable(reason) && attempt < maxAttempts) { + if (retryable && reason !== "timeout" && tunnelFailureRetryable(reason) && attempt < maxAttempts) { logger("warn", "http_tunnel_retry", { providerId: service.providerId, serviceId: service.id, From 7be078786be5808eec16f02fbdf2b196fbac314d Mon Sep 17 00:00:00 2001 From: Codex Date: Sun, 17 May 2026 07:57:43 +0000 Subject: [PATCH 09/15] Reset CI read service between runs --- .../k3s/ci/unidesk-ci.pipeline.yaml | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/src/components/microservices/k3sctl-adapter/k3s/ci/unidesk-ci.pipeline.yaml b/src/components/microservices/k3sctl-adapter/k3s/ci/unidesk-ci.pipeline.yaml index 404b4338..6e3964e6 100644 --- a/src/components/microservices/k3sctl-adapter/k3s/ci/unidesk-ci.pipeline.yaml +++ b/src/components/microservices/k3sctl-adapter/k3s/ci/unidesk-ci.pipeline.yaml @@ -270,6 +270,37 @@ spec: shift curl -fsS --cacert "$kube_ca" -H "Authorization: Bearer $kube_token" -X "$method" "$@" } + delete_if_exists() { + local path="$1" + local code + code="$(curl -sS -o /tmp/unidesk-ci-delete-response -w "%{http_code}" --cacert "$kube_ca" -H "Authorization: Bearer $kube_token" -X DELETE "$kube_api/$path")" + if [ "$code" = "200" ] || [ "$code" = "202" ] || [ "$code" = "404" ]; then + return 0 + fi + cat /tmp/unidesk-ci-delete-response >&2 + return 1 + } + wait_deleted() { + local path="$1" + local deadline=$((SECONDS + 120)) + local code + while [ "$SECONDS" -lt "$deadline" ]; do + code="$(curl -sS -o /tmp/unidesk-ci-get-response -w "%{http_code}" --cacert "$kube_ca" -H "Authorization: Bearer $kube_token" "$kube_api/$path")" + if [ "$code" = "404" ]; then + return 0 + fi + if [ "$code" != "200" ]; then + cat /tmp/unidesk-ci-get-response >&2 + return 1 + fi + sleep 2 + done + echo "timeout waiting for $path deletion" >&2 + return 1 + } + delete_if_exists "apis/apps/v1/namespaces/$kube_namespace/deployments/code-queue-ci-read" + delete_if_exists "api/v1/namespaces/$kube_namespace/services/code-queue-ci-read" + wait_deleted "apis/apps/v1/namespaces/$kube_namespace/deployments/code-queue-ci-read" cat >/tmp/code-queue-ci-read-deployment.yaml < Date: Sun, 17 May 2026 06:54:30 +0000 Subject: [PATCH 10/15] fix: guard code queue claim move race --- docs/reference/cli.md | 2 +- .../microservices/code-queue/src/index.ts | 448 ++++++++++++++++-- .../microservices/code-queue/src/queue-api.ts | 39 +- .../code-queue/src/self-tests.ts | 44 +- 4 files changed, 488 insertions(+), 45 deletions(-) diff --git a/docs/reference/cli.md b/docs/reference/cli.md index 49caaad6..0165f35c 100644 --- a/docs/reference/cli.md +++ b/docs/reference/cli.md @@ -26,7 +26,7 @@ UniDesk 的统一 CLI 入口是根目录 `scripts/cli.ts`,运行方式固定 - `codex task --trace --tail|--from-start|--after-seq N|--before-seq N --limit N` 按页拉取 Code Queue 的逻辑 trace;响应会返回 `nextAfterSeq`、`previousBeforeSeq`、`hasMore`、`hasBefore` 和下一页/上一页命令,默认 `--trace` 取最新一页,需要完整 prompt/最后 response 时加 `--full`。 - `codex output --tail|--from-start|--after-seq N|--before-seq N --limit N [--full-text]` 按原始 output seq 分页读取底层记录;当 trace 行提示 `commandOmittedLines`、`bodyOmittedLines` 或 `rawSeqs` 时,用该命令按 seq 补取完整信息,默认仍有单条文本预览上限,显式 `--full-text` 才返回该页全文。 - `codex judge --attempt N [--dry-run] [--include-prompt]` 通过 Code Queue 私有代理按指定 attempt 单步复现 judge;后端会从 PostgreSQL task JSON 与 output 归档重建该 attempt 在真实队列 worker 中的 `QueueTask`/`CodexRunResult`,再调用同一套 judge prompt builder 和 MiniMax 请求路径。默认会真实调用 MiniMax,`--dry-run` 只返回 prompt/payload 大小、attempt 窗口和重建来源诊断,`--include-prompt` 仅用于本地深度排查。 -- Code Queue 多队列 lane 由 `codex` 命令命名空间管理:`queues` 列表、`queue create ` 创建、`queue merge --into ` 合并、`move --queue ` 迁移;同一个 queue 内部串行执行,不同 queue 之间并行执行。合并会移动任务归属并自动删除源 queue 记录,只保留合并后的目标 queue;合并后的目标 queue 按任务原 `queueEnteredAt`/`createdAt` 时间顺序串行。迁移 queued/retry_wait 任务后会立即调度目标 queue。 +- Code Queue 多队列 lane 由 `codex` 命令命名空间管理:`queues` 列表、`queue create ` 创建、`queue merge --into ` 合并、`move --queue ` 迁移;同一个 queue 内部串行执行,不同 queue 之间并行执行。迁移只允许尚未被 scheduler claim 的 `queued`/`retry_wait` 任务,必须满足 `startedAt=null`、`currentAttempt=0` 且没有 active thread/turn;已进入 `running`/`judging` 或已有 claim 标记的任务返回 409,不得被 move/merge 回写成 queued。合并会移动可迁移任务归属并自动删除源 queue 记录,只保留合并后的目标 queue;若 source 或 target queue 存在 active/claimed 任务,合并整体返回 409。合并后的目标 queue 按任务原 `queueEnteredAt`/`createdAt` 时间顺序串行,成功迁移 queued/retry_wait 任务后会立即调度目标 queue。 - `job list` 与 `job status` 查询 `.state/jobs/` 文件系统状态,是异步命令的可观测入口。 - `debug health`、`debug dispatch` 与 `debug task` 走真实内部 core、WebSocket、数据库、provider、系统指标、Docker 状态和 Host SSH 维护桥流程,只用于开发调试,不写入 `TEST.md` 的正式验收步骤。 - `e2e run [--only pattern[,pattern...]] [--skip pattern[,pattern...]]` 使用 publicHost 派生的公开 frontend/provider ingress URL,并通过 Docker 内网验证 core API、PostgreSQL、provider self-connection、系统指标曲线、Docker 状态快照、provider.upgrade 预检和 Playwright 前端页面,是交付前的自动化 E2E 门禁;CLI 默认输出 check 状态摘要,完整诊断写入 `resultPath`,日常迭代应优先用 `--only` / `--skip` 跑最小必要集合。 diff --git a/src/components/microservices/code-queue/src/index.ts b/src/components/microservices/code-queue/src/index.ts index fe758aa6..571186ba 100644 --- a/src/components/microservices/code-queue/src/index.ts +++ b/src/components/microservices/code-queue/src/index.ts @@ -128,7 +128,7 @@ import { readOaTraceStatsForTaskAttempts, readOaTraceStatsForTasks, } from "./oa-events"; -import { configureSelfTests, runJudgeInfraSelfTest, runQueueOrderingSelfTest, runReferenceInjectionSelfTest, runTracePortSelfTest } from "./self-tests"; +import { configureSelfTests, runJudgeInfraSelfTest, runQueueClaimMoveSelfTest, runQueueOrderingSelfTest, runReferenceInjectionSelfTest, runTracePortSelfTest } from "./self-tests"; import { codexToolLifecycleStartedBeforeIn, configureTaskView, @@ -1185,7 +1185,12 @@ function updateNextSeqFromTasks(): void { state.nextSeq = nextSeq; } -async function upsertTaskToDatabase(client: SqlExecutor, task: QueueTask): Promise { +interface UpsertTaskOptions { + claimQueueId?: string | null; +} + +async function upsertTaskToDatabase(client: SqlExecutor, task: QueueTask, options: UpsertTaskOptions = {}): Promise { + const claimQueueId = options.claimQueueId ?? null; const rows = await client>` INSERT INTO unidesk_code_queue_tasks ( id, @@ -1292,9 +1297,62 @@ async function upsertTaskToDatabase(client: SqlExecutor, task: QueueTask): Promi END, true ) + WHERE ( + ${claimQueueId === null} + OR ( + unidesk_code_queue_tasks.queue_id = ${claimQueueId} + AND ( + ( + unidesk_code_queue_tasks.status = 'queued' + AND unidesk_code_queue_tasks.started_at IS NULL + AND unidesk_code_queue_tasks.current_attempt = 0 + AND unidesk_code_queue_tasks.codex_thread_id IS NULL + AND unidesk_code_queue_tasks.active_turn_id IS NULL + ) + OR ( + unidesk_code_queue_tasks.status = 'retry_wait' + AND unidesk_code_queue_tasks.active_turn_id IS NULL + ) + ) + ) + ) + AND NOT ( + EXCLUDED.status IN ('queued', 'retry_wait') + AND EXCLUDED.started_at IS NULL + AND EXCLUDED.current_attempt = 0 + AND EXCLUDED.codex_thread_id IS NULL + AND EXCLUDED.active_turn_id IS NULL + AND ( + unidesk_code_queue_tasks.status IN ('running', 'judging') + OR unidesk_code_queue_tasks.started_at IS NOT NULL + OR unidesk_code_queue_tasks.current_attempt > 0 + OR unidesk_code_queue_tasks.codex_thread_id IS NOT NULL + OR unidesk_code_queue_tasks.active_turn_id IS NOT NULL + ) + ) RETURNING read_at `; + if (rows.length === 0) { + const current = await client` + SELECT id, queue_id, status, started_at, current_attempt, codex_thread_id, active_turn_id + FROM unidesk_code_queue_tasks + WHERE id = ${task.id} + LIMIT 1 + `; + logger("warn", "database_task_stale_unclaimed_write_rejected", { + taskId: task.id, + attemptedQueueId: queueIdOf(task), + attemptedStatus: task.status, + attemptedStartedAt: task.startedAt, + attemptedCurrentAttempt: task.currentAttempt, + attemptedCodexThreadId: task.codexThreadId, + attemptedActiveTurnId: task.activeTurnId, + current: databaseStatusRowJson(current[0] ?? null), + }); + return false; + } task.readAt = timestampToIso(rows[0]?.read_at ?? null); + return true; } async function upsertQueueToDatabase(client: SqlExecutor, queue: QueueRecord): Promise { @@ -1347,6 +1405,16 @@ interface DatabaseTaskRow { task_json: unknown; } +interface DatabaseTaskStatusRow { + id: string; + queue_id: string; + status: TaskStatus; + started_at: Date | string | null; + current_attempt: number | string | null; + codex_thread_id: string | null; + active_turn_id: string | null; +} + interface DatabaseQueueRow { id: string; name: string; @@ -1376,6 +1444,161 @@ function normalizeDatabaseTaskRows(rows: DatabaseTaskRow[], source: string): Que return tasks.sort((left, right) => (timestampMs(left.createdAt) ?? 0) - (timestampMs(right.createdAt) ?? 0) || left.id.localeCompare(right.id)); } +function databaseStatusRowJson(row: DatabaseTaskStatusRow | null): JsonValue { + if (row === null) return null; + return { + id: row.id, + queueId: safeQueueId(row.queue_id), + status: row.status, + startedAt: timestampToIso(row.started_at), + currentAttempt: Number(row.current_attempt ?? 0), + codexThreadId: row.codex_thread_id, + activeTurnId: row.active_turn_id, + }; +} + +function taskIsUnclaimedMovable(task: QueueTask): boolean { + return (task.status === "queued" || task.status === "retry_wait") + && task.startedAt === null + && task.currentAttempt === 0 + && task.codexThreadId === null + && task.activeTurnId === null; +} + +function databaseTaskMoveBlocker(row: DatabaseTaskStatusRow | null): string { + if (row === null) return "task not found"; + if (row.status !== "queued" && row.status !== "retry_wait") return `status=${row.status}`; + if (row.started_at !== null) return "task already has started_at"; + if (Number(row.current_attempt ?? 0) !== 0) return `task already has current_attempt=${Number(row.current_attempt ?? 0)}`; + if (row.codex_thread_id !== null) return "task already has codex_thread_id"; + if (row.active_turn_id !== null) return "task already has active_turn_id"; + return ""; +} + +function taskMoveBlocker(task: QueueTask): string { + if (activeRunForTask(task) !== null) return "task has an active agent run"; + if (processingQueues.has(queueIdOf(task))) return "queue processor is currently active"; + if (activeRunSlotReservations.has(queueIdOf(task))) return "queue is reserving an active run slot"; + if (activeRunSlotWaiters.some((waiter) => waiter.taskId === task.id || waiter.queueId === queueIdOf(task))) return "queue is waiting for an active run slot"; + if (task.status !== "queued" && task.status !== "retry_wait") return `status=${task.status}`; + if (!taskIsUnclaimedMovable(task)) return "task has already been claimed"; + return ""; +} + +function reconcileHotTaskFromDatabase(task: QueueTask): QueueTask { + const existing = findTask(task.id); + if (existing === null) return rememberHotTask(task); + if (activeRunForTask(existing) !== null) return existing; + Object.assign(existing, task); + return existing; +} + +function taskHasClaimMarkers(task: QueueTask): boolean { + return task.status === "running" + || task.status === "judging" + || task.startedAt !== null + || task.currentAttempt > 0 + || task.codexThreadId !== null + || task.activeTurnId !== null; +} + +function shouldPreferHotTaskOverDatabase(hotTask: QueueTask, databaseTask: QueueTask): boolean { + if (activeRunForTask(hotTask) !== null) return true; + if (taskIsUnclaimedMovable(hotTask) && taskHasClaimMarkers(databaseTask)) return false; + const hotUpdatedAt = timestampMs(hotTask.updatedAt) ?? 0; + const databaseUpdatedAt = timestampMs(databaseTask.updatedAt) ?? 0; + return hotUpdatedAt >= databaseUpdatedAt; +} + +async function deleteTaskFromDatabase(taskId: string): Promise { + if (!databaseReady) return; + await sql` + DELETE FROM unidesk_code_queue_tasks + WHERE id = ${taskId} + `; +} + +async function claimTaskInDatabase(task: QueueTask, expectedQueueId: string): Promise { + if (!databaseReady) return true; + const claimed = await sql.begin(async (client) => await upsertTaskToDatabase(client, task, { claimQueueId: expectedQueueId })); + if (claimed) return true; + const databaseTask = await loadTaskFromDatabase(task.id); + if (databaseTask !== null) reconcileHotTaskFromDatabase(databaseTask); + logger("warn", "task_claim_conflict", { + taskId: task.id, + expectedQueueId, + attemptedQueueId: queueIdOf(task), + attemptedStatus: task.status, + attemptedCurrentAttempt: task.currentAttempt, + }); + return false; +} + +async function runDatabaseClaimMoveSelfTest(): Promise { + if (!databaseReady) return null; + const suffix = String(Date.now()); + const taskId = `codex_claim_move_db_${suffix}`; + const queuedAt = nowIso(); + const sourceQueueId = `claim_move_db_source_${suffix}`; + const targetQueueId = `claim_move_db_target_${suffix}`; + const before = state.tasks.slice(); + const beforeQueues = state.queues.slice(); + await deleteTaskFromDatabase(taskId); + try { + const queuedTask = normalizeTask({ + ...createTask({ prompt: "claim/move DB race self-test", queueId: sourceQueueId }), + id: taskId, + queueId: sourceQueueId, + queueEnteredAt: queuedAt, + createdAt: queuedAt, + updatedAt: queuedAt, + output: [], + }); + await sql.begin(async (client) => { + await upsertQueueToDatabase(client, { id: sourceQueueId, name: sourceQueueId, createdAt: queuedAt, updatedAt: queuedAt }); + await upsertTaskToDatabase(client, queuedTask); + }); + const staleHotTask = normalizeTask(JSON.parse(JSON.stringify(queuedTask)) as QueueTask); + const claimedTask = normalizeTask(JSON.parse(JSON.stringify(queuedTask)) as QueueTask); + const claimedAt = nowIso(); + claimedTask.status = "running"; + claimedTask.startedAt = claimedAt; + claimedTask.currentAttempt = 1; + claimedTask.currentMode = "initial"; + claimedTask.updatedAt = claimedAt; + const claimed = await claimTaskInDatabase(claimedTask, sourceQueueId); + if (!claimed) throw new Error("database claim self-test failed to claim queued task"); + state.tasks.splice(0, state.tasks.length, staleHotTask); + const response = await moveTaskToQueue(staleHotTask, new Request(`http://code-queue.local/api/tasks/${taskId}/move`, { + method: "POST", + body: JSON.stringify({ queueId: targetQueueId }), + headers: { "content-type": "application/json" }, + })); + const after = await loadTaskFromDatabase(taskId); + const body = await response.json() as Record; + if (response.status !== 409) throw new Error(`database stale move should return 409, got ${response.status}`); + if (after === null) throw new Error("database self-test task disappeared after stale move"); + if (after.status !== "running") throw new Error(`database self-test task status changed to ${after.status}`); + if (queueIdOf(after) !== sourceQueueId) throw new Error(`database self-test task queue changed to ${queueIdOf(after)}`); + if (after.currentAttempt !== 1 || after.startedAt === null) throw new Error("database self-test task claim markers were lost"); + return { + ok: true, + taskId, + moveStatus: response.status, + databaseStatus: after.status, + databaseQueueId: queueIdOf(after), + currentAttempt: after.currentAttempt, + startedAt: after.startedAt, + response: body as JsonValue, + } as unknown as JsonValue; + } finally { + await deleteTaskFromDatabase(taskId); + await deleteDatabaseQueues([sourceQueueId, targetQueueId]); + state.tasks.splice(0, state.tasks.length, ...before); + state.queues.splice(0, state.queues.length, ...beforeQueues); + } +} + async function loadPrunedDatabaseTaskRows(where: "all" | "hot"): Promise { return await sql` SELECT id, updated_at, status, read_at, task_json @@ -1674,12 +1897,21 @@ function rememberHotTask(task: QueueTask): QueueTask { } async function findTaskForRead(taskId: string): Promise { - return findTask(taskId) ?? await loadTaskFromDatabase(taskId); + const hotTask = findTask(taskId); + if (!databaseReady) return hotTask; + const databaseTask = await loadTaskFromDatabase(taskId); + if (hotTask === null) return databaseTask; + if (databaseTask === null) return hotTask; + return shouldPreferHotTaskOverDatabase(hotTask, databaseTask) ? hotTask : databaseTask; } async function findTaskForMutation(taskId: string): Promise { - const task = findTask(taskId) ?? await loadTaskFromDatabase(taskId); - return task === null ? null : rememberHotTask(task); + const hotTask = findTask(taskId); + if (!databaseReady) return hotTask; + const databaseTask = await loadTaskFromDatabase(taskId); + if (databaseTask === null) return hotTask; + if (hotTask === null) return rememberHotTask(databaseTask); + return shouldPreferHotTaskOverDatabase(hotTask, databaseTask) ? hotTask : reconcileHotTaskFromDatabase(databaseTask); } async function loadNextSeqFromDatabase(): Promise { @@ -1703,6 +1935,7 @@ async function flushDirtyTasksToDatabase(force = false): Promise { dirtyDatabaseTaskIds.clear(); dirtyDatabaseQueueIds.clear(); databaseFlushInFlight = true; + const rejectedTaskIds: string[] = []; try { await sql.begin(async (client) => { for (const id of queueIds) { @@ -1711,7 +1944,7 @@ async function flushDirtyTasksToDatabase(force = false): Promise { } for (const id of ids) { const task = state.tasks.find((item) => item.id === id); - if (task !== undefined) await upsertTaskToDatabase(client, task); + if (task !== undefined && !await upsertTaskToDatabase(client, task)) rejectedTaskIds.push(id); } }); databaseLastError = null; @@ -1723,6 +1956,10 @@ async function flushDirtyTasksToDatabase(force = false): Promise { databaseFlushInFlight = false; if (dirtyDatabaseTaskIds.size > 0 || dirtyDatabaseQueueIds.size > 0) scheduleDatabaseFlush(); } + for (const id of rejectedTaskIds) { + const databaseTask = await loadTaskFromDatabase(id); + if (databaseTask !== null) reconcileHotTaskFromDatabase(databaseTask); + } } async function initDatabasePersistence(): Promise { @@ -2446,6 +2683,7 @@ configureSelfTests({ defaultQueueId, enqueueActiveRunSlotWaiter, injectReferencedTaskContext, + moveTaskToQueueForTest: (task, req) => moveTaskToQueue(task, req, { bypassRoleCheck: true }), nextRunnableTaskFrom, normalizeTask, nowIso, @@ -2454,6 +2692,8 @@ configureSelfTests({ queuedStatusReason, removeActiveRunSlotWaiter, resolveReasoningEffort, + runDatabaseClaimMoveSelfTest, + tasks: () => state.tasks, updateProcessingFlag, }); @@ -2979,7 +3219,8 @@ function failTaskForFallbackRetryLimit(task: QueueTask, judge: JudgeResult | nul } async function runTask(task: QueueTask): Promise { - logger("info", "task_processor_start", { taskId: task.id, queueId: queueIdOf(task), providerId: task.providerId, executionMode: task.executionMode, cwd: task.cwd, maxAttempts: task.maxAttempts, model: task.model, agentPort: codeAgentPortForModel(task.model), promptPreview: safePreview(task.prompt, 240) }); + const claimQueueId = queueIdOf(task); + logger("info", "task_processor_start", { taskId: task.id, queueId: claimQueueId, providerId: task.providerId, executionMode: task.executionMode, cwd: task.cwd, maxAttempts: task.maxAttempts, model: task.model, agentPort: codeAgentPortForModel(task.model), promptPreview: safePreview(task.prompt, 240) }); if (task.status === "retry_wait" && task.lastJudge?.source === "fallback" && task.lastJudge.decision === "retry" && fallbackJudgeRetryCount(task) >= fallbackJudgeRetryLimit) { failTaskForFallbackRetryLimit(task, task.lastJudge); return; @@ -3010,6 +3251,11 @@ async function runTask(task: QueueTask): Promise { task.readAt = null; task.finishedAt = null; task.updatedAt = startedAt; + if (!await claimTaskInDatabase(task, claimQueueId)) { + releaseRunSlot(); + return; + } + publishTaskOaEvent(task, "claim"); logger("info", "task_run_start", { taskId: task.id, queueId: queueIdOf(task), attempt: task.currentAttempt, mode, providerId: task.providerId, executionMode: task.executionMode, cwd: task.cwd, maxAttempts: task.maxAttempts, model: task.model, agentPort: codeAgentPortForModel(task.model), freshRecovery: needsFreshRecoveryPrompt }); const attemptStartOutput = appendOutput(task, "system", `attempt ${task.currentAttempt}/${task.maxAttempts} queue=${queueIdOf(task)} provider=${task.providerId} executionMode=${task.executionMode} cwd=${task.cwd} mode=${mode} model=${task.model} port=${codeAgentPortForModel(task.model)}\n`, "queue"); @@ -3997,7 +4243,9 @@ function queueMergeBlocker(queueId: string): string | null { if (activeRunSlotReservations.has(queueId)) return "queue is reserving an active run slot"; if (activeRunSlotWaiters.some((waiter) => waiter.queueId === queueId)) return "queue is waiting for an active run slot"; const activeTask = state.tasks.find((task) => queueIdOf(task) === queueId && (task.status === "running" || task.status === "judging")); - return activeTask === undefined ? null : `task ${activeTask.id} is ${activeTask.status}`; + if (activeTask !== undefined) return `task ${activeTask.id} is ${activeTask.status}`; + const claimedPendingTask = state.tasks.find((task) => queueIdOf(task) === queueId && (task.status === "queued" || task.status === "retry_wait") && !taskIsUnclaimedMovable(task)); + return claimedPendingTask === undefined ? null : `task ${claimedPendingTask.id} has already been claimed`; } function parseSourceQueueIds(record: Record, targetQueueId: string): string[] { @@ -4020,27 +4268,117 @@ function parseSourceQueueIds(record: Record, targetQueueId: str return ids; } -async function mergeDatabaseQueueTasks(sourceQueueIds: string[], targetQueueId: string): Promise { - if (!databaseReady || sourceQueueIds.length === 0) return []; - const rows = await sql>` - UPDATE unidesk_code_queue_tasks - SET - queue_id = ${targetQueueId}, - task_json = jsonb_set( - jsonb_set( - task_json, - '{queueId}', - to_jsonb(${targetQueueId}::text), +async function mergeDatabaseQueueTasks(sourceQueueIds: string[], targetQueueId: string, mergedAt: string): Promise<{ movedTaskIds: string[]; blocker: DatabaseTaskStatusRow | null }> { + if (!databaseReady || sourceQueueIds.length === 0) return { movedTaskIds: [], blocker: null }; + return await sql.begin(async (client) => { + const mergeQueueIds = Array.from(new Set([targetQueueId, ...sourceQueueIds])); + const lockedRows = await client` + SELECT id, queue_id, status, started_at, current_attempt, codex_thread_id, active_turn_id + FROM unidesk_code_queue_tasks + WHERE queue_id IN ${client(mergeQueueIds)} + ORDER BY updated_at DESC, id DESC + FOR UPDATE + `; + const blocker = lockedRows.find((row) => { + return row.status === "running" + || row.status === "judging" + || ( + (row.status === "queued" || row.status === "retry_wait") + && ( + row.started_at !== null + || Number(row.current_attempt ?? 0) > 0 + || row.codex_thread_id !== null + || row.active_turn_id !== null + ) + ); + }) ?? null; + if (blocker !== null) return { movedTaskIds: [], blocker }; + const rows = await client>` + UPDATE unidesk_code_queue_tasks + SET + queue_id = ${targetQueueId}, + updated_at = ${mergedAt}, + task_json = jsonb_set( + jsonb_set( + jsonb_set( + task_json, + '{queueId}', + to_jsonb(${targetQueueId}::text), + true + ), + '{queueEnteredAt}', + to_jsonb(COALESCE(NULLIF(task_json->>'queueEnteredAt', ''), to_char(created_at AT TIME ZONE 'UTC', 'YYYY-MM-DD"T"HH24:MI:SS.MS"Z"'))::text), + true + ), + '{updatedAt}', + to_jsonb(${mergedAt}::text), true - ), - '{queueEnteredAt}', - to_jsonb(COALESCE(NULLIF(task_json->>'queueEnteredAt', ''), to_char(created_at AT TIME ZONE 'UTC', 'YYYY-MM-DD"T"HH24:MI:SS.MS"Z"'))::text), - true - ) - WHERE queue_id IN ${sql(sourceQueueIds)} - RETURNING id - `; - return rows.map((row) => row.id); + ) + WHERE queue_id IN ${client(sourceQueueIds)} + AND ( + status IN ('succeeded', 'failed', 'canceled') + OR ( + status IN ('queued', 'retry_wait') + AND started_at IS NULL + AND current_attempt = 0 + AND codex_thread_id IS NULL + AND active_turn_id IS NULL + ) + ) + RETURNING id + `; + return { movedTaskIds: rows.map((row) => row.id), blocker: null }; + }); +} + +async function moveDatabaseTaskToQueue(taskId: string, targetQueueId: string, movedAt: string): Promise<{ ok: boolean; row: DatabaseTaskStatusRow | null; previousQueueId: string | null; blocker: string }> { + if (!databaseReady) return { ok: true, row: null, previousQueueId: null, blocker: "" }; + return await sql.begin(async (client) => { + const rows = await client` + SELECT id, queue_id, status, started_at, current_attempt, codex_thread_id, active_turn_id + FROM unidesk_code_queue_tasks + WHERE id = ${taskId} + LIMIT 1 + FOR UPDATE + `; + const row = rows[0] ?? null; + const blocker = databaseTaskMoveBlocker(row); + if (blocker.length > 0) return { ok: false, row, previousQueueId: row === null ? null : safeQueueId(row.queue_id), blocker }; + const previousQueueId = safeQueueId(row?.queue_id); + const updated = await client` + UPDATE unidesk_code_queue_tasks + SET + queue_id = ${targetQueueId}, + updated_at = ${movedAt}, + task_json = jsonb_set( + jsonb_set( + jsonb_set( + task_json, + '{queueId}', + to_jsonb(${targetQueueId}::text), + true + ), + '{queueEnteredAt}', + to_jsonb(${movedAt}::text), + true + ), + '{updatedAt}', + to_jsonb(${movedAt}::text), + true + ) + WHERE id = ${taskId} + AND status IN ('queued', 'retry_wait') + AND started_at IS NULL + AND current_attempt = 0 + AND codex_thread_id IS NULL + AND active_turn_id IS NULL + RETURNING id, queue_id, status, started_at, current_attempt, codex_thread_id, active_turn_id + `; + const updatedRow = updated[0] ?? null; + return updatedRow === null + ? { ok: false, row, previousQueueId, blocker: "conditional update matched no rows" } + : { ok: true, row: updatedRow, previousQueueId, blocker: "" }; + }); } async function deleteDatabaseQueues(queueIds: string[]): Promise { @@ -4097,6 +4435,17 @@ async function mergeQueues(targetQueueIdValue: string | null, req: Request): Pro } const mergedAt = nowIso(); + const databaseMerge = await mergeDatabaseQueueTasks(sourceQueueIds, targetQueueId, mergedAt); + if (databaseMerge.blocker !== null) { + const blockerQueueId = safeQueueId(databaseMerge.blocker.queue_id); + const databaseTask = await loadTaskFromDatabase(databaseMerge.blocker.id); + if (databaseTask !== null) reconcileHotTaskFromDatabase(databaseTask); + return jsonResponse({ + ok: false, + error: `cannot merge queue ${blockerQueueId}: task ${databaseMerge.blocker.id} is already claimed (${databaseTaskMoveBlocker(databaseMerge.blocker) || databaseMerge.blocker.status})`, + blocker: databaseStatusRowJson(databaseMerge.blocker), + }, 409); + } const targetQueue = ensureQueue(targetQueueId); const sourceQueues = sourceQueueIds.map((id) => queueSnapshot(id, mergedAt)); targetQueue.updatedAt = mergedAt; @@ -4109,11 +4458,11 @@ async function mergeQueues(targetQueueIdValue: string | null, req: Request): Pro if (!sourceSet.has(previousQueueId)) continue; task.queueEnteredAt = taskQueueEnteredAt(task); task.queueId = targetQueueId; + task.updatedAt = mergedAt; hotMovedTasks.push(task); markTaskDirty(task.id); publishTaskOaEvent(task, "queue-merged"); } - const databaseMovedTaskIds = await mergeDatabaseQueueTasks(sourceQueueIds, targetQueueId); const deletedSourceQueues = deleteQueuesFromState(sourceQueueIds); const databaseDeletedQueueIds = await deleteDatabaseQueues(sourceQueueIds); persistState(false); @@ -4125,14 +4474,14 @@ async function mergeQueues(targetQueueIdValue: string | null, req: Request): Pro sourceQueueIds, deletedSourceQueueIds: deletedSourceQueues.map((queue) => queue.id), hotMovedTaskCount: hotMovedTasks.length, - databaseMovedTaskCount: databaseReady ? databaseMovedTaskIds.length : null, + databaseMovedTaskCount: databaseReady ? databaseMerge.movedTaskIds.length : null, databaseDeletedQueueIds: databaseReady ? databaseDeletedQueueIds : null, }); for (const id of mergeQueueIds) mergingQueues.delete(id); scheduleQueue(targetQueueId); await flushDirtyTasksToDatabase(true); const tasks = await loadAllTasksForRead(); - const movedIdSet = new Set(databaseReady ? databaseMovedTaskIds : hotMovedTasks.map((task) => task.id)); + const movedIdSet = new Set(databaseReady ? databaseMerge.movedTaskIds : hotMovedTasks.map((task) => task.id)); const orderedMovedTaskIds = tasks .filter((task) => movedIdSet.has(task.id)) .sort(compareTaskQueueOrder) @@ -4145,7 +4494,7 @@ async function mergeQueues(targetQueueIdValue: string | null, req: Request): Pro ok: true, targetQueueId, sourceQueueIds, - mergedTaskCount: databaseReady ? databaseMovedTaskIds.length : hotMovedTasks.length, + mergedTaskCount: databaseReady ? databaseMerge.movedTaskIds.length : hotMovedTasks.length, movedTaskIds: orderedMovedTaskIds.slice(0, 500), targetTaskOrder: targetTaskOrder.slice(0, 500), order: "merged tasks keep their original queueEnteredAt/createdAt ordering; source queue records are deleted after merge", @@ -4161,17 +4510,37 @@ async function mergeQueues(targetQueueIdValue: string | null, req: Request): Pro } } -async function moveTaskToQueue(task: QueueTask, req: Request): Promise { - if (!serviceRoleAllowsWrite(config.serviceRole)) return readOnlyRejectResponse(req.method, `/api/tasks/${task.id}/move`); - if (task.status === "running" || task.status === "judging") { - return jsonResponse({ ok: false, error: `cannot move active task ${task.id} while status=${task.status}`, task: taskForResponse(task) }, 409); - } +async function moveTaskToQueue(task: QueueTask, req: Request, options: { bypassRoleCheck?: boolean } = {}): Promise { + if (options.bypassRoleCheck !== true && !serviceRoleAllowsWrite(config.serviceRole)) return readOnlyRejectResponse(req.method, `/api/tasks/${task.id}/move`); const body = await readJson(req); const record = typeof body === "object" && body !== null && !Array.isArray(body) ? body as Record : {}; const queueId = normalizeQueueId(record.queueId ?? record.id); - const previousQueueId = queueIdOf(task); - const queue = ensureQueue(queueId); const movedAt = nowIso(); + const hotBlocker = taskMoveBlocker(task); + if (hotBlocker.length > 0) { + const databaseTask = databaseReady ? await loadTaskFromDatabase(task.id) : null; + if (databaseTask !== null) task = reconcileHotTaskFromDatabase(databaseTask); + return jsonResponse({ + ok: false, + error: `cannot move task ${task.id}: ${hotBlocker}`, + task: taskForResponse(task), + databaseTask: databaseTask === null ? null : taskForResponse(databaseTask), + }, 409); + } + const databaseMove = await moveDatabaseTaskToQueue(task.id, queueId, movedAt); + if (!databaseMove.ok) { + const databaseTask = databaseReady ? await loadTaskFromDatabase(task.id) : null; + if (databaseTask !== null) task = reconcileHotTaskFromDatabase(databaseTask); + return jsonResponse({ + ok: false, + error: `cannot move task ${task.id}: ${databaseMove.blocker}`, + blocker: databaseStatusRowJson(databaseMove.row), + task: taskForResponse(task), + databaseTask: databaseTask === null ? null : taskForResponse(databaseTask), + }, databaseMove.row === null ? 404 : 409); + } + const previousQueueId = databaseMove.previousQueueId ?? queueIdOf(task); + const queue = ensureQueue(queueId); queue.updatedAt = movedAt; markQueueDirty(queue.id); task.queueId = queueId; @@ -4301,6 +4670,7 @@ async function route(req: Request): Promise { if (url.pathname === "/api/judge/probe" && (req.method === "GET" || req.method === "POST")) return await runJudgeProbe(); if (url.pathname === "/api/judge/self-test" && (req.method === "GET" || req.method === "POST")) return jsonResponse(runJudgeInfraSelfTest()); if (url.pathname === "/api/queue-order/self-test" && (req.method === "GET" || req.method === "POST")) return jsonResponse(runQueueOrderingSelfTest()); + if (url.pathname === "/api/queue-claim-move/self-test" && (req.method === "GET" || req.method === "POST")) return jsonResponse(await runQueueClaimMoveSelfTest()); if (url.pathname === "/api/reference-injection/self-test" && (req.method === "GET" || req.method === "POST")) return jsonResponse(await runReferenceInjectionSelfTest()); if (url.pathname === "/api/trace-port/self-test" && (req.method === "GET" || req.method === "POST")) return jsonResponse(runTracePortSelfTest()); if (url.pathname === "/api/oa/backfill" && (req.method === "GET" || req.method === "POST")) return jsonResponse(await backfillOaTraceStats(url)); diff --git a/src/components/microservices/code-queue/src/queue-api.ts b/src/components/microservices/code-queue/src/queue-api.ts index 82a99661..442c6ed9 100644 --- a/src/components/microservices/code-queue/src/queue-api.ts +++ b/src/components/microservices/code-queue/src/queue-api.ts @@ -444,7 +444,8 @@ async function loadAllTasksForRead(): Promise { const tasks = await ctx().loadTasksFromDatabase("all"); const byId = new Map(tasks.map((task) => [task.id, task])); for (const active of ctx().tasks()) { - byId.set(active.id, active); + const databaseTask = byId.get(active.id); + if (databaseTask === undefined || preferHotTaskForRead(active, databaseTask)) byId.set(active.id, active); } ctx().runGarbageCollection(); return Array.from(byId.values()).sort((left, right) => (timestampMs(left.createdAt) ?? 0) - (timestampMs(right.createdAt) ?? 0) || left.id.localeCompare(right.id)); @@ -581,6 +582,30 @@ function activePriority(task: QueueTask): number { return statusRank[task.status] ?? 9; } +function taskHasClaimMarkers(task: QueueTask): boolean { + return task.status === "running" + || task.status === "judging" + || task.startedAt !== null + || task.currentAttempt > 0 + || task.codexThreadId !== null + || task.activeTurnId !== null; +} + +function taskIsUnclaimedQueued(task: QueueTask): boolean { + return (task.status === "queued" || task.status === "retry_wait") + && task.startedAt === null + && task.currentAttempt === 0 + && task.codexThreadId === null + && task.activeTurnId === null; +} + +function preferHotTaskForRead(hotTask: QueueTask, databaseTask: QueueTask): boolean { + const hotActiveRun = Array.from(ctx().activeRuns.values()).some((run) => run.taskId === hotTask.id); + if (hotActiveRun) return true; + if (taskIsUnclaimedQueued(hotTask) && taskHasClaimMarkers(databaseTask)) return false; + return taskUpdatedSortValue(hotTask) >= taskUpdatedSortValue(databaseTask); +} + function taskUpdatedSortValue(task: QueueTask): number { const time = Date.parse(task.updatedAt || task.createdAt); return Number.isFinite(time) ? time : 0; @@ -1051,7 +1076,8 @@ async function databaseTasksOverviewResponse(url: URL): Promise const byId = new Map(); for (const task of loadedTasks) byId.set(task.id, task); for (const task of ctx().tasks()) { - if (seenIds.has(task.id) || byId.has(task.id)) byId.set(task.id, task); + const databaseTask = byId.get(task.id); + if ((seenIds.has(task.id) || databaseTask !== undefined) && (databaseTask === undefined || preferHotTaskForRead(task, databaseTask))) byId.set(task.id, task); } const rowsSource = orderedIds .map((id) => byId.get(id) ?? null) @@ -1063,8 +1089,13 @@ async function databaseTasksOverviewResponse(url: URL): Promise let selectedTask: QueueTask | null = null; for (const id of selectedCandidates) { if (id.length === 0) continue; - const candidate = ctx().tasks().find((task) => task.id === id) - ?? await ctx().loadTaskFromDatabase(id); + const hotCandidate = ctx().tasks().find((task) => task.id === id) ?? null; + const databaseCandidate = await ctx().loadTaskFromDatabase(id); + const candidate = hotCandidate === null + ? databaseCandidate + : databaseCandidate === null || preferHotTaskForRead(hotCandidate, databaseCandidate) + ? hotCandidate + : databaseCandidate; if (candidate !== null && taskMatchesQueueFilter(candidate, queueId)) { selectedTask = candidate; break; diff --git a/src/components/microservices/code-queue/src/self-tests.ts b/src/components/microservices/code-queue/src/self-tests.ts index 76c9e44e..82d40cc9 100644 --- a/src/components/microservices/code-queue/src/self-tests.ts +++ b/src/components/microservices/code-queue/src/self-tests.ts @@ -17,6 +17,7 @@ export interface SelfTestsContext { defaultQueueId: string; enqueueActiveRunSlotWaiter: (task: QueueTask) => ActiveRunSlotWaiter; injectReferencedTaskContext: (request: QueueTaskRequest, finder?: (id: string) => QueueTask | null | Promise, injectedAt?: string) => Promise; + moveTaskToQueueForTest: (task: QueueTask, req: Request) => Promise; nextRunnableTaskFrom: (queueId: string, tasks?: QueueTask[]) => QueueTask | null; normalizeTask: (task: QueueTask) => QueueTask; nowIso: () => string; @@ -25,6 +26,8 @@ export interface SelfTestsContext { queuedStatusReason: (task: QueueTask, tasks?: QueueTask[]) => QueuedStatusReason | null; removeActiveRunSlotWaiter: (waiter: ActiveRunSlotWaiter) => void; resolveReasoningEffort: (model: string, explicit?: string | null) => string | null; + runDatabaseClaimMoveSelfTest?: () => Promise; + tasks: () => QueueTask[]; updateProcessingFlag: () => void; } @@ -192,6 +195,45 @@ function queueOrderTestTask(id: string, status: TaskStatus, createdAt: string, q return ctx().normalizeTask(task); } +async function runQueueClaimMoveSelfTest(): Promise { + const at = "2026-05-17T06:09:46.702Z"; + const task = queueOrderTestTask("codex_claim_move_self_test", "running", at, at); + task.queueId = "claim_move_source"; + task.startedAt = at; + task.currentAttempt = 1; + task.currentMode = "initial"; + task.codexThreadId = "thread_claim_move_self_test"; + task.activeTurnId = "turn_claim_move_self_test"; + task.updatedAt = at; + const before = ctx().tasks().slice(); + ctx().tasks().push(task); + try { + const response = await ctx().moveTaskToQueueForTest(task, new Request("http://code-queue.local/api/tasks/codex_claim_move_self_test/move", { + method: "POST", + body: JSON.stringify({ queueId: "claim_move_target" }), + headers: { "content-type": "application/json" }, + })); + const body = await response.json() as Record; + assertReferenceTest(response.status === 409, "moving a claimed/running task must return 409"); + assertReferenceTest(task.queueId === "claim_move_source", "running task queueId must remain unchanged after rejected move"); + assertReferenceTest(task.status === "running", "running task status must remain running after rejected move"); + assertReferenceTest(task.currentAttempt === 1, "running task currentAttempt must remain claimed after rejected move"); + const databaseRace = await ctx().runDatabaseClaimMoveSelfTest?.() ?? null; + return { + ok: true, + cases: [ + { name: "move_running_task_returns_409", ok: true, status: response.status }, + { name: "rejected_move_preserves_queue", ok: true, queueId: task.queueId }, + { name: "rejected_move_preserves_claim_markers", ok: true, status: task.status, currentAttempt: task.currentAttempt, startedAt: task.startedAt }, + ...(databaseRace === null ? [] : [{ name: "database_claim_blocks_stale_move", ok: true, result: databaseRace }]), + ], + response: body as JsonValue, + }; + } finally { + ctx().tasks().splice(0, ctx().tasks().length, ...before); + } +} + function runQueueOrderingSelfTest(): JsonValue { const activeRetry = queueOrderTestTask("codex_4000_active", "retry_wait", "2026-05-11T09:00:00.000Z", "2026-05-11T09:00:00.000Z"); const movedOlderCreated = queueOrderTestTask("codex_3999_moved", "queued", "2026-05-11T08:00:00.000Z", "2026-05-11T08:00:00.000Z") as QueueTask & { queueEnteredAt?: string }; @@ -476,4 +518,4 @@ function runJudgeInfraSelfTest(): JsonValue { }; } -export { runJudgeInfraSelfTest, runQueueOrderingSelfTest, runReferenceInjectionSelfTest, runTracePortSelfTest }; +export { runJudgeInfraSelfTest, runQueueClaimMoveSelfTest, runQueueOrderingSelfTest, runReferenceInjectionSelfTest, runTracePortSelfTest }; From c8e291f5fdcc60cc3077a8645e585cbbdac438eb Mon Sep 17 00:00:00 2001 From: Codex Date: Sun, 17 May 2026 06:57:53 +0000 Subject: [PATCH 11/15] test: exercise code queue claim move race --- src/components/microservices/code-queue/src/index.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/components/microservices/code-queue/src/index.ts b/src/components/microservices/code-queue/src/index.ts index 571186ba..c270a997 100644 --- a/src/components/microservices/code-queue/src/index.ts +++ b/src/components/microservices/code-queue/src/index.ts @@ -1573,7 +1573,7 @@ async function runDatabaseClaimMoveSelfTest(): Promise { method: "POST", body: JSON.stringify({ queueId: targetQueueId }), headers: { "content-type": "application/json" }, - })); + }), { bypassRoleCheck: true }); const after = await loadTaskFromDatabase(taskId); const body = await response.json() as Record; if (response.status !== 409) throw new Error(`database stale move should return 409, got ${response.status}`); From 57402f28c07d54c5e2a649305d5b8d4b08c3e844 Mon Sep 17 00:00:00 2001 From: Codex Date: Sun, 17 May 2026 08:07:32 +0000 Subject: [PATCH 12/15] fix(cli): bound diagnostics and add swap management --- AGENTS.md | 7 +- TEST.md | 6 +- docs/reference/cli.md | 11 +- docs/reference/deployment.md | 8 + docs/reference/observability.md | 10 +- scripts/cli.ts | 25 ++- scripts/src/command.ts | 26 ++- scripts/src/docker.ts | 36 +++- scripts/src/jobs.ts | 70 +++++++- scripts/src/microservices.ts | 87 ++++++++- scripts/src/output.ts | 19 +- scripts/src/remote.ts | 60 ++++++- scripts/src/swap.ts | 303 ++++++++++++++++++++++++++++++++ 13 files changed, 618 insertions(+), 50 deletions(-) create mode 100644 scripts/src/swap.ts diff --git a/AGENTS.md b/AGENTS.md index 17c64a48..2f1c856a 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -27,8 +27,9 @@ UniDesk 是一个以主 server 为统一入口的分布式工作平台;本文 - `bun scripts/cli.ts config show`:校验并展示根目录 `config.json`,配置来源规则见 `docs/reference/config.md`。 - `bun scripts/cli.ts check`:运行配置、TypeScript、文件存在性和 Docker Compose 配置检查,测试入口见 `TEST.md`。 - `bun scripts/cli.ts server start`:以异步 job 启动 database、backend-core、frontend、provider-gateway 和主 server 用户服务,部署规则见 `docs/reference/deployment.md`。 -- `bun scripts/cli.ts server status`:查询固定端口、容器状态、健康检查和访问 URL,判定标准见 `docs/reference/deployment.md`。 -- `bun scripts/cli.ts server logs`:分页返回文件日志与 Docker 日志尾部,日志规则见 `docs/reference/observability.md`。 +- `bun scripts/cli.ts server status`:查询固定端口、swap 摘要、容器状态、健康检查和访问 URL,判定标准见 `docs/reference/deployment.md`。 +- `bun scripts/cli.ts server swap status|ensure [--path /swapfile] [--size 2GiB] [--dry-run]`:以 JSON 查看或幂等创建主 server swapfile,`ensure` 输出 before/after、动作、持久化状态和 degraded/failed 详情,规则见 `docs/reference/deployment.md`。 +- `bun scripts/cli.ts server logs [--tail-bytes N]`:只返回文件日志与 Docker 日志尾部并带截断元数据,日志规则见 `docs/reference/observability.md`。 - `bun scripts/cli.ts server rebuild `:以 build-first、Compose lock、no-deps force-recreate 和 post-up validation 的异步 job 重建主 server Compose 内单个服务;Code Queue 部署在 D601,规则见 `docs/reference/deployment.md`。 - `bun scripts/cli.ts provider attach [--master-server URL] [--up] [--force]`:在新增计算节点上生成两项配置的 provider-gateway 挂载包;默认只需要主 server URL(默认 `http://74.48.78.17/`)和唯一 Provider ID,生成的 Compose 固定 Docker socket、`pid: "host"`、`restart: always`、只读 `/workspace`、SSH 维护私钥挂载和 loopback egress proxy 端口,规则见 `docs/reference/provider-gateway.md`。 - `bun scripts/cli.ts ssh [ssh-like args...]`:通过 provider-gateway 的 Host SSH / WSL SSH 维护桥打开近似原生 ssh 的交互会话或远端命令,并在远端 PATH 注入 `apply_patch`、`glob` 与 `skill-discover`;`apply-patch`、`py`、`skills`、结构化 `find`、`glob` 和 `argv` 子命令用于避免远端补丁、Python stdin、skill 发现与常用只读命令的嵌套转义问题,使用规则见 `docs/reference/cli.md` 和 `docs/reference/provider-gateway.md`。 @@ -40,7 +41,7 @@ UniDesk 是一个以主 server 为统一入口的分布式工作平台;本文 - `bun scripts/cli.ts codex task `:按 Code Queue 任务 ID 查询初始 prompt、最后 assistant message、工具调用摘要、attempt/judge/error 和耗时,便于新任务引用历史 session。 - `bun scripts/cli.ts codex judge --attempt [--dry-run]`:按指定 task/attempt 用与队列 worker 相同的上下文构建和 MiniMax judge 调用路径单步复现完成判定;`--dry-run` 只输出 prompt/payload 诊断。 - `bun scripts/cli.ts server stop`:以异步 job 停止固定 Compose 项目中的全部 UniDesk 服务,停止后用 `server status` 复核。 -- `bun scripts/cli.ts job list` / `bun scripts/cli.ts job status latest`:查询 `.state/jobs/` 中的异步任务状态,job 机制见 `docs/reference/cli.md`。 +- `bun scripts/cli.ts job list [--limit N]` / `bun scripts/cli.ts job status latest [--tail-bytes N]`:分页查询 `.state/jobs/` 中的异步任务状态,状态输出只读日志尾部并保留完整日志路径,job 机制见 `docs/reference/cli.md`。 - `bun scripts/cli.ts debug health` / `bun scripts/cli.ts debug dispatch` / `bun scripts/cli.ts debug task`:通过 Docker 内网 core、真实 HTTP、WebSocket、系统指标、Docker 状态和 Host SSH 维护桥流程调试健康检查、任务下发与任务结果,调试规则见 `docs/reference/cli.md`。 - `bun scripts/cli.ts e2e run [--only pattern[,pattern...]] [--skip pattern[,pattern...]]`:支持按 check/prefix/wildcard 选择性执行公网 frontend/provider ingress、内网 core/database、provider-gateway 自接入与 Playwright 验证;日常迭代先跑当前问题对应的最小检查集,最终交付再跑全量回归,验收规则见 `docs/reference/e2e.md`。 diff --git a/TEST.md b/TEST.md index 3e0d091d..cb4ef064 100644 --- a/TEST.md +++ b/TEST.md @@ -2,7 +2,7 @@ ## T1 CLI 可观测性与配置校验 -阅读 `AGENTS.md`(本项目 `AGENTS.md` 同时承担 `SKILL.md` 对 `scripts/cli.ts` 的解释职责),然后用 cli 手动测试以下内容:运行 `bun scripts/cli.ts help`、`bun scripts/cli.ts config show`、`bun scripts/cli.ts check`,确认每条命令都有 JSON 输出、失败时包含错误对象、`config.json` 是唯一配置来源,且 TypeScript 检查覆盖 `scripts/` 与 `src/components/`。 +阅读 `AGENTS.md`(本项目 `AGENTS.md` 同时承担 `SKILL.md` 对 `scripts/cli.ts` 的解释职责),然后用 cli 手动测试以下内容:运行 `bun scripts/cli.ts help`、`bun scripts/cli.ts config show`、`bun scripts/cli.ts check`,确认每条命令都有 JSON 输出、失败时包含错误对象、`config.json` 是唯一配置来源,且 TypeScript 检查覆盖 `scripts/` 与 `src/components/`;运行 `set -o pipefail; bun scripts/cli.ts server status | head -1`,确认下游 pipe 关闭时不会打印 Bun EPIPE stack trace。 ## T2 Docker 栈异步启动 @@ -10,7 +10,7 @@ ## T3 主 server 自接入 Provider Gateway -阅读 `AGENTS.md`(本项目 `AGENTS.md` 同时承担 `SKILL.md` 对 `scripts/cli.ts` 的解释职责),然后用 cli 手动测试以下内容:运行 `bun scripts/cli.ts server status` 和 `bun scripts/cli.ts debug health`,确认面向浏览器的公网入口只有 frontend 与 provider ingress,backend-core 显示为 Docker 内部端口,database/OA Event Flow 若因 D601 Code Queue 映射宿主端口也必须显示为受限宿主端口,且 `network.restrictedHostAccess.allowedSourceCidrs` 已生成来源限制,`/api/nodes` 中存在 `main-server` provider,状态为 `online`,`/api/nodes/system-status` 中存在 CPU/内存/硬盘采样,`/api/nodes/docker-status` 中存在 `main-server` 的 Docker 快照,且 provider 标签中能看到 Docker socket 可用性。 +阅读 `AGENTS.md`(本项目 `AGENTS.md` 同时承担 `SKILL.md` 对 `scripts/cli.ts` 的解释职责),然后用 cli 手动测试以下内容:运行 `bun scripts/cli.ts server status`、`bun scripts/cli.ts server swap status` 和 `bun scripts/cli.ts debug health`,确认 `server status` 包含 `swap` 摘要,`server swap status` 快速返回 total memory、active swaps、`/etc/fstab` 持久化状态和 warning;面向浏览器的公网入口只有 frontend 与 provider ingress,backend-core 显示为 Docker 内部端口,database/OA Event Flow 若因 D601 Code Queue 映射宿主端口也必须显示为受限宿主端口,且 `network.restrictedHostAccess.allowedSourceCidrs` 已生成来源限制,`/api/nodes` 中存在 `main-server` provider,状态为 `online`,`/api/nodes/system-status` 中存在 CPU/内存/硬盘采样,`/api/nodes/docker-status` 中存在 `main-server` 的 Docker 快照,且 provider 标签中能看到 Docker socket 可用性。若 `swap.warning` 非空,先运行 `bun scripts/cli.ts server swap ensure --dry-run` 审查动作,再谨慎执行 `bun scripts/cli.ts server swap ensure --size 2GiB`,确认输出包含 `before`/`after`、`actions`、`errors` 和 `status=ok|degraded`;已有 swap 时 ensure 必须 no-op。 ## T4 前端控制台连通 @@ -22,7 +22,7 @@ ## T6 日志第一现场验证 -阅读 `AGENTS.md`(本项目 `AGENTS.md` 同时承担 `SKILL.md` 对 `scripts/cli.ts` 的解释职责),然后用 cli 手动测试以下内容:运行 `bun scripts/cli.ts server logs --tail-bytes 20000`,实际读取输出中列出的 `logs/{YYYYMMDD}/` 文件,确认 backend-core、frontend、provider-gateway、database 都有实时日志;backend-core 与 Code Queue/Codex app-server 日志必须按 `logs/{YYYYMMDD}/{startStamp}_{YYYYMMDD}_{HH}_{service}.jsonl` 小时切片,默认日志族总量不得超过 `1GiB`,超过后会删除最旧切片;日志不得只有启动行,错误日志必须包含可定位的错误消息或 stack。 +阅读 `AGENTS.md`(本项目 `AGENTS.md` 同时承担 `SKILL.md` 对 `scripts/cli.ts` 的解释职责),然后用 cli 手动测试以下内容:运行 `bun scripts/cli.ts server logs --tail-bytes 20000`,确认输出包含 `policy`、每个日志文件的 `sizeBytes/tailBytes/truncated` 和 Docker logs 的 tail 元数据,实际读取输出中列出的 `logs/{YYYYMMDD}/` 文件,确认 backend-core、frontend、provider-gateway、database 都有实时日志;运行 `bun scripts/cli.ts job list --limit 5` 和 `bun scripts/cli.ts job status latest --tail-bytes 20000`,确认 job 列表分页、状态输出只含 stdout/stderr 尾部且保留完整日志路径;backend-core 与 Code Queue/Codex app-server 日志必须按 `logs/{YYYYMMDD}/{startStamp}_{YYYYMMDD}_{HH}_{service}.jsonl` 小时切片,默认日志族总量不得超过 `1GiB`,超过后会删除最旧切片;日志不得只有启动行,错误日志必须包含可定位的错误消息或 stack。 ## T7 停止与端口释放 diff --git a/docs/reference/cli.md b/docs/reference/cli.md index 0165f35c..f2d3364c 100644 --- a/docs/reference/cli.md +++ b/docs/reference/cli.md @@ -10,8 +10,9 @@ UniDesk 的统一 CLI 入口是根目录 `scripts/cli.ts`,运行方式固定 - `check` 执行配置校验、文件存在性检查、`scripts/` TypeScript 检查、`src/components/` TypeScript 检查和 Docker Compose 配置检查。 - `server start` 创建异步 job,在后台执行 Docker 构建和启动;命令本身只负责返回 job id、日志路径和启动命令。 - `server stop` 创建异步 job,在后台停止固定 Compose project 中的全部 UniDesk 服务。 -- `server status` 查询公开端口、受限宿主端口、内部端口、Compose 容器、core/frontend/provider/database 健康检查和访问 URL;D601 Code Queue 使用的 PostgreSQL/OA Event Flow host mapping 必须出现在受限宿主端口而不是无条件公开入口中。 -- `server logs` 返回 `logs/` 文件日志和 Docker 容器日志的尾部,默认限制输出大小,避免日志爆炸。 +- `server status` 查询公开端口、受限宿主端口、内部端口、主机 swap 摘要、Compose 容器、core/frontend/provider/database 健康检查和访问 URL;D601 Code Queue 使用的 PostgreSQL/OA Event Flow host mapping 必须出现在受限宿主端口而不是无条件公开入口中。低内存主 server 上 `swap.warning` 非空时,先执行 `server swap status` 或 `server swap ensure`。 +- `server swap status|ensure [--path /swapfile] [--size 2GiB] [--dry-run]` 是主 server swap 管理入口。`status` 仅读 `/proc/meminfo`、`/proc/swaps` 和 `/etc/fstab` 并返回 JSON;`ensure` 在已有任何 active swap 时只报告 no-op,在无 active swap 时创建固定 swapfile、`chmod 600`、`mkswap`、`swapon` 并尽量写入 `/etc/fstab`。输出必须包含 `before`、`after`、total memory、active swap、持久化状态、关键动作和错误详情;若 swap 已启用但 fstab 写入失败,状态为 `degraded`,调用者需按返回的 detail 修复持久化。 +- `server logs` 返回 `logs/` 文件日志和 Docker 容器日志的尾部,默认限制输出大小,避免日志爆炸。实现必须只读取文件末尾字节,不得为了 tail 先把巨大日志完整读入 CLI 内存。 - `server rebuild ` 创建异步 job,先构建目标服务镜像,随后在 `.state/locks/server-compose.lock` 串行保护下用 `--no-deps --force-recreate` 替换目标 service 并等待容器 `healthy/running`;该命令用于替代手工删除容器的兜底流程,其中 `todo-note`、`project-manager`、`baidu-netdisk` 和 `oa-event-flow` 只重建主 server 承载的对应后端,不会重建或删除 database 命名卷。Code Queue 部署在 D601,不再由 `server rebuild` 管理。 - `provider attach [--master-server URL] [--up] [--force]` 在新计算节点生成两项配置的 provider-gateway 挂载包:`.state/provider-.env` 默认只包含 `UNIDESK_MASTER_SERVER` 与 `PROVIDER_ID`,`provider-.yml` 固定 Docker socket、`pid: "host"`、`restart: always`、只读 `/workspace` 和 SSH 维护私钥挂载;`--up` 会立即执行生成的 `docker compose up -d --build`。 - `ssh [ssh-like args...]` 通过 backend-core 内网 WebSocket broker 和 provider-gateway 的 Host SSH / WSL SSH 维护桥连接目标节点;无后续参数时进入远端登录 shell,有后续参数时按 ssh 远端命令体验执行并返回远端 exit code。 @@ -27,7 +28,7 @@ UniDesk 的统一 CLI 入口是根目录 `scripts/cli.ts`,运行方式固定 - `codex output --tail|--from-start|--after-seq N|--before-seq N --limit N [--full-text]` 按原始 output seq 分页读取底层记录;当 trace 行提示 `commandOmittedLines`、`bodyOmittedLines` 或 `rawSeqs` 时,用该命令按 seq 补取完整信息,默认仍有单条文本预览上限,显式 `--full-text` 才返回该页全文。 - `codex judge --attempt N [--dry-run] [--include-prompt]` 通过 Code Queue 私有代理按指定 attempt 单步复现 judge;后端会从 PostgreSQL task JSON 与 output 归档重建该 attempt 在真实队列 worker 中的 `QueueTask`/`CodexRunResult`,再调用同一套 judge prompt builder 和 MiniMax 请求路径。默认会真实调用 MiniMax,`--dry-run` 只返回 prompt/payload 大小、attempt 窗口和重建来源诊断,`--include-prompt` 仅用于本地深度排查。 - Code Queue 多队列 lane 由 `codex` 命令命名空间管理:`queues` 列表、`queue create ` 创建、`queue merge --into ` 合并、`move --queue ` 迁移;同一个 queue 内部串行执行,不同 queue 之间并行执行。迁移只允许尚未被 scheduler claim 的 `queued`/`retry_wait` 任务,必须满足 `startedAt=null`、`currentAttempt=0` 且没有 active thread/turn;已进入 `running`/`judging` 或已有 claim 标记的任务返回 409,不得被 move/merge 回写成 queued。合并会移动可迁移任务归属并自动删除源 queue 记录,只保留合并后的目标 queue;若 source 或 target queue 存在 active/claimed 任务,合并整体返回 409。合并后的目标 queue 按任务原 `queueEnteredAt`/`createdAt` 时间顺序串行,成功迁移 queued/retry_wait 任务后会立即调度目标 queue。 -- `job list` 与 `job status` 查询 `.state/jobs/` 文件系统状态,是异步命令的可观测入口。 +- `job list [--limit N] [--include-command]` 与 `job status [--tail-bytes N]` 查询 `.state/jobs/` 文件系统状态,是异步命令的可观测入口。`job list` 默认只返回最新 50 条摘要;`job status` 默认只返回 stdout/stderr 末尾 12000 字节,并带 `tailPolicy` 与完整日志路径。 - `debug health`、`debug dispatch` 与 `debug task` 走真实内部 core、WebSocket、数据库、provider、系统指标、Docker 状态和 Host SSH 维护桥流程,只用于开发调试,不写入 `TEST.md` 的正式验收步骤。 - `e2e run [--only pattern[,pattern...]] [--skip pattern[,pattern...]]` 使用 publicHost 派生的公开 frontend/provider ingress URL,并通过 Docker 内网验证 core API、PostgreSQL、provider self-connection、系统指标曲线、Docker 状态快照、provider.upgrade 预检和 Playwright 前端页面,是交付前的自动化 E2E 门禁;CLI 默认输出 check 状态摘要,完整诊断写入 `resultPath`,日常迭代应优先用 `--only` / `--skip` 跑最小必要集合。 @@ -43,7 +44,9 @@ UniDesk 的统一 CLI 入口是根目录 `scripts/cli.ts`,运行方式固定 每条命令的最外层 JSON 包含 `ok`、`command` 和 `data` 或 `error`。失败时 CLI 设置非零退出码,但仍然输出 JSON 错误对象;错误对象应包含 `name`、`message` 和可用的 `stack`。 -`microservice proxy` 是面向人工验证的私有后端读取入口。正式写入型用户服务操作由 frontend 同源代理或 E2E 直接调用 backend-core 完成,并由 config 中的 `allowedMethods` 限制;CLI `proxy` 默认仍作为 GET/HEAD 读取验证入口,必要时可显式加 `--method POST|PUT|PATCH|DELETE` 调用无需自定义请求体的受控调试/自测端点,例如 `bun scripts/cli.ts microservice proxy baidu-netdisk /api/self-test --method POST --raw`。为了避免 Pipeline snapshot 这类超大业务 JSON 造成 CLI 输出爆炸,响应 body 超过默认阈值时会返回 `bodyOmitted=true`、`bodyPreview`、`bodyBytes` 和 `rawHint`;需要完整 body 时显式添加 `--raw`,或用 `--max-body-bytes ` 调整预览阈值。正式 frontend 展示仍应优先使用业务控件和 `__unideskArrayLimit` 这类展示级裁剪参数,而不是默认倾倒完整 JSON。 +诊断命令默认采用渐进披露:`server logs`、`job list/status`、`codex task/trace/output` 和 `microservice proxy` 都必须有默认条数、字节数或文本预览上限;用户显式传 `--limit`、`--tail-bytes`、`--full-text` 或 `--full` 才扩大单次输出。CLI stdout 遇到下游 pipe 关闭的 `EPIPE` 必须安静退出,不得打印 Bun stack trace。 + +`microservice proxy` 是面向人工验证的私有后端读取入口。正式写入型用户服务操作由 frontend 同源代理或 E2E 直接调用 backend-core 完成,并由 config 中的 `allowedMethods` 限制;CLI `proxy` 默认仍作为 GET/HEAD 读取验证入口,必要时可显式加 `--method POST|PUT|PATCH|DELETE` 调用无需自定义请求体的受控调试/自测端点,例如 `bun scripts/cli.ts microservice proxy baidu-netdisk /api/self-test --method POST --raw`。为了避免 Pipeline snapshot 这类超大业务 JSON 造成 CLI 输出爆炸,响应 body 超过默认阈值时会返回 `bodyOmitted=true`、`bodyPreview`、`bodyBytes` 和 `rawHint`;`--raw` 仍受默认硬限额保护,需要完整 body 时显式添加 `--raw --full`,或用 `--max-body-bytes ` 调整预览阈值。正式 frontend 展示仍应优先使用业务控件和 `__unideskArrayLimit` 这类展示级裁剪参数,而不是默认倾倒完整 JSON。 `network perf` 用于生成组网性能前后对比数据。标准 Code Queue overview 读路径基准命令是 `bun scripts/cli.ts network perf --service code-queue --path /api/tasks/overview?limit=30 --count 30 --concurrency 1 --label before`,远程主 server 可用 `bun scripts/cli.ts --main-server-ip 74.48.78.17 network perf ...`。输出包含成功/失败数、状态码分布、`x-unidesk-cache`、`x-unidesk-proxy-mode`、`x-unidesk-upstream-proxy-mode` 分布和 min/p50/p90/p95/max;provider-gateway 长连接数据面验收应看到 `proxyModeCounts.provider-ws-http-tunnel`,adapter native Service 数据面验收应看到 upstream proxy mode 为 `kubernetes-native-service`,若出现 `kubernetes-api-service-proxy` 必须结合 `/api/control-plane.nativeServiceProxy.failedServices` 解释 fallback 原因。 diff --git a/docs/reference/deployment.md b/docs/reference/deployment.md index 5f50a2c8..04ad3f0c 100644 --- a/docs/reference/deployment.md +++ b/docs/reference/deployment.md @@ -28,6 +28,14 @@ Compose v2 安装后仍然必须遵守 UniDesk 的服务控制入口:全栈生 版本化用户服务部署优先使用 `bun scripts/cli.ts deploy apply`。`deploy.json` 只声明服务 `id`、`repo` 和 `commitId`;目标节点、Dockerfile、Compose、Kubernetes manifest、健康检查和代理路径继续来自 `config.json` 与现有 manifest。部署必须遵循 target-side build:服务部署到哪台 target,就在哪台 target 从 remote commit 导出源码、一次性代理构建镜像并部署;不得把中心构建镜像作为默认分发路径,也不得用 `docker commit` 或脏 worktree 作为部署输入。完整规则见 `docs/reference/deploy.md`。 +## Main Server Swap + +主 server 可能运行在约 2 GiB 内存的小规格机器上,短时 Docker build、Codex/control-plane 调查和日志读取会触发 global OOM。主 server 必须通过 `bun scripts/cli.ts server swap status` 暴露当前 memory/swap 状态,并在 `server status` 的 `swap` 字段中给出同一摘要。 + +缺少 active swap 时,正式修复入口是 `bun scripts/cli.ts server swap ensure [--path /swapfile] [--size 2GiB]`。该命令必须幂等:已有任何 active swap 时只返回 no-op 状态;无 swap 时创建固定 swapfile、设置 `0600`、执行 `mkswap` 与 `swapon`,并尽量把 ` none swap sw 0 0` 写入 `/etc/fstab`。如果当前环境允许 `swapon` 但不允许写 `/etc/fstab`,命令返回 `status=degraded`,并在 JSON 的 `errors`/`actions` 中说明下一步;不得静默假装持久化完成。 + +swap 管理不能被强塞进所有热路径。`server start/status` 可以暴露 warning 或摘要,但不会自动创建 swap;需要变更主机 swap 时必须显式运行 `server swap ensure`,并用返回的 `before`/`after` 和 `fstab.persisted` 作为验收记录。 + ## Start And Stop `bun scripts/cli.ts server start` 与 `bun scripts/cli.ts server stop` 都是异步 job。启动 job 只执行固定 Compose project 的 `up -d --build --remove-orphans`,不得先 `down`,避免在 provider-gateway 旧容器或网络冲突时把长驻控制面容器先删掉又启动失败;停止 job 才允许执行 `down --remove-orphans`。启动和停止流程都禁止删除 Docker named volume。所有会改变主 server Compose 状态的 job 必须通过 `.state/locks/server-compose.lock` 串行化;连续 `server rebuild` 命令只代表连续创建异步 job,不能代表第一个 job 已结束,实际容器变更仍必须由 Compose lock 串行执行。 diff --git a/docs/reference/observability.md b/docs/reference/observability.md index a108f6a4..e4edd79f 100644 --- a/docs/reference/observability.md +++ b/docs/reference/observability.md @@ -4,7 +4,7 @@ UniDesk 的可观测性优先级高于静默成功。CLI、服务日志、Docker ## CLI Logs -异步 job 的 stdout 和 stderr 位于 `.state/jobs/`。`job status` 会返回有限尾部,避免输出爆炸,同时保留完整日志文件路径便于继续排查。 +异步 job 的 stdout 和 stderr 位于 `.state/jobs/`。`job list` 默认只返回最新 50 条摘要;`job status` 会返回有限尾部,避免输出爆炸,同时保留完整日志文件路径便于继续排查。实现必须只读取日志尾部字节,不得先把完整 job 日志读入 CLI 内存。 ## Service Logs @@ -18,7 +18,13 @@ UniDesk 的可观测性优先级高于静默成功。CLI、服务日志、Docker ## Log Access -`bun scripts/cli.ts server logs` 同时读取文件日志和 Docker logs 尾部。文件日志是服务崩溃时的第一现场,Docker logs 是容器启动失败和 stdout/stderr 的辅助来源。 +`bun scripts/cli.ts server logs` 同时读取文件日志和 Docker logs 尾部。文件日志是服务崩溃时的第一现场,Docker logs 是容器启动失败和 stdout/stderr 的辅助来源。默认输出必须包含 tail 字节数、是否截断和完整文件路径;扩大读取范围只能通过显式 `--tail-bytes N`,且 CLI 会对单次 tail 设置硬上限。 + +## Diagnostic Output Limits + +所有诊断型 CLI 输出必须优先摘要化、尾部化或分页化,禁止默认倾倒大 JSON、全量日志、全量 trace 或 `.state`/`logs` 宽泛搜索结果。当前硬限额入口包括:`server logs` 默认 3000 bytes tail、`job list` 默认 50 条、`job status` 默认 12000 bytes tail、`codex task/trace/output` 默认分页与文本预览、`microservice proxy` 默认 body 预览且 `--raw` 仍受硬限额保护。确实需要完整响应时必须显式使用对应的 `--full`、`--full-text`、`--tail-bytes` 或 `--limit` 参数,并在验收记录中说明为什么需要扩大输出。 + +CLI 写 stdout/stderr 遇到下游 pipe 关闭的 `EPIPE` 必须安静退出,不能打印 Bun stack trace。常见验证命令是 `set -o pipefail; bun scripts/cli.ts server status | head -1`,应只看到第一行 JSON 而无额外错误噪声。 ## Task Liveness diff --git a/scripts/cli.ts b/scripts/cli.ts index 05441f06..3da35bc2 100644 --- a/scripts/cli.ts +++ b/scripts/cli.ts @@ -3,7 +3,7 @@ import { debugDispatch, debugHealth, debugTask, isDebugDispatchCommand, type Deb import { isRebuildableService, rebuildService, stackLogs, stackStatus, startStack, stopStack } from "./src/docker"; import { parseE2ERunOptions, runE2E } from "./src/e2e"; import { emitError, emitJson } from "./src/output"; -import { jobWithTail, listJobs, readJob, runJob } from "./src/jobs"; +import { jobWithTail, listJobs, listJobsSummary, readJob, runJob } from "./src/jobs"; import { runChecks } from "./src/check"; import { runSsh } from "./src/ssh"; import { extractRemoteCliOptions, runRemoteCli } from "./src/remote"; @@ -15,6 +15,7 @@ import { runProviderCommand } from "./src/provider-attach"; import { runScheduleCommand } from "./src/schedules"; import { parseNetworkPerfOptions, runNetworkPerf } from "./src/network-perf"; import { runCiCommand } from "./src/ci"; +import { runSwapCommand } from "./src/swap"; const remoteOptions = extractRemoteCliOptions(process.argv.slice(2)); const args = remoteOptions.args; @@ -32,6 +33,7 @@ function help(): unknown { { command: "server start", description: "Fire-and-forget build/start for database, backend-core, frontend, provider gateway, and managed main-server user services." }, { command: "server stop", description: "Fire-and-forget docker-compose down for the fixed UniDesk stack." }, { command: "server status", description: "Show fixed ports, containers, service health, and public URLs." }, + { command: "server swap status|ensure [--path /swapfile] [--size 2GiB] [--dry-run]", description: "Inspect or idempotently create host swap for low-memory main-server operation." }, { command: "server logs [--tail-bytes N]", description: "Return bounded tails from file logs and docker logs." }, { command: "server rebuild ", description: "Build first, then serialize, force-recreate, and validate one Compose service." }, { command: "provider attach [--master-server URL] [--up] [--force]", description: "Generate the minimal external provider-gateway env/compose bundle; only master server URL and provider id are required." }, @@ -47,7 +49,7 @@ function help(): unknown { { command: "microservice health ", description: "Probe one user service through backend-core -> provider-gateway HTTP proxy." }, { command: "microservice diagnostics ", description: "Split k3sctl-managed proxy health into provider-gateway, HTTP tunnel, adapter, Kubernetes API service proxy, and target Service checks." }, { command: "microservice tunnel-self-test ", description: "Trigger an expected provider HTTP tunnel failure and verify requestId/stage diagnostics are returned." }, - { command: "microservice proxy [--method GET|POST|PUT|PATCH|DELETE] [--raw] [--max-body-bytes N]", description: "Access a private user-service backend path through the same frontend-only proxy used by WebUI; large bodies are summarized unless --raw is set." }, + { command: "microservice proxy [--method GET|POST|PUT|PATCH|DELETE] [--raw] [--full] [--max-body-bytes N]", description: "Access a private user-service backend path through the same frontend-only proxy used by WebUI; large bodies are summarized unless --full is explicit." }, { command: "decision upload [--title text] [--type meeting|decision] [--level G0|G1|G2|G3|P0|P1|P2|P3|none] [--status active|blocked|parked|done] [--linked-goal-id id] [--evidence url]", description: "Upload a meeting note or decision record through backend-core -> decision-center user-service proxy." }, { command: "decision list [--type ...] [--status ...] [--level ...] [--linked-goal-id id] [--limit N]", description: "List Decision Center records through the user-service proxy." }, { command: "decision show ", description: "Show one Decision Center record." }, @@ -59,7 +61,7 @@ function help(): unknown { { command: "codex output [--tail|--from-start|--after-seq N|--before-seq N --limit N] [--full-text]", description: "Fetch paged raw Code Queue output records by seq when a trace row has omitted command/output text." }, { command: "codex judge --attempt N [--dry-run] [--include-prompt]", description: "Replay one stored Code Queue attempt through the same judge context builder and MiniMax judge call path used by the live queue worker." }, { command: "codex (queues | queue create | queue merge --into | move --queue )", description: "List/create/merge Code Queue lanes and move a queued task; merge preserves task queue time order and deletes the source queue record." }, - { command: "job list", description: "List async jobs from .state/jobs." }, + { command: "job list [--limit N] [--include-command]", description: "List async jobs from .state/jobs with a bounded default page." }, { command: "job status [--tail-bytes N]", description: "Show job state with bounded stdout/stderr tails." }, { command: "debug health", description: "Probe internal core, nodes, system/Docker status, frontend, provider ingress, and public boundary." }, { command: "debug dispatch [providerId] [docker.ps|provider.upgrade|host.ssh|microservice.http|echo] [--wait-ms N]", description: "Submit a real internal-core dispatch request for CLI debugging." }, @@ -80,6 +82,10 @@ function numberOption(name: string, defaultValue: number): number { return value; } +function boundedNumberOption(name: string, defaultValue: number, maxValue: number): number { + return Math.min(numberOption(name, defaultValue), maxValue); +} + function stringOption(name: string): string | undefined { const index = args.indexOf(name); if (index === -1) return undefined; @@ -172,8 +178,15 @@ async function main(): Promise { emitJson(commandName, await stackStatus(config)); return; } + if (sub === "swap") { + const result = runSwapCommand(args.slice(2)); + const ok = (result as { ok?: unknown }).ok !== false; + emitJson(commandName, result, ok); + if (!ok) process.exitCode = 1; + return; + } if (sub === "logs") { - emitJson(commandName, stackLogs(config, numberOption("--tail-bytes", 3000))); + emitJson(commandName, stackLogs(config, boundedNumberOption("--tail-bytes", 3000, 500_000))); return; } if (sub === "rebuild") { @@ -227,12 +240,12 @@ async function main(): Promise { if (top === "job") { if (sub === "list") { - emitJson(commandName, { jobs: listJobs() }); + emitJson(commandName, listJobsSummary({ limit: boundedNumberOption("--limit", 50, 500), includeCommand: args.includes("--include-command") })); return; } if (sub === "status") { const id = third === "latest" || third === undefined ? latestJobId() : third; - emitJson(commandName, { job: jobWithTail(readJob(id), numberOption("--tail-bytes", 12000)) }); + emitJson(commandName, { job: jobWithTail(readJob(id), boundedNumberOption("--tail-bytes", 12000, 500_000)) }); return; } } diff --git a/scripts/src/command.ts b/scripts/src/command.ts index 468992dd..5f7a2c1b 100644 --- a/scripts/src/command.ts +++ b/scripts/src/command.ts @@ -1,5 +1,5 @@ import { spawn, spawnSync } from "node:child_process"; -import { createWriteStream, existsSync, readFileSync } from "node:fs"; +import { closeSync, createWriteStream, existsSync, openSync, readSync, statSync } from "node:fs"; export interface CommandResult { command: string[]; @@ -7,20 +7,26 @@ export interface CommandResult { exitCode: number | null; stdout: string; stderr: string; + signal: NodeJS.Signals | null; + timedOut: boolean; } -export function runCommand(command: string[], cwd: string): CommandResult { +export function runCommand(command: string[], cwd: string, options: { timeoutMs?: number } = {}): CommandResult { const result = spawnSync(command[0], command.slice(1), { cwd, encoding: "utf8", maxBuffer: 1024 * 1024 * 8, + timeout: options.timeoutMs, }); + const error = result.error as (Error & { code?: string }) | undefined; return { command, cwd, exitCode: result.status, stdout: result.stdout ?? "", - stderr: result.stderr ?? result.error?.message ?? "", + stderr: result.stderr ?? error?.message ?? "", + signal: result.signal, + timedOut: error?.code === "ETIMEDOUT", }; } @@ -50,6 +56,16 @@ export async function runCommandToFiles(command: string[], cwd: string, stdoutFi export function tailFile(path: string, maxBytes = 8192): string { if (!existsSync(path)) return ""; - const content = readFileSync(path); - return content.subarray(Math.max(0, content.length - maxBytes)).toString("utf8"); + const safeMaxBytes = Math.max(0, Math.floor(maxBytes)); + if (safeMaxBytes === 0) return ""; + const size = statSync(path).size; + const bytesToRead = Math.min(size, safeMaxBytes); + const buffer = Buffer.alloc(bytesToRead); + const fd = openSync(path, "r"); + try { + readSync(fd, buffer, 0, bytesToRead, size - bytesToRead); + } finally { + closeSync(fd); + } + return buffer.toString("utf8"); } diff --git a/scripts/src/docker.ts b/scripts/src/docker.ts index 29631cf0..70bd7bd6 100644 --- a/scripts/src/docker.ts +++ b/scripts/src/docker.ts @@ -1,8 +1,9 @@ -import { chmodSync, existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync } from "node:fs"; +import { chmodSync, existsSync, mkdirSync, readFileSync, readdirSync, statSync, writeFileSync } from "node:fs"; import { basename, dirname, join, resolve } from "node:path"; import { commandOk, runCommand, tailFile } from "./command"; import { type UniDeskConfig, repoRoot, rootPath } from "./config"; import { startJob } from "./jobs"; +import { swapStatus } from "./swap"; export interface ComposeRuntimeEnv { envFile: string; @@ -414,6 +415,7 @@ export async function stackStatus(config: UniDeskConfig): Promise { const overview = dockerExecJson("unidesk-backend-core", "fetch('http://127.0.0.1:8080/api/overview').then(r=>r.json()).then(j=>console.log(JSON.stringify({ok:true,status:200,body:j}))).catch(e=>{console.log(JSON.stringify({ok:false,error:String(e)}));process.exit(1)})"); return { runtimeEnv, + swap: swapStatus(), publicPorts: fixedPorts(config), blockedPublicPorts: [ { name: "backend-core-rest", port: config.network.core.port, listening: isPortListening(config.network.core.port), expected: "not-listening" }, @@ -478,11 +480,37 @@ export function stackLogs(config: UniDeskConfig, tailBytes: number): unknown { const allFiles = listLogFiles(logRoot); const currentFiles = allFiles.filter((path) => basename(path).startsWith(runtimeEnv.logPrefix)); const selectedFiles = (currentFiles.length > 0 ? currentFiles : allFiles.slice(-12)).slice(-12); - const files = selectedFiles.map((path) => ({ path, name: basename(path), tail: tailFile(path, tailBytes) })); + const files = selectedFiles.map((path) => { + const sizeBytes = existsSync(path) ? statSync(path).size : 0; + const truncated = sizeBytes > tailBytes; + return { path, name: basename(path), sizeBytes, tailBytes, truncated, tail: tailFile(path, tailBytes) }; + }); const containerNames = ["unidesk-database", "unidesk-backend-core", "unidesk-frontend", "unidesk-provider-gateway-main", "todo-note-backend", "project-manager-backend", "baidu-netdisk-backend", "oa-event-flow-backend"]; const docker = containerNames.map((name) => { const result = runCommand(["docker", "logs", "--tail", "40", name], repoRoot); - return { name, exitCode: result.exitCode, stdoutTail: result.stdout.slice(-tailBytes), stderrTail: result.stderr.slice(-tailBytes) }; + return { + name, + exitCode: result.exitCode, + tailBytes, + stdoutBytes: Buffer.byteLength(result.stdout, "utf8"), + stderrBytes: Buffer.byteLength(result.stderr, "utf8"), + stdoutTruncated: Buffer.byteLength(result.stdout, "utf8") > tailBytes, + stderrTruncated: Buffer.byteLength(result.stderr, "utf8") > tailBytes, + stdoutTail: result.stdout.slice(-tailBytes), + stderrTail: result.stderr.slice(-tailBytes), + }; }); - return { logRoot, runtimeEnv, files, docker }; + return { + logRoot, + runtimeEnv, + policy: { + defaultTailBytes: 3000, + requestedTailBytes: tailBytes, + selectedFileLimit: 12, + dockerTailLines: 40, + disclosure: "server logs returns tails only; increase with --tail-bytes for a larger bounded tail, and inspect listed paths directly for full logs.", + }, + files, + docker, + }; } diff --git a/scripts/src/jobs.ts b/scripts/src/jobs.ts index ed87cc73..101ffcf4 100644 --- a/scripts/src/jobs.ts +++ b/scripts/src/jobs.ts @@ -1,5 +1,5 @@ import { spawn, spawnSync } from "node:child_process"; -import { existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync } from "node:fs"; +import { existsSync, mkdirSync, readFileSync, readdirSync, statSync, writeFileSync } from "node:fs"; import { join } from "node:path"; import { repoRoot, rootPath } from "./config"; import { runCommandToFiles, tailFile } from "./command"; @@ -141,6 +141,70 @@ export async function runJob(id: string): Promise { return job; } -export function jobWithTail(job: JobRecord, maxBytes = 12000): JobRecord & { stdoutTail: string; stderrTail: string } { - return { ...job, stdoutTail: tailFile(job.stdoutFile, maxBytes), stderrTail: tailFile(job.stderrFile, maxBytes) }; +export function jobWithTail(job: JobRecord, maxBytes = 12000): JobRecord & { + tailPolicy: { + requestedTailBytes: number; + stdoutBytes: number; + stderrBytes: number; + stdoutTruncated: boolean; + stderrTruncated: boolean; + fullLogPaths: { stdoutFile: string; stderrFile: string }; + }; + stdoutTail: string; + stderrTail: string; +} { + const stdoutBytes = existsSync(job.stdoutFile) ? statSync(job.stdoutFile).size : 0; + const stderrBytes = existsSync(job.stderrFile) ? statSync(job.stderrFile).size : 0; + return { + ...job, + tailPolicy: { + requestedTailBytes: maxBytes, + stdoutBytes, + stderrBytes, + stdoutTruncated: stdoutBytes > maxBytes, + stderrTruncated: stderrBytes > maxBytes, + fullLogPaths: { stdoutFile: job.stdoutFile, stderrFile: job.stderrFile }, + }, + stdoutTail: tailFile(job.stdoutFile, maxBytes), + stderrTail: tailFile(job.stderrFile, maxBytes), + }; +} + +export interface JobListOptions { + limit?: number; + includeCommand?: boolean; +} + +export function listJobsSummary(options: JobListOptions = {}): unknown { + const limit = Math.max(1, Math.floor(options.limit ?? 50)); + const jobs = listJobs(); + const returned = jobs.slice(0, limit).map((job) => ({ + id: job.id, + name: job.name, + status: job.status, + runner: job.runner, + runnerPid: job.runnerPid ?? null, + runnerContainer: job.runnerContainer ?? null, + createdAt: job.createdAt, + startedAt: job.startedAt, + finishedAt: job.finishedAt, + exitCode: job.exitCode, + note: job.note, + stdoutFile: job.stdoutFile, + stderrFile: job.stderrFile, + ...(options.includeCommand === true ? { command: job.command, cwd: job.cwd } : {}), + })); + return { + jobs: returned, + total: jobs.length, + returned: returned.length, + limit, + truncated: jobs.length > returned.length, + disclosure: { + defaultLimit: 50, + nextCommand: jobs.length > returned.length ? `bun scripts/cli.ts job list --limit ${Math.min(jobs.length, limit * 2)}` : null, + includeCommandCommand: "bun scripts/cli.ts job list --include-command", + statusCommand: "bun scripts/cli.ts job status --tail-bytes 12000", + }, + }; } diff --git a/scripts/src/microservices.ts b/scripts/src/microservices.ts index 24954060..e0932f27 100644 --- a/scripts/src/microservices.ts +++ b/scripts/src/microservices.ts @@ -2,18 +2,51 @@ import { runCommand } from "./command"; import { type UniDeskConfig, repoRoot } from "./config"; import { jsonByteLength, previewJson } from "./preview"; -export function coreInternalFetch(path: string, init?: { method?: string; body?: unknown }): unknown { +export function coreInternalFetch(path: string, init?: { method?: string; body?: unknown; maxResponseBytes?: number }): unknown { if (!path.startsWith("/")) throw new Error("core internal path must start with /"); + const maxResponseBytes = Math.max(1024, Math.floor(init?.maxResponseBytes ?? 5_000_000)); const code = ` const res = await fetch(${JSON.stringify(`http://127.0.0.1:8080${path}`)}, ${JSON.stringify({ method: init?.method ?? "GET", headers: init?.body === undefined ? undefined : { "content-type": "application/json" }, body: init?.body === undefined ? undefined : JSON.stringify(init.body), })}); - const text = await res.text(); + const maxResponseBytes = ${JSON.stringify(maxResponseBytes)}; + const reader = res.body?.getReader(); + const chunks = []; + let bytes = 0; + let responseTruncated = false; + if (reader) { + while (true) { + const { done, value } = await reader.read(); + if (done) break; + if (bytes + value.byteLength > maxResponseBytes) { + const keep = Math.max(0, maxResponseBytes - bytes); + if (keep > 0) { + chunks.push(value.slice(0, keep)); + bytes += keep; + } + responseTruncated = true; + try { await reader.cancel(); } catch {} + break; + } + chunks.push(value); + bytes += value.byteLength; + } + } + const buffer = new Uint8Array(bytes); + let offset = 0; + for (const chunk of chunks) { + buffer.set(chunk, offset); + offset += chunk.byteLength; + } + const text = new TextDecoder().decode(buffer); let body = null; - try { body = text ? JSON.parse(text) : null; } catch { body = { text }; } - console.log(JSON.stringify({ ok: res.ok, status: res.status, body })); + try { body = text && !responseTruncated ? JSON.parse(text) : null; } catch { body = { text }; } + if (responseTruncated) { + body = { _unideskResponseTruncated: true, maxResponseBytes, bytesRead: bytes, contentLength: res.headers.get("content-length"), textPreview: text }; + } + console.log(JSON.stringify({ ok: res.ok, status: res.status, responseTruncated, responseBytesRead: bytes, responseContentLength: res.headers.get("content-length"), body })); `; const result = runCommand(["docker", "exec", "unidesk-backend-core", "bun", "-e", code], repoRoot); if (result.exitCode !== 0) { @@ -50,6 +83,11 @@ function numberOption(args: string[], name: string, defaultValue: number): numbe return value; } +function cappedNumberOption(args: string[], name: string, defaultValue: number, maxValue: number): number { + const value = numberOption(args, name, defaultValue); + return Math.min(value, maxValue); +} + function stringOption(args: string[], name: string): string | undefined { const index = args.indexOf(name); if (index === -1) return undefined; @@ -58,6 +96,10 @@ function stringOption(args: string[], name: string): string | undefined { return raw; } +function hasFlag(args: string[], name: string): boolean { + return args.includes(name); +} + function methodOption(args: string[]): string { const method = (stringOption(args, "--method") ?? "GET").toUpperCase(); if (!["GET", "HEAD", "POST", "DELETE", "PUT", "PATCH"].includes(method)) throw new Error(`unsupported --method ${method}`); @@ -65,13 +107,34 @@ function methodOption(args: string[]): string { } export function summarizeMicroserviceProxyResponse(response: unknown, args: string[]): unknown { - if (args.includes("--raw")) return response; - const maxBodyBytes = numberOption(args, "--max-body-bytes", 60_000); + const full = args.includes("--full"); + const raw = args.includes("--raw"); + const maxBodyBytes = full ? numberOption(args, "--max-body-bytes", 5_000_000) : cappedNumberOption(args, "--max-body-bytes", raw ? 120_000 : 60_000, 500_000); if (typeof response !== "object" || response === null || Array.isArray(response)) return response; const record = response as Record; if (!("body" in record)) return response; + if (record.responseTruncated === true) { + return { + ...record, + bodyOmitted: true, + bodyMaxBytes: maxBodyBytes, + rawHint: "The upstream response exceeded the CLI collection cap before JSON parsing; re-run with --raw --full and a specific --max-body-bytes only when the full body is required.", + }; + } const bodyBytes = jsonByteLength(record.body); - if (bodyBytes <= maxBodyBytes) return response; + if (bodyBytes <= maxBodyBytes) { + if (!raw || full) return response; + return { + ...record, + outputPolicy: { + rawRequested: true, + bounded: true, + maxBodyBytes, + bodyBytes, + fullCommand: "Re-run with --raw --full to allow the complete body.", + }, + }; + } const rest = { ...record }; delete rest.body; return { @@ -80,7 +143,9 @@ export function summarizeMicroserviceProxyResponse(response: unknown, args: stri bodyBytes, bodyMaxBytes: maxBodyBytes, bodyPreview: previewJson(record.body, { maxDepth: 3, maxArrayItems: 3, maxObjectKeys: 16, maxStringLength: 320 }), - rawHint: "Re-run with --raw for the full body, or add/tighten __unideskArrayLimit=: in the proxied path.", + rawHint: raw && !full + ? "The --raw response exceeded the default hard limit; re-run with --raw --full for the complete body, or add/tighten __unideskArrayLimit=: in the proxied path." + : "Re-run with --raw --full for the complete body, or add/tighten __unideskArrayLimit=: in the proxied path.", }; } @@ -106,7 +171,11 @@ export async function runMicroserviceCommand(_config: UniDeskConfig, args: strin if (action === "proxy") { const id = requireId(idArg, "microservice proxy"); const path = requireProxyPath(pathArg); - return summarizeMicroserviceProxyResponse(coreInternalFetch(`/api/microservices/${encodeId(id)}/proxy${path}`, { method: methodOption(args) }), args); + const full = hasFlag(args, "--full"); + const raw = hasFlag(args, "--raw"); + const maxBodyBytes = full ? numberOption(args, "--max-body-bytes", 5_000_000) : cappedNumberOption(args, "--max-body-bytes", raw ? 120_000 : 60_000, 500_000); + const maxResponseBytes = full ? Math.min(Math.max(maxBodyBytes, 120_000), 5_000_000) : Math.min(Math.max(maxBodyBytes * 3, 240_000), 1_500_000); + return summarizeMicroserviceProxyResponse(coreInternalFetch(`/api/microservices/${encodeId(id)}/proxy${path}`, { method: methodOption(args), maxResponseBytes }), args); } throw new Error("microservice command must be one of: list, status, health, diagnostics, tunnel-self-test, proxy"); } diff --git a/scripts/src/output.ts b/scripts/src/output.ts index addfde5c..f3427535 100644 --- a/scripts/src/output.ts +++ b/scripts/src/output.ts @@ -5,6 +5,20 @@ export interface JsonEnvelope { error?: unknown; } +function isEpipe(error: unknown): boolean { + return typeof error === "object" && error !== null && "code" in error && (error as { code?: unknown }).code === "EPIPE"; +} + +process.stdout.on("error", (error) => { + if (isEpipe(error)) process.exit(0); + throw error; +}); + +process.stderr.on("error", (error) => { + if (isEpipe(error)) process.exit(0); + throw error; +}); + export function emitJson(command: string, data: T, ok = true): void { const envelope: JsonEnvelope = { ok, command, data }; safeStdoutWrite(`${JSON.stringify(envelope, null, 2)}\n`); @@ -22,10 +36,7 @@ function safeStdoutWrite(text: string): void { try { process.stdout.write(text); } catch (error) { - if (typeof error === "object" && error !== null && "code" in error && (error as { code?: unknown }).code === "EPIPE") { - process.exitCode = 0; - return; - } + if (isEpipe(error)) process.exit(0); throw error; } } diff --git a/scripts/src/remote.ts b/scripts/src/remote.ts index d09b52a1..bfccb177 100644 --- a/scripts/src/remote.ts +++ b/scripts/src/remote.ts @@ -27,6 +27,9 @@ interface FetchJsonResult { status?: number; body?: unknown; error?: string; + responseTruncated?: boolean; + responseBytesRead?: number; + responseContentLength?: string | null; } const hostOptions = new Set(["--main-server-ip", "--main-server", "--server"]); @@ -172,19 +175,54 @@ function frontendBaseUrl(host: string, config: UniDeskConfig): string { return `http://${host}:${config.network.frontend.port}`; } -async function readJson(url: string, init?: RequestInit, timeoutMs = 8000): Promise { +async function readJson(url: string, init?: RequestInit, timeoutMs = 8000, maxResponseBytes = 5_000_000): Promise { const controller = new AbortController(); const timer = setTimeout(() => controller.abort(), timeoutMs); try { const res = await fetch(url, { ...init, signal: controller.signal }); - const text = await res.text(); + const reader = res.body?.getReader(); + const chunks: Uint8Array[] = []; + let bytes = 0; + let responseTruncated = false; + if (reader !== undefined) { + while (true) { + const { done, value } = await reader.read(); + if (done) break; + if (bytes + value.byteLength > maxResponseBytes) { + const keep = Math.max(0, maxResponseBytes - bytes); + if (keep > 0) { + chunks.push(value.slice(0, keep)); + bytes += keep; + } + responseTruncated = true; + try { + await reader.cancel(); + } catch { + // Ignore cancel failures after the bounded preview has been collected. + } + break; + } + chunks.push(value); + bytes += value.byteLength; + } + } + const buffer = new Uint8Array(bytes); + let offset = 0; + for (const chunk of chunks) { + buffer.set(chunk, offset); + offset += chunk.byteLength; + } + const text = new TextDecoder().decode(buffer); let body: unknown = null; try { - body = text.length > 0 ? JSON.parse(text) : null; + body = text.length > 0 && !responseTruncated ? JSON.parse(text) : null; } catch { body = { text }; } - return { ok: res.ok, status: res.status, body }; + if (responseTruncated) { + body = { _unideskResponseTruncated: true, maxResponseBytes, bytesRead: bytes, contentLength: res.headers.get("content-length"), textPreview: text }; + } + return { ok: res.ok, status: res.status, body, responseTruncated, responseBytesRead: bytes, responseContentLength: res.headers.get("content-length") }; } catch (error) { return { ok: false, error: error instanceof Error ? error.message : String(error) }; } finally { @@ -208,11 +246,11 @@ async function loginFrontend(host: string, config: UniDeskConfig): Promise { +async function frontendJson(session: FrontendSession, path: string, init?: RequestInit, timeoutMs = 8000, maxResponseBytes = 5_000_000): Promise { const headers = new Headers(init?.headers); headers.set("cookie", session.cookie); if (init?.body !== undefined && !headers.has("content-type")) headers.set("content-type", "application/json"); - return readJson(`${session.baseUrl}${path}`, { ...init, headers }, timeoutMs); + return readJson(`${session.baseUrl}${path}`, { ...init, headers }, timeoutMs, maxResponseBytes); } function stringOption(args: string[], name: string): string | undefined { @@ -231,6 +269,10 @@ function numberOption(args: string[], name: string, defaultValue: number): numbe return value; } +function cappedNumberOption(args: string[], name: string, defaultValue: number, maxValue: number): number { + return Math.min(numberOption(args, name, defaultValue), maxValue); +} + function jsonOption(args: string[], name: string): Record | undefined { const raw = stringOption(args, name); if (raw === undefined) return undefined; @@ -462,7 +504,11 @@ async function remoteMicroservice(session: FrontendSession, args: string[]): Pro }; } if (action === "proxy" && id !== undefined && path !== undefined && path.startsWith("/")) { - const response = await frontendJson(session, `/api/microservices/${encodeURIComponent(id)}/proxy${path}`, undefined, 24_000); + const full = args.includes("--full"); + const raw = args.includes("--raw"); + const maxBodyBytes = full ? numberOption(args, "--max-body-bytes", 5_000_000) : cappedNumberOption(args, "--max-body-bytes", raw ? 120_000 : 60_000, 500_000); + const maxResponseBytes = full ? Math.min(Math.max(maxBodyBytes, 120_000), 5_000_000) : Math.min(Math.max(maxBodyBytes * 3, 240_000), 1_500_000); + const response = await frontendJson(session, `/api/microservices/${encodeURIComponent(id)}/proxy${path}`, undefined, 24_000, maxResponseBytes); return { transport: "frontend", response: summarizeMicroserviceProxyResponse(response, args), diff --git a/scripts/src/swap.ts b/scripts/src/swap.ts new file mode 100644 index 00000000..390c86db --- /dev/null +++ b/scripts/src/swap.ts @@ -0,0 +1,303 @@ +import { accessSync, constants, existsSync, readFileSync, statSync } from "node:fs"; +import { runCommand } from "./command"; +import { repoRoot } from "./config"; + +const defaultSwapPath = "/swapfile"; +const defaultSwapSizeBytes = 2 * 1024 * 1024 * 1024; + +export interface SwapArea { + filename: string; + type: string; + sizeBytes: number; + usedBytes: number; + priority: number | null; +} + +export interface SwapMemoryStatus { + totalBytes: number; + availableBytes: number | null; + swapTotalBytes: number; + swapFreeBytes: number; +} + +export interface SwapStatus { + memory: SwapMemoryStatus; + activeSwaps: SwapArea[]; + configuredPath: string; + configuredPathExists: boolean; + configuredPathMode: string | null; + configuredPathSizeBytes: number | null; + configuredPathActive: boolean; + fstab: { + path: string; + writable: boolean; + persisted: boolean; + matchingLine: string | null; + error: string | null; + }; + warning: string | null; +} + +export interface SwapEnsureResult { + ok: boolean; + status: "ok" | "degraded" | "failed"; + requested: { + path: string; + sizeBytes: number; + }; + before: SwapStatus; + after: SwapStatus; + actions: Array<{ action: string; ok: boolean; detail?: unknown }>; + errors: Array<{ action: string; message: string; detail?: unknown }>; +} + +function shellQuote(value: string): string { + return `'${value.replace(/'/g, `'\\''`)}'`; +} + +function parseByteCount(value: string): number { + const raw = value.trim(); + if (/^\d+$/u.test(raw)) return Number(raw); + const match = raw.match(/^([0-9]+(?:\.[0-9]+)?)([KMGTPE]?i?B?)$/iu); + if (!match) return 0; + const amount = Number(match[1]); + const unit = match[2].toUpperCase(); + const powers: Record = { + K: 1, + KB: 1, + KIB: 1, + M: 2, + MB: 2, + MIB: 2, + G: 3, + GB: 3, + GIB: 3, + T: 4, + TB: 4, + TIB: 4, + P: 5, + PB: 5, + PIB: 5, + E: 6, + EB: 6, + EIB: 6, + }; + return Math.round(amount * (1024 ** (powers[unit] ?? 0))); +} + +function parseMeminfo(): SwapMemoryStatus { + const raw = readFileSync("/proc/meminfo", "utf8"); + const values = new Map(); + for (const line of raw.split("\n")) { + const match = line.match(/^([^:]+):\s+(\d+)\s+kB/u); + if (match) values.set(match[1], Number(match[2]) * 1024); + } + return { + totalBytes: values.get("MemTotal") ?? 0, + availableBytes: values.get("MemAvailable") ?? null, + swapTotalBytes: values.get("SwapTotal") ?? 0, + swapFreeBytes: values.get("SwapFree") ?? 0, + }; +} + +function parseSwaps(): SwapArea[] { + if (!existsSync("/proc/swaps")) return []; + const lines = readFileSync("/proc/swaps", "utf8").trim().split("\n").slice(1); + return lines.map((line) => line.trim().split(/\s+/u)).filter((parts) => parts.length >= 5).map(([filename, type, sizeKiB, usedKiB, priority]) => ({ + filename, + type, + sizeBytes: Number(sizeKiB) * 1024, + usedBytes: Number(usedKiB) * 1024, + priority: Number.isFinite(Number(priority)) ? Number(priority) : null, + })); +} + +function fileMode(path: string): string | null { + if (!existsSync(path)) return null; + return (statSync(path).mode & 0o777).toString(8).padStart(3, "0"); +} + +function fstabStatus(path: string): SwapStatus["fstab"] { + const fstabPath = "/etc/fstab"; + try { + const raw = existsSync(fstabPath) ? readFileSync(fstabPath, "utf8") : ""; + let writable = false; + try { + accessSync(fstabPath, constants.W_OK); + writable = true; + } catch { + writable = false; + } + const matchingLine = raw.split("\n").find((line) => { + const trimmed = line.trim(); + if (trimmed.length === 0 || trimmed.startsWith("#")) return false; + const parts = trimmed.split(/\s+/u); + return parts[0] === path && parts[2] === "swap"; + }) ?? null; + return { + path: fstabPath, + writable, + persisted: matchingLine !== null, + matchingLine, + error: null, + }; + } catch (error) { + return { + path: fstabPath, + writable: false, + persisted: false, + matchingLine: null, + error: error instanceof Error ? error.message : String(error), + }; + } +} + +export function swapStatus(path = defaultSwapPath): SwapStatus { + const memory = parseMeminfo(); + const activeSwaps = parseSwaps(); + const configuredPathExists = existsSync(path); + const configuredPathSizeBytes = configuredPathExists ? statSync(path).size : null; + const configuredPathActive = activeSwaps.some((swap) => swap.filename === path); + const warning = memory.swapTotalBytes > 0 ? null : "swap is not active; low-memory main servers are at risk of global OOM during builds or diagnostics"; + return { + memory, + activeSwaps, + configuredPath: path, + configuredPathExists, + configuredPathMode: fileMode(path), + configuredPathSizeBytes, + configuredPathActive, + fstab: fstabStatus(path), + warning, + }; +} + +function pushAction( + actions: SwapEnsureResult["actions"], + errors: SwapEnsureResult["errors"], + action: string, + command: string[], +): boolean { + const result = runCommand(command, repoRoot, { timeoutMs: 120_000 }); + const ok = result.exitCode === 0; + const detail = { + command, + exitCode: result.exitCode, + stdoutTail: result.stdout.slice(-1200), + stderrTail: result.stderr.slice(-1200), + timedOut: result.timedOut, + }; + actions.push({ action, ok, detail }); + if (!ok) { + errors.push({ + action, + message: result.stderr.trim() || result.stdout.trim() || `command failed with exit code ${result.exitCode}`, + detail, + }); + } + return ok; +} + +function ensureFstabLine(path: string): { ok: boolean; action: string; detail: unknown } { + const line = `${path} none swap sw 0 0`; + const script = [ + "set -euo pipefail", + "touch /etc/fstab", + `grep -Eq '^${path.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}[[:space:]]+[^[:space:]]+[[:space:]]+swap[[:space:]]' /etc/fstab || printf '%s\\n' ${shellQuote(line)} >> /etc/fstab`, + ].join("\n"); + const result = runCommand(["bash", "-lc", script], repoRoot, { timeoutMs: 30_000 }); + return { + ok: result.exitCode === 0, + action: "persist-fstab", + detail: { + command: ["bash", "-lc", script], + exitCode: result.exitCode, + stdoutTail: result.stdout.slice(-1200), + stderrTail: result.stderr.slice(-1200), + timedOut: result.timedOut, + }, + }; +} + +function parseSizeOption(args: string[], defaultBytes: number): number { + const index = args.indexOf("--size"); + const raw = index === -1 ? undefined : args[index + 1]; + if (raw === undefined) return defaultBytes; + const bytes = parseByteCount(raw); + if (!Number.isFinite(bytes) || bytes <= 0) throw new Error("--size must be a positive byte count such as 2GiB or 4096M"); + return bytes; +} + +function parsePathOption(args: string[], defaultPath: string): string { + const index = args.indexOf("--path"); + if (index === -1) return defaultPath; + const raw = args[index + 1]; + if (raw === undefined || !raw.startsWith("/")) throw new Error("--path must be an absolute path"); + return raw; +} + +function hasFlag(args: string[], name: string): boolean { + return args.includes(name); +} + +export function runSwapCommand(args: string[]): unknown { + const [action = "status"] = args; + const path = parsePathOption(args, defaultSwapPath); + if (action === "status") return swapStatus(path); + if (action === "ensure") { + const sizeBytes = parseSizeOption(args, defaultSwapSizeBytes); + const dryRun = hasFlag(args, "--dry-run"); + const before = swapStatus(path); + const actions: SwapEnsureResult["actions"] = []; + const errors: SwapEnsureResult["errors"] = []; + if (before.memory.swapTotalBytes > 0) { + actions.push({ action: "noop-existing-swap", ok: true, detail: { activeSwaps: before.activeSwaps } }); + const after = swapStatus(path); + return { ok: true, status: "ok", requested: { path, sizeBytes }, before, after, actions, errors } satisfies SwapEnsureResult; + } + if (dryRun) { + actions.push({ action: "dry-run", ok: true, detail: { wouldCreate: path, sizeBytes, wouldPersistFstab: true } }); + const after = swapStatus(path); + return { ok: true, status: "degraded", requested: { path, sizeBytes }, before, after, actions, errors } satisfies SwapEnsureResult; + } + if (!existsSync(path)) { + const sizeMiB = Math.ceil(sizeBytes / 1024 / 1024); + const allocated = pushAction(actions, errors, "allocate-swapfile", ["fallocate", "-l", `${sizeMiB}M`, path]); + if (!allocated) pushAction(actions, errors, "allocate-swapfile-dd-fallback", ["dd", "if=/dev/zero", `of=${path}`, "bs=1M", `count=${sizeMiB}`, "status=none"]); + } else { + const existingBytes = statSync(path).size; + if (existingBytes < sizeBytes) { + const sizeMiB = Math.ceil(sizeBytes / 1024 / 1024); + const resized = pushAction(actions, errors, "resize-existing-swapfile", ["fallocate", "-l", `${sizeMiB}M`, path]); + if (!resized) pushAction(actions, errors, "resize-existing-swapfile-dd-fallback", ["dd", "if=/dev/zero", `of=${path}`, "bs=1M", `count=${sizeMiB}`, "status=none"]); + } else { + actions.push({ action: "reuse-existing-swapfile-path", ok: true, detail: { path, sizeBytes: existingBytes } }); + } + } + pushAction(actions, errors, "chmod-600", ["chmod", "600", path]); + pushAction(actions, errors, "mkswap", ["mkswap", path]); + pushAction(actions, errors, "swapon", ["swapon", path]); + const persist = ensureFstabLine(path); + actions.push({ action: persist.action, ok: persist.ok, detail: persist.detail }); + if (!persist.ok) { + errors.push({ + action: persist.action, + message: "swap is active but /etc/fstab could not be updated; rerun ensure as root or add the returned fstab line manually", + detail: persist.detail, + }); + } + const after = swapStatus(path); + const swapActive = after.memory.swapTotalBytes > 0; + const status = swapActive && after.fstab.persisted ? "ok" : swapActive ? "degraded" : "failed"; + return { + ok: status !== "failed", + status, + requested: { path, sizeBytes }, + before, + after, + actions, + errors, + } satisfies SwapEnsureResult; + } + throw new Error("server swap command must be one of: status, ensure"); +} From 0ed5c3150934e1713b7a2b5b9fcb455723e2ff46 Mon Sep 17 00:00:00 2001 From: Codex Date: Sun, 17 May 2026 08:14:09 +0000 Subject: [PATCH 13/15] Make Code Queue read role database startup read-only --- .../microservices/code-queue/src/index.ts | 180 ++++++++++-------- 1 file changed, 96 insertions(+), 84 deletions(-) diff --git a/src/components/microservices/code-queue/src/index.ts b/src/components/microservices/code-queue/src/index.ts index c270a997..9662b2d9 100644 --- a/src/components/microservices/code-queue/src/index.ts +++ b/src/components/microservices/code-queue/src/index.ts @@ -283,6 +283,10 @@ function serviceRoleAllowsScheduler(role: CodeQueueServiceRole): boolean { return role === "combined" || role === "scheduler"; } +function serviceRoleReadOnly(role: CodeQueueServiceRole): boolean { + return role === "read"; +} + function envList(name: string, fallback: string[]): string[] { const raw = process.env[name]; const source = raw === undefined || raw.length === 0 ? fallback.join(",") : raw; @@ -1158,6 +1162,7 @@ function runGarbageCollection(): void { } function scheduleDatabaseFlush(delayMs = config.databaseFlushIntervalMs): void { + if (serviceRoleReadOnly(config.serviceRole)) return; if (!databaseReady || (dirtyDatabaseTaskIds.size === 0 && dirtyDatabaseQueueIds.size === 0) || shutdownRequested) return; if (databaseFlushTimer !== null) return; databaseFlushTimer = setTimeout(() => { @@ -1924,6 +1929,11 @@ async function loadNextSeqFromDatabase(): Promise { } async function flushDirtyTasksToDatabase(force = false): Promise { + if (serviceRoleReadOnly(config.serviceRole)) { + dirtyDatabaseTaskIds.clear(); + dirtyDatabaseQueueIds.clear(); + return; + } if (!databaseReady) return; if (databaseFlushInFlight && !force) { scheduleDatabaseFlush(); @@ -1964,89 +1974,91 @@ async function flushDirtyTasksToDatabase(force = false): Promise { async function initDatabasePersistence(): Promise { logger("info", "database_persistence_init_start", { databaseUrl: redactDatabaseUrl(config.databaseUrl) }); - await sql` - CREATE TABLE IF NOT EXISTS unidesk_code_queue_tasks ( - id TEXT PRIMARY KEY, - queue_id TEXT NOT NULL DEFAULT 'default', - status TEXT NOT NULL, - provider_id TEXT NOT NULL DEFAULT 'main-server', - execution_mode TEXT NOT NULL DEFAULT 'default', - model TEXT NOT NULL, - cwd TEXT NOT NULL, - prompt TEXT NOT NULL, - base_prompt TEXT NOT NULL DEFAULT '', - reference_task_ids JSONB NOT NULL DEFAULT '[]'::jsonb, - reference_injection JSONB, - reasoning_effort TEXT, - max_attempts INTEGER NOT NULL, - current_attempt INTEGER NOT NULL DEFAULT 0, - current_mode TEXT, - codex_thread_id TEXT, - active_turn_id TEXT, - created_at TIMESTAMPTZ NOT NULL, - updated_at TIMESTAMPTZ NOT NULL, - started_at TIMESTAMPTZ, - finished_at TIMESTAMPTZ, - read_at TIMESTAMPTZ, - last_error TEXT, - last_judge JSONB, - output_count INTEGER NOT NULL DEFAULT 0, - event_count INTEGER NOT NULL DEFAULT 0, - attempt_count INTEGER NOT NULL DEFAULT 0, - last_output_seq BIGINT NOT NULL DEFAULT 0, - task_json JSONB NOT NULL - ) - `; - await sql` - CREATE TABLE IF NOT EXISTS unidesk_code_queue_queues ( - id TEXT PRIMARY KEY, - name TEXT NOT NULL DEFAULT '', - created_at TIMESTAMPTZ NOT NULL, - updated_at TIMESTAMPTZ NOT NULL - ) - `; - await sql` - CREATE TABLE IF NOT EXISTS unidesk_code_queue_workdirs ( - provider_id TEXT NOT NULL, - execution_mode TEXT NOT NULL DEFAULT 'default', - path TEXT NOT NULL, - created_at TIMESTAMPTZ NOT NULL, - updated_at TIMESTAMPTZ NOT NULL, - PRIMARY KEY (provider_id, execution_mode, path) - ) - `; - await sql` - CREATE TABLE IF NOT EXISTS unidesk_code_queue_notifications ( - id TEXT PRIMARY KEY, - kind TEXT NOT NULL, - dedup_key TEXT NOT NULL, - target TEXT NOT NULL, - message TEXT NOT NULL, - created_at TIMESTAMPTZ NOT NULL, - updated_at TIMESTAMPTZ NOT NULL, - attempts INTEGER NOT NULL DEFAULT 0, - next_attempt_at TIMESTAMPTZ NOT NULL, - last_error TEXT, - sent_at TIMESTAMPTZ - ) - `; - await sql`ALTER TABLE unidesk_code_queue_tasks ADD COLUMN IF NOT EXISTS queue_id TEXT NOT NULL DEFAULT 'default'`; - await sql`ALTER TABLE unidesk_code_queue_tasks ADD COLUMN IF NOT EXISTS provider_id TEXT NOT NULL DEFAULT 'main-server'`; - await sql`ALTER TABLE unidesk_code_queue_tasks ADD COLUMN IF NOT EXISTS execution_mode TEXT NOT NULL DEFAULT 'default'`; - await sql`ALTER TABLE unidesk_code_queue_tasks ADD COLUMN IF NOT EXISTS base_prompt TEXT NOT NULL DEFAULT ''`; - await sql`ALTER TABLE unidesk_code_queue_tasks ADD COLUMN IF NOT EXISTS reference_task_ids JSONB NOT NULL DEFAULT '[]'::jsonb`; - await sql`ALTER TABLE unidesk_code_queue_tasks ADD COLUMN IF NOT EXISTS reference_injection JSONB`; - await sql`ALTER TABLE unidesk_code_queue_tasks ADD COLUMN IF NOT EXISTS read_at TIMESTAMPTZ`; - await sql` - UPDATE unidesk_code_queue_tasks - SET read_at = NULLIF(task_json->>'readAt', '')::timestamptz - WHERE read_at IS NULL - AND status IN ('succeeded', 'failed', 'canceled') - AND COALESCE(task_json->>'readAt', '') <> '' - AND (task_json->>'readAt') ~ '^\\d{4}-\\d{2}-\\d{2}T' - `; - await sql`ALTER TABLE unidesk_code_queue_queues ADD COLUMN IF NOT EXISTS name TEXT NOT NULL DEFAULT ''`; - await sql`ALTER TABLE unidesk_code_queue_workdirs ADD COLUMN IF NOT EXISTS execution_mode TEXT NOT NULL DEFAULT 'default'`; + if (!serviceRoleReadOnly(config.serviceRole)) { + await sql` + CREATE TABLE IF NOT EXISTS unidesk_code_queue_tasks ( + id TEXT PRIMARY KEY, + queue_id TEXT NOT NULL DEFAULT 'default', + status TEXT NOT NULL, + provider_id TEXT NOT NULL DEFAULT 'main-server', + execution_mode TEXT NOT NULL DEFAULT 'default', + model TEXT NOT NULL, + cwd TEXT NOT NULL, + prompt TEXT NOT NULL, + base_prompt TEXT NOT NULL DEFAULT '', + reference_task_ids JSONB NOT NULL DEFAULT '[]'::jsonb, + reference_injection JSONB, + reasoning_effort TEXT, + max_attempts INTEGER NOT NULL, + current_attempt INTEGER NOT NULL DEFAULT 0, + current_mode TEXT, + codex_thread_id TEXT, + active_turn_id TEXT, + created_at TIMESTAMPTZ NOT NULL, + updated_at TIMESTAMPTZ NOT NULL, + started_at TIMESTAMPTZ, + finished_at TIMESTAMPTZ, + read_at TIMESTAMPTZ, + last_error TEXT, + last_judge JSONB, + output_count INTEGER NOT NULL DEFAULT 0, + event_count INTEGER NOT NULL DEFAULT 0, + attempt_count INTEGER NOT NULL DEFAULT 0, + last_output_seq BIGINT NOT NULL DEFAULT 0, + task_json JSONB NOT NULL + ) + `; + await sql` + CREATE TABLE IF NOT EXISTS unidesk_code_queue_queues ( + id TEXT PRIMARY KEY, + name TEXT NOT NULL DEFAULT '', + created_at TIMESTAMPTZ NOT NULL, + updated_at TIMESTAMPTZ NOT NULL + ) + `; + await sql` + CREATE TABLE IF NOT EXISTS unidesk_code_queue_workdirs ( + provider_id TEXT NOT NULL, + execution_mode TEXT NOT NULL DEFAULT 'default', + path TEXT NOT NULL, + created_at TIMESTAMPTZ NOT NULL, + updated_at TIMESTAMPTZ NOT NULL, + PRIMARY KEY (provider_id, execution_mode, path) + ) + `; + await sql` + CREATE TABLE IF NOT EXISTS unidesk_code_queue_notifications ( + id TEXT PRIMARY KEY, + kind TEXT NOT NULL, + dedup_key TEXT NOT NULL, + target TEXT NOT NULL, + message TEXT NOT NULL, + created_at TIMESTAMPTZ NOT NULL, + updated_at TIMESTAMPTZ NOT NULL, + attempts INTEGER NOT NULL DEFAULT 0, + next_attempt_at TIMESTAMPTZ NOT NULL, + last_error TEXT, + sent_at TIMESTAMPTZ + ) + `; + await sql`ALTER TABLE unidesk_code_queue_tasks ADD COLUMN IF NOT EXISTS queue_id TEXT NOT NULL DEFAULT 'default'`; + await sql`ALTER TABLE unidesk_code_queue_tasks ADD COLUMN IF NOT EXISTS provider_id TEXT NOT NULL DEFAULT 'main-server'`; + await sql`ALTER TABLE unidesk_code_queue_tasks ADD COLUMN IF NOT EXISTS execution_mode TEXT NOT NULL DEFAULT 'default'`; + await sql`ALTER TABLE unidesk_code_queue_tasks ADD COLUMN IF NOT EXISTS base_prompt TEXT NOT NULL DEFAULT ''`; + await sql`ALTER TABLE unidesk_code_queue_tasks ADD COLUMN IF NOT EXISTS reference_task_ids JSONB NOT NULL DEFAULT '[]'::jsonb`; + await sql`ALTER TABLE unidesk_code_queue_tasks ADD COLUMN IF NOT EXISTS reference_injection JSONB`; + await sql`ALTER TABLE unidesk_code_queue_tasks ADD COLUMN IF NOT EXISTS read_at TIMESTAMPTZ`; + await sql` + UPDATE unidesk_code_queue_tasks + SET read_at = NULLIF(task_json->>'readAt', '')::timestamptz + WHERE read_at IS NULL + AND status IN ('succeeded', 'failed', 'canceled') + AND COALESCE(task_json->>'readAt', '') <> '' + AND (task_json->>'readAt') ~ '^\\d{4}-\\d{2}-\\d{2}T' + `; + await sql`ALTER TABLE unidesk_code_queue_queues ADD COLUMN IF NOT EXISTS name TEXT NOT NULL DEFAULT ''`; + await sql`ALTER TABLE unidesk_code_queue_workdirs ADD COLUMN IF NOT EXISTS execution_mode TEXT NOT NULL DEFAULT 'default'`; + } const countRows = await sql>`SELECT COUNT(*) AS count FROM unidesk_code_queue_tasks`; const hotTasks = await loadTasksFromDatabase("hot"); @@ -2091,7 +2103,7 @@ async function initDatabasePersistence(): Promise { } } ensureDefaultWorkdirRecords(); - await upsertWorkdirsToDatabase(sortedWorkdirRecords()); + if (!serviceRoleReadOnly(config.serviceRole)) await upsertWorkdirsToDatabase(sortedWorkdirRecords()); databaseReady = true; if (config.serviceRole === "combined" || config.serviceRole === "scheduler") scheduleStartupDatabaseMaintenance(); runGarbageCollection(); From dd448c7218223f0275f3e3e9012b76693e1aa938 Mon Sep 17 00:00:00 2001 From: Codex Date: Sun, 17 May 2026 08:39:03 +0000 Subject: [PATCH 14/15] Bound provider egress tunnel lifecycle --- docs/reference/observability.md | 2 +- docs/reference/provider-gateway.md | 4 +- src/components/backend-core/src/egress-tcp.ts | 49 ++++++- src/components/backend-core/src/types.ts | 2 + src/components/provider-gateway/package.json | 2 +- .../provider-gateway/src/egress-proxy.ts | 121 +++++++++++++++--- 6 files changed, 154 insertions(+), 26 deletions(-) diff --git a/docs/reference/observability.md b/docs/reference/observability.md index e4edd79f..e2be8653 100644 --- a/docs/reference/observability.md +++ b/docs/reference/observability.md @@ -38,6 +38,6 @@ frontend Bun server 必须提供同源 `/api/frontend-performance`,记录 webu 性能优化必须先用这些指标锁定慢操作名称、路径、耗时和代理层级,再改后端查询或前后端通信策略;不得只凭主观体感改 UI。Code Queue 这类控制面页面出现 `core_proxy`、`GET /api/microservices/code-queue/proxy/api/tasks/overview`、`POST /api/microservices/code-queue/proxy/api/tasks//read` 等超过 1s 的慢操作时,应保留优化前后的性能面板证据,并同时记录 live API 耗时、容器内存、`/health` 存储摘要和是否仍通过 PostgreSQL/append-only archive 重建历史数据。短 TTL cache、warmup 或页面内存缓存只能作为重复请求抖动保护,性能证据必须证明数据库索引/聚合、分页和渐进式披露本身已把核心路径降到目标内,不能用长缓存遮蔽慢 SQL 或全量 JSON 物化。 -当最近失败请求集中出现 frontend `core_proxy` 502/503/504,路径为 `/api/microservices/code-queue/proxy/...` 的 overview、trace 或 summary,且 k3s/k8s Pod 仍在运行时,必须先运行 `bun scripts/cli.ts microservice diagnostics code-queue`,区分 provider-gateway online、WebSocket HTTP tunnel、k3sctl-adapter、Kubernetes API service proxy 和目标 Service 五段状态。provider tunnel 类失败必须记录响应 body/headers 中的 `requestId`、`stage`、`failureReason`、`x-unidesk-request-id` 和 `x-unidesk-tunnel-error`;如需主动验证错误结构,运行 `bun scripts/cli.ts microservice tunnel-self-test code-queue`,该自测应返回预期失败但 `ok=true` 的诊断结果。随后再继续判断“Kubernetes API service proxy 不可达”“Code Queue 进程不可达”和“Code Queue event loop 被热路径同步工作饿死”。排障顺序是同时查看 `/api/frontend-performance`、`/api/performance`、`k3sctl-adapter` `/api/control-plane`、Kubernetes Pod `/live`、`/health`、overview/trace-step curl、`kubectl top pod` 或 Docker stats、容器 `RestartCount`/`OOMKilled` 和 Code Queue 日志;如果 Pod 内 `/health` 也超时,应优先检查实时 output 发布、archive 读取、transcript 构建、统计计算、启动维护、历史 OA backfill 和远程 Provider 准备/SSH 子进程是否阻塞 event loop,而不是先调整 frontend 渲染或代理超时。Code Queue 默认不得在启动时自动执行历史 OA backfill 或通知表索引维护;显式 backfill 必须作为运维动作记录,并在运行期间并发证明 `/live`、`/health` 与 `/api/tasks/overview` 仍快速返回。涉及 D601 等远程 Provider 时,还要检查 `runCodeQueueSsh`/开发容器准备是否仍存在同步子进程、无 timeout 的 SSH、无上限 stdout/stderr 或 stale TUN 重建等待;修复后必须在远程准备探针运行期间并发证明 Pod `/health` 与 `/api/tasks/overview` 仍快速返回。 +当最近失败请求集中出现 frontend `core_proxy` 502/503/504,路径为 `/api/microservices/code-queue/proxy/...` 的 overview、trace 或 summary,且 k3s/k8s Pod 仍在运行时,必须先运行 `bun scripts/cli.ts microservice diagnostics code-queue`,区分 provider-gateway online、WebSocket HTTP tunnel、k3sctl-adapter、Kubernetes API service proxy 和目标 Service 五段状态。provider tunnel 类失败必须记录响应 body/headers 中的 `requestId`、`stage`、`failureReason`、`x-unidesk-request-id` 和 `x-unidesk-tunnel-error`;如需主动验证错误结构,运行 `bun scripts/cli.ts microservice tunnel-self-test code-queue`,该自测应返回预期失败但 `ok=true` 的诊断结果。随后再继续判断“Kubernetes API service proxy 不可达”“Code Queue 进程不可达”和“Code Queue event loop 被热路径同步工作饿死”。如果 `debug health` 或 provider-gateway egress health 显示 `providerGatewayEgressProxyActiveTunnels` 持续偏高、`pendingTunnels` 非零或 `oldestTunnelAgeMs` 长时间增长,应先按 provider-gateway egress tunnel 生命周期排障,确认 `egress_tcp_open`、connect timeout、idle cleanup 与 core socket close 清理是否生效。排障顺序是同时查看 `/api/frontend-performance`、`/api/performance`、`k3sctl-adapter` `/api/control-plane`、Kubernetes Pod `/live`、`/health`、overview/trace-step curl、`kubectl top pod` 或 Docker stats、容器 `RestartCount`/`OOMKilled` 和 Code Queue 日志;如果 Pod 内 `/health` 也超时,应优先检查实时 output 发布、archive 读取、transcript 构建、统计计算、启动维护、历史 OA backfill 和远程 Provider 准备/SSH 子进程是否阻塞 event loop,而不是先调整 frontend 渲染或代理超时。Code Queue 默认不得在启动时自动执行历史 OA backfill 或通知表索引维护;显式 backfill 必须作为运维动作记录,并在运行期间并发证明 `/live`、`/health` 与 `/api/tasks/overview` 仍快速返回。涉及 D601 等远程 Provider 时,还要检查 `runCodeQueueSsh`/开发容器准备是否仍存在同步子进程、无 timeout 的 SSH、无上限 stdout/stderr 或 stale TUN 重建等待;修复后必须在远程准备探针运行期间并发证明 Pod `/health` 与 `/api/tasks/overview` 仍快速返回。 Code Queue task 明明产出最终回复却反复 `retry_wait` 时,应优先用任务详情里的 latest attempt 字段核查 `terminalStatus`、`transportClosedBeforeTerminal`、`appServerExitCode`、`finalResponseChars`、`judge.raw._safetyOverride` 和 attempt output。OpenCode 远程任务中,`opencode completed status=completed exit=0` 加当前 attempt 非空 assistant 输出应对应 `terminalStatus=completed`、`transportClosedBeforeTerminal=false`;如果因为缺少 `step_finish` 事件仍触发 `_safetyOverride=terminal_not_completed`,说明协议终态归一化有回归。相反,当前 attempt 没有最终 assistant response 时即使 tool/read/bash 证据完整,也必须 retry,不能用旧 `task.finalResponse` 或 reasoning/tool evidence 代替可见最终回复。 diff --git a/docs/reference/provider-gateway.md b/docs/reference/provider-gateway.md index 3290151d..f20c2fee 100644 --- a/docs/reference/provider-gateway.md +++ b/docs/reference/provider-gateway.md @@ -100,10 +100,12 @@ backend-core 必须把 provider WebSocket HTTP tunnel 的失败分类到响应 b provider-gateway 可以提供 egress HTTP CONNECT 代理,用于让 Code Queue、Pipeline runner、target-side Docker build 等节点侧执行环境通过既有 provider WebSocket 通道出网。代理默认监听容器内 `0.0.0.0:18789`,节点部署必须只发布为宿主 loopback `127.0.0.1:18789->18789/tcp`,不得开放公网端口;普通 Docker 执行容器可通过同一私有 Docker network 访问 provider-gateway 容器名,k3s/k8s Pod 必须通过显式 Kubernetes Service 暴露同节点 provider-gateway 私有 endpoint,例如 D601 Code Queue 使用 selector 指向 hostNetwork 桥接 Pod 的 `d601-provider-egress-proxy.unidesk.svc.cluster.local:18789`,不得把固定 Docker bridge IP、手工 EndpointSlice 或该 egress Service 当作业务 HTTP 入口。代理只负责把本地 CONNECT/absolute HTTP 请求转换为 `egress_tcp_open`、`egress_tcp_data`、`egress_tcp_close` 消息;backend-core 在主 server 侧建立真实 TCP 连接并把数据回传,避免 D601 等计算节点本地网络不可达时卡死 Codex/Git/NPM/apt/Playwright。 -该能力属于 provider-gateway 通道能力,register/heartbeat 的 `unideskCapabilities` 必须包含 `network.egress-proxy`,labels 必须上报 `providerGatewayEgressProxy*` 状态。不得再为某个用户服务单独注册伪 provider 来实现出网代理;否则节点列表会出现虚假 provider,且代理、统计、升级路径会形成多套通道。代理健康检查使用 `GET /__unidesk/egress-proxy/health`,返回 `connected`、`providerId`、`activeTunnels` 和监听端口;业务服务自己的 `/health` 应把该结果作为排障证据透出。 +该能力属于 provider-gateway 通道能力,register/heartbeat 的 `unideskCapabilities` 必须包含 `network.egress-proxy`,labels 必须上报 `providerGatewayEgressProxy*` 状态。不得再为某个用户服务单独注册伪 provider 来实现出网代理;否则节点列表会出现虚假 provider,且代理、统计、升级路径会形成多套通道。代理健康检查使用 `GET /__unidesk/egress-proxy/health`,返回 `connected`、`providerId`、`activeTunnels`、`pendingTunnels`、`oldestTunnelAgeMs`、`openTimeoutMs`、`idleTimeoutMs` 和监听端口;业务服务自己的 `/health` 应把该结果作为排障证据透出。 egress proxy 的长期边界是“统一 provider 通道,不引入第二控制面”。backend-core 只接受在线 provider socket 上的 `egress_tcp_*` 消息,并在该 socket 关闭时销毁全部对应 TCP relay;provider-gateway 只维护本地 HTTP proxy 与 WebSocket 消息映射,不保存业务状态,不参与任务调度、统计或节点注册以外的控制面。执行容器、用户服务、Pipeline runner 和 provider-side deploy build 不允许直接连接 backend-core provider ingress,也不允许携带 provider token 自行注册;需要出网时只能连接同节点 provider-gateway 的私有 proxy endpoint。当前 k3s/k8s Code Queue 通过 `d601-provider-egress-proxy` Kubernetes Service 连接 D601 provider-gateway egress endpoint,这是 Pod 内的出网入口,不是业务 HTTP 代理入口,也不能替代 Kubernetes API service proxy。部署构建同样不得新建 SSH SOCKS、公网 master proxy 或宿主全局代理;构建脚本只能把 provider-gateway WS egress 作为短生命周期环境变量和 Docker build-arg 注入,并配合目标节点本地 BuildKit/image cache 避免重复下载大依赖层。 +egress tunnel 必须有生命周期边界:provider-gateway 发出 `egress_tcp_open` 后如果主 server 未在 `openTimeoutMs` 内返回 `egress_tcp_opened` 或 close,必须主动关闭本地 client 并向 core 发送 `egress_tcp_close`;provider-gateway 与 backend-core 都必须对长时间无数据的 relay 执行 idle 清理,避免 provider WebSocket 抖动、TCP connect 卡住或上游未关闭时留下 stale tunnel。排障时如果 `activeTunnels` 持续增长、`pendingTunnels` 非零或 `oldestTunnelAgeMs` 明显超过业务请求耗时,应先看 provider-gateway 与 backend-core egress 清理日志,再判断 Code Queue、PostgreSQL 或 OA Event Flow 本身是否慢。 + 故障语义必须显式,不允许静默 fallback。provider-gateway 到 backend-core 的 WebSocket 未连接时,本地 proxy 必须返回 503;执行容器不能自动绕过到 D601 本地直连公网、外部公共代理或主 server 公网 HTTP 端口。`NO_PROXY` 只用于 PostgreSQL、OA Event Flow、ClaudeQQ、frontend/backend-core 内网代理、provider-gateway health 等明确内网链路,不能把 GitHub、模型 API、npm registry 等外部目标加入绕过列表。`hyueapi.com` 是明确的模型 API 例外:该上游会拒绝 provider-gateway egress proxy 出口,Code Queue 必须用 `CODE_QUEUE_EGRESS_PROXY_NO_PROXY` / `NO_PROXY` 将 `hyueapi.com,.hyueapi.com` 配成直连,其它模型 API 仍不得默认绕过 proxy。验收必须同时证明 provider-gateway labels、业务服务 `/health` 和执行容器内 `curl -I https://...` 都走同一 proxy path,hyueapi 例外则以 Code Queue `/health.egressProxy.noProxy` 和目标任务成功完成作为证据。 ## Gateway Version Metadata diff --git a/src/components/backend-core/src/egress-tcp.ts b/src/components/backend-core/src/egress-tcp.ts index dedb45fd..b9cad80d 100644 --- a/src/components/backend-core/src/egress-tcp.ts +++ b/src/components/backend-core/src/egress-tcp.ts @@ -23,6 +23,9 @@ function isValidEgressPort(port: number): boolean { return Number.isInteger(port) && port > 0 && port <= 65_535; } +const egressTcpConnectTimeoutMs = 15_000; +const egressTcpIdleTimeoutMs = 600_000; + function sendEgressClose(provider: ProviderSocket, connectionId: string, error?: string): void { const message: CoreEgressTcpCloseMessage = error === undefined ? { type: "egress_tcp_close", connectionId } @@ -30,13 +33,31 @@ function sendEgressClose(provider: ProviderSocket, connectionId: string, error?: wsSendJson(provider, message); } +function clearConnectionTimers(connection: EgressTcpConnection): void { + if (connection.connectTimer !== null) clearTimeout(connection.connectTimer); + if (connection.idleTimer !== null) clearTimeout(connection.idleTimer); + connection.connectTimer = null; + connection.idleTimer = null; +} + function closeEgressTcpConnection(providerId: string, connectionId: string, error?: string): void { const key = egressTcpKey(providerId, connectionId); const connection = ctx.activeEgressTcpConnections.get(key); if (connection === undefined) return; ctx.activeEgressTcpConnections.delete(key); + clearConnectionTimers(connection); connection.socket.destroy(); - if (error !== undefined) sendEgressClose(connection.provider, connectionId, error); + if (error !== undefined) { + logger("warn", "egress_tcp_connection_closed", { providerId, connectionId, error }); + sendEgressClose(connection.provider, connectionId, error); + } +} + +function refreshConnectionIdle(connection: EgressTcpConnection): void { + if (connection.idleTimer !== null) clearTimeout(connection.idleTimer); + connection.idleTimer = setTimeout(() => { + closeEgressTcpConnection(connection.providerId, connection.connectionId, "egress tcp idle timeout"); + }, egressTcpIdleTimeoutMs); } export function handleEgressTcpOpen(ws: ProviderSocket, message: ProviderEgressTcpOpenMessage): void { @@ -49,13 +70,32 @@ export function handleEgressTcpOpen(ws: ProviderSocket, message: ProviderEgressT const key = egressTcpKey(message.providerId, message.connectionId); closeEgressTcpConnection(message.providerId, message.connectionId); const socket = connectTcp({ host, port }); - const connection: EgressTcpConnection = { providerId: message.providerId, connectionId: message.connectionId, socket, provider: ws }; + const connection: EgressTcpConnection = { + providerId: message.providerId, + connectionId: message.connectionId, + socket, + provider: ws, + connectTimer: null, + idleTimer: null, + }; ctx.activeEgressTcpConnections.set(key, connection); + connection.connectTimer = setTimeout(() => { + closeEgressTcpConnection(message.providerId, message.connectionId, "egress tcp connect timeout"); + }, egressTcpConnectTimeoutMs); + refreshConnectionIdle(connection); socket.on("connect", () => { + if (ctx.activeEgressTcpConnections.get(key) !== connection) return; + if (connection.connectTimer !== null) { + clearTimeout(connection.connectTimer); + connection.connectTimer = null; + } + refreshConnectionIdle(connection); const opened: CoreEgressTcpOpenedMessage = { type: "egress_tcp_opened", connectionId: message.connectionId }; wsSendJson(ws, opened); }); socket.on("data", (chunk) => { + if (ctx.activeEgressTcpConnections.get(key) !== connection) return; + refreshConnectionIdle(connection); const data: CoreEgressTcpDataMessage = { type: "egress_tcp_data", connectionId: message.connectionId, @@ -67,11 +107,13 @@ export function handleEgressTcpOpen(ws: ProviderSocket, message: ProviderEgressT socket.on("close", () => { if (ctx.activeEgressTcpConnections.get(key) !== connection) return; ctx.activeEgressTcpConnections.delete(key); + clearConnectionTimers(connection); sendEgressClose(ws, message.connectionId); }); socket.on("error", (error) => { if (ctx.activeEgressTcpConnections.get(key) !== connection) return; ctx.activeEgressTcpConnections.delete(key); + clearConnectionTimers(connection); sendEgressClose(ws, message.connectionId, error.message); }); } @@ -79,6 +121,7 @@ export function handleEgressTcpOpen(ws: ProviderSocket, message: ProviderEgressT export function handleEgressTcpData(message: ProviderEgressTcpDataMessage): void { const connection = ctx.activeEgressTcpConnections.get(egressTcpKey(message.providerId, message.connectionId)); if (connection === undefined) return; + refreshConnectionIdle(connection); connection.socket.write(Buffer.from(message.data, message.encoding === "base64" ? "base64" : "utf8")); } @@ -90,6 +133,7 @@ export function closeEgressTcpConnectionsForProvider(providerId: string): void { for (const [key, connection] of ctx.activeEgressTcpConnections) { if (connection.providerId !== providerId) continue; ctx.activeEgressTcpConnections.delete(key); + clearConnectionTimers(connection); connection.socket.destroy(); } } @@ -98,6 +142,7 @@ export function closeEgressTcpConnectionsForSocket(provider: ProviderSocket): vo for (const [key, connection] of ctx.activeEgressTcpConnections) { if (connection.provider !== provider) continue; ctx.activeEgressTcpConnections.delete(key); + clearConnectionTimers(connection); connection.socket.destroy(); } } diff --git a/src/components/backend-core/src/types.ts b/src/components/backend-core/src/types.ts index f887c182..c3c34480 100644 --- a/src/components/backend-core/src/types.ts +++ b/src/components/backend-core/src/types.ts @@ -166,6 +166,8 @@ export interface EgressTcpConnection { connectionId: string; socket: Socket; provider: ProviderSocket; + connectTimer: ReturnType | null; + idleTimer: ReturnType | null; } export type HttpTunnelFailureReason = diff --git a/src/components/provider-gateway/package.json b/src/components/provider-gateway/package.json index a21be47c..41c1b161 100644 --- a/src/components/provider-gateway/package.json +++ b/src/components/provider-gateway/package.json @@ -1,6 +1,6 @@ { "name": "@unidesk/provider-gateway", - "version": "0.2.21", + "version": "0.2.22", "private": true, "type": "module", "scripts": { diff --git a/src/components/provider-gateway/src/egress-proxy.ts b/src/components/provider-gateway/src/egress-proxy.ts index 278dd54e..848ff07d 100644 --- a/src/components/provider-gateway/src/egress-proxy.ts +++ b/src/components/provider-gateway/src/egress-proxy.ts @@ -29,7 +29,12 @@ interface Tunnel { method: string; opened: boolean; pending: Buffer[]; + pendingBytes: number; closed: boolean; + createdAt: number; + lastActivityAt: number; + openTimer: ReturnType | null; + idleTimer: ReturnType | null; } export interface ProviderEgressProxyHandle { @@ -111,6 +116,10 @@ function proxyUrlFor(host: string, port: number): string { return `http://${host}:${port}`; } +const tunnelOpenTimeoutMs = 15_000; +const tunnelIdleTimeoutMs = 600_000; +const maxPendingBytes = 4 * 1024 * 1024; + export function startProviderEgressProxy(options: ProviderEgressProxyOptions): ProviderEgressProxyHandle { const proxyUrl = proxyUrlFor(options.listenHost, options.listenPort); const tunnels = new Map(); @@ -118,25 +127,73 @@ export function startProviderEgressProxy(options: ProviderEgressProxyOptions): P const send = (message: EgressToCoreMessage): boolean => options.sendToCore(message); - const status = (): Record => ({ - enabled: true, - providerId: options.providerId, - connected: options.isCoreConnected(), - proxyUrl, - listenHost: options.listenHost, - listenPort: options.listenPort, - activeTunnels: tunnels.size, - channel: "provider-gateway", - }); + const status = (): Record => { + const now = Date.now(); + const tunnelList = Array.from(tunnels.values()); + const ages = tunnelList.map((tunnel) => now - tunnel.createdAt); + return { + enabled: true, + providerId: options.providerId, + connected: options.isCoreConnected(), + proxyUrl, + listenHost: options.listenHost, + listenPort: options.listenPort, + activeTunnels: tunnels.size, + pendingTunnels: tunnelList.filter((tunnel) => !tunnel.opened).length, + oldestTunnelAgeMs: ages.length > 0 ? Math.max(...ages) : 0, + openTimeoutMs: tunnelOpenTimeoutMs, + idleTimeoutMs: tunnelIdleTimeoutMs, + maxPendingBytes, + channel: "provider-gateway", + }; + }; + + const clearTunnelTimers = (tunnel: Tunnel): void => { + if (tunnel.openTimer !== null) clearTimeout(tunnel.openTimer); + if (tunnel.idleTimer !== null) clearTimeout(tunnel.idleTimer); + tunnel.openTimer = null; + tunnel.idleTimer = null; + }; + + const destroyTunnel = (tunnel: Tunnel, notifyCore: boolean, error?: string): void => { + tunnels.delete(tunnel.id); + tunnel.closed = true; + clearTunnelTimers(tunnel); + tunnel.pending.splice(0); + tunnel.pendingBytes = 0; + if (!tunnel.client.destroyed) tunnel.client.destroy(); + if (notifyCore) send({ type: "egress_tcp_close", providerId: options.providerId, connectionId: tunnel.id, at: nowIso() }); + if (error !== undefined) { + options.logger("warn", "egress_proxy_tunnel_closed", { + connectionId: tunnel.id, + opened: tunnel.opened, + ageMs: Date.now() - tunnel.createdAt, + error, + }); + } + }; const closeTunnel = (id: string, error?: string): void => { const tunnel = tunnels.get(id); if (tunnel === undefined) return; - tunnels.delete(id); - tunnel.closed = true; - if (!tunnel.client.destroyed) tunnel.client.destroy(); - send({ type: "egress_tcp_close", providerId: options.providerId, connectionId: id, at: nowIso() }); - if (error !== undefined) options.logger("warn", "egress_proxy_tunnel_closed", { connectionId: id, error }); + destroyTunnel(tunnel, true, error); + }; + + const refreshTunnelIdle = (tunnel: Tunnel): void => { + if (tunnel.closed) return; + tunnel.lastActivityAt = Date.now(); + if (tunnel.idleTimer !== null) clearTimeout(tunnel.idleTimer); + tunnel.idleTimer = setTimeout(() => closeTunnel(tunnel.id, "egress proxy idle timeout"), tunnelIdleTimeoutMs); + }; + + const queuePendingChunk = (tunnel: Tunnel, chunk: Buffer): boolean => { + if (tunnel.closed) return false; + tunnel.pending.push(chunk); + tunnel.pendingBytes += chunk.byteLength; + refreshTunnelIdle(tunnel); + if (tunnel.pendingBytes <= maxPendingBytes) return true; + closeTunnel(tunnel.id, "egress proxy pending buffer exceeded"); + return false; }; const handleCoreMessage = (message: EgressFromCoreMessage): boolean => { @@ -144,24 +201,29 @@ export function startProviderEgressProxy(options: ProviderEgressProxyOptions): P if (message.type === "egress_tcp_opened") { if (tunnel === undefined || tunnel.closed) return true; tunnel.opened = true; + if (tunnel.openTimer !== null) { + clearTimeout(tunnel.openTimer); + tunnel.openTimer = null; + } + refreshTunnelIdle(tunnel); if (tunnel.method === "CONNECT") { tunnel.client.write("HTTP/1.1 200 Connection Established\r\nProxy-Agent: UniDesk-ProviderGateway\r\n\r\n"); } for (const chunk of tunnel.pending.splice(0)) { send({ type: "egress_tcp_data", providerId: options.providerId, connectionId: tunnel.id, data: chunk.toString("base64"), encoding: "base64", at: nowIso() }); } + tunnel.pendingBytes = 0; return true; } if (message.type === "egress_tcp_data") { if (tunnel === undefined || tunnel.client.destroyed) return true; + refreshTunnelIdle(tunnel); tunnel.client.write(Buffer.from(message.data, message.encoding === "base64" ? "base64" : "utf8")); return true; } if (message.type === "egress_tcp_close") { if (tunnel !== undefined) { - tunnels.delete(message.connectionId); - tunnel.closed = true; - if (!tunnel.client.destroyed) tunnel.client.destroy(); + destroyTunnel(tunnel, false, message.error); } if (message.error !== undefined && message.error.length > 0) { options.logger("warn", "egress_proxy_remote_close", { connectionId: message.connectionId, error: message.error }); @@ -222,14 +284,28 @@ export function startProviderEgressProxy(options: ProviderEgressProxyOptions): P fail("400 Bad Request", "unsupported proxy target\n"); return; } - const tunnel: Tunnel = { id, client, method: parsed.method, opened: false, pending: [], closed: false }; + const createdAt = Date.now(); + const tunnel: Tunnel = { + id, + client, + method: parsed.method, + opened: false, + pending: [], + pendingBytes: 0, + closed: false, + createdAt, + lastActivityAt: createdAt, + openTimer: null, + idleTimer: null, + }; tunnels.set(id, tunnel); client.on("data", (nextChunk) => { const nextBuffer = Buffer.isBuffer(nextChunk) ? nextChunk : Buffer.from(nextChunk); if (!tunnel.opened) { - tunnel.pending.push(nextBuffer); + queuePendingChunk(tunnel, nextBuffer); return; } + refreshTunnelIdle(tunnel); send({ type: "egress_tcp_data", providerId: options.providerId, connectionId: id, data: nextBuffer.toString("base64"), encoding: "base64", at: nowIso() }); }); client.on("close", () => closeTunnel(id)); @@ -238,10 +314,13 @@ export function startProviderEgressProxy(options: ProviderEgressProxyOptions): P if (!opened) { tunnels.delete(id); tunnel.closed = true; + clearTunnelTimers(tunnel); fail("503 Service Unavailable", "provider-gateway core channel is not connected\n"); return; } - if (firstPayload !== null) tunnel.pending.push(firstPayload); + tunnel.openTimer = setTimeout(() => closeTunnel(id, "egress proxy open timeout"), tunnelOpenTimeoutMs); + refreshTunnelIdle(tunnel); + if (firstPayload !== null) queuePendingChunk(tunnel, firstPayload); }); client.on("error", () => undefined); }); From 95a75565bd885ae981157ea8c08f082698e7ff8c Mon Sep 17 00:00:00 2001 From: Codex Date: Sun, 17 May 2026 11:19:30 +0000 Subject: [PATCH 15/15] Run CI read service from checked out source --- .../k3sctl-adapter/k3s/ci/unidesk-ci.pipeline.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/components/microservices/k3sctl-adapter/k3s/ci/unidesk-ci.pipeline.yaml b/src/components/microservices/k3sctl-adapter/k3s/ci/unidesk-ci.pipeline.yaml index 6e3964e6..7f37047d 100644 --- a/src/components/microservices/k3sctl-adapter/k3s/ci/unidesk-ci.pipeline.yaml +++ b/src/components/microservices/k3sctl-adapter/k3s/ci/unidesk-ci.pipeline.yaml @@ -436,11 +436,17 @@ spec: limits: memory: 1Gi volumeMounts: + - name: source + mountPath: /app - name: state mountPath: /var/lib/unidesk/code-queue-ci - name: logs mountPath: /var/log/unidesk volumes: + - name: source + hostPath: + path: "$(workspaces.source.path)/repo" + type: Directory - name: state emptyDir: {} - name: logs