From 89fe767623efd0b427bbb81c08d9cef644953133 Mon Sep 17 00:00:00 2001 From: Codex Date: Fri, 15 May 2026 04:52:16 +0000 Subject: [PATCH] fix: stabilize code queue proxy and egress --- config.json | 2 +- docs/reference/microservices.md | 16 +- .../microservices/code-queue/Dockerfile | 1 + .../microservices/code-queue/package.json | 5 + .../code-queue/src/dev-containers.ts | 16 +- .../microservices/code-queue/src/index.ts | 3 +- .../microservices/code-queue/src/judge.ts | 2 +- .../microservices/code-queue/src/oa-events.ts | 146 ++++++++++ .../code-queue/src/provider-runtime.ts | 250 +++++++++++++++--- .../microservices/code-queue/src/task-view.ts | 44 ++- .../microservices/code-queue/src/types.ts | 1 + 11 files changed, 426 insertions(+), 60 deletions(-) diff --git a/config.json b/config.json index 8cd0dba6..95d1e435 100644 --- a/config.json +++ b/config.json @@ -516,7 +516,7 @@ "containerName": "code-queue-backend" }, "backend": { - "nodeBaseUrl": "http://host.docker.internal:4222", + "nodeBaseUrl": "http://code-queue:4222", "nodeBindHost": "127.0.0.1", "nodePort": 4222, "proxyMode": "provider-gateway-http", diff --git a/docs/reference/microservices.md b/docs/reference/microservices.md index 32a3a098..7ee4aa4e 100644 --- a/docs/reference/microservices.md +++ b/docs/reference/microservices.md @@ -9,7 +9,7 @@ UniDesk 用户服务是挂载到 UniDesk 核心服务上的、面向用户使用 - 用户服务后端端口默认只绑定计算节点本机地址,例如 `127.0.0.1:`,不得直接暴露公网。 - 浏览器只访问 UniDesk frontend;frontend 通过同源 `/api/microservices/*` 代理到 backend-core,backend-core 再通过目标 provider-gateway 的 `microservice.http` 能力访问计算节点本机后端。 - backend-core REST API、database 和计算节点用户服务后端都不得新增公网端口;公网入口仍只有 frontend 和 provider ingress。 -- `microservice.http` 只允许 provider-gateway 访问 `http://127.0.0.1`、`http://localhost`、`http://host.docker.internal` 这类节点本地地址;主 server 内置用户服务可使用同一 Compose 网络内的显式服务名,例如 `todo-note:4211`,D601 Code Queue 等计算节点服务必须使用节点本机映射。backend-core 还必须用 `allowedPathPrefixes` 和 `allowedMethods` 同时限制可代理路径和 HTTP 方法。 +- `microservice.http` 只允许 provider-gateway 访问 `http://127.0.0.1`、`http://localhost`、`http://host.docker.internal` 这类节点本地地址,或明确登记为同一私有 Docker network 内的服务名;主 server 内置用户服务可使用同一 Compose 网络内的显式服务名,例如 `todo-note:4211`,D601 Code Queue 必须使用 provider-gateway network 内的 `code-queue:4222`。backend-core 还必须用 `allowedPathPrefixes` 和 `allowedMethods` 同时限制可代理路径和 HTTP 方法。 ## Config Contract @@ -126,17 +126,18 @@ Baidu Netdisk 在 UniDesk 语境中按纯后端服务管理:不得暴露百度 当前 Code Queue 作为 `id=code-queue` 的用户服务登记在 `config.json`,长期部署在 D601,并接入统一 `oa-event-flow` 发布 Trace/STEP 事实事件与读取统计中心: -- Provider:`D601`,由 D601 provider-gateway 通过 `microservice.http` 访问 D601 本机映射 `http://host.docker.internal:4222`;容器对 D601 host 只绑定 `127.0.0.1:4222`,不得直接暴露公网。 +- Provider:`D601`,由 D601 provider-gateway 通过 `microservice.http` 访问同一 provider-gateway Docker network 内的 `http://code-queue:4222`;`code-queue-backend` 仍可对 D601 host 绑定 `127.0.0.1:4222` 作为节点本地维护入口,但 UniDesk 正式代理链路不得依赖 `host.docker.internal:4222`,避免 Linux/Docker Desktop loopback 绑定差异导致 frontend 502。 - 代码引用:`https://github.com/pikasTech/unidesk` 与配置中的 `repository.commitId`;服务源码位于 `src/components/microservices/code-queue`,属于 UniDesk 自有控制面组件。 - 部署引用:UniDesk 仓库中的 `src/components/microservices/code-queue/docker-compose.d601.yml`,Dockerfile 为 `src/components/microservices/code-queue/Dockerfile`,容器名为 `code-queue-backend`;主 server 根目录 `docker-compose.yml` 不再包含 `code-queue` service。 - 主服务依赖映射:D601 Code Queue 仍以主 PostgreSQL 为权威数据库,`DATABASE_URL` 必须指向主 server 受限端口映射;`OA_EVENT_FLOW_BASE_URL` 必须指向主 server OA Event Flow 受限端口映射;`CODE_QUEUE_NOTIFY_CLAUDEQQ_BASE_URL` 在 D601 上直接使用本机 ClaudeQQ 映射 `http://host.docker.internal:3290`。这些端口映射只服务 D601 运行时,必须用防火墙或等价策略限制来源,不得成为浏览器或任意公网客户端入口。 -- 默认出网代理:D601 `code-queue-backend` 必须默认把 `HTTP_PROXY`、`HTTPS_PROXY` 和 `ALL_PROXY` 注入给 Codex/OpenCode、`git`、`curl`、`npm` 等任务子进程;代理上游必须是 D601 provider-gateway 暴露在 provider-gateway Docker 网络内的 egress HTTP CONNECT 端口,而不是 Code Queue 自建伪 provider WebSocket 或交互 shell 临时 `export`。Code Queue Compose 必须加入 provider-gateway 网络,并通过 `CODE_QUEUE_EGRESS_PROXY_URL` 指向 `http://unidesk-provider-gateway-D601:18789`;provider-gateway 再复用已注册的 provider WebSocket 通道,把 TCP open/data/close 消息转发给主 server backend-core 出网,不依赖 D601 本地直连公网。`NO_PROXY` 必须覆盖 `localhost`、`127.0.0.1`、`host.docker.internal`、provider-gateway 容器名、主 server 地址和 UniDesk 内部服务名,避免 PostgreSQL、OA Event Flow、ClaudeQQ、microservice health 等内网链路绕远或递归进入代理;`/health` 必须暴露 egress proxy 的 `enabled`、`connected`、`proxyUrl`、`channel=provider-gateway` 和上游 provider-gateway health,作为 Codex 网络卡死排障的第一证据。 +- 默认出网代理:D601 `code-queue-backend` 必须默认把 `HTTP_PROXY`、`HTTPS_PROXY` 和 `ALL_PROXY` 注入给 Codex/OpenCode、`git`、`curl`、`npm` 等任务子进程;代理上游必须是 D601 provider-gateway 暴露在 provider-gateway Docker 网络内的 egress HTTP CONNECT 端口,而不是 Code Queue 自建伪 provider WebSocket 或交互 shell 临时 `export`。Code Queue Compose 必须加入 provider-gateway 网络,并通过 `CODE_QUEUE_EGRESS_PROXY_URL` 指向 `http://unidesk-provider-gateway-D601:18789`;provider-gateway 再复用已注册的 provider WebSocket 通道,把 TCP open/data/close 消息转发给主 server backend-core 出网,不依赖 D601 本地直连公网。`NO_PROXY` 必须覆盖 `localhost`、`127.0.0.1`、`host.docker.internal`、provider-gateway 容器名、主 server 地址和 UniDesk 内部服务名,避免 PostgreSQL、OA Event Flow、ClaudeQQ、microservice health 等内网链路绕远或递归进入代理;`/health` 必须暴露 egress proxy 的 `enabled`、`connected`、`proxyUrl`、`channel=provider-gateway` 和上游 provider-gateway health,作为 Codex 网络卡死排障的第一证据。远程开发/执行容器不得只依赖这些环境变量,必须在容器网络层用 TUN 默认路由和 OUTPUT 防火墙强制外网流量只能经 master TUN 出口。 - 出网代理无 fallback 纪律:Code Queue 的运行时配置只允许一个默认出网路径,即 provider-gateway egress proxy;不得在代码中同时保留 Code Queue 自建 WebSocket proxy、临时 shell proxy、D601 本地直连公网、主 server direct HTTP proxy 等隐式分支。Compose 层必须显式设置大小写 `HTTP_PROXY`、`HTTPS_PROXY`、`ALL_PROXY` 和 `NO_PROXY`,服务启动后再把同一组变量写入 `process.env`,确保 service 自检、Codex/OpenCode app-server、任务 shell、`git`、`curl`、`npm` 使用一致路径。任何新增网络 fallback 都必须先进入本参考文档并配套 `/health` 可见状态,否则视为残留旧路径。 -- 上线纪律:Code Queue 相关的前端或后端改进必须在同一任务内正式上线并验证公网 frontend 或 live API,不能只停留在源码、构建产物或“后续再上线”。重建 `frontend` 只替换无状态 WebUI 容器,不会触碰 D601 `code-queue-backend`、PostgreSQL 队列或运行中 Codex thread,不能以“可能影响长期任务”为由延迟前端上线;`code-queue-backend` 本身带有 restart-recovery,允许按 D601 Compose 重启/替换,停止、重启或重建后必须从持久化状态恢复运行中和排队任务。修改 Code Queue 自身时不得等待当前 Code Queue task 结束、等待 queue idle 或等待 `0 running` 后才重启;这会等待自己退出形成自锁。应直接执行 D601 `docker compose -f src/components/microservices/code-queue/docker-compose.d601.yml up -d --build --force-recreate code-queue` 或等价 build-first 路径,并用恢复后的 live API 或公网 frontend 证明任务和队列仍可读可继续。 -- 更名与灾备恢复:旧版 Codex 队列服务名只允许作为兼容诊断和一次性迁移来源;`code-queue-backend` 容器自身 `/health` 正常但 `microservice health code-queue` 返回 `microservice not found`、或服务目录仍只出现旧服务 ID 时,优先判定为 backend-core 仍加载旧 `MICROSERVICES_JSON`,必须刷新 `.state/docker-compose.env` 并显式重建/重建替换 `backend-core`,随后用 `microservice list` 验证 `id=code-queue`、`nodeBaseUrl=http://host.docker.internal:4222` 和容器摘要。若更名后 `unidesk_code_queue_*` 为空而历史 `unidesk_codex_queue_*` 表仍有队列数据,恢复前必须先停止 `code-queue-backend`,备份 `.state/code-queue` 与当前 `unidesk_code_queue_*` 表,再把历史本地状态目录合并到 `.state/code-queue/`,并用 `docker exec -i unidesk-database psql ...` 这类保持 stdin 的方式把 `unidesk_codex_queue_tasks`、`unidesk_codex_queue_queues` 和 `unidesk_codex_queue_notifications` 迁移到对应 `unidesk_code_queue_*` 表;不得在确认 `/api/tasks`、`/api/queues` 和 output archive 可读前删除历史本地状态目录或旧 PostgreSQL 表。迁移完成后只允许在 D601 用 `docker compose -f src/components/microservices/code-queue/docker-compose.d601.yml up -d --build code-queue` 启动目标服务;禁止再通过主 server Compose 启动旧 `code-queue` service。 +- 上线纪律:Code Queue 相关的前端或后端改进必须在同一任务内正式上线并验证公网 frontend 或 live API,不能只停留在源码、构建产物或“后续再上线”。重建 `frontend` 只替换无状态 WebUI 容器,不会触碰 D601 `code-queue-backend`、PostgreSQL 队列或运行中 Codex thread,不能以“可能影响长期任务”为由延迟前端上线;`code-queue-backend` 本身带有 restart-recovery,允许按 D601 Compose 重启/替换,停止、重启或重建后必须从持久化状态恢复运行中和排队任务。修改 Code Queue 自身时不得等待当前 Code Queue task 结束、等待 queue idle 或等待 `0 running` 后才重启;这会等待自己退出形成自锁。应通过 D601 上的 `~/cq-deploy` symlink 执行 `cd ~/cq-deploy && CODE_QUEUE_ENV_FILE=.state/code-queue-d601.env docker compose -f src/components/microservices/code-queue/docker-compose.d601.yml up -d --build --force-recreate code-queue` 或等价 build-first 路径,并用恢复后的 live API 或公网 frontend 证明任务和队列仍可读可继续;不要在 provider-gateway Host SSH 命令中使用 `/home/ubuntu/unidesk-code-queue-deploy` 全路径触发 provider-gateway 自保护误判。 +- 更名与灾备恢复:旧版 Codex 队列服务名只允许作为兼容诊断和一次性迁移来源;`code-queue-backend` 容器自身 `/health` 正常但 `microservice health code-queue` 返回 `microservice not found`、或服务目录仍只出现旧服务 ID 时,优先判定为 backend-core 仍加载旧 `MICROSERVICES_JSON`,必须刷新 `.state/docker-compose.env` 并显式重建/重建替换 `backend-core`,随后用 `microservice list` 验证 `id=code-queue`、`nodeBaseUrl=http://code-queue:4222` 和容器摘要。若更名后 `unidesk_code_queue_*` 为空而历史 `unidesk_codex_queue_*` 表仍有队列数据,恢复前必须先停止 `code-queue-backend`,备份 `.state/code-queue` 与当前 `unidesk_code_queue_*` 表,再把历史本地状态目录合并到 `.state/code-queue/`,并用 `docker exec -i unidesk-database psql ...` 这类保持 stdin 的方式把 `unidesk_codex_queue_tasks`、`unidesk_codex_queue_queues` 和 `unidesk_codex_queue_notifications` 迁移到对应 `unidesk_code_queue_*` 表;不得在确认 `/api/tasks`、`/api/queues` 和 output archive 可读前删除历史本地状态目录或旧 PostgreSQL 表。迁移完成后只允许在 D601 用 `docker compose -f src/components/microservices/code-queue/docker-compose.d601.yml up -d --build code-queue` 启动目标服务;禁止再通过主 server Compose 启动旧 `code-queue` service。 - Codex 认证:容器只从 D601 的 `/home/ubuntu/.codex/config.toml` 同步 Codex provider 配置到 D601 `.state/code-queue/codex-home`,并通过 D601 `.state/code-queue-d601.env` 透传 `OPENAI_API_KEY`、`CRS_OAI_KEY` 等 provider 所需变量;这些 provider 环境变量不得写入仓库,必须由 D601 Compose env-file 注入,确保容器重建和重启后不会丢失认证。新增 provider 的 `env_key` 时必须增加同类运行时透传和 Compose env 持久化,禁止把 Codex 或 MiniMax 密钥写入仓库文件。Code Queue 容器必须只读挂载 D601 host 的 SSH 目录到 `/root/.ssh`(默认 `/home/ubuntu/.ssh`),让容器内 `git push`、`ssh -T git@github.com` 与 host 使用同一套 GitHub SSH key/known_hosts;不得把私钥复制进镜像或仓库。 -- Develop-ready 镜像:Code Queue 镜像必须在启动前预装 UniDesk/Pipeline 调试所需工具,至少包含 `codex`、`bun`、`node`、`npm`/`npx`、`git`、`rg`、`curl`、`python3`/`pip3`、`docker`、`docker compose`、`docker-compose`、`jq`、`ssh`、`rsync`、`make`、`gcc`/`g++`、`tar`、`gzip` 和 `unzip`;不得依赖 Codex 任务运行时再 `apt-get install` 这些基础环境。 -- 远程开发容器与任务执行 Provider:Code Queue 必须能通过 live API 拉起 D601 等计算节点上的开发容器,入口为 `POST /api/dev-containers//start`,默认 Provider 为 `D601`。该流程由 Code Queue 调用 UniDesk `ssh ` 维护桥在目标节点创建 `unidesk-codex-dev-`,并在 Code Queue 所在节点与开发容器之间建立 `ssh -w` TUN 点对点链路;服务所在节点负责对开发容器的 TUN 源地址做 NAT/MASQUERADE,开发容器默认路由和 DNS 改走该 TUN,从而让 `ping google.com`、DNS、HTTP(S) 等出网都经主 server 全局代理,而不是依赖 D601 本地网络。提交 Code Queue 任务时必须支持选择执行 Provider:`D601` 在 D601 `code-queue-backend` 容器中本机执行,默认工作目录为 `/workspace`;其他 Provider 在对应 `unidesk-codex-dev-` 容器中执行,默认工作目录为 `/home/ubuntu`,可按任务覆盖 `cwd`。远程任务启动前必须自动复用或拉起该 Provider 的开发容器、同步 Codex 配置和允许的运行时 provider 环境变量,并通过同一 master TUN/NAT 链路出网;目标 host 存在 `/mnt` 时,开发容器必须挂载 host `/mnt:/mnt`,确保 D601 这类 WSL 节点的 Windows 盘符路径如 `/mnt/f/Work/ConStart` 在任务容器内可见,避免 agent 因缺少真实工作区而搜索到无关项目。TUN 建立必须幂等处理 stale 状态:启动前清理旧 `tun`、默认路由和旧 tunnel SSH 进程,缺失旧设备不能导致失败,冷启动运行时准备要有有界但足够的 timeout。验收必须保留三类日志:容器直连 `google.com` 在建隧道前失败、容器建隧道后 `ping google.com` 成功、服务所在节点上对应 `UNIDESK-CODEX-DEV-` NAT 链或 `tun` 计数在 ping 前后增长;涉及 WSL 工作区任务时还必须在开发容器内验证目标 `/mnt/...` 路径可读。开发容器代理密钥只生成到 `.state/code-queue/dev-proxy/` 与目标节点用户目录,不得提交到仓库。 +- Develop-ready 镜像:Code Queue 镜像必须在启动前预装 UniDesk/Pipeline 调试所需工具,至少包含 `codex`、`bun`、`node`、`npm`/`npx`、`git`、`rg`、`curl`、`python3`/`pip3`、`docker`、`docker compose`、`docker-compose`、`jq`、`ssh`、`rsync`、`make`、`gcc`/`g++`、`iptables`、`tar`、`gzip` 和 `unzip`;不得依赖 Codex 任务运行时再 `apt-get install` 这些基础环境。 +- 远程开发容器与任务执行 Provider:Code Queue 必须能通过 live API 拉起 D601 等计算节点上的开发容器,入口为 `POST /api/dev-containers//start`,默认 Provider 为 `D601`。该流程由 Code Queue 调用 UniDesk `ssh ` 维护桥在目标节点创建 `unidesk-codex-dev-`,并在 Code Queue 所在节点与开发容器之间建立 `ssh -w` TUN 点对点链路;服务所在节点负责对开发容器的 TUN 源地址做 NAT/MASQUERADE,开发容器默认路由和 DNS 改走该 TUN,从而让 `ping google.com`、DNS、HTTP(S) 等出网都经主 server 全局代理,而不是依赖 D601 本地网络。提交 Code Queue 任务时必须支持选择执行 Provider:`D601` 在 D601 `code-queue-backend` 容器中本机执行,默认工作目录为 `/workspace`;其他 Provider 在对应 `unidesk-codex-dev-` 容器中执行,默认工作目录为 `/home/ubuntu`,可按任务覆盖 `cwd`。远程任务启动前必须自动复用或拉起该 Provider 的开发容器、同步 Codex 配置和允许的运行时 provider 环境变量,并通过同一 master TUN/NAT 链路出网;目标 host 存在 `/mnt` 时,开发容器必须挂载 host `/mnt:/mnt`,确保 D601 这类 WSL 节点的 Windows 盘符路径如 `/mnt/f/Work/ConStart` 在任务容器内可见,避免 agent 因缺少真实工作区而搜索到无关项目。TUN 建立必须幂等处理 stale 状态:启动前清理旧 `tun`、默认路由、旧 tunnel SSH 进程和旧 OUTPUT 跳转,缺失旧设备不能导致失败,冷启动运行时准备要有有界但足够的 timeout。TUN 建立后必须创建 `UD-CQ-EGRESS-` OUTPUT 链,规则只允许 loopback、既有连接、`tun` 出口以及到 master server 的 SSH tunnel 控制连接,随后 reject 其他 IPv4/IPv6 出站包;这条网络层封口是开发/执行容器的权威外网边界,不能用 `HTTP_PROXY`/`NO_PROXY` 环境变量替代,容器镜像也必须使用已解析出的唯一 `unidesk-code-queue:` 或显式 `image`,缺失时直接失败,禁止 provider-gateway image、`latest` 或其他隐式镜像 fallback。验收必须保留三类日志:容器建隧道后 `ping google.com` 成功、强制指定原 Docker 网卡直连外网被 `sealed_direct_ping=blocked_expected` 拦截、服务所在节点上对应 `UNIDESK-CODEX-DEV-` NAT 链或 `tun` 计数在 ping 前后增长;涉及 WSL 工作区任务时还必须在开发容器内验证目标 `/mnt/...` 路径可读。`GET /api/dev-containers//status` 必须展示默认路由、`route_8_8_8_8`、`egressFirewallChain` 和 OUTPUT 链跳转。开发容器代理密钥只生成到 `.state/code-queue/dev-proxy/` 与目标节点用户目录,不得提交到仓库。 +- 远程维护桥调用:Code Queue 已迁移到 D601 后,`code-queue-backend` 容器内没有主 server 的 `unidesk-backend-core` 容器,不能再把 `bun scripts/cli.ts ssh ...` 实现为本地 `docker exec unidesk-backend-core`。Code Queue 后端发起的 provider 维护命令必须通过主 server frontend `/api/dispatch` 进入 backend-core,再由目标 provider-gateway 执行 `host.ssh`;需要传递脚本时必须使用 base64 临时文件,超过 Host SSH 单命令长度上限时分块上传到目标 `/tmp` 后再执行,避免恢复到本地 Docker broker、交互 stdin 或手工 shell fallback。 - 远程 Provider 准备不得阻塞控制面:Code Queue 在请求处理、队列调度、远程开发容器准备、Host SSH/WSL SSH 透传、Codex/OpenCode 启动和日志导出路径中,禁止使用会长时间占用 Bun event loop 的同步子进程调用,例如针对远程 Provider 的 `spawnSync`、`execSync` 或 `execFileSync`。远程命令必须通过异步子进程执行,带显式 timeout、超时 kill、stdout/stderr 上限和任务 output 进度记录;远程准备失败只能让对应任务进入失败或 retry,不能让 `POST /api/tasks`、SSE `/api/events`、`/health`、overview 或前端 direct proxy 等控制面请求等待远程 SSH 结束。凡是改动 D601/远程 Provider 准备、`api/dev-containers/*`、任务入队启动或 `runCodeQueueSsh` 等路径,验收必须在一个远程 SSH/status/start 探针运行期间并发验证容器直连 `/health` 和 `/api/tasks/overview` 仍能在 1s 内返回,证明远程超时不会复发为全站刷新卡死。 - OpenCode 远程执行:`minimax-m2.7` 走 OpenCode JSON event port 时,本地和远程命令都必须显式执行 `opencode run ...`;远程 Docker exec 不得退化成 `exec run ...`,否则会在目标容器内变成 `bash: exec: run: not found`。OpenCode JSON stream 的终态判定以“当前进程退出码 + 当前 attempt 的最终 assistant response”为准:`exit=0` 且当前 attempt 产生非空最终回复时,即使上游没有发 `step_finish` 事件,也应视为正常 terminal;非零退出、无当前最终回复或传输关闭才进入 retry。每个 attempt 的 `finalResponse` 必须只来自当前 OpenCode/Codex turn,禁止在当前 turn 未产出最终回复时回退复用 task 上一次 `finalResponse`,否则会把旧任务内容误判为本轮完成。 - Codex 控制:服务内部启动 `codex app-server --listen stdio://`,用 JSON-RPC 调用 `thread/start`、`turn/start`、`turn/steer` 和 `turn/interrupt`,并监听 `turn/completed`、assistant delta、reasoning delta、command output delta、file diff delta 等通知生成前端可轮询的 transcript。 @@ -147,6 +148,7 @@ Baidu Netdisk 在 UniDesk 语境中按纯后端服务管理:不得暴露百度 - 内存优化过程与防回归:Code Queue 已迁移到 D601,但内存治理仍必须按“PostgreSQL 权威源优先、进程热状态最小化、容器硬上限兜底”的顺序设计。长期可复用的优化路径是:先确认任务、queue、readAt、promptHistory、active session 和通知 outbox 均可从 PostgreSQL 恢复;再把历史任务列表、详情、统计、Trace/output 和 `/health` 的只读查询改为 PostgreSQL 直读或聚合查询;随后只把 `queued`、`running`、`judging`、`retry_wait` 等调度必需任务载入 Bun 堆,并在 PostgreSQL 查询侧裁剪 hot `output`/`events`;最后用 dirty-only flush、append-only 输出归档、Codex SQLite 小批量导出、`bun --smol`、`mem_limit=600m`、`memswap_limit=1536m`、`NODE_OPTIONS=--max-old-space-size=768` 和 cgroup memory watchdog 作为运行时防线。PostgreSQL 到进程的单次读取足够快,不能为了减少 SQL 查询把全部历史 `task_json`、Trace、output 或统计摘要常驻内存;任何新增缓存都必须有默认较小的环境变量上限、明确淘汰策略、可从 PostgreSQL 或 append-only 归档重建,且不得影响重启恢复。新增或修改 `/api/tasks`、overview、stats、summary、transcript、output、trace、health、flush、scheduler 和通知路径时,禁止在常规请求中调用会物化全量历史任务 JSON 的代码,禁止启动后无条件重写全量历史 task JSON,禁止用未设上限的 `Map`/数组保存历史 output/event/Trace,`CODE_QUEUE_MAX_ACTIVE_QUEUES=0` 表示不按 queue 数量设置全局排队上限;如显式设置为正数,必须同时说明内存预算并补充内存压测验收。memory watchdog 必须以 cgroup working set 为主要判断,且在 swap 仍有余量时不得提前杀掉唯一 active run;否则 TypeScript/Playwright 这类短时高内存验证会被错误中断并让 retry 队列反复震荡。 - 列表/详情延迟优化原则:Code Queue 控制面交互的长期目标是常规历史规模下首屏、`GET /api/tasks/overview`、`POST /api/tasks//read` 和分页加载均在 1s 内完成;性能面板出现十几秒级 `code_queue_direct_proxy` 或 `core_proxy` 慢操作时,必须优先按后端查询形态和前后端通信策略定位,不能把问题归因于 React 渲染后只改 UI。后端优化顺序是:先为 queue、status、updated/created 时间、readAt/terminal unread 和常用筛选条件补齐 PostgreSQL 索引;再用 SQL `COUNT`、`GROUP BY`、条件聚合和分页 ID 查询生成 queue/status/stats/unread 摘要;随后按 ID 轻量加载当前页、selected、active 和 unread priority task,禁止为了列表或已读操作解析完整 Trace、output archive、Codex transcript 或物化全量历史 `task_json`。`read`/`read-all` 这类 mutation 必须是 SQL-only 更新并返回最小 patch/queue 计数,不能触发 overview 全量重算或重载所有任务;启动 warm 只能预热小体积聚合和索引路径,不得把历史任务作为常驻缓存。允许 frontend/backend 代理使用秒级、严格有界、mutation 自动失效的 overview micro-cache 来吸收重复刷新,但 cache 只能作为抖动保护,不能替代数据库索引、聚合查询和分页披露,也不能让 stale readAt/queue/status 状态跨设备可见。 - Trace/实时输出热路径防回归:Code Queue 的 `appendOutput`、output archive append、`publishTaskEvent`、SSE `/api/events`、任务列表、overview、task meta 和 `/health` 都属于热路径,必须保持 O(1) 或明确小常数上界;这些路径不得同步调用完整 transcript 构建器、`taskFullOutput`、output archive 全量读取、Codex session/log 文件解析、完整 `task_json` 物化或任何会随历史输出长度增长的统计。输出追加时必须增量维护轻量持久化指标,至少包括 `stepCount`、`llmStepCount`、`outputMaxSeq` 或等价字段;列表、overview、meta、SSE 事件和 `/health` 只能读取这些指标或小体积 SQL 聚合。完整 Trace、`trace-summary`、`trace-steps`、`trace-step`、transcript/output 详情允许在显式详情请求中解析归档,但必须分页或有界、使用短 TTL 或容量受限缓存,并在 archive append 后失效。若 frontend 性能面板出现 Code Queue direct proxy 502、`/api/tasks/overview`/trace 接口成批超时,或容器内 `/health` 在 active output 持续追加时也卡住,优先按 Bun event-loop starvation/backpressure 排查,而不是先改 React 渲染;修复必须证明热路径不再随 output/archive 历史线性增长。 +- Trace STEP 权威来源:`GET /api/tasks//trace-steps` 与 `GET /api/tasks//trace-step` 必须直接从 `oa-event-flow` 读取 `trace-step-created` 事实事件,并在响应中暴露 `source=oa-event-flow`;不得在 OA 事件缺失、读取失败或本地 transcript 数量更多/更少时静默回退到 `task-transcript`、Codex session JSONL、output archive 或内存热状态。OA 事件不可用应显式失败或返回空事实集,避免 STEP 计数和执行过程摘要重新形成双路径分叉。 - 完成判定:app-server `turn/completed` 的 `turn.status=completed|interrupted|failed` 只代表 Codex turn 已结束;即使 `completed` 也必须把原始任务、当前 attempt 的 assistant 最终回复、command/file-change 事件、stderr tail 和 current attempt events 组成 execution record 交给 judge 判断是否真的完成。MiniMax judge 输入必须做有界压缩,保留终态、最终回复、关键错误/命令/部署证据和摘要计数,避免长 transcript 让 MiniMax 请求超时;默认 `UNIDESK_CODE_QUEUE_MINIMAX_JUDGE_TIMEOUT_MS=90000`。MiniMax judge 之前和之后都必须保留少量协议级硬门禁:明确用户 interrupt 判为 fail;当前 attempt `terminalStatus=failed|null`、传输在终态前关闭、或当前 attempt 最终回复为空时判为 retry;这些门禁只保护“本轮 turn 是否可被验收”的事实,不得发明业务实现要求。协议门禁通过后,配置了 `UNIDESK_CODE_QUEUE_MINIMAX_API_KEY` 且 MiniMax 可用时,MiniMax `MiniMax-M2.7` 对业务是否 `complete|retry|fail` 的判定是权威结果;当且仅当 MiniMax LLM 调用失效(未配置、额度/限流/网络/超时不可用、JSON 去噪与 repair 全部耗尽、或返回超预算反馈且修复耗尽)时,才允许启用非 LLM/fallback 判断。MiniMax 返回必须先做 JSON 去噪,支持去除 Markdown fence、`json` 标签和从夹杂文本中提取平衡 JSON object;如果去噪后仍无法解析,服务必须把解析错误和上一轮去噪前原始回答反馈给 MiniMax 做 JSON repair 重试,重试次数由 `UNIDESK_CODE_QUEUE_MINIMAX_JUDGE_REPAIR_ATTEMPTS` 控制,默认 `2`,耗尽后才进入 fallback,并在 fallback 原因、task JSON、attempt summary 和 TraceView 中保留 MiniMax 失败阶段、是否超时、耗时、prompt/payload 大小、HTTP 状态、错误名和响应预览。 - Judge 权威边界:MiniMax 成功返回可解析、预算内的 judge JSON 后,Code Queue 不得用旧 attempt 的 429/exceeded retry limit 证据、历史 output 字符串、面向特定任务的正则或 `hardCompletionBlockers`/`retryRequiredReasons` 覆盖一次协议有效的完成判定;尤其不能因为 attempt 1 的限流中断仍在历史输出里,就禁止 MiniMax 把 attempt 2 的正常完成判为 `complete`。允许的本地 safety override 必须限定为协议事实和系统交付纪律:用户显式打断、当前 attempt 未正常终止、当前 attempt 没有最终 assistant response、最终回复停在并发文件确认而非交付、或 runtime/UI/service 变更承认未部署验证。所有 override 都必须写入 `_safetyOverride`、生成紧凑 continuation prompt,并由自测或 judge probe 覆盖;不得把业务猜测伪装成本地硬门禁。 - Retry/推进语义:`retry` 不是新开一个独立任务或完全新 session;只要已有 `codexThreadId`,服务必须 `thread/resume` 原 thread 并 append 一个继续执行 prompt。continuation/judge feedback prompt 只应携带本轮缺口、恢复原因、验收要求和有界原始任务摘要,禁止重新注入完整引用上下文、历史 transcript 或长 JSON;服务重启恢复类 feedback 尤其必须保持短 prompt,依赖现有 thread 上文继续。超长 prompt 必须在 prompt 合成源头解决:每个 feedback/recovery/judge 生成器都要从结构化字段选择必要信息、去重合并缺口并提供按需查询入口,禁止先合成超长 prompt 再在末端用 substring/safePreview 一刀切硬截断;硬截断会静默丢失验收信息,风险高于长 prompt 本身。若 MiniMax `continuePrompt` 超出预算,必须要求 MiniMax 基于原始 judge 输入重新合成紧凑反馈,repair 耗尽后才可进入 fallback;不得把已生成的长 prompt 截尾后发送给 Codex。若 MiniMax 成功返回了预算内 `continuePrompt`,必须原样使用该反馈,不得再用 71-Freq、`period_sum/mpu_read_num`、`mpu_read_num`、历史限流中断等字符串识别把它覆盖成“简洁原始需求 continuation”。只有 judge 判定 `complete` 后,队列 worker 才把当前任务标为成功并推进下一个 queued/retry_wait 任务。非 LLM/fallback 判定产生的 `retry` 最多累计 `3` 次;达到上限后当前任务必须转为 `failed` 并记录原因,worker 继续推进后续 queued/retry_wait 任务,避免 fallback safety override 或硬编码判断造成无限循环。 diff --git a/src/components/microservices/code-queue/Dockerfile b/src/components/microservices/code-queue/Dockerfile index 058eacd2..cf0ee39a 100644 --- a/src/components/microservices/code-queue/Dockerfile +++ b/src/components/microservices/code-queue/Dockerfile @@ -16,6 +16,7 @@ RUN apt-get update \ git \ gzip \ iproute2 \ + iptables \ iputils-ping \ jq \ make \ diff --git a/src/components/microservices/code-queue/package.json b/src/components/microservices/code-queue/package.json index 662f5bed..9cfac0ab 100644 --- a/src/components/microservices/code-queue/package.json +++ b/src/components/microservices/code-queue/package.json @@ -8,5 +8,10 @@ }, "dependencies": { "postgres": "latest" + }, + "devDependencies": { + "@types/bun": "latest", + "@types/node": "latest", + "typescript": "latest" } } diff --git a/src/components/microservices/code-queue/src/dev-containers.ts b/src/components/microservices/code-queue/src/dev-containers.ts index 8ee79f0e..e8fc1551 100644 --- a/src/components/microservices/code-queue/src/dev-containers.ts +++ b/src/components/microservices/code-queue/src/dev-containers.ts @@ -82,10 +82,13 @@ async function startDevContainerPlan(plan: DevContainerPlan, options: { const before = await run("main-server", ctx().masterProxyEvidenceScript(plan), 15_000, "master-proxy-evidence-before-ping"); const ping = await run(plan.providerId, ctx().devContainerPingScript(plan), 20_000, "remote-container-ping-google"); const after = await run("main-server", ctx().masterProxyEvidenceScript(plan), 15_000, "master-proxy-evidence-after-ping"); - verification.pingGoogleOk = ping.exitCode === 0 && /0% packet loss|1 packets received|1 received/iu.test(ping.stdout); + verification.pingGoogleOk = ping.exitCode === 0 && /0% packet loss|1 packets received|1 received|bytes from/iu.test(ping.stdout); verification.directPingEvidence = commands.find((command) => command.name === "remote-tunnel-start")?.stdout.includes("direct_ping=failed_expected") === true ? `direct ${plan.providerId} container ping failed before tunnel` : "direct ping did not fail before tunnel"; + verification.sealedEgressEvidence = ping.stdout.includes("sealed_direct_ping=blocked_expected") + ? `direct ${plan.providerId} container egress blocked after tunnel seal` + : "sealed direct egress block was not observed"; verification.masterProxyEvidenceBefore = ctx().safePreview(before.stdout, 2000); verification.pingGoogleLog = ping.stdout; verification.masterProxyEvidenceAfter = ctx().safePreview(after.stdout, 2000); @@ -156,13 +159,14 @@ export async function startDevContainer(req: Request, providerFromPath: string | containerWorkdir: plan.containerWorkdir, }, masterProxy: { - mode: "ssh-tun-nat", + mode: "ssh-tun-nat-sealed", masterHost: plan.masterHost, tunId: plan.tunId, tunName: plan.tunName, serverIp: plan.serverIp, clientIp: plan.clientIp, natChain: plan.natChain, + egressFirewallChain: plan.egressFirewallChain, }, verification: result.verification, commands: result.commands, @@ -175,11 +179,12 @@ export async function startDevContainer(req: Request, providerFromPath: string | providerId, containerName: plan.containerName, masterProxy: { - mode: "ssh-tun-nat", + mode: "ssh-tun-nat-sealed", masterHost: plan.masterHost, tunName: plan.tunName, clientIp: plan.clientIp, natChain: plan.natChain, + egressFirewallChain: plan.egressFirewallChain, }, }, 500); } @@ -193,7 +198,7 @@ export async function devContainerStatus(providerFromPath: string | null): Promi CONTAINER=${ctx().shellQuote(plan.containerName)} docker inspect "$CONTAINER" --format 'container={{.Name}} state={{.State.Status}} image={{.Config.Image}} workdir={{ index .Config.Labels "unidesk.workdir" }} started={{.State.StartedAt}}' 2>/dev/null || true if docker inspect "$CONTAINER" >/dev/null 2>&1; then - docker exec "$CONTAINER" bash -lc 'export PATH=/tmp/unidesk-tools:$PATH; echo default=$(ip route show default | head -1); echo resolv=$(tr "\\n" " " /dev/null || true' || true + docker exec "$CONTAINER" bash -lc 'export PATH=/tmp/unidesk-tools:$PATH; source /tmp/unidesk-dev-egress.env 2>/dev/null || true; echo default=$(ip route show default | head -1); echo route_8_8_8_8=$(ip route get 8.8.8.8 | head -1); echo resolv=$(tr "\\n" " " /dev/null || true; if [ -n "\${EGRESS_CHAIN:-}" ]; then iptables -S "$EGRESS_CHAIN" 2>/dev/null || true; iptables -S OUTPUT 2>/dev/null | grep "$EGRESS_CHAIN" || true; fi' || true fi`; commands.push(await ctx().runCodeQueueSsh(providerId, statusScript, 15_000, "remote-container-status")); commands.push(await ctx().runCodeQueueSsh("main-server", ctx().masterProxyEvidenceScript(plan), 15_000, "master-proxy-status")); @@ -202,11 +207,12 @@ fi`; providerId, containerName: plan.containerName, masterProxy: { - mode: "ssh-tun-nat", + mode: "ssh-tun-nat-sealed", masterHost: plan.masterHost, tunName: plan.tunName, clientIp: plan.clientIp, natChain: plan.natChain, + egressFirewallChain: plan.egressFirewallChain, }, commands, }); diff --git a/src/components/microservices/code-queue/src/index.ts b/src/components/microservices/code-queue/src/index.ts index b4c3e4a1..552d56ca 100644 --- a/src/components/microservices/code-queue/src/index.ts +++ b/src/components/microservices/code-queue/src/index.ts @@ -3777,7 +3777,7 @@ async function route(req: Request): Promise { defaultProviderId: plan.providerId, startEndpoint: `/api/dev-containers/${encodeURIComponent(plan.providerId)}/start`, statusEndpoint: `/api/dev-containers/${encodeURIComponent(plan.providerId)}/status`, - masterProxyMode: "ssh-tun-nat", + masterProxyMode: "ssh-tun-nat-sealed", defaultPlan: { containerName: plan.containerName, image: plan.image, @@ -3787,6 +3787,7 @@ async function route(req: Request): Promise { serverIp: plan.serverIp, clientIp: plan.clientIp, natChain: plan.natChain, + egressFirewallChain: plan.egressFirewallChain, }, }); } diff --git a/src/components/microservices/code-queue/src/judge.ts b/src/components/microservices/code-queue/src/judge.ts index f764c1b1..70d2a4ab 100644 --- a/src/components/microservices/code-queue/src/judge.ts +++ b/src/components/microservices/code-queue/src/judge.ts @@ -118,7 +118,7 @@ function fallbackJudge(result: CodexRunResult, minimaxError?: string | JudgeFail } export const retryInstruction = "这是同一个 Codex thread 的 continuation,不是新任务。请基于上文继续完成原始任务;只做最小必要状态核查,避免从头重新摸索、避免重复已经完成的修改。"; -const codeQueueD601DeployCommand = "bun scripts/cli.ts ssh D601 'cd /home/ubuntu/unidesk-code-queue-deploy && CODE_QUEUE_ENV_FILE=/home/ubuntu/unidesk-code-queue-deploy/.state/code-queue-d601.env docker compose -f src/components/microservices/code-queue/docker-compose.d601.yml up -d --build --force-recreate code-queue'"; +const codeQueueD601DeployCommand = "bun scripts/cli.ts ssh D601 'cd ~/cq-deploy && CODE_QUEUE_ENV_FILE=.state/code-queue-d601.env docker compose -f src/components/microservices/code-queue/docker-compose.d601.yml up -d --build --force-recreate code-queue'"; const codeQueueRestartSafetyGuidance = `Code Queue 服务具备 restart-recovery,允许在任何时候重启、重建或替换 D601 的 \`code-queue-backend\`;当当前任务修改 Code Queue 自身时,禁止等待当前 Code Queue task 退出或等待队列归零后再重启,因为这会等待自己退出形成自锁。正确做法是在 D601 直接执行 \`${codeQueueD601DeployCommand}\` 或等价 no-deps force-recreate,并在恢复后用 live health/task 查询验证。`; export function explicitUserInterrupt(task: QueueTask, result: CodexRunResult): boolean { diff --git a/src/components/microservices/code-queue/src/oa-events.ts b/src/components/microservices/code-queue/src/oa-events.ts index c8e9d77a..7e362929 100644 --- a/src/components/microservices/code-queue/src/oa-events.ts +++ b/src/components/microservices/code-queue/src/oa-events.ts @@ -28,6 +28,20 @@ export interface OaTraceStats extends JsonRecord { source: "oa-event-flow"; } +export interface OaTraceStepSummary { + eventSequence: number; + seq: number; + at: string; + kind: string; + title: string; + status: string; + summaryLines: string[]; + rawSeqs: number[]; + scopeId: string; + attemptIndex: number | null; + source: "oa-event-flow"; +} + const postTimeoutMs = 2500; let context: OaEventContext | null = null; @@ -119,6 +133,19 @@ function recordString(record: Record | null, keys: string[]): s return ""; } +function asJsonRecord(value: unknown): JsonRecord { + return typeof value === "object" && value !== null && !Array.isArray(value) ? value as JsonRecord : {}; +} + +function stringValue(value: unknown, fallback = ""): string { + return typeof value === "string" ? value : fallback; +} + +function positiveInteger(value: unknown): number | null { + const parsed = Number(value); + return Number.isInteger(parsed) && parsed > 0 ? Math.floor(parsed) : null; +} + export function outputTraceKind(output: LiveOutput): "read" | "edit" | "run" | "error" | "message" | "system" { if (output.channel === "diff") return "edit"; if (output.channel === "error") return "error"; @@ -338,6 +365,125 @@ export async function readOaTraceStatsForTaskAttempts(taskId: string, attemptInd return readOaTraceStatsForScopeIds([taskScopeId(taskId), ...uniqueAttempts.map((index) => taskAttemptScopeId(taskId, index))]); } +function eventPayloadRecord(event: unknown): JsonRecord { + if (typeof event !== "object" || event === null || Array.isArray(event)) return {}; + return asJsonRecord((event as Record).payload); +} + +function stringList(value: unknown, maxItems = 8): string[] { + if (!Array.isArray(value)) return []; + return value.map((item) => String(item ?? "").trimEnd()).filter((item) => item.length > 0).slice(0, maxItems); +} + +function numberList(value: unknown, fallback: number): number[] { + if (!Array.isArray(value)) return [fallback]; + const values = value.map((item) => Number(item)).filter((item) => Number.isFinite(item)).map((item) => Math.floor(item)); + return values.length > 0 ? values : [fallback]; +} + +function commandLifecycleStatus(payload: JsonRecord, title: string, summaryLines: string[]): string { + const source = [title, ...summaryLines].join("\n"); + const status = /\bstatus=([A-Za-z0-9_-]+)/u.exec(source)?.[1]; + if (status !== undefined && status.length > 0) return status; + const method = stringValue(payload.method); + if (method.length > 0) return method; + return stringValue(payload.status); +} + +function traceStepFromEvent(event: unknown): OaTraceStepSummary | null { + if (typeof event !== "object" || event === null || Array.isArray(event)) return null; + const record = event as Record; + const payload = eventPayloadRecord(record); + const seq = Number(payload.seq ?? payload.stepSeq ?? payload.outputSeq); + if (!Number.isFinite(seq)) return null; + const eventSequence = Number(record.sequence ?? 0); + const title = stringValue(payload.title, stringValue(payload.method, stringValue(payload.kind))).slice(0, 300); + const summaryLines = stringList(payload.summaryLines); + return { + eventSequence: Number.isFinite(eventSequence) ? Math.floor(eventSequence) : 0, + seq: Math.floor(seq), + at: stringValue(record.createdAt, stringValue(payload.createdAt, ctx().nowIso())), + kind: stringValue(payload.kind, stringValue(payload.stepKind, stringValue(payload.channel, "system"))), + title, + status: commandLifecycleStatus(payload, title, summaryLines).slice(0, 120), + summaryLines, + rawSeqs: numberList(payload.rawSeqs, Math.floor(seq)), + scopeId: stringValue(payload.scopeId, stringValue(payload.attemptScopeId)), + attemptIndex: positiveInteger(payload.attemptIndex), + source: "oa-event-flow", + }; +} + +function eventNextAfterSeq(body: Record, events: unknown[], fallback: number): number { + const bodyNext = Number(body.nextAfterSeq); + const eventNext = events.reduce((max, event) => { + if (typeof event !== "object" || event === null || Array.isArray(event)) return max; + const sequence = Number((event as Record).sequence); + return Number.isFinite(sequence) ? Math.max(max, Math.floor(sequence)) : max; + }, fallback); + return Number.isFinite(bodyNext) && bodyNext > fallback ? Math.max(eventNext, Math.floor(bodyNext)) : eventNext; +} + +const traceStepReadPageLimit = 500; +const traceStepReadMaxPages = 40; +const traceStepReadAttempts = 3; +const traceStepReadTimeoutMs = 10000; + +async function waitMs(ms: number): Promise { + await new Promise((resolve) => setTimeout(resolve, ms)); +} + +async function fetchOaTraceStepPage(url: URL): Promise> { + let lastError: unknown = null; + for (let attempt = 1; attempt <= traceStepReadAttempts; attempt += 1) { + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), traceStepReadTimeoutMs); + try { + const response = await fetch(url, { signal: controller.signal }); + if (!response.ok) throw new Error(`status=${response.status}`); + return await response.json() as Record; + } catch (error) { + lastError = error; + if (attempt < traceStepReadAttempts) await waitMs(250 * attempt); + } finally { + clearTimeout(timer); + } + } + throw lastError instanceof Error ? lastError : new Error(String(lastError ?? "unknown error")); +} + +export async function readOaTraceStepsForTask(taskId: string, attemptIndex: number | null = null): Promise { + const cleanTaskId = String(taskId || "").trim(); + if (cleanTaskId.length === 0) return []; + const runtime = ctx(); + const tags = [`task:${cleanTaskId}`, "trace"]; + if (attemptIndex !== null && Number.isInteger(attemptIndex) && attemptIndex > 0) tags.push(`attempt:${Math.floor(attemptIndex)}`); + const bySeq = new Map(); + let afterSeq = 0; + for (let page = 0; page < traceStepReadMaxPages; page += 1) { + const url = new URL(`${runtime.baseUrl}/api/events`); + url.searchParams.set("tags", tags.join(",")); + url.searchParams.set("type", "trace-step-created"); + url.searchParams.set("limit", String(traceStepReadPageLimit)); + url.searchParams.set("afterSeq", String(afterSeq)); + try { + const body = await fetchOaTraceStepPage(url); + const events = Array.isArray(body.events) ? body.events : []; + for (const event of events) { + const step = traceStepFromEvent(event); + if (step !== null) bySeq.set(step.seq, { ...(bySeq.get(step.seq) ?? {}), ...step }); + } + const nextAfterSeq = eventNextAfterSeq(body, events, afterSeq); + if (events.length < traceStepReadPageLimit || nextAfterSeq <= afterSeq) break; + afterSeq = nextAfterSeq; + } catch (error) { + runtime.logger("warn", "oa_trace_steps_read_failed", { taskId: cleanTaskId, attemptIndex, page, error: error instanceof Error ? error.message : String(error) }); + throw error; + } + } + return Array.from(bySeq.values()).sort((left, right) => left.seq - right.seq); +} + function statNumber(stats: OaTraceStats | null | undefined, key: string): number | null { const value = Number(stats?.[key]); return Number.isFinite(value) && value >= 0 ? Math.floor(value) : null; diff --git a/src/components/microservices/code-queue/src/provider-runtime.ts b/src/components/microservices/code-queue/src/provider-runtime.ts index 2afc0d9b..b2171db9 100644 --- a/src/components/microservices/code-queue/src/provider-runtime.ts +++ b/src/components/microservices/code-queue/src/provider-runtime.ts @@ -118,7 +118,7 @@ function buildDevContainerPlan(providerId: string, body: Record ? imageFromBody : ctx().config.devContainerImage.length > 0 ? ctx().config.devContainerImage - : "unidesk-code-queue:latest"; + : `unidesk-code-queue:${safeProvider.toLowerCase()}`; const workdir = workdirFromBody.length > 0 ? workdirFromBody : ctx().config.devContainerWorkdir; const thirdOctet = providerId === "D601" ? 6 : Math.max(1, Math.min(250, Math.floor(tunId / 64))); const baseOctet = providerId === "D601" ? 0 : (tunId % 64) * 4; @@ -138,6 +138,7 @@ function buildDevContainerPlan(providerId: string, body: Record serverIp: `10.214.${thirdOctet}.${serverLastOctet}`, clientIp: `10.214.${thirdOctet}.${clientLastOctet}`, natChain: safeDockerName(`UNIDESK-CODEX-DEV-${safeProvider}`).toUpperCase().slice(0, 28), + egressFirewallChain: safeDockerName(`UD-CQ-EGRESS-${safeProvider}`).toUpperCase().slice(0, 28), keyDir: `/home/ubuntu/.unidesk/codex-dev-proxy/${safeProvider}`, masterKeyPath: resolve(ctx().config.defaultWorkdir, ".state/code-queue/dev-proxy", safeProvider, "id_ed25519"), }; @@ -151,18 +152,27 @@ function appendLimited(buffer: string, chunk: Buffer | string, maxBytes: number) return { value: buffer + text.slice(0, remaining), truncated: true }; } -function runCodeQueueSsh(providerId: string, script: string, timeoutMs: number, name: string): Promise { - const started = Date.now(); +interface CliRunResult { + code: number | null; + signal: NodeJS.Signals | null; + stdout: string; + stderr: string; + timedOut: boolean; + truncated: boolean; + spawnError: Error | null; +} + +function runCliCommand(args: string[], timeoutMs: number): Promise { const maxBuffer = 8 * 1024 * 1024; - return new Promise((resolveLog) => { + return new Promise((resolveRun) => { let stdout = ""; let stderr = ""; let truncated = false; let timedOut = false; let spawnError: Error | null = null; - const child = spawn("bun", ["scripts/cli.ts", "ssh", providerId, "bash -s"], { + const child = spawn("bun", args, { cwd: ctx().config.defaultWorkdir, - stdio: ["pipe", "pipe", "pipe"], + stdio: ["ignore", "pipe", "pipe"], }); const timer = setTimeout(() => { timedOut = true; @@ -183,28 +193,152 @@ function runCodeQueueSsh(providerId: string, script: string, timeoutMs: number, child.on("error", (error) => { spawnError = error; }); - child.stdin.on("error", () => undefined); child.on("close", (code, signal) => { clearTimeout(timer); - const errorParts: string[] = []; - if (stderr.length > 0) errorParts.push(stderr); - if (timedOut) errorParts.push(`Error: command timed out after ${timeoutMs}ms`); - if (spawnError !== null) errorParts.push(`${spawnError.name}: ${spawnError.message}`); - if (truncated) errorParts.push(`output truncated at ${maxBuffer} bytes`); - resolveLog({ - name, - providerId, - exitCode: timedOut && code === 0 ? null : code, + resolveRun({ + code: timedOut && code === 0 ? null : code, signal: signal as NodeJS.Signals | null, - durationMs: Date.now() - started, stdout, - stderr: errorParts.join("\n").trim(), + stderr, + timedOut, + truncated, + spawnError, }); }); - child.stdin.end(script); }); } +function buildFrontendHostSshArgs(providerId: string, command: string, timeoutMs: number): string[] { + const waitMs = Math.max(timeoutMs + 5_000, 20_000); + return [ + "scripts/cli.ts", + "--main-server-ip", + ctx().config.devContainerMasterHost, + "debug", + "dispatch", + providerId, + "host.ssh", + "--wait-ms", + String(waitMs), + "--payload-json", + JSON.stringify({ source: "code-queue", mode: "exec", command, timeoutMs }), + ]; +} + +function parseFrontendHostSshResult(run: CliRunResult): { stdout: string; stderr: string; exitCode: number | null } { + let stdout = run.stdout; + let stderr = run.stderr; + let exitCode = run.code; + try { + const parsed = JSON.parse(run.stdout) as { + ok?: boolean; + error?: unknown; + data?: { + dispatch?: unknown; + wait?: { + task?: { + status?: string; + result?: { + stdout?: string; + stderr?: string; + exitCode?: number; + }; + }; + }; + }; + }; + const task = parsed.data?.wait?.task; + const result = task?.result; + if (result !== undefined) { + stdout = typeof result.stdout === "string" ? result.stdout : ""; + stderr = [typeof result.stderr === "string" ? result.stderr : "", run.stderr].filter((part) => part.length > 0).join("\n"); + exitCode = typeof result.exitCode === "number" ? result.exitCode : task?.status === "succeeded" ? 0 : 1; + } else if (parsed.ok === false || task?.status === "failed") { + stdout = ""; + stderr = [JSON.stringify(parsed.error ?? task ?? parsed.data?.dispatch ?? parsed), run.stderr].filter((part) => part.length > 0).join("\n"); + exitCode = 1; + } + } catch { + // Keep raw stdout/stderr when the CLI itself failed before returning dispatch JSON. + } + return { stdout, stderr, exitCode }; +} + +async function runFrontendHostSsh(providerId: string, command: string, timeoutMs: number): Promise { + const run = await runCliCommand(buildFrontendHostSshArgs(providerId, command, timeoutMs), Math.max(timeoutMs + 15_000, 30_000)); + const parsed = parseFrontendHostSshResult(run); + return { ...run, parsedStdout: parsed.stdout, parsedStderr: parsed.stderr, parsedExitCode: parsed.exitCode }; +} + +function encodedScriptCommand(encodedScript: string): string { + return [ + "set -euo pipefail", + "tmp=$(mktemp /tmp/unidesk-code-queue-ssh.XXXXXX)", + "cleanup(){ rm -f \"$tmp\"; }", + "trap cleanup EXIT", + "base64 -d > \"$tmp\" <<'UNIDESK_CODE_QUEUE_SCRIPT'", + encodedScript, + "UNIDESK_CODE_QUEUE_SCRIPT", + "bash \"$tmp\"", + ].join("\n"); +} + +function commandLogFromRun(name: string, providerId: string, started: number, run: CliRunResult & { parsedStdout: string; parsedStderr: string; parsedExitCode: number | null }): DevContainerCommandLog { + const errorParts: string[] = []; + if (run.parsedStderr.length > 0) errorParts.push(run.parsedStderr); + if (run.timedOut) errorParts.push(`Error: command timed out after ${Date.now() - started}ms`); + if (run.spawnError !== null) errorParts.push(`${run.spawnError.name}: ${run.spawnError.message}`); + if (run.truncated) errorParts.push("output truncated at 8388608 bytes"); + return { + name, + providerId, + exitCode: run.parsedExitCode, + signal: run.signal, + durationMs: Date.now() - started, + stdout: run.parsedStdout, + stderr: errorParts.join("\n").trim(), + }; +} + +async function uploadAndRunLargeScript(providerId: string, encodedScript: string, timeoutMs: number, name: string, started: number): Promise { + const token = `${Date.now()}-${Math.random().toString(16).slice(2)}`.replace(/[^A-Za-z0-9.-]/gu, "-"); + const b64Path = `/tmp/unidesk-code-queue-${token}.b64`; + const scriptPath = `/tmp/unidesk-code-queue-${token}.sh`; + const init = await runFrontendHostSsh(providerId, `set -euo pipefail\numask 077\n: > ${shellQuote(b64Path)}`, 15_000); + if (init.parsedExitCode !== 0) return commandLogFromRun(name, providerId, started, init); + const chunkSize = 1800; + for (let offset = 0; offset < encodedScript.length; offset += chunkSize) { + const chunk = encodedScript.slice(offset, offset + chunkSize); + const appendCommand = [ + "set -euo pipefail", + `cat >> ${shellQuote(b64Path)} <<'UNIDESK_CODE_QUEUE_CHUNK'`, + chunk, + "UNIDESK_CODE_QUEUE_CHUNK", + ].join("\n"); + const append = await runFrontendHostSsh(providerId, appendCommand, 15_000); + if (append.parsedExitCode !== 0) return commandLogFromRun(name, providerId, started, append); + } + const runCommand = [ + "set -uo pipefail", + `base64 -d ${shellQuote(b64Path)} > ${shellQuote(scriptPath)}`, + `rm -f ${shellQuote(b64Path)}`, + `bash ${shellQuote(scriptPath)}`, + "code=$?", + `rm -f ${shellQuote(scriptPath)}`, + "exit \"$code\"", + ].join("\n"); + const run = await runFrontendHostSsh(providerId, runCommand, timeoutMs); + return commandLogFromRun(name, providerId, started, run); +} + +function runCodeQueueSsh(providerId: string, script: string, timeoutMs: number, name: string): Promise { + const started = Date.now(); + const encodedScript = Buffer.from(script, "utf8").toString("base64"); + const inlineCommand = encodedScriptCommand(encodedScript); + if (inlineCommand.length > 3_400) return uploadAndRunLargeScript(providerId, encodedScript, timeoutMs, name, started); + return runFrontendHostSsh(providerId, inlineCommand, timeoutMs).then((run) => commandLogFromRun(name, providerId, started, run)); +} + function throwIfCommandFailed(command: DevContainerCommandLog): void { if (command.exitCode === 0) return; throw new Error(`${command.name} failed on ${command.providerId ?? "local"} exit=${command.exitCode} stderr=${ctx().safePreview(command.stderr, 1000)}`); @@ -295,8 +429,6 @@ function remoteContainerStartScript(plan: DevContainerPlan, forceRecreate: boole return `set -euo pipefail CONTAINER=${shellQuote(plan.containerName)} REQUESTED_IMAGE=${shellQuote(plan.image)} -CODEX_IMAGE=unidesk-code-queue:latest -FALLBACK_IMAGE=${shellQuote(`unidesk_provider-gateway:${plan.providerId.toLowerCase()}`)} KEY_DIR=${shellQuote(plan.keyDir)} WORKDIR=${shellQuote(plan.workdir)} CONTAINER_WORKDIR=${shellQuote(plan.containerWorkdir)} @@ -306,16 +438,9 @@ SSH_MOUNT_ARGS=() if [ -d "$SSH_DIR" ]; then SSH_MOUNT_ARGS=(-v "$SSH_DIR":/root/.ssh:ro); fi HOST_PATH_MOUNT_ARGS=() if [ -d /mnt ]; then HOST_PATH_MOUNT_ARGS+=(-v /mnt:/mnt); fi -if ! docker image inspect "$IMAGE" >/dev/null 2>&1 && docker image inspect "$CODEX_IMAGE" >/dev/null 2>&1; then - IMAGE="$CODEX_IMAGE" -fi if ! docker image inspect "$IMAGE" >/dev/null 2>&1; then - if docker image inspect "$FALLBACK_IMAGE" >/dev/null 2>&1; then - IMAGE="$FALLBACK_IMAGE" - else - echo "missing requested image $REQUESTED_IMAGE, codex image $CODEX_IMAGE and fallback $FALLBACK_IMAGE" >&2 - exit 1 - fi + echo "missing dev container image $REQUESTED_IMAGE" >&2 + exit 1 fi test -r "$KEY_DIR/id_ed25519" test -r "$KEY_DIR/known_hosts" @@ -336,12 +461,15 @@ cid=$(docker run -d \\ --name "$CONTAINER" \\ --hostname ${shellQuote(`codex-dev-${plan.providerId}`)} \\ --user root \\ + --no-healthcheck \\ --cap-add NET_ADMIN \\ --device /dev/net/tun \\ --add-host host.docker.internal:host-gateway \\ --label unidesk.role=codex-dev \\ --label unidesk.provider=${shellQuote(plan.providerId)} \\ --label "unidesk.workdir=$WORKDIR" \\ + --label com.docker.compose.project=unidesk-code-queue-dev \\ + --label com.docker.compose.service=codex-dev \\ --env-file "$KEY_DIR/codex-env" \\ -e CODEX_HOME=${shellQuote(plan.remoteCodexHome)} \\ -e CODEX_INTERNAL_ORIGINATOR_OVERRIDE=unidesk_code_queue \\ @@ -362,6 +490,7 @@ cid=$(docker run -d \\ "$IMAGE" \\ bash -lc 'mkdir -p /tmp/unidesk-tools; ln -sf /usr/local/bin/busybox /tmp/unidesk-tools/ip; ln -sf /usr/local/bin/busybox /tmp/unidesk-tools/ping; export PATH=/tmp/unidesk-tools:$PATH; echo ready; sleep infinity') docker exec "$CONTAINER" bash -lc 'mkdir -p /tmp/unidesk-tools; ln -sf /usr/local/bin/busybox /tmp/unidesk-tools/ip; ln -sf /usr/local/bin/busybox /tmp/unidesk-tools/ping; export PATH=/tmp/unidesk-tools:$PATH; echo container=$(hostname); pwd; ip route show default; ls -ld /dev/net/tun /run/unidesk-dev-proxy/id_ed25519 /var/lib/unidesk/code-queue/codex-home /var/lib/unidesk/code-queue/opencode-xdg; command -v ssh; command -v ip; command -v ping' +docker exec "$CONTAINER" bash -lc 'command -v iptables; iptables --version' echo "remote_container_ready container=$CONTAINER cid=$cid image=$IMAGE workdir=$WORKDIR"`; } @@ -393,14 +522,21 @@ mkdir -p /tmp/unidesk-tools ln -sf /usr/local/bin/busybox /tmp/unidesk-tools/ip ln -sf /usr/local/bin/busybox /tmp/unidesk-tools/ping export PATH=/tmp/unidesk-tools:$PATH -MASTER=${plan.masterHost} -TUN_ID=${plan.tunId} -TUN=${plan.tunName} -CLIENT_IP=${plan.clientIp} -SERVER_IP=${plan.serverIp} +MASTER=${shellQuote(plan.masterHost)} +TUN_ID=${shellQuote(String(plan.tunId))} +TUN=${shellQuote(plan.tunName)} +CLIENT_IP=${shellQuote(plan.clientIp)} +SERVER_IP=${shellQuote(plan.serverIp)} +EGRESS_CHAIN=${shellQuote(plan.egressFirewallChain)} KNOWN_HOSTS=/tmp/unidesk-dev-proxy-known_hosts cp /run/unidesk-dev-proxy/known_hosts "$KNOWN_HOSTS" chmod 600 "$KNOWN_HOSTS" +MASTER_IP=$(getent ahostsv4 "$MASTER" 2>/dev/null | awk '{print $1; exit}' || true) +if [ -z "$MASTER_IP" ]; then MASTER_IP="$MASTER"; fi +case "$MASTER_IP" in + *.*.*.*) ;; + *) echo "cannot resolve IPv4 master host: $MASTER" >&2; exit 1 ;; +esac DEFAULT_LINE=$(ip route show default | head -1) DEFAULT_GW=$(printf '%s\\n' "$DEFAULT_LINE" | sed -n 's/^default via \\([^ ]*\\) dev \\([^ ]*\\).*/\\1/p') DEFAULT_DEV=$(printf '%s\\n' "$DEFAULT_LINE" | sed -n 's/^default via \\([^ ]*\\) dev \\([^ ]*\\).*/\\2/p') @@ -415,6 +551,10 @@ else fi if [ -z "$ORIG_GW" ] || [ -z "$ORIG_DEV" ]; then echo "cannot detect docker bridge route: default=$DEFAULT_LINE link=$LINK_ROUTE" >&2; exit 1; fi ( ping -c 1 -W 3 google.com >/tmp/direct-ping.log 2>&1 && echo direct_ping=unexpected_ok ) || echo direct_ping=failed_expected +if ! command -v iptables >/dev/null 2>&1; then + echo "iptables is required for sealed dev-container egress" >&2 + exit 1 +fi if command -v pkill >/dev/null 2>&1; then pkill -f "ssh .* -w $TUN_ID:$TUN_ID .*$MASTER" >/dev/null 2>&1 || true elif command -v busybox >/dev/null 2>&1; then @@ -425,7 +565,11 @@ elif command -v busybox >/dev/null 2>&1; then done fi ip link delete "$TUN" >/dev/null 2>&1 || true -ip route replace $MASTER/32 via "$ORIG_GW" dev "$ORIG_DEV" +while iptables -w -D OUTPUT -j "$EGRESS_CHAIN" >/dev/null 2>&1; do :; done +if command -v ip6tables >/dev/null 2>&1; then + while ip6tables -w -D OUTPUT -j "\${EGRESS_CHAIN}6" >/dev/null 2>&1; do :; done +fi +ip route replace "$MASTER_IP/32" via "$ORIG_GW" dev "$ORIG_DEV" ip route replace default via "$ORIG_GW" dev "$ORIG_DEV" SSH_LOG=/tmp/unidesk-dev-proxy-ssh.log rm -f "$SSH_LOG" @@ -445,7 +589,37 @@ ip addr replace $CLIENT_IP peer $SERVER_IP dev "$TUN" ip link set "$TUN" up ip route replace default via $SERVER_IP dev "$TUN" printf 'nameserver 8.8.8.8\\nnameserver 1.1.1.1\\noptions timeout:2 attempts:2\\n' > /etc/resolv.conf -echo "container_tunnel_ready orig=$ORIG_GW/$ORIG_DEV route_to_master=$(ip route get $MASTER | head -1) default=$(ip route show default | head -1) dns=$(tr '\\n' ' ' /dev/null || true +iptables -w -F "$EGRESS_CHAIN" +iptables -w -A "$EGRESS_CHAIN" -o lo -j RETURN +iptables -w -A "$EGRESS_CHAIN" -m conntrack --ctstate ESTABLISHED,RELATED -j RETURN +iptables -w -A "$EGRESS_CHAIN" -o "$TUN" -j RETURN +iptables -w -A "$EGRESS_CHAIN" -d "$MASTER_IP/32" -o "$ORIG_DEV" -p tcp --dport 22 -j RETURN +iptables -w -A "$EGRESS_CHAIN" -j REJECT --reject-with icmp-port-unreachable +iptables -w -I OUTPUT 1 -j "$EGRESS_CHAIN" +if command -v ip6tables >/dev/null 2>&1; then + CHAIN6="\${EGRESS_CHAIN}6" + ip6tables -w -N "$CHAIN6" 2>/dev/null || true + ip6tables -w -F "$CHAIN6" + ip6tables -w -A "$CHAIN6" -o lo -j RETURN + ip6tables -w -A "$CHAIN6" -m conntrack --ctstate ESTABLISHED,RELATED -j RETURN + ip6tables -w -A "$CHAIN6" -d ::1/128 -j RETURN + ip6tables -w -A "$CHAIN6" -d fc00::/7 -j RETURN + ip6tables -w -A "$CHAIN6" -d fe80::/10 -j RETURN + ip6tables -w -A "$CHAIN6" -j REJECT + ip6tables -w -I OUTPUT 1 -j "$CHAIN6" +fi +cat > /tmp/unidesk-dev-egress.env </dev/null || true; echo route=$(ip route show default | head -1); echo route_8_8_8_8=$(ip route get 8.8.8.8 | head -1); echo resolv=$(tr "\\n" " " /tmp/sealed-direct-ping.log 2>&1 && echo sealed_direct_ping=unexpected_ok ) || echo sealed_direct_ping=blocked_expected; fi; ping -c 1 -W 5 google.com'`; } function windowsNativeBridgeServerSource(): string { diff --git a/src/components/microservices/code-queue/src/task-view.ts b/src/components/microservices/code-queue/src/task-view.ts index 7b58afe4..235c600f 100644 --- a/src/components/microservices/code-queue/src/task-view.ts +++ b/src/components/microservices/code-queue/src/task-view.ts @@ -22,6 +22,7 @@ import { codeAgentPortForModel, codeAgentPortInfo, codeExecutionModeInfo, extrac import { currentTaskPromptMarker, resolvedReferenceContextTitle, stripCodeQueueEnvironmentHint, userPromptForDisplay } from "./prompts"; import { outputArchiveSignature, taskFullOutput } from "./task-output"; import { retryPrompt } from "./judge"; +import { readOaTraceStepsForTask, type OaTraceStepSummary } from "./oa-events"; export interface TaskViewContext { config: Pick; @@ -1716,6 +1717,37 @@ function traceLineVisibleInTraceView(line: TranscriptLine): boolean { return line.kind !== "system" || traceSystemLineIsError(line); } +function oaTraceStepKind(kind: string): TranscriptKind { + const normalized = String(kind || "").trim().toLowerCase(); + if (normalized === "read" || normalized === "explored" || normalized === "explore") return "explored"; + if (normalized === "edit" || normalized === "edited") return "edited"; + if (normalized === "run" || normalized === "ran" || normalized === "command") return "ran"; + if (normalized === "error" || normalized === "failed") return "error"; + if (normalized === "message" || normalized === "assistant" || normalized === "user") return "message"; + return "system"; +} + +function oaTraceStepToTranscriptLine(step: OaTraceStepSummary): TranscriptLine { + const kind = oaTraceStepKind(step.kind); + const summaryText = step.summaryLines.join("\n").trimEnd(); + const commandText = summaryText.startsWith("item/") ? summaryText : ""; + return { + seq: step.seq, + at: step.at, + kind, + title: step.title || (kind === "explored" ? "Read" : kind === "edited" ? "Edit" : kind === "ran" ? "Run" : "Trace step"), + status: step.status || undefined, + commandPreview: commandText.length > 0 ? commandText : undefined, + bodyPreview: commandText.length > 0 ? undefined : summaryText || undefined, + rawSeqs: step.rawSeqs, + }; +} + +async function oaTraceTranscriptForTask(taskId: string, attemptIndex: number | null): Promise { + const steps = await readOaTraceStepsForTask(taskId, attemptIndex); + return steps.map(oaTraceStepToTranscriptLine).filter(traceLineVisibleInTraceView); +} + function mergeTraceWindowLines(left: TranscriptLine[], right: TranscriptLine[]): TranscriptLine[] { const seen = new Set(); const merged: TranscriptLine[] = []; @@ -2114,14 +2146,11 @@ function taskPromptDetailResponse(task: QueueTask, url: URL): Response { }); } -function taskTraceStepsResponse(task: QueueTask, url: URL): Response { +async function taskTraceStepsResponse(task: QueueTask, url: URL): Promise { const limit = ctx().parseLimit(url); const attemptIndex = ctx().parseSeqParam(url, "attempt", null); const agentPort = codeAgentPortForModel(task.model); - const previewTranscript = cachedPreviewTranscript(task); - const attemptWindow = attemptIndex === null ? null : traceAttemptWindows(task, previewTranscript).find((window) => window.index === attemptIndex) ?? null; - const sourceTranscript = attemptWindow === null ? previewTranscript : executionLinesForAttempt(attemptWindow.lines); - const transcript = sourceTranscript.filter(traceLineVisibleInTraceView); + const transcript = await oaTraceTranscriptForTask(task.id, attemptIndex); const page = ctx().pageBySeq(transcript, url, limit); return ctx().jsonResponse({ ok: true, @@ -2132,6 +2161,7 @@ function taskTraceStepsResponse(task: QueueTask, url: URL): Response { agentPort, agentPortInfo: codeAgentPortInfo(agentPort), attempt: attemptIndex, + source: "oa-event-flow", steps: page.chunk.map((line) => ({ seq: line.seq, at: line.at, @@ -2155,11 +2185,11 @@ function taskTraceStepsResponse(task: QueueTask, url: URL): Response { }); } -function taskTraceStepDetailResponse(task: QueueTask, url: URL): Response { +async function taskTraceStepDetailResponse(task: QueueTask, url: URL): Promise { const seq = Number(url.searchParams.get("seq")); if (!Number.isFinite(seq)) return ctx().jsonResponse({ ok: false, error: "seq is required" }, 400); const agentPort = codeAgentPortForModel(task.model); - const transcript = fullTranscript(task).filter(traceLineVisibleInTraceView); + const transcript = await oaTraceTranscriptForTask(task.id, null); const line = transcript.find((item) => Number(item.seq) === seq || item.rawSeqs.includes(seq)); if (line === undefined) return ctx().jsonResponse({ ok: false, error: "trace step not found", seq }, 404); return ctx().jsonResponse({ diff --git a/src/components/microservices/code-queue/src/types.ts b/src/components/microservices/code-queue/src/types.ts index a5cc6faa..bc65a4b5 100644 --- a/src/components/microservices/code-queue/src/types.ts +++ b/src/components/microservices/code-queue/src/types.ts @@ -446,6 +446,7 @@ export interface DevContainerPlan { serverIp: string; clientIp: string; natChain: string; + egressFirewallChain: string; keyDir: string; masterKeyPath: string; }