From 860b2a76b115fe35de7f60c411d700d1badf18b7 Mon Sep 17 00:00:00 2001 From: Codex Date: Sat, 16 May 2026 04:18:25 +0000 Subject: [PATCH] fix: verify code queue deploy commit --- AGENTS.md | 2 +- TEST.md | 2 +- docs/reference/cli.md | 4 +- docs/reference/codex-deploy.md | 7 +- scripts/src/codex-deploy.ts | 90 +++++++++++++++++-- .../microservices/code-queue/src/index.ts | 14 +++ .../microservices/code-queue/src/types.ts | 2 + 7 files changed, 108 insertions(+), 13 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 3fcf9ede..d93e3f9d 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -28,7 +28,7 @@ UniDesk 是一个以主 server 为统一入口的分布式工作平台;本文 - `bun scripts/cli.ts provider attach [--master-server URL] [--up] [--force]`:在新增计算节点上生成两项配置的 provider-gateway 挂载包;默认只需要主 server URL(默认 `http://74.48.78.17/`)和唯一 Provider ID,生成的 Compose 固定 Docker socket、`pid: "host"`、`restart: always`、只读 `/workspace`、SSH 维护私钥挂载和 loopback egress proxy 端口,规则见 `docs/reference/provider-gateway.md`。 - `bun scripts/cli.ts ssh [ssh-like args...]`:通过 provider-gateway 的 Host SSH / WSL SSH 维护桥打开近似原生 ssh 的交互会话或远端命令,并在远端 PATH 注入 `apply_patch`、`glob` 与 `skill-discover`;`apply-patch`、`py`、`skills`、结构化 `find`、`glob` 和 `argv` 子命令用于避免远端补丁、Python stdin、skill 发现与常用只读命令的嵌套转义问题,使用规则见 `docs/reference/cli.md` 和 `docs/reference/provider-gateway.md`。 - `bun scripts/cli.ts microservice list/status/health/proxy`:管理和验证挂载在主 server、计算节点 Docker 或 v3s 控制面上的用户服务,OA Event Flow/Todo Note/Baidu Netdisk on main-server、V3S Control/Code Queue/MDTODO/FindJob/Pipeline/MET Nonlinear on D601 的规则见 `docs/reference/microservices.md`。 -- `bun scripts/cli.ts codex deploy `:按已 push 到 remote 的 UniDesk commit 部署 D601 v3s/k8s Code Queue,自动 fetch/export、同步 `/home/ubuntu/cq-deploy`、构建镜像、导入 k3s、apply manifest、rollout 和健康验证;规则见 `docs/reference/codex-deploy.md`。 +- `bun scripts/cli.ts codex deploy `:按已 push 到 remote 的 UniDesk commit 部署 D601 v3s/k8s Code Queue,自动 fetch/export、同步 `/home/ubuntu/cq-deploy`、构建镜像、导入 k3s、apply manifest、写入部署 commit 戳记、rollout,并通过真实 `/health` 校验 `deploy.commit`,避免旧服务充数;规则见 `docs/reference/codex-deploy.md`。 - `bun scripts/cli.ts codex task `:按 Code Queue 任务 ID 查询初始 prompt、最后 assistant message、工具调用摘要、attempt/judge/error 和耗时,便于新任务引用历史 session。 - `bun scripts/cli.ts codex judge --attempt [--dry-run]`:按指定 task/attempt 用与队列 worker 相同的上下文构建和 MiniMax judge 调用路径单步复现完成判定;`--dry-run` 只输出 prompt/payload 诊断。 - `bun scripts/cli.ts server stop`:以异步 job 停止固定 Compose 项目中的全部 UniDesk 服务,停止后用 `server status` 复核。 diff --git a/TEST.md b/TEST.md index fedb6037..208b4b65 100644 --- a/TEST.md +++ b/TEST.md @@ -99,7 +99,7 @@ ## T23 D601 Code Queue User Service -阅读 `AGENTS.md`(本项目 `AGENTS.md` 同时承担 `SKILL.md` 对 `scripts/cli.ts` 的解释职责),然后用 cli 手动测试以下内容:运行 `bun scripts/cli.ts microservice list`,确认 `code-queue` 显示为 `providerId=D601`、`public=false`、`frontendOnly=true`、仓库 URL `https://github.com/pikasTech/unidesk`、v3s/k8s `v3s://unidesk/code-queue:4222` 逻辑服务映射、`deployment.mode=v3sctl-managed`、`runtime.orchestrator=v3sctl` 且无业务直连容器摘要;使用 `bun scripts/cli.ts codex deploy <已push的commitId>` 重建/启动 D601 Code Queue,确认命令立即返回异步 job id,`bun scripts/cli.ts job status --tail-bytes 30000` 能看到 fetch/export、rsync、Docker build、k3s image import、kubectl apply、rollout 和 health 验证进度,并确认主 server 根目录 `docker-compose.yml` 中不再存在 `code-queue` service。运行 `bun scripts/cli.ts microservice health code-queue`、`bun scripts/cli.ts microservice proxy code-queue /api/dev-ready --raw`、`bun scripts/cli.ts microservice proxy code-queue '/api/tasks/overview?limit=5&transcriptLimit=1&compact=1&afterSeq=0&preferId='` 和 `bun scripts/cli.ts codex task <已有taskId>`,确认链路通过 backend-core、v3sctl-adapter、Kubernetes API service proxy 和 D601 active Code Queue Service,且 task id 查询返回初始 prompt、最后 assistant message、工具调用摘要、attempt/judge/error 和耗时,`queue.storage.primary=postgres`、`queue.storage.postgresReady=true`、`queue.devReady.missingTools=[]`、`queue.devReady.docker.versionOk=true`、`queue.devReady.docker.composeOk=true`;`queue.devReady.ssh.ready` 只在需要跨 Provider SSH/Windows-native 任务时作为强制项。在 D601 `code-queue-backend` 容器内验证主 PostgreSQL 端口映射可执行 `select 1`,主 OA Event Flow 端口映射 `/health` 可访问,本机 ClaudeQQ `http://host.docker.internal:3290/health` 可访问;这些映射不得成为任意公网入口。 +阅读 `AGENTS.md`(本项目 `AGENTS.md` 同时承担 `SKILL.md` 对 `scripts/cli.ts` 的解释职责),然后用 cli 手动测试以下内容:运行 `bun scripts/cli.ts microservice list`,确认 `code-queue` 显示为 `providerId=D601`、`public=false`、`frontendOnly=true`、仓库 URL `https://github.com/pikasTech/unidesk`、v3s/k8s `v3s://unidesk/code-queue:4222` 逻辑服务映射、`deployment.mode=v3sctl-managed`、`runtime.orchestrator=v3sctl` 且无业务直连容器摘要;使用 `bun scripts/cli.ts codex deploy <已push的commitId>` 重建/启动 D601 Code Queue,确认命令立即返回异步 job id,`bun scripts/cli.ts job status --tail-bytes 30000` 能看到 fetch/export、rsync、Docker build、k3s image import、kubectl apply、部署 commit 戳记、rollout 和 health commit 验证进度,并确认 job 最终校验真实 Code Queue `/health` 返回的 `deploy.commit` 精确匹配本次 remote commit,不能由旧服务或旧 Pod 充数;同时确认主 server 根目录 `docker-compose.yml` 中不再存在 `code-queue` service。运行 `bun scripts/cli.ts microservice health code-queue`、`bun scripts/cli.ts microservice proxy code-queue /api/dev-ready --raw`、`bun scripts/cli.ts microservice proxy code-queue '/api/tasks/overview?limit=5&transcriptLimit=1&compact=1&afterSeq=0&preferId='` 和 `bun scripts/cli.ts codex task <已有taskId>`,确认链路通过 backend-core、v3sctl-adapter、Kubernetes API service proxy 和 D601 active Code Queue Service,且 task id 查询返回初始 prompt、最后 assistant message、工具调用摘要、attempt/judge/error 和耗时,`queue.storage.primary=postgres`、`queue.storage.postgresReady=true`、`queue.devReady.missingTools=[]`、`queue.devReady.docker.versionOk=true`、`queue.devReady.docker.composeOk=true`;`queue.devReady.ssh.ready` 只在需要跨 Provider SSH/Windows-native 任务时作为强制项。在 D601 `code-queue-backend` 容器内验证主 PostgreSQL 端口映射可执行 `select 1`,主 OA Event Flow 端口映射 `/health` 可访问,本机 ClaudeQQ `http://host.docker.internal:3290/health` 可访问;这些映射不得成为任意公网入口。 随后登录公网 frontend `http://74.48.78.17:18081/`,进入 `用户服务 / Code Queue`,确认页面显示默认模型 `gpt-5.5`、默认执行 Provider `D601`、默认工作目录 `/workspace`、模型下拉菜单包含 `gpt-5.4-mini`/`gpt-5.4`/`gpt-5.5`、入队份数、队列指标、任务 ID、复制任务 ID、引用按钮、任务耗时、引用任务 ID、清空输入、创建成功提示、任务提交表单、Trace 输出、attempt 表、MiniMax/fallback judge 状态、追加 prompt、打断和重试控件;通过页面提交一个小任务,确认任务进入 queued/running/succeeded 或可解释的 failed 状态,并且输出区能看到运行中的 Codex 消息。批量验收时设置 `入队份数=5` 或用 `---` 分隔 5 段 prompt,一次性入队 5 条任务,确认 5 条任务按顺序运行并全部进入 succeeded 或可解释的非成功终态,不能只运行第一条后停止;其中任一任务被 judge 判定 `fail` 时只能把当前任务标为 failed,后续 queued 任务仍必须继续推进。测试异常中断时可以提交长任务后点击 `打断`,确认任务变为 canceled 或被 judge 标记为非成功终态;自动重试只应在服务端/传输异常、任务正常结束但 execution record 显示未完成、或 judge 判定 retry 时发生;retry 必须复用已有 Codex thread 并 append 继续执行 prompt,只有当前任务 complete 后才推进队列中的下一个任务。MiniMax judge 必须能处理 Markdown fence/夹杂文本等 JSON 去噪;若去噪后仍失败,必须把解析错误和上一轮去噪前原始回答反馈给 MiniMax 修复后重试,日志中应出现 `judge_json_parse_retry`,且 repair 成功时仍以 `source=minimax` 返回。Codex provider key 只能通过 `OPENAI_API_KEY`、`CRS_OAI_KEY` 这类运行时环境透传,MiniMax API key 只能通过 D601 env-file 运行时环境传入,禁止写入 `config.json`、Dockerfile、源码或测试文档。 diff --git a/docs/reference/cli.md b/docs/reference/cli.md index d567b5d7..e0060334 100644 --- a/docs/reference/cli.md +++ b/docs/reference/cli.md @@ -19,7 +19,7 @@ UniDesk 的统一 CLI 入口是根目录 `scripts/cli.ts`,运行方式固定 - `ssh py [script-args...] < script.py` 把本地 stdin 落到远端临时 `.py` 文件后再以 `python3 -u` 执行并自动清理,避免再手写 `'python3 -'`、heredoc 或多层引号;`script-args` 会按 argv 安全透传给远端脚本。 - `ssh skills [--scope all|wsl|windows] [--limit N]` 发现目标节点上的 WSL/Linux skill 根目录;当 provider 是 WSL 时同一次调用还会扫描 Windows 用户目录下的 `.agents/skills` 与 `.codex/skills`。 - `microservice list/status/health/proxy` 通过 backend-core 内网 API 管理挂载在计算节点 Docker 中的用户服务(底层命令名仍为 microservice);`health` 和 `proxy` 会走真实 backend-core -> provider-gateway -> 节点本机后端链路,`proxy` 对超大 body 默认输出有界预览,规则见 `docs/reference/microservices.md`。 -- `codex deploy ` 创建异步 job,将已 push 到 remote 的 UniDesk commit 部署为 D601 v3s/k8s Code Queue:fetch/export tracked files、同步 `/home/ubuntu/cq-deploy`、构建 `unidesk-code-queue:d601`、导入 k3s containerd、apply manifest、rollout restart 和 live health 验证;详细规则见 `docs/reference/codex-deploy.md`。 +- `codex deploy ` 创建异步 job,将已 push 到 remote 的 UniDesk commit 部署为 D601 v3s/k8s Code Queue:fetch/export tracked files、同步 `/home/ubuntu/cq-deploy`、构建 `unidesk-code-queue:d601`、导入 k3s containerd、apply manifest、写入部署 commit 戳记、rollout restart,并通过真实 Code Queue `/health` 校验 `deploy.commit` 精确匹配本次 remote commit;详细规则见 `docs/reference/codex-deploy.md`。 - `codex task ` 通过 Code Queue 私有代理按任务 ID 查询结构化执行摘要;默认只返回有界 prompt/response 预览、执行 Provider、工作目录、最后 assistant message、最近工具调用摘要、attempt、judge、错误、耗时和 trace 翻页提示,适合在新队列任务中引用历史 session 且避免噪声爆炸。 - `codex task --trace --tail|--from-start|--after-seq N|--before-seq N --limit N` 按页拉取 Code Queue 的逻辑 trace;响应会返回 `nextAfterSeq`、`previousBeforeSeq`、`hasMore`、`hasBefore` 和下一页/上一页命令,默认 `--trace` 取最新一页,需要完整 prompt/最后 response 时加 `--full`。 - `codex output --tail|--from-start|--after-seq N|--before-seq N --limit N [--full-text]` 按原始 output seq 分页读取底层记录;当 trace 行提示 `commandOmittedLines`、`bodyOmittedLines` 或 `rawSeqs` 时,用该命令按 seq 补取完整信息,默认仍有单条文本预览上限,显式 `--full-text` 才返回该页全文。 @@ -33,7 +33,7 @@ UniDesk 的统一 CLI 入口是根目录 `scripts/cli.ts`,运行方式固定 长时操作采用 Fire-and-Forget 模式:CLI 创建 `.state/jobs/{jobId}.json`,后台进程执行真实命令,并将 stdout、stderr 分别写入 `.state/jobs/{jobId}.stdout.log` 与 `.state/jobs/{jobId}.stderr.log`。调用者通过 `bun scripts/cli.ts job status ` 查询进度和尾部输出。 -`server rebuild` 与 `server start`、`server stop` 一样必须通过返回的 job id 确认结果;不要把连续 `server rebuild` 命令理解成“前一个重建已完成”,因为两个命令只是在快速创建异步 job。重建 frontend 的标准流程是运行 `bun scripts/cli.ts server rebuild frontend`,随后轮询 `bun scripts/cli.ts job status ` 到 `succeeded`,再用 `server status` 或 `e2e run` 验证公网 frontend;重建 Todo Note 后端使用 `bun scripts/cli.ts server rebuild todo-note`,随后用 `microservice health todo-note` 和 `microservice proxy todo-note /api/instances` 验证;重建 Project Manager 后端使用 `bun scripts/cli.ts server rebuild project-manager`,随后用 `microservice health project-manager` 和 `microservice proxy project-manager /api/projects` 验证;重建 Baidu Netdisk 后端使用 `bun scripts/cli.ts server rebuild baidu-netdisk`,随后用 `microservice health baidu-netdisk` 和 `microservice proxy baidu-netdisk /api/transfers` 验证;重建 OA Event Flow 后端使用 `bun scripts/cli.ts server rebuild oa-event-flow`,随后用 `microservice health oa-event-flow` 和 `microservice proxy oa-event-flow /api/diagnostics` 验证。Code Queue 后端由 D601 v3s/k8s 控制面代管,必须使用 `bun scripts/cli.ts codex deploy ` 部署已 push 的 remote commit,再用 `microservice health code-queue` 和 `microservice proxy code-queue /api/tasks/overview` 验证。不得把 `docker rm` 手工兜底当成正式交付步骤。 +`server rebuild` 与 `server start`、`server stop` 一样必须通过返回的 job id 确认结果;不要把连续 `server rebuild` 命令理解成“前一个重建已完成”,因为两个命令只是在快速创建异步 job。重建 frontend 的标准流程是运行 `bun scripts/cli.ts server rebuild frontend`,随后轮询 `bun scripts/cli.ts job status ` 到 `succeeded`,再用 `server status` 或 `e2e run` 验证公网 frontend;重建 Todo Note 后端使用 `bun scripts/cli.ts server rebuild todo-note`,随后用 `microservice health todo-note` 和 `microservice proxy todo-note /api/instances` 验证;重建 Project Manager 后端使用 `bun scripts/cli.ts server rebuild project-manager`,随后用 `microservice health project-manager` 和 `microservice proxy project-manager /api/projects` 验证;重建 Baidu Netdisk 后端使用 `bun scripts/cli.ts server rebuild baidu-netdisk`,随后用 `microservice health baidu-netdisk` 和 `microservice proxy baidu-netdisk /api/transfers` 验证;重建 OA Event Flow 后端使用 `bun scripts/cli.ts server rebuild oa-event-flow`,随后用 `microservice health oa-event-flow` 和 `microservice proxy oa-event-flow /api/diagnostics` 验证。Code Queue 后端由 D601 v3s/k8s 控制面代管,必须使用 `bun scripts/cli.ts codex deploy ` 部署已 push 的 remote commit;部署 job 自身必须通过真实 `/health` 返回的 `deploy.commit` 证明不是旧服务在充数,之后再用 `microservice health code-queue` 和 `microservice proxy code-queue /api/tasks/overview` 做人工复核。不得把 `docker rm` 手工兜底当成正式交付步骤。 ## Output Contract diff --git a/docs/reference/codex-deploy.md b/docs/reference/codex-deploy.md index 3076216b..445dabd2 100644 --- a/docs/reference/codex-deploy.md +++ b/docs/reference/codex-deploy.md @@ -23,14 +23,15 @@ bun scripts/cli.ts job status --tail-bytes 30000 3. 在 `/home/ubuntu/cq-deploy` 构建 `unidesk-code-queue:d601`。 4. `docker save` 镜像并导入 k3s containerd:`docker exec -i unidesk-v8s-server ctr -n k8s.io images import -`。 5. `kubectl apply -f src/components/microservices/v3sctl-adapter/v3s/code-queue.k8s.yaml`,其中包含 Code Queue 和 `d601-tcp-egress-gateway`。 -6. `kubectl -n unidesk rollout restart deployment/d601-tcp-egress-gateway deployment/code-queue` 并等待 rollout 完成。 -7. 通过 `bun scripts/cli.ts microservice health code-queue` 等价的 backend-core 路径验证 live health。 +6. 将解析后的 40 位 remote commit 写入 `deployment/code-queue` 的 `CODE_QUEUE_DEPLOY_COMMIT` / `CODE_QUEUE_DEPLOY_REQUESTED_COMMIT`,并记录到 Deployment annotation。 +7. `kubectl -n unidesk rollout restart deployment/d601-tcp-egress-gateway deployment/code-queue` 并等待 rollout 完成。 +8. 通过 backend-core 的真实微服务代理读取 Code Queue `/health`,强制校验 `deploy.commit` 等于本次解析出的 remote commit;如果健康的是旧服务或旧 Pod,job 必须失败。 ## Observability `codex deploy` 本身不阻塞等待部署结束。返回 JSON 中的 `statusCommand` 和 `tailCommand` 是唯一状态入口。后台 job 的 stderr 是 JSONL progress,每个长步骤会记录远端 `/tmp/unidesk-codex-deploy-*.log` 和 sentinel 文件;失败时 `job status` 会显示最后日志尾部。 -`job status` 到 `succeeded` 后,还要用以下命令做 live 验证: +`job status` 到 `succeeded` 时,部署 job 已经完成 live commit 验证。需要人工复核时可用以下命令确认 `deploy.commit`: ```bash bun scripts/cli.ts microservice health code-queue diff --git a/scripts/src/codex-deploy.ts b/scripts/src/codex-deploy.ts index 43bc2761..3e716804 100644 --- a/scripts/src/codex-deploy.ts +++ b/scripts/src/codex-deploy.ts @@ -165,6 +165,17 @@ function coreBody(response: unknown): Record | null { return asRecord(record?.body); } +function dispatchStdout(raw: unknown): string { + const task = asRecord(raw); + const result = asRecord(task?.result); + return asString(result?.stdout); +} + +function parseFullCommit(value: string): string { + const match = value.match(/\b[0-9a-f]{40}\b/iu); + return match?.[0]?.toLowerCase() ?? ""; +} + async function dispatchSsh( config: UniDeskConfig, providerId: string, @@ -378,6 +389,15 @@ function prepareSourceScript(options: DeployCliOptions, exportDir: string): stri ].join("\n"); } +function resolveCommitScript(options: DeployCliOptions): string { + return [ + "set -euo pipefail", + `repo=${shellQuote(options.sourceRepoDir)}`, + `commit=${shellQuote(options.commitId)}`, + "git -C \"$repo\" rev-parse --verify \"$commit^{commit}\"", + ].join("\n"); +} + function syncDeployScript(options: DeployCliOptions, exportDir: string): string { return [ "set -euo pipefail", @@ -429,6 +449,23 @@ function applyManifestCommand(options: DeployCliOptions): string { return `KUBECONFIG=${shellQuote(options.kubeconfigPath)} kubectl apply -f ${shellQuote(manifest)}`; } +function stampDeployCommitScript(options: DeployCliOptions, resolvedCommit: string): string { + return [ + "set -euo pipefail", + `kubeconfig=${shellQuote(options.kubeconfigPath)}`, + `namespace=${shellQuote(k8sNamespace)}`, + `deployment=${shellQuote(k8sDeployment)}`, + `tcp_deployment=${shellQuote(tcpEgressDeployment)}`, + `resolved_commit=${shellQuote(resolvedCommit)}`, + `requested_commit=${shellQuote(options.commitId)}`, + "KUBECONFIG=\"$kubeconfig\" kubectl -n \"$namespace\" set env \"deployment/$tcp_deployment\" \"deployment/$deployment\" CODE_QUEUE_DEPLOY_COMMIT=\"$resolved_commit\" CODE_QUEUE_DEPLOY_REQUESTED_COMMIT=\"$requested_commit\"", + "KUBECONFIG=\"$kubeconfig\" kubectl -n \"$namespace\" annotate \"deployment/$tcp_deployment\" \"deployment/$deployment\" unidesk.ai/deploy-commit=\"$resolved_commit\" unidesk.ai/deploy-requested-commit=\"$requested_commit\" --overwrite", + "current=$(KUBECONFIG=\"$kubeconfig\" kubectl -n \"$namespace\" get deploy \"$deployment\" -o jsonpath='{.spec.template.spec.containers[0].env[?(@.name==\"CODE_QUEUE_DEPLOY_COMMIT\")].value}')", + "test \"$current\" = \"$resolved_commit\"", + "printf 'deployment_commit=%s\\nrequested_commit=%s\\n' \"$current\" \"$requested_commit\"", + ].join("\n"); +} + function rolloutRestartCommand(options: DeployCliOptions): string { return `KUBECONFIG=${shellQuote(options.kubeconfigPath)} kubectl -n ${shellQuote(k8sNamespace)} rollout restart deployment/${shellQuote(tcpEgressDeployment)} deployment/${shellQuote(k8sDeployment)}`; } @@ -447,23 +484,44 @@ function rolloutStatusScript(options: DeployCliOptions): string { ].join("\n"); } -async function microserviceHealthStep(config: UniDeskConfig, timeoutMs: number): Promise { +function healthDeployCommit(body: Record | null): string { + const deploy = asRecord(body?.deploy); + return asString(deploy?.commit).toLowerCase(); +} + +function healthProbeSummary(body: Record | null): Record | null { + if (body === null) return null; + return { + ok: body.ok, + service: body.service, + instanceId: body.instanceId, + deploy: body.deploy, + status: body.status, + databaseReady: body.databaseReady, + startedAt: body.startedAt, + }; +} + +async function microserviceHealthStep(config: UniDeskConfig, expectedCommit: string, timeoutMs: number): Promise { const step = "unidesk-health"; const startedAt = nowIso(); const startedMs = Date.now(); const deadline = Date.now() + timeoutMs; let latest: unknown = null; + let latestCommit = ""; while (Date.now() < deadline) { latest = coreInternalFetch("/api/microservices/code-queue/health"); const record = asRecord(latest); const body = asRecord(record?.body); - const ok = record?.ok === true && body?.ok !== false; - progressLine(step, "health probe", { ok, status: record?.status ?? null, body }); + latestCommit = healthDeployCommit(body); + const commitMatches = latestCommit === expectedCommit; + const ok = record?.ok === true && body?.ok !== false && commitMatches; + progressLine(step, "health probe", { ok, status: record?.status ?? null, expectedCommit, deployedCommit: latestCommit || null, commitMatches, body: healthProbeSummary(body) }); if (ok) { return { step, ok: true, - detail: `Code Queue health passed in ${elapsedMs(startedMs)}ms`, + detail: `Code Queue health passed with deployed commit ${latestCommit} in ${elapsedMs(startedMs)}ms`, startedAt, finishedAt: nowIso(), raw: latest, @@ -474,7 +532,7 @@ async function microserviceHealthStep(config: UniDeskConfig, timeoutMs: number): return { step, ok: false, - detail: `Code Queue health did not pass within ${timeoutMs}ms`, + detail: `Code Queue health did not expose expected commit ${expectedCommit} within ${timeoutMs}ms; latest deployed commit=${latestCommit || "none"}`, startedAt, finishedAt: nowIso(), raw: latest, @@ -511,6 +569,22 @@ export async function codexDeploy(config: UniDeskConfig, options: DeployCliOptio const prepare = await backgroundStep(config, options, "prepare-source", prepareSourceScript(options, exportDir), remainingTimeout(deadline, 180_000), null); if (!pushStep(prepare)) return { ok: false, startedAt, finishedAt: nowIso(), options, steps }; + const resolveCommit = await directStep(config, options, "resolve-commit", bashScriptCommand(resolveCommitScript(options)), null, 60_000, 45_000); + if (!pushStep(resolveCommit)) return { ok: false, startedAt, finishedAt: nowIso(), options, steps }; + const resolvedCommit = parseFullCommit(dispatchStdout(resolveCommit.raw) || resolveCommit.detail); + if (resolvedCommit.length !== 40) { + const parseFailure = { + step: "resolve-commit-parse", + ok: false, + detail: `remote commit did not resolve to a full 40-character SHA; output=${resolveCommit.detail}`, + startedAt: nowIso(), + finishedAt: nowIso(), + }; + pushStep(parseFailure); + return { ok: false, startedAt, finishedAt: nowIso(), options, steps }; + } + progressLine("resolve-commit", "resolved requested commit", { requestedCommit: options.commitId, resolvedCommit }); + const sync = await directStep(config, options, "sync-deploy-tree", bashScriptCommand(syncDeployScript(options, exportDir)), null, 60_000, 45_000); if (!pushStep(sync)) return { ok: false, startedAt, finishedAt: nowIso(), options, steps }; @@ -527,13 +601,16 @@ export async function codexDeploy(config: UniDeskConfig, options: DeployCliOptio const apply = await directStep(config, options, "kubectl-apply", applyManifestCommand(options), options.deployDir, 60_000, 45_000); if (!pushStep(apply)) return { ok: false, startedAt, finishedAt: nowIso(), options, steps }; + const stampCommit = await directStep(config, options, "stamp-deploy-commit", bashScriptCommand(stampDeployCommitScript(options, resolvedCommit)), options.deployDir, 60_000, 45_000); + if (!pushStep(stampCommit)) return { ok: false, startedAt, finishedAt: nowIso(), options, resolvedCommit, steps }; + const restart = await directStep(config, options, "rollout-restart", rolloutRestartCommand(options), options.deployDir, 60_000, 45_000); if (!pushStep(restart)) return { ok: false, startedAt, finishedAt: nowIso(), options, steps }; const rollout = await backgroundStep(config, options, "rollout-status", rolloutStatusScript(options), remainingTimeout(deadline, 240_000)); if (!pushStep(rollout)) return { ok: false, startedAt, finishedAt: nowIso(), options, steps }; - const health = await microserviceHealthStep(config, remainingTimeout(deadline, 90_000)); + const health = await microserviceHealthStep(config, resolvedCommit, remainingTimeout(deadline, 90_000)); pushStep(health); return { @@ -541,6 +618,7 @@ export async function codexDeploy(config: UniDeskConfig, options: DeployCliOptio startedAt, finishedAt: nowIso(), options, + resolvedCommit, steps, statusCommands: { health: "bun scripts/cli.ts microservice health code-queue", diff --git a/src/components/microservices/code-queue/src/index.ts b/src/components/microservices/code-queue/src/index.ts index e3be165b..451bce94 100644 --- a/src/components/microservices/code-queue/src/index.ts +++ b/src/components/microservices/code-queue/src/index.ts @@ -321,6 +321,8 @@ function readConfig(): RuntimeConfig { port: envNumber("PORT", 4222), dataDir, instanceId: envString("CODE_QUEUE_INSTANCE_ID", mainProviderId), + deployCommit: envString("CODE_QUEUE_DEPLOY_COMMIT", "unknown"), + deployRequestedCommit: envString("CODE_QUEUE_DEPLOY_REQUESTED_COMMIT", ""), schedulerEnabled: envBool("CODE_QUEUE_SCHEDULER_ENABLED", true), startupOaBackfillEnabled: envBool("CODE_QUEUE_STARTUP_OA_BACKFILL_ENABLED", false), outputArchiveDir: envString("CODE_QUEUE_OUTPUT_ARCHIVE_DIR", resolve(dataDir, "output-archive")), @@ -3818,6 +3820,10 @@ async function route(req: Request): Promise { ok: true, service: "code-queue", instanceId: config.instanceId, + deploy: { + commit: config.deployCommit, + requestedCommit: config.deployRequestedCommit, + }, databaseReady, serviceReady, startedAt: serviceStartedAt, @@ -3827,6 +3833,10 @@ async function route(req: Request): Promise { ok: false, service: "code-queue", instanceId: config.instanceId, + deploy: { + commit: config.deployCommit, + requestedCommit: config.deployRequestedCommit, + }, status: "starting", databaseReady, databaseLastError, @@ -3836,6 +3846,10 @@ async function route(req: Request): Promise { ok: true, service: "code-queue", instanceId: config.instanceId, + deploy: { + commit: config.deployCommit, + requestedCommit: config.deployRequestedCommit, + }, schedulerEnabled: config.schedulerEnabled, queue: queueSummary(false, state.tasks), egressProxy: await providerGatewayEgressProxyStatus(), diff --git a/src/components/microservices/code-queue/src/types.ts b/src/components/microservices/code-queue/src/types.ts index 794cb74c..922bbb2e 100644 --- a/src/components/microservices/code-queue/src/types.ts +++ b/src/components/microservices/code-queue/src/types.ts @@ -33,6 +33,8 @@ export interface RuntimeConfig { port: number; dataDir: string; instanceId: string; + deployCommit: string; + deployRequestedCommit: string; schedulerEnabled: boolean; startupOaBackfillEnabled: boolean; outputArchiveDir: string;