From 65ac76c2352e53659688af649adb5a8dfce0f8a9 Mon Sep 17 00:00:00 2001 From: Codex Date: Fri, 22 May 2026 23:00:38 +0000 Subject: [PATCH] fix: compact noisy cli health outputs --- AGENTS.md | 6 +- docs/reference/cli.md | 6 +- docs/reference/code-queue-supervision.md | 4 +- docs/reference/microservices.md | 2 +- .../code-queue-queues-shape-contract-test.ts | 20 ++- scripts/src/code-queue.ts | 132 +++++++++++++-- scripts/src/help.ts | 16 +- scripts/src/microservices.ts | 150 +++++++++++++++++- scripts/src/provider-triage.test.ts | 27 +++- scripts/src/provider-triage.ts | 136 +++++++++++++++- 10 files changed, 454 insertions(+), 45 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 13b84cda..70cf76e7 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -34,9 +34,9 @@ UniDesk 是一个以主 server 为统一入口的分布式工作平台;本文 - `bun scripts/cli.ts server logs [--tail-bytes N]`:分页返回文件日志与 Docker 日志尾部并带截断元数据,日志规则见 `docs/reference/observability.md`。 - `bun scripts/cli.ts server cleanup plan [--min-age-hours N] [--limit N]`:只读/干跑生成主 server Docker 镜像清理计划,默认只列出至少 24 小时前创建的非保护镜像,输出 active/protected images、stale candidates、预计释放空间、风险等级和必须人工确认的 `docker image rm` 命令;禁止默认删除、禁止 prune、禁止触碰 database volume、registry storage 或 Baidu Netdisk 状态。 - `bun scripts/cli.ts server rebuild `:以 build-first、Compose lock、no-deps force-recreate 和 post-up validation 的异步 job 重建主 server Compose 内单个服务;对 database、File Browser、Code Queue 执行面、k3sctl-adapter 或未知对象返回结构化 `unsupported-server-rebuild`,规则见 `docs/reference/deployment.md` 与 `docs/reference/cicd-standardization.md`。 -- `bun scripts/cli.ts provider attach [--master-server URL] [--up] [--force]` / `bun scripts/cli.ts provider triage [--observed-error text] [--observed-scope scope] [--microservice id ...]`:前者在新增计算节点上生成两项配置的 provider-gateway 挂载包;后者是只读多信号健康裁决入口,输出 `decision`、`healthyScopes`、`failedScopes` 和 `retryable`,用来把单路径 `provider is not online`、SSH 超时、registry 失败或 proxy 失败归类为 `retryable-transient`、`service-degraded` 或 `global-offline`,规则见 `docs/reference/provider-gateway.md` 和 `docs/reference/code-queue-supervision.md`。 +- `bun scripts/cli.ts provider attach [--master-server URL] [--up] [--force]` / `bun scripts/cli.ts provider triage [--observed-error text] [--observed-scope scope] [--microservice id ...] [--full|--raw]`:前者在新增计算节点上生成两项配置的 provider-gateway 挂载包;后者是只读多信号健康裁决入口,默认低噪声输出 `decision`、`healthyScopes`、`failedScopes`、`retryable` 和异常信号摘要,用来把单路径 `provider is not online`、SSH 超时、registry 失败或 proxy 失败归类为 `retryable-transient`、`service-degraded` 或 `global-offline`,完整 evidence 需显式 `--full|--raw`,规则见 `docs/reference/provider-gateway.md` 和 `docs/reference/code-queue-supervision.md`。 - `bun scripts/cli.ts ssh [ssh-like args...]`:通过 provider-gateway 的 Host SSH / WSL SSH 维护桥打开近似原生 ssh 的交互会话或远端命令,并在远端 PATH 注入 `apply_patch`、`glob` 与 `skill-discover`;`apply-patch`、`py`、`skills`、结构化 `find`、`glob` 和 `argv` 子命令用于避免远端补丁、Python stdin、skill 发现与常用只读命令的嵌套转义问题,使用规则见 `docs/reference/cli.md` 和 `docs/reference/provider-gateway.md`。 -- `bun scripts/cli.ts microservice list/status/health/diagnostics/tunnel-self-test/proxy`:管理和验证挂载在主 server、计算节点 Docker 或 k3s 控制面上的用户服务,`proxy` 支持受控 JSON body,OA Event Flow/Todo Note/Baidu Netdisk/Code Queue Manager on main-server、k3s Control/Code Queue 执行面/MDTODO/Decision Center/FindJob/Pipeline/MET Nonlinear on D601 的规则见 `docs/reference/microservices.md`。 +- `bun scripts/cli.ts microservice list/status/health/diagnostics/tunnel-self-test/proxy`:管理和验证挂载在主 server、计算节点 Docker 或 k3s 控制面上的用户服务,`status/health/diagnostics` 默认 compact summary 并用 `--full|--raw` 展开完整 body,`proxy` 支持受控 JSON body,OA Event Flow/Todo Note/Baidu Netdisk/Code Queue Manager on main-server、k3s Control/Code Queue 执行面/MDTODO/Decision Center/FindJob/Pipeline/MET Nonlinear on D601 的规则见 `docs/reference/microservices.md`。 - `bun scripts/cli.ts microservice health/diagnostics/proxy code-agent-sandbox`:验证独立 Code Agent Sandbox 的 health、只读 diagnostics、trace 和 adapter/mode/credential boundary 契约,规则见 `docs/reference/code-agent-sandbox.md`。 - `bun scripts/cli.ts decision upload/list/show/health`:通过 backend-core 用户服务代理上传会议记录/需求/决议 Markdown、列出记录和查看详情;Decision Center 运行在 D601 k3s,规则见 `docs/reference/microservices.md`。 - `bun scripts/cli.ts decision requirement list/create/show/update/upsert`:管理 Decision Center 产品化需求记录,类型覆盖外部目标、内部目标、决议、阻塞、债务和实验,规则见 `docs/reference/microservices.md`。 @@ -50,7 +50,7 @@ UniDesk 是一个以主 server 为统一入口的分布式工作平台;本文 - `bun scripts/cli.ts ci install/status/run/publish-backend-core/publish-user-service/run-dev-e2e/logs`:在 D601 原生 k3s 上安装和运行 Tekton CI,支持每 commit 检查、Code Queue 只读性能门禁、`CI.json` catalog 驱动的 backend-core 与 user-service commit-pinned 镜像发布和手动触发的 `origin/master:deploy.json#environments.dev` 临时 namespace e2e;catalog/producer/consumer 分工见 `docs/reference/cicd-standardization.md`,`run-dev-e2e` 的 Git 控制 runner、短 launcher 和 no-CD 边界见 `docs/reference/dev-ci-runner.md`,Tekton 规则见 `docs/reference/ci.md`。 - `bun scripts/cli.ts codex deploy `:旧 Code Queue 兼容部署入口已禁用,原因是它会绕过受控部署边界直连 D601 部署 Code Queue;规则见 `docs/reference/codex-deploy.md`。 - `bun scripts/cli.ts codex submit [prompt] [--prompt-file path|--prompt-stdin] [--queue ]` / `codex pr-preflight [--remote]`:前者通过 backend-core 私有代理提交 Code Queue 任务,`--dry-run` 会给出 MiniMax/GPT/人工路由建议但不改写 payload;后者只读检查 D601 scheduler/runner 的 GitHub token、egress 和 PR 能力,PR 型派单前必须使用,规则见 `docs/reference/cli.md` 和 `docs/reference/code-queue-supervision.md`。 -- `bun scripts/cli.ts codex task `:按 Code Queue 任务 ID 查询默认审阅摘要,只返回原始 prompt、最终 response、最后错误和渐进披露命令;需要工具调用、attempt/judge 和详细耗时时显式加 `--detail`。 +- `bun scripts/cli.ts codex task `:按 Code Queue 任务 ID 查询默认审阅摘要,只返回原始 prompt、最终 response、最后错误和渐进披露命令;需要工具调用、attempt/judge 和详细耗时时显式加 `--detail`;`codex queues [--full] [--limit N] [--page N|--offset N]` 默认分页低噪声输出队列摘要,完整 upstream 只通过 raw command 显式获取。 - `bun scripts/cli.ts codex judge --attempt [--dry-run]`:按指定 task/attempt 用与队列 worker 相同的上下文构建和 MiniMax judge 调用路径单步复现完成判定;`--dry-run` 只输出 prompt/payload 诊断。 - `bun scripts/cli.ts codex steer [prompt|--prompt-file path|--prompt-stdin] [--dry-run]`:通过 Code Queue 私有代理向运行中的 active turn 注入纠偏提示,正式替代底层 `microservice proxy ... /steer` 调用。 - `bun scripts/cli.ts codex interrupt|cancel `:通过 Code Queue 私有代理中断运行任务或取消 queued/retry_wait 任务,规则见 `docs/reference/cli.md`。 diff --git a/docs/reference/cli.md b/docs/reference/cli.md index 00c53009..3b7650aa 100644 --- a/docs/reference/cli.md +++ b/docs/reference/cli.md @@ -18,12 +18,12 @@ CLI 可以从 `master` 快速演进,但必须兼容 `deploy.json` 固定的 CI - `server logs` 返回 `logs/` 文件日志和 Docker 容器日志的尾部,默认限制输出大小,避免日志爆炸。实现必须只读取文件末尾字节,不得为了 tail 先把巨大日志完整读入 CLI 内存。 - `server cleanup plan [--min-age-hours N] [--limit N]` 只生成主 server Docker 镜像清理 dry-run 计划,不执行删除;默认 `--min-age-hours 24`,避免把刚发布或刚验证的镜像列为 stale。输出必须包含 `dryRun=true`、`mutation=false`、`policy.deletionExecuted=false`、active containers/images、受保护镜像、candidate stale images、估算释放空间、风险等级、`commandsToReview` 和人工审批清单。计划必须保守白名单:保留 running containers 使用的 image ID,保留 stopped containers 引用的 image ID 直到人工先复核容器,保留 `deploy.json`/`CI.json` 当前 commit-pinned artifact、Compose stable image、上游 digest pin 和 provider-gateway runner image;`protectedStorage` 必须显式列出 PostgreSQL named volume、Baidu Netdisk `.state`、D601 registry storage 和 Docker volumes/host data policy。该入口禁止生成或执行 `docker system prune`、`docker image prune`、`docker builder prune`、`docker volume rm`、`docker compose down -v`、数据库清理或 host data `rm` 命令;未来若增加真实删除,必须另设显式审批参数并先复核 dry-run 输出。 - `server rebuild ` 创建异步 job,先构建目标服务镜像,随后在 `.state/locks/server-compose.lock` 串行保护下用 `--no-deps --force-recreate` 替换目标 service 并等待容器 `healthy/running`;该命令用于替代手工删除容器的兜底流程,其中 `dev-frontend-proxy` 只更新主 server dev 入口薄代理,`todo-note`、`code-queue-mgr`、`project-manager`、`baidu-netdisk` 和 `oa-event-flow` 只重建主 server 承载的对应后端,不会重建或删除 database 命名卷。D601 Code Queue 执行面不由 `server rebuild` 管理,Rust backend-core 迭代不得用 `server rebuild backend-core` 在 master server 编译,规则见 `docs/reference/dev-environment.md`。 -- `provider attach [--master-server URL] [--up] [--force]` 在新计算节点生成两项配置的 provider-gateway 挂载包:`.state/provider-.env` 默认只包含 `UNIDESK_MASTER_SERVER` 与 `PROVIDER_ID`,`provider-.yml` 固定 Docker socket、`pid: "host"`、`restart: always`、只读 `/workspace` 和 SSH 维护私钥挂载;`--up` 会立即执行生成的 `docker compose up -d --build`。`provider triage [--observed-error text] [--observed-scope scope] [--microservice id ...]` 是只读多信号健康裁决入口,会把单路径 `provider is not online`、SSH 超时、registry 失败和 service proxy 失败归类成 `runner-local-observation-gap`、`service-degraded`、`provider-degraded` 或 `global-blocker`,且默认提供 `debug health`、`debug dispatch host.ssh --wait-ms 15000`、`ssh argv true`、`artifact-registry health --provider-id `、`microservice health k3sctl-adapter`、`microservice health code-queue` 和 `codex tasks --view supervisor --limit 20` 作为推荐交叉验证命令。 +- `provider attach [--master-server URL] [--up] [--force]` 在新计算节点生成两项配置的 provider-gateway 挂载包:`.state/provider-.env` 默认只包含 `UNIDESK_MASTER_SERVER` 与 `PROVIDER_ID`,`provider-.yml` 固定 Docker socket、`pid: "host"`、`restart: always`、只读 `/workspace` 和 SSH 维护私钥挂载;`--up` 会立即执行生成的 `docker compose up -d --build`。`provider triage [--observed-error text] [--observed-scope scope] [--microservice id ...] [--full|--raw]` 是只读多信号健康裁决入口,会把单路径 `provider is not online`、SSH 超时、registry 失败和 service proxy 失败归类成 `runner-local-observation-gap`、`service-degraded`、`provider-degraded` 或 `global-blocker`。默认输出只返回裁决、scope、失败/降级/未知信号和有界 evidence 摘要,完整 evidence 必须显式加 `--full` 或 `--raw`;推荐交叉验证命令仍包含 `debug health`、`debug dispatch host.ssh --wait-ms 15000`、`ssh argv true`、`artifact-registry health --provider-id `、`microservice health k3sctl-adapter`、`microservice health code-queue` 和 `codex tasks --view supervisor --limit 20`。 - `ssh [ssh-like args...]` 通过 backend-core 内网 WebSocket broker 和 provider-gateway 的 Host SSH / WSL SSH 维护桥连接目标节点;无后续参数时进入远端登录 shell,有后续参数时按 ssh 远端命令体验执行并返回远端 exit code。 - `ssh apply-patch [tool args...] < patch.diff` 直接调用远端注入的 `apply_patch` 工具,并把本地 stdin 中的标准 `*** Begin Patch` / `*** End Patch` patch 流透传给目标节点。 - `ssh py [script-args...] < script.py` 把本地 stdin 落到远端临时 `.py` 文件后再以 `python3 -u` 执行并自动清理,避免再手写 `'python3 -'`、heredoc 或多层引号;`script-args` 会按 argv 安全透传给远端脚本。 - `ssh skills [--scope all|wsl|windows] [--limit N]` 发现目标节点上的 WSL/Linux skill 根目录;当 provider 是 WSL 时同一次调用还会扫描 Windows 用户目录下的 `.agents/skills` 与 `.codex/skills`。 -- `microservice list/status/health/diagnostics/tunnel-self-test/proxy` 通过 backend-core 内网 API 管理挂载在计算节点 Docker 或 k3s 控制面中的用户服务(底层命令名仍为 microservice);`health`、`diagnostics`、`tunnel-self-test` 和 `proxy` 会走真实 backend-core -> provider-gateway 或 k3sctl-adapter -> 节点服务链路,`proxy` 支持受控 JSON 请求体并对超大响应 body 默认输出有界预览,规则见 `docs/reference/microservices.md`。 +- `microservice list/status/health/diagnostics/tunnel-self-test/proxy` 通过 backend-core 内网 API 管理挂载在计算节点 Docker 或 k3s 控制面中的用户服务(底层命令名仍为 microservice);`health`、`status` 和 `diagnostics` 默认返回 compact summary、body 字节数和 `--full|--raw` 展开命令,只有小 body 或无法抽取 summary 时才带有界 body preview,避免 Code Queue/k3s 诊断一次性输出爆炸;`tunnel-self-test` 和 `proxy` 会走真实 backend-core -> provider-gateway 或 k3sctl-adapter -> 节点服务链路,`proxy` 支持受控 JSON 请求体并对超大响应 body 默认输出有界预览,规则见 `docs/reference/microservices.md`。 - `decision upload/list/show/health` 通过 backend-core 用户服务代理访问 D601 k3s Decision Center,用于上传会议记录/决议 Markdown、列出权威记录、查看详情和健康检查;`decision list` 默认只返回摘要并省略完整 Markdown body,需要排查大正文时显式加 `--include-body`。正式文书字段通过 records 模型一等字段返回和查询:`--doc-no DC-...`、`--doc-type DCSN|GOAL|PLAN|RPRT|ACTN|ISSU|RETR|RQST|RESP|MINS`、`--doc-priority P0|P1|P2|P3`、`--year YYYY`、`--signer`、`--issued-at`、`--effective-scope`、`--supersedes`、`--superseded-by`;`show` 和 `requirement update` 可使用 `id` 或 `docNo`。`decision requirement list/create/upsert/update/show` 在同一 records 模型上管理 `goal|decision|blocker|debt|experiment` 需求记录,`docNo` 唯一,未传 `--doc-no` 但提供 `--doc-type/--doc-priority/--year` 时由服务分配下一个序号。它们不得直连 D601 Service、NodePort 或 provider-gateway 业务 HTTP。 - `decision diary import ` 将带 `# YYYY年M月D日`、`# YYYY-MM-DD` 或 `# YYYY/M/D` 标题的工作日志拆成每天一篇 Markdown 日记,按 `YYYY-MM/YYYY-MM-DD.md` 虚拟路径写入 Decision Center PostgreSQL;`decision diary list/history` 默认只返回摘要,需要完整 Markdown 时显式加 `--include-body`;`decision diary show [--source-file path]` 查看单日正文,`--source-file` 用于同一天存在多个导入来源时精确选择;`decision diary edit|upsert --body-file [--title text] [--source-file path] [--tag tag]` 通过 `PUT /api/diary/entries/:idOrDate` 创建当天或历史条目并编辑既有条目。 - `deploy check/plan/apply` 默认从根目录 `deploy.json` 读取服务 repo 与 commit 期望状态,join `config.json` 和现有 manifest 后使用 target-side build 或 reviewed artifact consumer 校验/更新已支持目标;`deploy plan --env dev|prod` 只从 `origin/master:deploy.json#environments.` 读取 manifest 并输出 dry-run 环境计划,不使用本地 dirty worktree;当前 `deploy apply --env dev` 支持 `backend-core`、`frontend`、`baidu-netdisk`、`decision-center`、`mdtodo`、`claudeqq` 和 dev-only `code-queue` artifact consumers,`findjob`/`pipeline`/`met-nonlinear` 为 D601 direct Compose artifact consumers,`k3sctl-adapter` 只提供 plan/dry-run;dev desired-state smoke 使用 `ci run-dev-e2e`;规则见 `docs/reference/deploy.md`、`docs/reference/dev-environment.md` 和 `docs/reference/dev-ci-runner.md`。`deploy apply --env prod` 同时覆盖 `findjob` 和 `pipeline` 的 pull-only Compose CD,但 `met-nonlinear` 仍然只允许 dry-run/plan,`k3sctl-adapter` 只允许 plan/dry-run。 @@ -56,7 +56,7 @@ CLI 可以从 `master` 快速演进,但必须兼容 `deploy.json` 固定的 CI - `codex steer [prompt|--prompt-file path|--prompt-stdin] [--dry-run]` 通过 Code Queue 私有代理向正在运行的 task 注入纠偏提示,正式替代底层 `microservice proxy code-queue /api/tasks//steer` 调用。prompt 必须且只能来自位置参数、文件或 stdin 之一;`--dry-run` 只输出 `method`、`path`、`stableProxyPath`、prompt 字符数、截断预览和 raw proxy 等价命令,不触碰运行中 session,也不得泄露超长 prompt 全文。真实执行复用与 `codex task/tasks/read` 相同的 backend-core stable proxy helper,路径固定为 `/api/microservices/code-queue/proxy/api/tasks//steer`,只能作用于 D601 scheduler 上存在 active steerable turn 的 running task。 - `codex steer` 非 dry-run 失败仍输出 JSON 且退出非零;`.data.diagnostics.reason` 用于 runner 分流,当前包括 `backend-core-unreachable`、`code-queue-microservice-unregistered`、`proxy-unauthorized`、`proxy-404`、`steer-endpoint-404`、`upstream-runtime-rejected`、`stable-proxy-failed` 和 `invalid-proxy-response`。`scope` 区分 `backend-core`、`stable-proxy`、`code-queue-runtime` 或 `unknown`,并带 `status`、`exitCode`、`retryable`、有界 `upstreamBodyPreview` 和推荐交叉验证命令;若任务不在 running/active-turn 状态,通常归类为 `upstream-runtime-rejected`,不得静默成功。 - `codex interrupt|cancel ` 通过 Code Queue 私有代理请求中断;running/judging 任务会请求 D601 当前 agent run 停止,queued/retry_wait 任务的取消也必须保持与 WebUI 相同代理路径,返回有界 task 摘要和后续查询命令。任何需要接触 active run 的动作仍属于 D601 执行面。 -- Code Queue 多队列 lane 由 `codex` 命令命名空间管理:`queues [--full|--all]` 列表、`queue create ` 创建、`queue merge --into ` 合并、`move --queue ` 迁移;这些队列管理入口默认由主 server `code-queue-mgr` 直管 PostgreSQL,仍通过稳定 `code-queue` 用户服务代理路径访问。`codex queues` 默认只返回 active/nonempty/unread/runnable queue 摘要、全局 counts 和 execution diagnostics;完整队列数组必须显式 `--full` 或 `--all`。summary 和 full 的稳定机读路径都是 `.data.queues.items[]`,全局元数据固定在 `.data.queues.counts`、`.data.queues.executionDiagnostics`、`.data.queues.activeTaskIds` 和 `.data.queues.queuedTaskIds`;旧 full 顶层数组语义只允许作为 deprecated 兼容字段 `.data.queues.deprecatedFullArray[]` 出现,不再作为 `.data.queues` 主形态。同一个 queue 内部串行执行,不同 queue 之间并行执行。迁移只允许尚未被 scheduler claim 的 `queued`/`retry_wait` 任务,必须满足 `startedAt=null`、`currentAttempt=0` 且没有 active thread/turn;已进入 `running`/`judging` 或已有 claim 标记的任务返回 409,不得被 move/merge 回写成 queued。合并会移动可迁移任务归属并自动删除源 queue 记录,只保留合并后的目标 queue;若 source 或 target queue 存在 active/claimed 任务,合并整体返回 409。合并后的目标 queue 按任务原 `queueEnteredAt`/`createdAt` 时间顺序串行,成功迁移 queued/retry_wait 任务后由 D601 scheduler 轮询推进。 +- Code Queue 多队列 lane 由 `codex` 命令命名空间管理:`queues [--full|--all] [--limit N] [--page N|--offset N]` 列表、`queue create ` 创建、`queue merge --into ` 合并、`move --queue ` 迁移;这些队列管理入口默认由主 server `code-queue-mgr` 直管 PostgreSQL,仍通过稳定 `code-queue` 用户服务代理路径访问。`codex queues` 默认只返回 active/nonempty/unread/runnable queue 摘要、全局 counts 和 execution diagnostics;`--full` 或 `--all` 只切换为完整队列行视图的一页,仍受 `--limit`/`--page`/`--offset` 分页约束,不再默认携带 deprecated full array。summary 和 full 的稳定机读路径都是 `.data.queues.items[]`,全局元数据固定在 `.data.queues.counts`、`.data.queues.executionDiagnostics`、`.data.queues.activeTaskIds` 和 `.data.queues.queuedTaskIds`;需要完整 upstream 时使用输出中的 raw command。旧 full 顶层数组语义已作为 deprecated 兼容信息记录,不再作为 `.data.queues` 主形态。同一个 queue 内部串行执行,不同 queue 之间并行执行。迁移只允许尚未被 scheduler claim 的 `queued`/`retry_wait` 任务,必须满足 `startedAt=null`、`currentAttempt=0` 且没有 active thread/turn;已进入 `running`/`judging` 或已有 claim 标记的任务返回 409,不得被 move/merge 回写成 queued。合并会移动可迁移任务归属并自动删除源 queue 记录,只保留合并后的目标 queue;若 source 或 target queue 存在 active/claimed 任务,合并整体返回 409。合并后的目标 queue 按任务原 `queueEnteredAt`/`createdAt` 时间顺序串行,成功迁移 queued/retry_wait 任务后由 D601 scheduler 轮询推进。 - 所有 `codex` 查询和管理命令必须走与 WebUI 相同的 backend-core 私有代理路径 `/api/microservices/code-queue/proxy/...`;CLI 不得为了提交、移动、中断、取消或队列管理直接调用 D601 内部 Service、数据库、pod curl 或 k3sctl scheduler 子服务。若该路径失败,应先修复 CLI/backend/provider tunnel 链路,而不是绕过控制面。 - `job list [--limit N] [--include-command]` 与 `job status [--tail-bytes N]` 查询 `.state/jobs/` 文件系统状态,是异步命令的可观测入口。`job list` 默认只返回最新 50 条摘要;`job status` 默认只返回 stdout/stderr 末尾 12000 字节,并带 `tailPolicy` 与完整日志路径。 - `debug health`、`debug dispatch` 与 `debug task` 走真实内部 core、WebSocket、数据库、provider、系统指标、Docker 状态和 Host SSH 维护桥流程,只用于开发调试,不写入 `TEST.md` 的正式验收步骤。 diff --git a/docs/reference/code-queue-supervision.md b/docs/reference/code-queue-supervision.md index d0f6652e..8ab70005 100644 --- a/docs/reference/code-queue-supervision.md +++ b/docs/reference/code-queue-supervision.md @@ -202,7 +202,7 @@ bun scripts/cli.ts codex pr-preflight --remote --issue 常用入口: - `bun scripts/cli.ts codex tasks --view supervisor --limit N`:查看默认低噪声监督视图,包括 running、完成未读、最多 5 条最近完成、queued/runnable、execution diagnostics、任务分类和下一步 drill-down 命令。默认行只保留 task id、队列、短 prompt/body 预览和原始字符数;`show/detail/trace/output/full/read` 放在 section template 中,避免每条任务重复刷屏,需要更多内容再按 taskId 展开。 -- `bun scripts/cli.ts codex queues`:查看低噪声队列计数、active task id、完成未读队列、runnable 队列和控制面诊断;只有需要完整队列行时才加 `--full`。summary 和 full 都使用稳定 JSON path `.data.queues.items[]` 读取队列行,并从 `.data.queues.counts` 与 `.data.queues.executionDiagnostics` 读取全局计数和执行诊断。 +- `bun scripts/cli.ts codex queues`:查看低噪声队列计数、active task id、完成未读队列、runnable 队列和控制面诊断;需要完整队列行视图时加 `--full`,但 `--full` 仍默认分页,继续用 `--limit N`、`--page N` 或 `--offset N` 渐进展开。summary 和 full 都使用稳定 JSON path `.data.queues.items[]` 读取队列行,并从 `.data.queues.counts` 与 `.data.queues.executionDiagnostics` 读取全局计数和执行诊断;完整 upstream 只通过输出中的 raw command 显式获取。 - `bun scripts/cli.ts codex tasks --unread --limit N`:查看完成未读审阅积压;`--unread` 与 `--unread-only` 等价,不能被静默忽略。 - `bun scripts/cli.ts codex tasks --status succeeded --unread --limit N`:按具体终态过滤监督结果;不支持的 status filter 必须显式失败,不能扩大为未过滤结果。 - `bun scripts/cli.ts codex task `:默认只查看原始 prompt、最终 response、最后错误和 drill-down 命令,这是完成未读任务审阅的第一步。 @@ -235,7 +235,7 @@ host Codex 指挥官正规化后仍受同一条高风险边界约束。`docs/ref 当多信号裁决显示 provider 服务器、D601 执行面或关键维护桥疑似需要人工检查时,指挥官可以在更新 #24/#40 等记录之外,通过 ClaudeQQ 额外提醒用户检查 provider 服务器状态。提醒只在首次确认、状态恶化、恢复或需要用户介入时发送,不能在每轮轮询中重复轰炸。ClaudeQQ 提醒是 best-effort:若 ClaudeQQ 本身依赖同一条故障 provider/k3sctl 链路而不可达,指挥官应把通知失败的原因写入 #24 或对应 blocker issue,并继续按轮询和恢复规则推进。 -在 UniDesk CLI 中,`bun scripts/cli.ts provider triage ` 是只读多信号裁决入口,适合作为 worker 和指挥官的统一健康判断前置。它必须至少保留这些合同:`provider is not online` 这类单路径失败只应落到 `decision=retryable-transient` / `blockingDisposition=runner-local-observation-gap`,不得直接输出 `global-offline`;只有 provider-gateway/SSH/k3s/scheduler 等多个独立关键路径同时失败且缺少健康交叉证据,才允许输出 `decision=global-offline`;registry 或单个 service proxy 失败但 heartbeat、SSH 或节点视图仍健康时,应输出 `decision=service-degraded`。`recommendedCrossChecks` 必须包含 `debug health`、`debug dispatch host.ssh --wait-ms 15000`、`ssh argv true`、`artifact-registry health --provider-id `、`microservice health k3sctl-adapter`、`microservice health code-queue` 与 `codex tasks --view supervisor --limit 20`。 +在 UniDesk CLI 中,`bun scripts/cli.ts provider triage ` 是只读多信号裁决入口,适合作为 worker 和指挥官的统一健康判断前置。它必须至少保留这些合同:默认输出只展示裁决、scope、失败/降级/未知信号和有界 evidence 摘要,完整 evidence 必须显式加 `--full` 或 `--raw`;`provider is not online` 这类单路径失败只应落到 `decision=retryable-transient` / `blockingDisposition=runner-local-observation-gap`,不得直接输出 `global-offline`;只有 provider-gateway/SSH/k3s/scheduler 等多个独立关键路径同时失败且缺少健康交叉证据,才允许输出 `decision=global-offline`;registry 或单个 service proxy 失败但 heartbeat、SSH 或节点视图仍健康时,应输出 `decision=service-degraded`。`recommendedCrossChecks` 必须包含 `debug health`、`debug dispatch host.ssh --wait-ms 15000`、`ssh argv true`、`artifact-registry health --provider-id `、`microservice health k3sctl-adapter`、`microservice health code-queue` 与 `codex tasks --view supervisor --limit 20`。 D601 artifact registry 的 systemd unit inactive 不等于 D601 全局离线。如果 `artifact-registry health` 或 `provider triage D601` 同时看到 registry container running、loopback listener healthy、`/v2/` 返回 200,且 provider heartbeat、Host SSH、k3sctl-adapter、Code Queue scheduler 或业务 API 有健康信号,这只能判为 `service-degraded`,不得写成 provider offline、D601 offline 或 CI/CD 全局不可推进。只有这些健康面也同时失败,才进入 `global-offline` 判断。 diff --git a/docs/reference/microservices.md b/docs/reference/microservices.md index 707178fb..3c7d2dcb 100644 --- a/docs/reference/microservices.md +++ b/docs/reference/microservices.md @@ -396,7 +396,7 @@ ClaudeQQ 的业务源码和持久化数据仍在 D601,但正式运行由 k3s 用户服务前端必须整合到 `src/components/frontend/src/` 下的 TypeScript + React 模块中。`app.tsx` 只做 shell/router 和导入分发,业务页面必须拆成独立 TSX,例如 `todo-note.tsx`、`findjob.tsx`、`pipeline.tsx`、`met-nonlinear.tsx`、`code-queue.tsx`、`k3sctl.tsx`。默认展示必须是业务控件:指标卡、状态徽标、表格、草稿卡片、运行卡片、树形任务、表单控件、结构化材料索引、链接和字段摘要;只有操作员点击 `查看原始JSON` 时才允许打开原始 JSON 弹窗。日志、JSONL 和大块 JSON 不得在主界面按行展示,避免把裸数据伪装成 UI。 -对于超大业务 JSON,backend-core 可把 `__unideskArrayLimit=:` 作为 frontend-only 代理参数传给 provider-gateway,由 provider-gateway 在返回前裁剪指定 JSON 数组并写入 `_unidesk.arrayLimits` 元数据。该参数只用于控制 UniDesk 展示预览,不能替代业务后端自身分页 API 的长期设计。CLI 的 `microservice proxy` 还会对超过默认阈值的 body 做二次有界预览,防止人工验证时输出爆炸;只有显式 `--raw` 才允许倾倒完整 body。 +对于超大业务 JSON,backend-core 可把 `__unideskArrayLimit=:` 作为 frontend-only 代理参数传给 provider-gateway,由 provider-gateway 在返回前裁剪指定 JSON 数组并写入 `_unidesk.arrayLimits` 元数据。该参数只用于控制 UniDesk 展示预览,不能替代业务后端自身分页 API 的长期设计。CLI 的 `microservice status/health/diagnostics` 默认只返回 compact summary、body 字节数和 `--full|--raw` 展开命令;只有小 body 或无法抽取 summary 时才带有界 preview,防止 Code Queue/k3s health 这类大 JSON 刷屏。`microservice proxy` 还会对超过默认阈值的 body 做二次有界预览,防止人工验证时输出爆炸。只有显式 `--full` 或 `--raw` 才允许倾倒完整 body。 ## Verification diff --git a/scripts/code-queue-queues-shape-contract-test.ts b/scripts/code-queue-queues-shape-contract-test.ts index b8f062e5..a414c971 100644 --- a/scripts/code-queue-queues-shape-contract-test.ts +++ b/scripts/code-queue-queues-shape-contract-test.ts @@ -119,17 +119,26 @@ export function runCodeQueueQueuesShapeContract(): JsonRecord { assertQueuesShape("full", full, "full"); const fullQueues = asRecord(asRecord(full).queues); assertCondition(!Array.isArray(fullQueues), "full queues payload must be an object, not the deprecated array shape", fullQueues); - assertCondition(fullQueues.bounded === false, "full without --limit should preserve complete queue listing semantics", fullQueues); - const deprecatedFullArray = asArray(fullQueues.deprecatedFullArray); - assertCondition(deprecatedFullArray.length === 3, "full should expose deprecated array only under a compatibility field", fullQueues); + assertCondition(fullQueues.bounded === true, "full queues output should now be paged by default", fullQueues); + assertCondition(fullQueues.deprecatedFullArray === undefined, "full should not expose deprecated unbounded array by default", fullQueues); const compatibility = asRecord(fullQueues.compatibility); assertCondition(compatibility.stablePath === "data.queues.items[]", "compatibility metadata should document stable path", compatibility); assertCondition(compatibility.deprecated === true, "compatibility metadata should mark old array path deprecated", compatibility); + assertCondition(compatibility.deprecatedFullArrayOmitted === true, "compatibility metadata should explain deprecated array omission", compatibility); const limitedFull = codexQueuesQueryForTest(["--full", "--limit", "2"], fetcher); const limitedFullQueues = asRecord(asRecord(limitedFull).queues); assertCondition(limitedFullQueues.bounded === true, "full with explicit --limit should be bounded", limitedFullQueues); assertCondition(asArray(limitedFullQueues.items).length === 2, "full with explicit --limit should limit data.queues.items[]", limitedFullQueues); + assertCondition(limitedFullQueues.hasMore === true, "limited full should expose next page", limitedFullQueues); + const limitedCommands = asRecord(limitedFullQueues.commands); + assertCondition(String(limitedCommands.next ?? "").includes("--offset 2"), "limited full should expose offset pagination command", limitedCommands); + + const offsetFull = codexQueuesQueryForTest(["--full", "--limit", "2", "--offset", "2"], fetcher); + const offsetFullQueues = asRecord(asRecord(offsetFull).queues); + assertCondition(offsetFullQueues.offset === 2, "offset full should preserve offset", offsetFullQueues); + assertCondition(offsetFullQueues.hasPrevious === true, "offset full should expose previous page", offsetFullQueues); + assertCondition(asRecord(asArray(offsetFullQueues.items)[0]).id === "gamma", "offset full should return second page rows", offsetFullQueues); return { ok: true, @@ -138,8 +147,9 @@ export function runCodeQueueQueuesShapeContract(): JsonRecord { "summary queue metadata", "full data.queues.items[] shape", "full queue metadata", - "deprecated full array compatibility field", - "full explicit limit remains bounded", + "deprecated full array omitted from default output", + "full explicit limit remains bounded and paged", + "offset pagination", ], }; } diff --git a/scripts/src/code-queue.ts b/scripts/src/code-queue.ts index 62b8dfc6..3f1be55d 100644 --- a/scripts/src/code-queue.ts +++ b/scripts/src/code-queue.ts @@ -11,6 +11,7 @@ const maxTraceLimit = 500; const defaultOutputLimit = 20; const defaultTextPreviewChars = 12_000; const defaultTasksLimit = 20; +const defaultQueuesLimit = 8; const maxTasksLimit = 100; const supervisorSectionReturnedLimit = 5; const supervisorRecentCompletedLimit = 5; @@ -255,7 +256,8 @@ interface CodexTasksDegraded { interface CodexQueuesOptions { full: boolean; limit: number; - limitExplicit: boolean; + offset: number; + page: number; } interface CodexPrPreflightOptions { @@ -593,6 +595,18 @@ function nonNegativeNumberOption(args: string[], names: string[], defaultValue: return defaultValue; } +function nonNegativeIntegerOption(args: string[], names: string[], defaultValue: number, maxValue = Number.MAX_SAFE_INTEGER): number { + for (const name of names) { + const index = args.indexOf(name); + if (index === -1) continue; + const raw = args[index + 1]; + const value = Number(raw); + if (!Number.isInteger(value) || value < 0) throw new Error(`${name} must be a non-negative integer`); + return Math.min(value, maxValue); + } + return defaultValue; +} + function nullablePositiveNumberOption(args: string[], names: string[]): number | null { for (const name of names) { const index = args.indexOf(name); @@ -1176,6 +1190,42 @@ function compactExecutionDiagnostics(value: unknown): Record | }; } +function compactQueueExecutionDiagnostics(value: unknown): Record | null { + const diagnostics = compactExecutionDiagnostics(value); + if (diagnostics === null) return null; + const listBudget = asRecord(diagnostics.listBudget) ?? {}; + const omittedCounts = asRecord(listBudget.omittedCounts) ?? {}; + return { + state: diagnostics.state ?? null, + degraded: diagnostics.degraded ?? null, + splitBrain: diagnostics.splitBrain ?? null, + splitBrainLive: diagnostics.splitBrainLive ?? null, + effectiveLiveness: diagnostics.effectiveLiveness ?? null, + recommendedAction: diagnostics.recommendedAction ?? null, + liveness: diagnostics.liveness ?? null, + executionStateSource: diagnostics.executionStateSource ?? null, + controlPlane: diagnostics.controlPlane ?? null, + databaseActiveTaskCount: diagnostics.databaseActiveTaskCount ?? null, + schedulerActiveRunSlotCount: diagnostics.schedulerActiveRunSlotCount ?? null, + activeHeartbeatCount: diagnostics.activeHeartbeatCount ?? null, + lastSchedulerHeartbeatAt: diagnostics.lastSchedulerHeartbeatAt ?? null, + lastObservedAgentEventAt: diagnostics.lastObservedAgentEventAt ?? null, + lastPersistedTraceAt: diagnostics.lastPersistedTraceAt ?? null, + reasons: diagnostics.reasons ?? [], + listBudget: { + truncated: listBudget.truncated ?? false, + omittedCounts: { + databaseActiveTaskIds: omittedCounts.databaseActiveTaskIds ?? 0, + activeHeartbeatTaskIds: omittedCounts.activeHeartbeatTaskIds ?? 0, + heartbeatFreshTaskIds: omittedCounts.heartbeatFreshTaskIds ?? 0, + heartbeatRiskTaskIds: omittedCounts.heartbeatRiskTaskIds ?? 0, + reasons: omittedCounts.reasons ?? 0, + }, + rawCommand: listBudget.rawCommand ?? "bun scripts/cli.ts microservice proxy code-queue /api/tasks/overview?limit=30 --raw --full", + }, + }; +} + function supervisorExecutionDiagnostics(value: unknown): Record | null { const diagnostics = compactExecutionDiagnostics(value); if (diagnostics === null) return null; @@ -1577,12 +1627,17 @@ function parseTasksOptions(args: string[]): CodexTasksOptions { function parseQueuesOptions(args: string[]): CodexQueuesOptions { assertKnownOptions(args, { flags: ["--full", "--all"], - valueOptions: ["--limit"], + valueOptions: ["--limit", "--offset", "--page"], }, "codex queues"); + const limit = positiveIntegerOption(args, ["--limit"], defaultQueuesLimit, maxTasksLimit); + const page = positiveIntegerOption(args, ["--page"], 1); + const offsetExplicit = args.includes("--offset"); + const offset = offsetExplicit ? nonNegativeIntegerOption(args, ["--offset"], 0) : (page - 1) * limit; return { full: hasFlag(args, "--full") || hasFlag(args, "--all"), - limit: positiveIntegerOption(args, ["--limit"], defaultTasksLimit, maxTasksLimit), - limitExplicit: args.includes("--limit"), + limit, + offset, + page: Math.floor(offset / limit) + 1, }; } @@ -2430,13 +2485,23 @@ function requireMergeTargetQueueId(args: string[], command: string): string { return raw.trim(); } +function compactCounts(value: unknown): Record { + const counts = asRecord(value) ?? {}; + const compact: Record = {}; + for (const [key, count] of Object.entries(counts)) { + if (typeof count === "number" && count === 0) continue; + compact[key] = count; + } + return compact; +} + function compactQueueRow(value: unknown): Record { const record = asRecord(value) ?? {}; return { id: record.id ?? null, name: record.name ?? null, total: record.total ?? null, - counts: record.counts ?? {}, + counts: compactCounts(record.counts), unreadTerminal: record.unreadTerminal ?? 0, activeTaskId: record.activeTaskId ?? null, runnableTaskId: record.runnableTaskId ?? null, @@ -2449,6 +2514,18 @@ function compactQueueRow(value: unknown): Record { }; } +function queueListCommand(options: Partial = {}): string { + const full = options.full === true; + const limit = options.limit ?? defaultQueuesLimit; + const offset = options.offset ?? 0; + return [ + "bun scripts/cli.ts codex queues", + full ? "--full" : "", + limit === defaultQueuesLimit ? "" : `--limit ${limit}`, + offset > 0 ? `--offset ${offset}` : "", + ].filter(Boolean).join(" "); +} + function compactQueuesResponse(body: Record, options: CodexQueuesOptions, upstream: { ok: unknown; status: unknown }): Record { const queue = asRecord(body.queue) ?? asRecord(body.summary) ?? {}; const queues = asArray(body.queues).map(compactQueueRow); @@ -2458,17 +2535,31 @@ function compactQueuesResponse(body: Record, options: CodexQueu const runnableQueues = queues.filter((row) => row.runnableTaskId !== null && row.runnableTaskId !== undefined); const activeQueues = queues.filter((row) => typeof row.id === "string" && activeIds.includes(row.id)); const selected = options.full ? queues : Array.from(new Map([...activeQueues, ...unreadQueues, ...runnableQueues, ...nonemptyQueues].map((row) => [String(row.id), row])).values()); - const limitApplied = !options.full || options.limitExplicit; - const visible = limitApplied ? selected.slice(0, options.limit) : selected; - const diagnostics = compactExecutionDiagnostics(queue.executionDiagnostics); + const visible = selected.slice(options.offset, options.offset + options.limit); + const diagnostics = compactQueueExecutionDiagnostics(queue.executionDiagnostics); + const activeTaskIds = boundedUniqueStringList(queue.activeTaskIds, Math.min(options.limit, maxTasksLimit)); + const queuedTaskIds = boundedUniqueStringList(queue.queuedTaskIds, Math.min(options.limit, maxTasksLimit)); + const nextOffset = options.offset + visible.length; + const previousOffset = Math.max(0, options.offset - options.limit); + const hasMore = nextOffset < selected.length; + const hasPrevious = options.offset > 0; return { upstream, queues: { view: options.full ? "full" : "summary", - bounded: limitApplied, + bounded: true, + outputPolicy: { + default: "paged-low-noise", + stableItemsPath: "data.queues.items[]", + rawFullCommand: "bun scripts/cli.ts microservice proxy code-queue /api/queues --raw --full", + }, count: selected.length, returned: visible.length, - hasMore: selected.length > visible.length, + limit: options.limit, + offset: options.offset, + page: options.page, + hasMore, + hasPrevious, totals: { totalTasks: queue.total ?? null, queueCount: queue.queueCount ?? queues.length, @@ -2478,29 +2569,36 @@ function compactQueuesResponse(body: Record, options: CodexQueu runnableQueueCount: runnableQueues.length, }, activeQueueIds: queue.activeQueueIds ?? [], - activeTaskIds: queue.activeTaskIds ?? [], - queuedTaskIds: queue.queuedTaskIds ?? [], + activeTaskIds: activeTaskIds.items, + activeTaskIdsCount: activeTaskIds.count, + activeTaskIdsTruncated: activeTaskIds.truncated, + queuedTaskIds: queuedTaskIds.items, + queuedTaskIdsCount: queuedTaskIds.count, + queuedTaskIdsTruncated: queuedTaskIds.truncated, counts: queue.counts ?? {}, unreadTerminal: queue.unreadTerminal ?? 0, executionDiagnostics: diagnostics, items: visible, ...(options.full ? { - deprecatedFullArray: asArray(body.queues), compatibility: { deprecated: true, deprecatedPath: "data.queues.deprecatedFullArray[]", stablePath: "data.queues.items[]", - message: "Use data.queues.items[] for both codex queues and codex queues --full.", + deprecatedFullArrayOmitted: true, + message: "Use data.queues.items[] for both codex queues and codex queues --full; raw full upstream is available only through the explicit raw command.", }, } : {}), commands: { - refresh: `bun scripts/cli.ts codex queues${options.limit === defaultTasksLimit ? "" : ` --limit ${options.limit}`}`, - full: `bun scripts/cli.ts codex queues --full${options.limit === defaultTasksLimit ? "" : ` --limit ${options.limit}`}`, + refresh: queueListCommand({ full: options.full, limit: options.limit, offset: options.offset }), + next: hasMore ? queueListCommand({ full: options.full, limit: options.limit, offset: nextOffset }) : null, + previous: hasPrevious ? queueListCommand({ full: options.full, limit: options.limit, offset: previousOffset }) : null, + first: queueListCommand({ full: options.full, limit: options.limit, offset: 0 }), + full: queueListCommand({ full: true, limit: options.limit, offset: 0 }), tasks: `bun scripts/cli.ts codex tasks --view supervisor --limit ${Math.min(options.limit, defaultTasksLimit)}`, unread: `bun scripts/cli.ts codex tasks --unread --limit ${Math.min(options.limit, defaultTasksLimit)}`, - raw: "bun scripts/cli.ts microservice proxy code-queue /api/queues --raw", + raw: "bun scripts/cli.ts microservice proxy code-queue /api/queues --raw --full", }, }, }; diff --git a/scripts/src/help.ts b/scripts/src/help.ts index cff45612..2e97ad24 100644 --- a/scripts/src/help.ts +++ b/scripts/src/help.ts @@ -17,7 +17,7 @@ export function rootHelp(): unknown { { command: "server logs [--tail-bytes N]", description: "Return bounded tails from file logs and docker logs." }, { command: "server cleanup plan [--min-age-hours N] [--limit N]", description: "Dry-run Docker image cleanup plan only: list active/protected images, stale candidates older than the default 24h threshold, risk, estimated reclaim, and manual review commands without deleting anything." }, { command: "server rebuild ", description: "Maintenance-only local Compose rebuild for reviewed main-server services; frontend standard release must use CI artifact plus deploy apply dev/prod artifact consumers." }, - { command: "provider attach [--master-server URL] [--up] [--force] | provider triage [--observed-error text] [--observed-scope scope] [--microservice id ...]", description: "Generate the minimal external provider-gateway env/compose bundle or run the read-only provider health triage contract." }, + { command: "provider attach [--master-server URL] [--up] [--force] | provider triage [--observed-error text] [--observed-scope scope] [--microservice id ...] [--full|--raw]", description: "Generate the minimal external provider-gateway env/compose bundle or run the low-noise read-only provider health triage contract." }, { command: "ssh [ssh-like args...]", description: "Open a Host SSH / WSL SSH maintenance session through the provider-gateway bridge with built-in remote helper tools in PATH." }, { command: "ssh apply-patch [tool args...] < patch.diff", description: "Invoke the injected remote apply_patch helper directly over SSH passthrough and stream the patch from local stdin." }, { command: "ssh py [script-args...] < script.py", description: "Run remote Python from local stdin through SSH passthrough without nested shell quoting; extra args become script argv." }, @@ -27,9 +27,9 @@ export function rootHelp(): unknown { { command: "ssh argv [args...]", description: "Run a remote command with each argv token shell-quoted by UniDesk before SSH passthrough." }, { command: "microservice list", description: "List UniDesk-managed user services and their provider/runtime mapping." }, { command: "microservice status ", description: "Show one user service config, repository reference, backend mapping, and runtime status." }, - { command: "microservice health ", description: "Probe one user service through backend-core -> provider-gateway HTTP proxy." }, + { command: "microservice health [--full|--raw]", description: "Probe one user service through backend-core -> provider-gateway HTTP proxy; default output is a compact health summary." }, { command: "microservice proxy [--method GET|POST|PUT|PATCH|DELETE] [--body-json JSON|--body-file path|--body-stdin] [--raw] [--max-body-bytes N]", description: "Access a private user-service backend path through the same frontend-only proxy used by WebUI; JSON request bodies are supported for controlled write/debug endpoints." }, - { command: "microservice diagnostics ", description: "Split k3sctl-managed proxy health into provider-gateway, HTTP tunnel, adapter, Kubernetes API service proxy, and target Service checks." }, + { command: "microservice diagnostics [--full|--raw]", description: "Split k3sctl-managed proxy health into a compact summary by default; use --full/--raw for complete evidence." }, { command: "microservice tunnel-self-test ", description: "Trigger an expected provider HTTP tunnel failure and verify requestId/stage diagnostics are returned." }, { command: "decision upload [--title text] [--type meeting|decision|goal|external_goal|internal_goal|blocker|debt|experiment] [--level|--priority G0|G1|G2|G3|P0|P1|P2|P3|none] [--doc-no DC-...] [--doc-type DCSN|GOAL|PLAN|RPRT|ACTN|ISSU|RETR|RQST|RESP|MINS] [--doc-priority P0|P1|P2|P3] [--signer text] [--issued-at ISO]", description: "Upload a meeting note or decision/requirement record through backend-core -> decision-center user-service proxy." }, { command: "decision diary import [--source-file path] [--tag tag] [--include-entries]", description: "Import a dated work log Markdown into PostgreSQL diary entries split as YYYY-MM/YYYY-MM-DD.md." }, @@ -165,9 +165,9 @@ function microserviceHelp(): unknown { output: "json", usage: [ "bun scripts/cli.ts microservice list", - "bun scripts/cli.ts microservice status ", - "bun scripts/cli.ts microservice health ", - "bun scripts/cli.ts microservice diagnostics ", + "bun scripts/cli.ts microservice status [--full|--raw]", + "bun scripts/cli.ts microservice health [--full|--raw]", + "bun scripts/cli.ts microservice diagnostics [--full|--raw]", "bun scripts/cli.ts microservice tunnel-self-test ", "bun scripts/cli.ts microservice proxy [--method GET|POST|PUT|PATCH|DELETE] [--body-json JSON|--body-file path|--body-stdin] [--raw] [--full] [--max-body-bytes N]", ], @@ -197,7 +197,7 @@ function providerHelp(): unknown { output: "json", usage: [ "bun scripts/cli.ts provider attach [--master-server URL] [--up] [--force]", - "bun scripts/cli.ts provider triage [--observed-error text] [--observed-scope scope] [--microservice id ...]", + "bun scripts/cli.ts provider triage [--observed-error text] [--observed-scope scope] [--microservice id ...] [--full|--raw]", ], description: "Generate the minimal provider-gateway attach env/compose bundle or run the read-only provider health triage contract.", }; @@ -260,7 +260,7 @@ function codexHelp(): unknown { "bun scripts/cli.ts codex judge --attempt N [--dry-run] [--include-prompt]", "bun scripts/cli.ts codex steer [prompt|--prompt-file path|--prompt-stdin] [--dry-run]", "bun scripts/cli.ts codex interrupt|cancel ", - "bun scripts/cli.ts codex queues [--full|--all] | queue create | queue merge --into | move --queue ", + "bun scripts/cli.ts codex queues [--full|--all] [--limit N] [--page N|--offset N] | queue create | queue merge --into | move --queue ", ], promptInput: { recommended: ["--prompt-stdin", "--prompt-file"], diff --git a/scripts/src/microservices.ts b/scripts/src/microservices.ts index e91fb253..33f7e3e3 100644 --- a/scripts/src/microservices.ts +++ b/scripts/src/microservices.ts @@ -240,6 +240,14 @@ function hasFlag(args: string[], name: string): boolean { return args.includes(name); } +function asRecord(value: unknown): Record | null { + return typeof value === "object" && value !== null && !Array.isArray(value) ? value as Record : null; +} + +function asArray(value: unknown): unknown[] { + return Array.isArray(value) ? value : []; +} + function parseJsonOption(raw: string, name: string): unknown { try { return JSON.parse(raw) as unknown; @@ -264,6 +272,14 @@ function requestBodyOption(args: string[]): unknown | undefined { return undefined; } +function assertKnownObservationOptions(args: string[], command: string): void { + const flags = new Set(["--full", "--raw"]); + for (const arg of args) { + if (!arg.startsWith("--")) continue; + if (!flags.has(arg)) throw new Error(`unsupported ${command} option: ${arg}`); + } +} + function methodOption(args: string[], hasBody = false): string { const method = (stringOption(args, "--method") ?? (hasBody ? "POST" : "GET")).toUpperCase(); if (!["GET", "HEAD", "POST", "DELETE", "PUT", "PATCH"].includes(method)) throw new Error(`unsupported --method ${method}`); @@ -314,20 +330,148 @@ export function summarizeMicroserviceProxyResponse(response: unknown, args: stri }; } +function compactStringList(value: unknown, limit = 8): Record { + const all = Array.from(new Set(asArray(value).map((item) => String(item ?? "")).filter(Boolean))); + return { + items: all.slice(0, limit), + count: all.length, + truncated: all.length > limit, + omitted: Math.max(0, all.length - limit), + }; +} + +function compactRecordFields(record: Record | null, keys: string[]): Record | null { + if (record === null) return null; + const selected: Record = {}; + for (const key of keys) { + if (record[key] !== undefined) selected[key] = record[key]; + } + return Object.keys(selected).length > 0 ? selected : null; +} + +function compactQueueHealth(value: unknown): Record | null { + const queue = asRecord(value); + if (queue === null) return null; + return { + controlPlane: queue.controlPlane ?? null, + defaultProviderId: queue.defaultProviderId ?? null, + defaultModel: queue.defaultModel ?? null, + counts: queue.counts ?? {}, + total: queue.total ?? null, + queueCount: queue.queueCount ?? null, + activeQueueIds: compactStringList(queue.activeQueueIds, 8), + activeTaskIds: compactStringList(queue.activeTaskIds ?? queue.databaseActiveTaskIds, 8), + queuedTaskIds: compactStringList(queue.queuedTaskIds, 8), + databaseActiveTaskCount: queue.databaseActiveTaskCount ?? null, + schedulerActiveRunSlotCount: queue.schedulerActiveRunSlotCount ?? queue.activeRunSlotCount ?? null, + }; +} + +function compactMicroserviceBody(body: unknown, serviceId: string): Record | null { + const record = asRecord(body); + if (record === null) return null; + const diagnostics = asRecord(record.executionDiagnostics) ?? asRecord(record.diagnostics); + const devReady = asRecord(record.devReady); + return { + ok: record.ok ?? null, + serviceId: record.serviceId ?? record.service ?? serviceId, + service: record.service ?? null, + role: record.role ?? null, + status: record.status ?? null, + healthy: record.healthy ?? null, + mode: record.mode ?? null, + environment: record.environment ?? null, + version: record.version ?? record.gatewayVersion ?? null, + target: record.target ?? record.k3sServiceId ?? null, + updatedAt: record.updatedAt ?? record.observedAt ?? record.generatedAt ?? null, + startedAt: record.startedAt ?? null, + taskCount: record.taskCount ?? null, + schemaReady: record.schemaReady ?? null, + queue: compactQueueHealth(record.queue), + resourceBudget: compactRecordFields(asRecord(record.resourceBudget), [ + "targetMemoryMb", + "mgrPoolMax", + "tracePoolMax", + "noRunnerDependencies", + "noDockerSocket", + "noPlaywright", + "rustRewrite", + ]), + topLevelKeys: compactStringList(Object.keys(record), 16), + devReady: devReady === null ? null : { + ok: devReady.ok ?? null, + missingTools: compactStringList(devReady.missingTools, 8), + skills: devReady.skills ?? null, + }, + executionDiagnostics: diagnostics === null ? null : { + state: diagnostics.state ?? null, + effectiveLiveness: diagnostics.effectiveLiveness ?? null, + recommendedAction: diagnostics.recommendedAction ?? null, + splitBrainLive: diagnostics.splitBrainLive ?? null, + schedulerActiveRunSlotCount: diagnostics.schedulerActiveRunSlotCount ?? null, + databaseActiveTaskCount: diagnostics.databaseActiveTaskCount ?? null, + heartbeatFreshTaskIds: compactStringList(diagnostics.heartbeatFreshTaskIds, 8), + heartbeatRiskTaskIds: compactStringList(diagnostics.heartbeatRiskTaskIds, 8), + }, + }; +} + +function summarizeMicroserviceObservation(action: string, serviceId: string, response: unknown, args: string[]): unknown { + if (hasFlag(args, "--full") || hasFlag(args, "--raw")) return response; + const record = asRecord(response); + if (record === null) return response; + const body = "body" in record ? record.body : response; + const bodyBytes = jsonByteLength(body); + const summary = compactMicroserviceBody(body, serviceId); + const includePreview = summary === null || bodyBytes <= 20_000; + return { + upstream: { + ok: record.ok ?? null, + status: record.status ?? null, + exitCode: record.exitCode ?? null, + }, + microservice: { + action, + id: serviceId, + summary, + bodyBytes, + bodyOmitted: true, + outputPolicy: { + default: "compact-health-summary", + full: `bun scripts/cli.ts microservice ${action} ${serviceId} --full`, + raw: `bun scripts/cli.ts microservice ${action} ${serviceId} --raw`, + proxyRaw: serviceId === "code-queue" && action === "health" ? "bun scripts/cli.ts microservice proxy code-queue /health --raw --full" : null, + }, + ...(includePreview + ? { bodyPreview: previewJson(body, { maxDepth: 2, maxArrayItems: 2, maxObjectKeys: 8, maxStringLength: 160 }) } + : { + bodyPreviewOmitted: true, + bodyPreviewHint: "Large body preview omitted because compact summary is available; use --full or --raw for complete evidence.", + }), + }, + }; +} + export async function runMicroserviceCommand(_config: UniDeskConfig, args: string[]): Promise { const [action = "list", idArg, pathArg] = args; if (action === "list") return coreInternalFetch("/api/microservices"); if (action === "status") { const id = requireId(idArg, "microservice status"); - return coreInternalFetch(`/api/microservices/${encodeId(id)}/status`); + const optionArgs = args.slice(2); + assertKnownObservationOptions(optionArgs, "microservice status"); + return summarizeMicroserviceObservation(action, id, coreInternalFetch(`/api/microservices/${encodeId(id)}/status`), optionArgs); } if (action === "health") { const id = requireId(idArg, "microservice health"); - return coreInternalFetch(`/api/microservices/${encodeId(id)}/health`); + const optionArgs = args.slice(2); + assertKnownObservationOptions(optionArgs, "microservice health"); + return summarizeMicroserviceObservation(action, id, coreInternalFetch(`/api/microservices/${encodeId(id)}/health`), optionArgs); } if (action === "diagnostics") { const id = requireId(idArg, "microservice diagnostics"); - return coreInternalFetch(`/api/microservices/${encodeId(id)}/diagnostics`); + const optionArgs = args.slice(2); + assertKnownObservationOptions(optionArgs, "microservice diagnostics"); + return summarizeMicroserviceObservation(action, id, coreInternalFetch(`/api/microservices/${encodeId(id)}/diagnostics`), optionArgs); } if (action === "tunnel-self-test") { const id = requireId(idArg, "microservice tunnel-self-test"); diff --git a/scripts/src/provider-triage.test.ts b/scripts/src/provider-triage.test.ts index 7f195e58..526db079 100644 --- a/scripts/src/provider-triage.test.ts +++ b/scripts/src/provider-triage.test.ts @@ -1,5 +1,5 @@ import { describe, expect, test } from "bun:test"; -import { buildProviderTriageResult, type ProviderTriageSignal } from "./provider-triage"; +import { buildProviderTriageResult, compactProviderTriageResult, type ProviderTriageSignal } from "./provider-triage"; function signal( id: string, @@ -77,4 +77,29 @@ describe("provider triage contract", () => { expect(result.degradedScopes).toContain("registry"); expect(result.healthyScopes).toEqual(expect.arrayContaining(["k3s", "provider-gateway", "ssh"])); }); + + test("compact output is bounded and preserves drill-down command args", () => { + const result = buildProviderTriageResult("D601", [ + signal("backend-core-node", "provider-gateway", "ok"), + signal("host-ssh-probe", "ssh", "ok"), + signal("artifact-registry-health", "registry", "degraded"), + signal("k3sctl-adapter-health", "k3s", "unknown"), + signal("code-queue-health", "scheduler", "unknown"), + signal("service-a", "microservice", "failed"), + signal("service-b", "microservice", "failed"), + signal("service-c", "microservice", "degraded"), + signal("service-d", "microservice", "unknown"), + signal("service-e", "microservice", "unknown"), + signal("service-f", "microservice", "unknown"), + ], "2026-05-20T00:00:00.000Z"); + + const compact = compactProviderTriageResult(result, ["--microservice", "code-queue", "--microservice", "k3sctl-adapter"]); + const signalCounts = compact.signalCounts as Record; + const outputPolicy = compact.outputPolicy as Record; + + expect((compact.signals as unknown[]).length).toBe(8); + expect(signalCounts.omittedIssueSignals).toBe(1); + expect(outputPolicy.full).toBe("bun scripts/cli.ts provider triage D601 --microservice 'code-queue' --microservice 'k3sctl-adapter' --full"); + expect(outputPolicy.raw).toBe("bun scripts/cli.ts provider triage D601 --microservice 'code-queue' --microservice 'k3sctl-adapter' --raw"); + }); }); diff --git a/scripts/src/provider-triage.ts b/scripts/src/provider-triage.ts index c1c12486..8c188b98 100644 --- a/scripts/src/provider-triage.ts +++ b/scripts/src/provider-triage.ts @@ -87,6 +87,14 @@ function bool(value: unknown): boolean { return value === true; } +function hasFlag(args: string[], name: string): boolean { + return args.includes(name); +} + +function shellQuote(value: string): string { + return `'${value.replace(/'/g, `'\\''`)}'`; +} + function lower(value: unknown): string { return String(value ?? "").toLowerCase(); } @@ -292,6 +300,127 @@ function observedErrorSignal(message: string, scope: ProviderSignalScope): Provi return signal("observed-error", scope, "failed", message, { message, runnerErrorClassification: classifyRunnerError(message) }, scope !== "runner-local"); } +function compactStringList(value: unknown, limit = 6): Record { + const all = Array.from(new Set(asArray(value).map((item) => String(item ?? "")).filter(Boolean))); + return { + items: all.slice(0, limit), + count: all.length, + truncated: all.length > limit, + omitted: Math.max(0, all.length - limit), + }; +} + +function compactEvidence(value: unknown): unknown { + const record = asRecord(value); + if (record === null) return value; + const body = bodyOf(value); + const devReady = asRecord(record.devReady) ?? asRecord(body?.devReady); + const diagnostics = asRecord(record.executionDiagnostics) ?? asRecord(body?.executionDiagnostics) ?? asRecord(asRecord(value)?.diagnostics); + return { + upstream: record.upstream ?? (body === null ? null : { ok: asRecord(value)?.ok ?? null, status: asRecord(value)?.status ?? null }), + status: record.status ?? body?.status ?? null, + ok: record.ok ?? body?.ok ?? null, + serviceId: record.serviceId ?? body?.serviceId ?? null, + providerGatewayVersion: record.providerGatewayVersion ?? null, + hostSshConfigured: record.hostSshConfigured ?? null, + taskId: record.taskId ?? null, + taskStatus: record.taskStatus ?? null, + exitCode: record.exitCode ?? null, + devReady: devReady === null ? null : { + ok: devReady.ok ?? null, + missingTools: compactStringList(devReady.missingTools), + }, + executionDiagnostics: diagnostics === null ? null : { + state: diagnostics.state ?? null, + effectiveLiveness: diagnostics.effectiveLiveness ?? null, + recommendedAction: diagnostics.recommendedAction ?? null, + splitBrainLive: diagnostics.splitBrainLive ?? null, + heartbeatFreshTaskIds: compactStringList(diagnostics.heartbeatFreshTaskIds), + heartbeatRiskTaskIds: compactStringList(diagnostics.heartbeatRiskTaskIds), + }, + fallback: record.fallback === undefined ? null : record.fallback, + error: record.error ?? null, + }; +} + +function providerTriageCommand(providerId: string, args: string[], mode: "--full" | "--raw"): string { + const kept: string[] = []; + const valueOptions = new Set(["--observed-error", "--observed-scope", "--microservice", "--service", "--microservices"]); + for (let index = 0; index < args.length; index += 1) { + const arg = args[index] ?? ""; + if (arg === "--full" || arg === "--raw") continue; + if (valueOptions.has(arg)) { + const value = args[index + 1]; + if (value !== undefined) { + kept.push(arg, shellQuote(value)); + index += 1; + } + continue; + } + kept.push(arg); + } + return [`${commandPrefix} provider triage ${providerId}`, ...kept, mode].filter(Boolean).join(" "); +} + +export function compactProviderTriageResult(result: ProviderTriageResult, args: string[] = []): Record { + const issueSignals = result.signals + .filter((item) => item.status === "failed" || item.status === "degraded" || item.status === "unknown") + .sort((left, right) => { + const rank: Record = { failed: 0, degraded: 1, unknown: 2, ok: 3 }; + return rank[left.status] - rank[right.status]; + }); + const sourceSignals = issueSignals.length > 0 ? issueSignals : []; + const signalLimit = issueSignals.length > 0 ? 8 : 0; + const visibleSignals = sourceSignals.slice(0, signalLimit); + const okSignalCount = result.signals.filter((item) => item.status === "ok").length; + const issueSignalCount = issueSignals.length; + return { + ok: result.ok, + providerId: result.providerId, + decision: result.decision, + scope: result.scope, + retryable: result.retryable, + blockingDisposition: result.blockingDisposition, + observedAt: result.observedAt, + failedScopes: result.failedScopes, + degradedScopes: result.degradedScopes, + healthyScopes: result.healthyScopes, + failedIndependentScopes: result.failedIndependentScopes, + healthyIndependentScopes: result.healthyIndependentScopes, + rationale: result.rationale, + signalCounts: { + total: result.signals.length, + returned: visibleSignals.length, + limit: signalLimit, + ok: okSignalCount, + degraded: result.signals.filter((item) => item.status === "degraded").length, + failed: result.signals.filter((item) => item.status === "failed").length, + unknown: result.signals.filter((item) => item.status === "unknown").length, + omittedOkSignals: Math.max(0, result.signals.filter((item) => item.status === "ok").length - visibleSignals.filter((item) => item.status === "ok").length), + omittedIssueSignals: Math.max(0, issueSignalCount - visibleSignals.filter((item) => item.status === "failed" || item.status === "degraded" || item.status === "unknown").length), + omittedSignals: Math.max(0, sourceSignals.length - visibleSignals.length), + }, + signals: visibleSignals.map((item) => ({ + id: item.id, + scope: item.scope, + status: item.status, + independentPath: item.independentPath, + observedAt: item.observedAt, + summary: item.summary, + evidenceSummary: compactEvidence(item.evidence), + })), + recommendedCrossChecks: result.recommendedCrossChecks.slice(0, 8), + outputPolicy: { + default: "compact-triage-summary", + signalLimit, + full: providerTriageCommand(result.providerId, args, "--full"), + raw: providerTriageCommand(result.providerId, args, "--raw"), + note: "Default output returns prioritized failed/degraded/unknown signals plus bounded evidence. Use --full or --raw only when complete evidence is required.", + }, + contract: result.contract, + }; +} + export function providerTriageRecommendedCrossChecks(providerId: string): string[] { return [ `${commandPrefix} provider triage ${providerId}`, @@ -429,10 +558,12 @@ function optionValue(args: string[], name: string): string | undefined { } function assertKnownOptions(args: string[]): void { + const flags = new Set(["--full", "--raw"]); const valueOptions = new Set(["--observed-error", "--observed-scope", "--microservice", "--service", "--microservices"]); for (let index = 0; index < args.length; index += 1) { const arg = args[index] ?? ""; if (!arg.startsWith("--")) continue; + if (flags.has(arg)) continue; if (!valueOptions.has(arg)) throw new Error(`unsupported provider triage option: ${arg}`); const value = args[index + 1]; if (value === undefined || value.startsWith("--")) throw new Error(`${arg} requires a value`); @@ -440,7 +571,7 @@ function assertKnownOptions(args: string[]): void { } } -export async function runProviderTriage(config: UniDeskConfig, providerId: string, args: string[] = []): Promise { +export async function runProviderTriage(config: UniDeskConfig, providerId: string, args: string[] = []): Promise { if (!/^[A-Za-z0-9_.-]{1,64}$/u.test(providerId)) throw new Error("provider triage requires a safe provider id such as D601"); assertKnownOptions(args); const observedAt = isoNow(); @@ -497,5 +628,6 @@ export async function runProviderTriage(config: UniDeskConfig, providerId: strin } } - return buildProviderTriageResult(providerId, signals, observedAt); + const result = buildProviderTriageResult(providerId, signals, observedAt); + return hasFlag(args, "--full") || hasFlag(args, "--raw") ? result : compactProviderTriageResult(result, args); }