fix: observe code queue execution plane via tran

2026-05-26 00:35:54 +00:00
parent 8af5aafb9e
commit 273fad7c24
4 changed files with 32 additions and 10 deletions
@@ -107,7 +107,7 @@ UniDesk 是一个以主 server 为统一入口的分布式工作平台；本文
 - `bun scripts/cli.ts hwlab cd audit --env dev` / `status|preflight|apply --dry-run`：旧 D601 HWLAB DEV CD 指挥侧 wrapper，仅用于显式 legacy 诊断和迁移对照；当前 HWLAB DEV/PROD source/runtime truth 已迁到 G14 `/root/hwlab` 与 G14 k3s/GitOps，规则见 `docs/reference/hwlab.md`。
 - `bun scripts/cli.ts ci install/status/run/publish-backend-core/publish-user-service/run-dev-e2e/logs`：在 D601 原生 k3s 上安装和运行 Tekton CI，支持每 commit 检查、Code Queue 只读性能门禁、`CI.json` catalog 驱动的 backend-core 与 user-service commit-pinned 镜像发布和手动触发的 `origin/master:deploy.json#environments.dev` 临时 namespace e2e；catalog/producer/consumer 分工见 `docs/reference/cicd-standardization.md`，`run-dev-e2e` 的 Git 控制 runner、短 launcher 和 no-CD 边界见 `docs/reference/dev-ci-runner.md`，Tekton 规则见 `docs/reference/ci.md`。
 - `bun scripts/cli.ts codex deploy <commitId>`：旧 Code Queue 兼容部署入口已禁用，原因是它会绕过受控部署边界直连 D601 部署 Code Queue；规则见 `docs/reference/codex-deploy.md`。
- `bun scripts/cli.ts codex prompt-lint [prompt|--prompt-file path|--prompt-stdin]` / `codex submit [prompt] [--prompt-file path|--prompt-stdin] [--queue <id>]` / `codex execution-plane [--full|--raw]` / `codex pr-preflight [--remote]`：`prompt-lint` 在派发/steer 前 dry-run 检查 runner prompt 的 DEV 测试授权分级（`read-only`/`live-read`/`live-mutating`）且不回显 prompt；`submit --dry-run` 同时给出 MiniMax/GPT/人工路由建议、该 lint 结果和 requested/effective execution mode；真实提交成功只返回写入确认、task id、服务级 runnerPermissions 和后续查看命令，不回显 prompt；`execution-plane` 只读比较 D601 原生 k3s 正式 Code Queue 执行面、旧 Compose 残留、commit/digest/worktree/probe drift；`pr-preflight` 只读检查 D601 scheduler/runner 的 GitHub token、egress 和 PR 能力，PR 型派单前必须使用，规则见 `docs/reference/cli.md` 和 `docs/reference/code-queue-supervision.md`。
+- `bun scripts/cli.ts codex prompt-lint [prompt|--prompt-file path|--prompt-stdin]` / `codex submit [prompt] [--prompt-file path|--prompt-stdin] [--queue <id>]` / `codex execution-plane [--full|--raw]` / `codex pr-preflight [--remote]`：`prompt-lint` 在派发/steer 前 dry-run 检查 runner prompt 的 DEV 测试授权分级（`read-only`/`live-read`/`live-mutating`）且不回显 prompt；`submit --dry-run` 同时给出 MiniMax/GPT/人工路由建议、该 lint 结果和 requested/effective execution mode；真实提交成功只返回写入确认、task id、服务级 runnerPermissions 和后续查看命令，不回显 prompt；`execution-plane` 通过 `tran D601:k3s` 只读比较 D601 原生 k3s 正式 Code Queue 执行面、旧 Compose 残留、commit/digest/worktree/probe drift；`pr-preflight` 只读检查 D601 scheduler/runner 的 GitHub token、egress 和 PR 能力，PR 型派单前必须使用，规则见 `docs/reference/cli.md` 和 `docs/reference/code-queue-supervision.md`。
 - `bun scripts/cli.ts codex task <taskId>`：按 Code Queue 任务 ID 查询默认审阅摘要，只返回原始 prompt、最终 response、最后错误和渐进披露命令；`codex tasks --view commander` 是 host commander 推荐轮询入口，默认有界显示 active runner 精确计数、queued/retry_wait、terminal-unread、active 风险、分类和 drill-down 命令；`--view supervisor|full`、`codex output` 和大 `--limit` 仍默认有界，完整内容需显式 `--full`/`--full-text`/分页展开；`codex queues [--full] [--limit N] [--page N|--offset N]` 默认分页低噪声输出队列摘要，完整 upstream 只通过 raw command 显式获取。
 - `bun scripts/cli.ts codex unread [--repo owner/name] [--issue N] [--limit N]`：只读汇总完成未读积压并给出 repo/issue/status/queue 计数和 drill-down/read 命令；批量已读必须显式 `codex unread mark-read ... --confirm`，规则见 `docs/reference/cli.md`。
 - `bun scripts/cli.ts codex judge <taskId> --attempt <n> [--dry-run]`：按指定 task/attempt 用与队列 worker 相同的上下文构建和 MiniMax judge 调用路径单步复现完成判定；`--dry-run` 只输出 prompt/payload 诊断。
@@ -280,7 +280,7 @@ replacement runner 只用于方向明显错误、质量不可接受、原 task
 - `bun scripts/cli.ts codex tasks --view commander --limit N`：host commander 轮询的推荐入口。输出是有界 action map，必须直接显示 `activeRunners.count`、计数来源、split-brain/heartbeat 处置、queued/retry_wait 精确计数、terminal-unread 总数和已省略行数、active 风险数、stale/heartbeat/trace gap、`finalResponse` 已出现但仍非终态的 awaiting terminal/judge、blocker-like final response、HWLAB#7/#99/#116/#164/#317 与 UniDesk#20/#118 命中、任务分类和下一步 drill-down 命令。默认不得输出完整 prompt、完整 final response、raw output、完整 trace 或 raw overview；需要详情只能按 task id 使用 `codex task`、`codex task --trace`、`codex output`、`codex read` 或 `rawOverview` 命令渐进展开。
 - `bun scripts/cli.ts codex tasks --view supervisor --limit N`：查看默认低噪声监督视图，包括 `activeRunning`、running、完成未读、少量最近完成、queued/runnable、activity、commanderConcurrency、execution diagnostics、任务分类和下一步 drill-down 命令。默认行只保留 task id、队列、短 prompt/body 预览和原始字符数；`--limit` 是扫描/分页预算，不是返回几十条肥行的开关，CLI effective limit 安全上限为 100，输出必须用 `filters.requestedLimit`、`filters.effectiveLimit`、`filters.limitCapped`、`source.requestedLimit` 和 `source.effectiveLimit` 区分用户请求、CLI cap 和 overview 源拉取预算；例如 `--limit 260` 应明确显示 requested=260、effective=100、source=200，`running.returned` 只是低噪声返回行数。`show/detail/trace/output/full/read` 放在 section template 中，避免每条任务重复刷屏，需要更多内容再按 taskId 展开。刚执行 `codex submit` 后也可以先读 submit 返回的 `submitted.taskStates[]`、`queue.countContext`、`queue.activity.effectiveActiveTaskCount` 和 `queue.stateDisclosure`；若某个 id preview 有 `idsUnavailable=true`，不要把它当成空队列，按 `queue.listPreviewPolicy.rawCommand` 或本 supervisor 命令继续查。
 - `bun scripts/cli.ts codex queues`：默认是 commander-first 队列态势摘要，`--commander` 是显式同义开关。输出前部固定使用 `.data.queues.commander`，先给出 `activeRunnerCount`、`source`、`target=15`、`slotDeficit`、`queuedCount`、`runningTasks`、`heartbeat.fresh`、`heartbeat.risk`、`heartbeat.staleRecoveryCandidates`、active/runnable queue 小页和 drill-down 命令；历史 queue item 列表保留在 `.data.queues.items[]`，但只是分页的次要行。需要完整队列行视图时加 `--full`，但 `--full` 仍默认分页，继续用 `--limit N`、`--page N` 或 `--offset N` 渐进展开。summary 和 full 都使用稳定 JSON path `.data.queues.items[]` 读取队列行，并从 `.data.queues.commander`、`.data.queues.commanderConcurrency`、`.data.queues.activity`、`.data.queues.counts` 与 `.data.queues.executionDiagnostics` 读取全局活跃计数和执行诊断；完整 upstream 只通过输出中的 raw command 显式获取。若 `/api/queues` 没有返回 task row，`runningTasks.items[].name` 会是 `null` 且 `nameSource=not-returned-by-api-queues`，此时按返回的 `codex task <taskId>` 或 supervisor 命令展开，不要假设任务没有名称。
- `bun scripts/cli.ts codex execution-plane [--full|--raw]`：只读巡检 D601 原生 k3s `unidesk` namespace 下的正式执行面。该命令强制使用 `KUBECONFIG=/etc/rancher/k3s/k3s.yaml` 并确认 node `d601`，默认低噪声返回 `summary.formalExecutionPlane`、`summary.deploymentDrift`、`summary.deprecatedComposeResidual`、`executionPlane.deployments[]`、`drift.status`、`residual.status` 和 `judgeProbe.behaviorVersion`。它比较三类 Deployment：`code-queue` 必须是 scheduler，`code-queue-read` 必须是 read，`code-queue-write` 必须是 write；同时比较 deployment env/annotation commit、Pod `imageID` digest、宿主 `/home/ubuntu/cq-deploy` HEAD、以及 `/api/judge/probe` 的 `behaviorVersion=code-queue-judge-probe:v1`。任何 commit/digest/worktree/probe 不一致或缺少可比 marker 都必须输出 `deployment-drift`，不能写成 healthy。检测到旧 Docker Compose `code-queue-backend` 或旧 `127.0.0.1:4222` 监听时必须输出 `deprecated-compose-residual`。默认不打印完整 Kubernetes Deployment JSON、环境变量全集、SecretRef 值、judge probe 原始结果或命令 stdout；需要逐项展开时使用 `--full`，需要安全裁剪后的原始观察对象时使用 `--raw`。
+- `bun scripts/cli.ts codex execution-plane [--full|--raw]`：只读巡检 D601 原生 k3s `unidesk` namespace 下的正式执行面。该命令的 live collector 必须通过 UniDesk `tran`/`ssh` 维护桥访问 `D601:k3s` 和 `D601:/home/ubuntu/cq-deploy`，不得在 master server 本地调用 `kubectl`、读取本地 worktree 或把 master server 的工具缺失误报成 D601 阻塞。该命令强制使用 `KUBECONFIG=/etc/rancher/k3s/k3s.yaml` 并确认 node `d601`，默认低噪声返回 `summary.formalExecutionPlane`、`summary.deploymentDrift`、`summary.deprecatedComposeResidual`、`executionPlane.deployments[]`、`drift.status`、`residual.status` 和 `judgeProbe.behaviorVersion`。它比较三类 Deployment：`code-queue` 必须是 scheduler，`code-queue-read` 必须是 read，`code-queue-write` 必须是 write；同时比较 deployment env/annotation commit、Pod `imageID` digest、宿主 `/home/ubuntu/cq-deploy` HEAD、以及 `/api/judge/probe` 的 `behaviorVersion=code-queue-judge-probe:v1`。任何 commit/digest/worktree/probe 不一致或缺少可比 marker 都必须输出 `deployment-drift`，不能写成 healthy。检测到 D601 上旧 Docker Compose `code-queue-backend` 或旧 `127.0.0.1:4222` 监听时必须输出 `deprecated-compose-residual`。默认不打印完整 Kubernetes Deployment JSON、环境变量全集、SecretRef 值、judge probe 原始结果或命令 stdout；需要逐项展开时使用 `--full`，需要安全裁剪后的原始观察对象时使用 `--raw`。
 - `bun scripts/cli.ts codex unread --limit N`：查看完成未读审阅积压的默认 triage，按 repo、issue、status 和 queue 汇总，并给出有界最新任务紧凑行；默认行只包含 task id、状态、queue、issues、updatedAt/finishedAt 和一条 `nextStep`，不重复每任务 `show/detail/trace/output/read` 命令，也不输出 raw prompt、final response、trace 或 output。完整 per-task 命令必须显式使用 `codex unread --full`、`codex unread --view full`、`codex unread list` 或单任务 `codex task <taskId>`/`codex read <taskId>` 展开；默认输出必须保留一次性的模板命令和分页命令。
 - `bun scripts/cli.ts codex unread mark-read --repo owner/name --issue N --limit N --confirm`：批量已读入口，必须显式 `mark-read` 和 `--confirm`，否则结构化失败且不 POST `/read`。
 - `bun scripts/cli.ts codex tasks --unread --limit N`：兼容查看完成未读审阅积压；`--unread` 与 `--unread-only` 等价，不能被静默忽略。
@@ -3,6 +3,7 @@ import {
  runCodeQueueExecutionPlaneForTest,
  type CodeQueueExecutionPlaneObservation,
 } from "./src/code-queue-execution-plane";
+import { readFileSync } from "node:fs";

 type JsonRecord = Record<string, unknown>;

@@ -175,12 +176,21 @@ async function checkProgressiveDisclosure(): Promise<void> {
  assertCondition("details" in raw && "rawObservation" in raw, "--raw should include details and raw observation", raw);
 }

+async function checkLiveCollectorUsesD601TranTransport(): Promise<void> {
+  const source = readFileSync(new URL("./src/code-queue-execution-plane.ts", import.meta.url), "utf8");
+  assertCondition(source.includes('["D601:k3s", "kubectl", ...args]'), "live collector should observe k3s through D601 tran route, not local kubectl");
+  assertCondition(source.includes('`D601:${options.worktreePath}`'), "worktree observation should run on D601 workspace route");
+  assertCondition(!source.includes('runCommand(["kubectl", ...args]'), "live collector must not call local kubectl directly");
+  assertCondition(!source.includes('runCommand(["git", "-C", options.worktreePath'), "worktree observation must not read local filesystem");
+}
+
 async function main(): Promise<void> {
  const checks = [
    ["code-queue:execution-plane-healthy-no-drift", checkHealthyNoDrift],
    ["code-queue:execution-plane-deployment-drift", checkDeploymentDrift],
    ["code-queue:execution-plane-deprecated-compose-residual", checkDeprecatedComposeResidual],
    ["code-queue:execution-plane-progressive-disclosure", checkProgressiveDisclosure],
+    ["code-queue:execution-plane-d601-tran-transport", checkLiveCollectorUsesD601TranTransport],
  ] as const;
  const results = [];
  for (const [name, check] of checks) {
@@ -193,4 +203,3 @@ async function main(): Promise<void> {
 if (import.meta.main) {
  await main();
 }
-
@@ -248,6 +248,13 @@ function commandProbe(result: CommandResult): ProbeResult {
  };
 }

+function runTran(args: string[], options: ExecutionPlaneOptions, timeoutMs = options.timeoutMs): ProbeResult {
+  return commandProbe(runCommand(["./scripts/tran", ...args], repoRoot, {
+    timeoutMs,
+    env: process.env,
+  }));
+}
+
 function safeError(probe: ProbeResult): string | null {
  if (probe.ok) return null;
  const text = firstLine(probe.stderr) ?? firstLine(probe.stdout);
@@ -255,10 +262,7 @@ function safeError(probe: ProbeResult): string | null {
 }

 function runKubectl(args: string[], options: ExecutionPlaneOptions): ProbeResult {
-  return commandProbe(runCommand(["kubectl", ...args], repoRoot, {
-    timeoutMs: options.timeoutMs,
-    env: { ...process.env, KUBECONFIG: options.kubeconfig },
-  }));
+  return runTran(["D601:k3s", "kubectl", ...args], options);
 }

 function collectGuard(options: ExecutionPlaneOptions): { guard: D601K3sGuardClassification; diagnostics: Record<string, unknown> } {
@@ -436,7 +440,7 @@ function collectServices(options: ExecutionPlaneOptions): ServiceObservation[] {
 }

 function collectWorktree(options: ExecutionPlaneOptions): WorktreeObservation {
-  const probe = commandProbe(runCommand(["git", "-C", options.worktreePath, "rev-parse", "HEAD"], repoRoot, { timeoutMs: 5_000 }));
+  const probe = runTran([`D601:${options.worktreePath}`, "argv", "git", "rev-parse", "HEAD"], options, 10_000);
  return {
    path: options.worktreePath,
    ok: probe.ok,
@@ -446,7 +450,16 @@ function collectWorktree(options: ExecutionPlaneOptions): WorktreeObservation {
 }

 function collectResidual(): ResidualObservation {
-  const docker = commandProbe(runCommand(["docker", "ps", "-a", "--filter", "name=code-queue-backend", "--format", "{{.Names}}\t{{.Status}}\t{{.Image}}"], repoRoot, { timeoutMs: 8_000 }));
+  const remoteOptions: ExecutionPlaneOptions = {
+    namespace: expectedNamespace,
+    kubeconfig: d601NativeKubeconfig,
+    worktreePath: expectedWorktreePath,
+    full: false,
+    raw: false,
+    skipProbe: true,
+    timeoutMs: 15_000,
+  };
+  const docker = runTran(["D601", "argv", "docker", "ps", "-a", "--filter", "name=code-queue-backend", "--format", "{{.Names}}\t{{.Status}}\t{{.Image}}"], remoteOptions, 15_000);
  const containers = docker.ok
    ? lines(docker.stdout).map((line) => {
        const fields = line.split("\t");
@@ -454,7 +467,7 @@ function collectResidual(): ResidualObservation {
      }).filter((item) => item.name === "code-queue-backend" || item.name.includes("code-queue-backend"))
    : [];

-  const ss = commandProbe(runCommand(["ss", "-H", "-ltnp"], repoRoot, { timeoutMs: 8_000 }));
+  const ss = runTran(["D601", "argv", "ss", "-H", "-ltnp"], remoteOptions, 15_000);
  const listeners = ss.ok
    ? (lines(ss.stdout)
        .filter((line) => line.includes(":4222"))