diff --git a/docs/reference/code-queue-supervision.md b/docs/reference/code-queue-supervision.md index 3f2bca45..a6489511 100644 --- a/docs/reference/code-queue-supervision.md +++ b/docs/reference/code-queue-supervision.md @@ -102,6 +102,10 @@ Code Queue 派单模型按成本、可信度和 blast radius 分层:GPT-5.5/Co `codex prompt-lint [prompt|--prompt-file path|--prompt-stdin]` 是同一套派单前 guardrail 的本地 dry-run 入口,用于检查 runner prompt 是否声明了 `DEV test class`、是否列出允许的 live mutation、禁止动作和 closeout 字段。它只返回分类、缺失或矛盾项和有界 evidence,不提交任务、不连接 live service、不打印完整 prompt。`codex submit --dry-run` 和 `codex steer --dry-run` 会嵌入同一 `promptLint` 结果;`dispatchDisposition=needs-authorization` 时,指挥官必须补齐授权或把 prompt 降到 `read-only` 范围后再派发/steer。 +Device Pod 类 DS 派单必须把工具可用性设计进 prompt,而不是靠事后强制纠偏。prompt 应明确唯一 pod、workspace selector、目标工程/target、允许的 live mutation、禁止的 pod/BOOT/生产/密钥/数据库范围和 closeout 字段;文本源码修改默认要求 `hwpod ... workspace apply-patch`,新文件使用 `apply-patch --add-file`,整文件替换使用 `apply-patch --replace-file`,不要优先 `workspace put`。命令入口默认写短别名 `hwpod`,不要写长路径 `node /app/skills/device-pod-cli/scripts/device-pod-cli.mjs`;添加 Keil 源文件、build clean、download、UART/JSON-RPC smoke 也应走 `hwpod`。prompt 中只允许把 `/app/tools/tran.mjs`、`/app/tools/hwlab-gateway-tran.mjs`、临时 Python/PowerShell/JS 上传脚本列为禁止绕行;如果 DS 仍然需要这些绕行,指挥官应先把缺失能力补进 `device-pod-cli`/`hwpod`,再重置 workspace 让 DS 复测。 + +Device Pod 类 DS 验收不能只看最终回复。指挥官必须用 `codex task --trace` / `codex output ` 审计实际命令面:确认是否使用 `hwpod`,是否出现长 CLI 路径、`tran.mjs`、`hwlab-gateway-tran.mjs`、临时脚本上传、`workspace put` 或构建产物 patch/put/delete;同时核对 build job、download job、UART/JSON-RPC 或屏幕/串口等硬件证据。若任务因为模型上游 429/503、transport 断连或 Code Queue continuation 被错误降级而没有进入工具调用,不应把它记作 device-pod-cli 失败样本,应先处理调度/运行面摩擦,再重新派发干净任务。 + 并发治理按模型和风险一起决定。GPT-5.5 常规并发目标是 5 条 lane;当写入范围互不重叠、heartbeat/trace 健康、完成质量稳定时可以短时提高到 10。MiniMax 只承接简单任务时可以提高到 10,但必须保留指挥官审阅和证据核验。DeepSeek 用于中等复杂度任务,默认按约 5 条 lane 观察质量,再根据成功率和 reviewer 负载逐步调整。并发扩张的前提永远是任务质量和可观测性,而不是模型价格。 模型选择矩阵: diff --git a/src/components/microservices/code-queue/src/judge.ts b/src/components/microservices/code-queue/src/judge.ts index 04d13739..13811d92 100644 --- a/src/components/microservices/code-queue/src/judge.ts +++ b/src/components/microservices/code-queue/src/judge.ts @@ -842,6 +842,10 @@ function applyJudgeSafetyOverrides(task: QueueTask, result: CodexRunResult, judg return judge; } +export function applyJudgeSafetyOverridesForTest(task: QueueTask, result: CodexRunResult, judge: JudgeResult): JudgeResult { + return applyJudgeSafetyOverrides(task, result, judge); +} + export async function judgeTask(task: QueueTask, result: CodexRunResult): Promise { if (config().minimaxApiKey.length === 0) { const judge = applyJudgeSafetyOverrides(task, result, fallbackJudge(result)); diff --git a/src/components/microservices/code-queue/src/self-tests.ts b/src/components/microservices/code-queue/src/self-tests.ts index 72dd5f61..327963d5 100644 --- a/src/components/microservices/code-queue/src/self-tests.ts +++ b/src/components/microservices/code-queue/src/self-tests.ts @@ -2,7 +2,7 @@ import { minimaxM27Model } from "./code-agent/common"; import { openCodeTransportClosedBeforeTerminal, remoteOpenCodeRunCommandForTest } from "./code-agent/opencode"; -import { continuePromptSourceBudgetChars, miniMaxJudgeMessages, parsedContinuePromptForJudge, parseJudgeJson, queueRecoveryRetryPrompt, retryPrompt } from "./judge"; +import { applyJudgeSafetyOverridesForTest, continuePromptSourceBudgetChars, miniMaxJudgeMessages, parsedContinuePromptForJudge, parseJudgeJson, queueRecoveryRetryPrompt, retryPrompt } from "./judge"; import { codeQueueEnvironmentHintTitle, injectCodeQueueEnvironmentHint, promptWithCodeQueueEnvironmentHint, userPromptForDisplay } from "./prompts"; import { buildTaskTranscript, safePreview, taskTraceSummaryFixtureResponse, transcriptLineSummaryLines } from "./task-view"; import type { ActiveRunSlotWaiter } from "./code-agent/common"; @@ -265,6 +265,30 @@ async function runReferenceInjectionSelfTest(): Promise { ...Array.from({ length: 80 }, (_, index) => `验收点 ${index + 1}: 基于当前 thread 上文补齐缺失证据,并在最终 response 中写出真实命令/API/UI 结果。`), ].join("\n"); const explicitRetryPrompt = retryPrompt(retryTask, { decision: "retry", confidence: 1, reason: "Long MiniMax feedback fixture", continuePrompt: explicitLongContinuePrompt, source: "minimax" }); + const liveMutatingDevicePodPrompt = [ + "DEV test class: live-mutating hardware smoke on G14 DEV device-pod only.", + "", + "Allowed live mutation:", + "- 仅允许通过 `hwpod` 修改 `device-pod-71-freq` 的 `workspace:/projects/01_baseline` 下 firmware 源码和 Keil project membership。", + "", + "Forbidden actions:", + "- 禁止使用 `/app/tools/tran.mjs`,禁止上传临时脚本。", + "", + "Closeout fields:", + "- `DEV test class`、`Allowed mutation actually used`、`Forbidden actions avoided`。", + ].join("\n"); + const liveMutatingDevicePodTask = testTask("codex_live_mutating_device_pod_retry", liveMutatingDevicePodPrompt, "", [], "2026-05-08T00:32:00.000Z"); + const liveMutatingDevicePodRetry = applyJudgeSafetyOverridesForTest(liveMutatingDevicePodTask, { + threadId: "thread_live_mutating_device_pod_retry", + turnId: "turn_live_mutating_device_pod_retry", + finalResponse: "", + terminalStatus: "failed", + terminalError: "unexpected status 503 Service Unavailable", + transportClosedBeforeTerminal: false, + appServerExit: { code: 0, signal: null, stderrTail: "" }, + events: [], + }, { decision: "retry", confidence: 0.7, reason: "Codex turn 失败。", continuePrompt: "继续完成原始任务。", source: "fallback" }); + const liveMutatingContinuePrompt = liveMutatingDevicePodRetry.continuePrompt ?? ""; let longMiniMaxPromptRejectedAtSource = false; try { parsedContinuePromptForJudge({ continuePrompt: `${"x".repeat(continuePromptSourceBudgetChars + 1)}` }, "retry"); @@ -277,6 +301,10 @@ async function runReferenceInjectionSelfTest(): Promise { assertReferenceTest(!recoveryAfterDeepReference.includes("Reference Round"), "queue recovery prompt should not re-inject reference rounds"); assertReferenceTest(explicitRetryPrompt === explicitLongContinuePrompt, "explicit continuePrompt should not be tail-truncated"); assertReferenceTest(!explicitRetryPrompt.includes("已截断"), "explicit continuePrompt should not include truncation marker"); + assertReferenceTest(liveMutatingContinuePrompt.includes("live-mutating hardware smoke"), "live-mutating retry should preserve the original authorization summary"); + assertReferenceTest(liveMutatingContinuePrompt.includes("hwpod"), "live-mutating retry should preserve tool authorization details"); + assertReferenceTest(!liveMutatingContinuePrompt.includes("只读 continuation"), "live-mutating retry must not be downgraded to read-only continuation"); + assertReferenceTest(!liveMutatingContinuePrompt.includes("不改变运行态、生产态或持久化状态"), "live-mutating retry must not add generic read-only mutation bans"); assertReferenceTest(longMiniMaxPromptRejectedAtSource, "over-budget MiniMax continuePrompt should be rejected for source repair"); return { ok: true, @@ -291,6 +319,7 @@ async function runReferenceInjectionSelfTest(): Promise { { name: "retry_prompt_does_not_reinject_reference_graph", ok: true, chars: retryAfterDeepReference.length }, { name: "queue_recovery_prompt_is_compact", ok: true, chars: recoveryAfterDeepReference.length }, { name: "explicit_continue_prompt_not_tail_truncated", ok: true, chars: explicitRetryPrompt.length }, + { name: "live_mutating_retry_not_downgraded_to_readonly", ok: true, chars: liveMutatingContinuePrompt.length }, { name: "over_budget_minimax_continue_prompt_requires_source_repair", ok: true, budgetChars: continuePromptSourceBudgetChars }, ], promptPreview: safePreview(promptC, 1200),