fix: 归类 v0.1 provider 可用性失败

This commit is contained in:
Codex
2026-05-29 14:05:15 +08:00
parent 8412073944
commit 7534b87153
9 changed files with 39 additions and 5 deletions
+1 -1
View File
@@ -81,7 +81,7 @@ POST /api/v1/commands/:commandId/ack
- events append-only,单 run 内 `seq` 单调递增。
- 每个 run 必须最终出现 `terminal_status`,或保持明确 non-terminal status 并可查询 lease/heartbeat。
- failureKind 至少能区分 `schema-invalid``tenant-policy-denied``secret-unavailable``runner-lease-conflict``backend-failed``provider-auth-failed``infra-failed``cancelled`
- failureKind 至少能区分 `schema-invalid``tenant-policy-denied``secret-unavailable``runner-lease-conflict``backend-failed``provider-auth-failed``provider-unavailable``infra-failed``cancelled`
- health/readiness 必须返回 Postgres reachable、schema migration ready、SecretRef redacted 状态和 build/source metadata。
- 日志、event、trace、health 和 diagnostics 不得输出 provider credential、Codex auth/config 内容、DSN password、token 或 URL credential。
@@ -71,6 +71,7 @@ Runner 必须把以下失败归类为结构化 failureKind
- `secret-unavailable`SecretRef 缺失、RBAC 拒绝或 Secret projection 不完整。
- `provider-auth-failed`:上游 provider 鉴权失败。
- `provider-unavailable`:上游 provider 返回 HTTP 5xx/503、`Service Unavailable`、携带 5xx 的 `responseStreamDisconnected` 或明确 temporary/provider unavailable 文案;这是外部 provider availability blocker,不得归为本地 `backend-failed`
- `backend-failed`:backend 进程退出、协议错误或返回 terminal error。
- `runner-lease-conflict`claim/lease 被其他 runner 持有。
- `infra-failed`Job 启动、网络、manager API 或文件系统基础设施失败。
+2 -1
View File
@@ -51,6 +51,7 @@ Adapter 必须把 backend 错误映射为稳定 failureKind
| `secret-unavailable` | Secret projection 缺失、文件不存在、权限不可读。 |
| `provider-auth-failed` | provider credential 或 auth file 无效、上游返回 401/403。 |
| `provider-rate-limited` | 上游限流或 quota 错误。 |
| `provider-unavailable` | 上游 provider availability/transient 失败,包括 HTTP 5xx/503、`Service Unavailable``responseStreamDisconnected` 携带 5xx 状态码、明确 `provider unavailable``temporary unavailable` 文案。 |
| `backend-protocol-error` | backend 输出无法解析、协议字段缺失。 |
| `backend-json-parse-error` | backend stdout 不是合法 JSON-RPC 行。 |
| `backend-response-invalid` | backend JSON-RPC response/terminal notification 缺少必需字段。 |
@@ -74,7 +75,7 @@ Adapter 必须把 backend 错误映射为稳定 failureKind
### T2 Failure mapping 自测试
阅读本文,然后用 mock 错误覆盖 missing secret、provider auth failure、rate limit、protocol error、timeout 和 cancel。确认每类错误都映射到稳定 failureKind,且输出为 JSON 或结构化 event。
阅读本文,然后用 mock 错误覆盖 missing secret、provider auth failure、rate limit、provider availability/transient failure、protocol error、timeout 和 cancel。provider availability/transient 样例必须至少包含 HTTP 503 `Service Unavailable` 或携带 5xx 的 `responseStreamDisconnected`,并确认不会被归类为 `backend-failed`。确认每类错误都映射到稳定 failureKind,且输出为 JSON 或结构化 event。
### T3 真实 backend 联调
+5 -1
View File
@@ -90,13 +90,17 @@ Run 的 `executionPolicy.secretScope` 应引用 `agentrun-v01-provider-codex`
阅读本文,然后使用无效的 Codex Secret 创建 run。确认 backend 返回 `provider-auth-failed` 或等价 failureKind,记录上游状态分类和 trace correlation,但不打印 Authorization header、token 或 auth/config 文件内容。
### T5 Provider availability failure
阅读本文,然后用 mock/fake Codex app-server 自测试 HTTP 503 `Service Unavailable`、携带 5xx 的 `responseStreamDisconnected` 或明确 temporary/provider unavailable 文案。确认 Codex adapter 返回 `provider-unavailable`,不会落到 `backend-failed`;综合联调若真实 provider 返回同类错误,应记录为外部 provider blocker,而不是本地 runner/backend 执行面 blocker。
## 规格的实现情况
| 规格项 | 状态 | 说明 |
| --- | --- | --- |
| Codex backend 规格 | 已定义 | 本文为 v0.1 第一真实 backend 权威。 |
| Codex Secret projection | 未实现 | 需要后续 Kubernetes Secret 和 runner/backend manifest。 |
| Codex adapter | 已部分实现 | 当前代码已实现受控 `codex app-server --listen stdio://``initialize`/`thread/start`/`thread/resume`/`turn/start` response 校验、stderr 有界诊断、spawn/JSON parse/response invalid/timeout failureKind 和 fake app-server 自测试。 |
| Codex adapter | 已部分实现 | 当前代码已实现受控 `codex app-server --listen stdio://``initialize`/`thread/start`/`thread/resume`/`turn/start` response 校验、stderr 有界诊断、spawn/JSON parse/response invalid/timeout/provider 5xx availability failureKind 和 fake app-server 自测试。 |
| 错误可观测与脱敏 | 已部分实现 | child env、cwd、workspace 和 Codex home 只输出摘要;stderr tail 有界且标记截断;事件和 failure 统一走 redaction。 |
| 真实 provider turn | 未实现 | 综合联调必须真实完成后才能发布通过。 |
| hostPath `~/.codex` | 不采用 | 只能通过 Kubernetes Secret projection 注入。 |
+2
View File
@@ -83,6 +83,8 @@ RESTful API 交互联调必须满足:
- `GET /api/v1/runs/:runId/commands/:commandId``GET /api/v1/runs/:runId/events?afterSeq=N&limit=M` 能轮询到 terminal_statusevent `seq` 单调递增,分页重复读取不丢失也不重复。
- 所有成功和失败响应都必须是 JSON;失败响应必须包含可判定的 failureKind、message 和 trace correlation,且不得泄露 Secret value。
真实 provider 返回 HTTP 5xx/503、`Service Unavailable`、携带 5xx 的 `responseStreamDisconnected` 或明确 temporary/provider unavailable 文案时,综合联调结论应归类为外部 provider availability blockerfailureKind 使用 `provider-unavailable`。这类结果不能证明真实 turn 成功,但也不得被记录为本地 runner/backend 执行面 `backend-failed` blocker。
CLI 与 RESTful API 可以复用同一个真实 run 做联调。若两者观察到的 run id、command id、state、terminal_status、failureKind、event seq 或 redaction 结果不一致,综合联调不通过。
## 发布判定
+12 -1
View File
@@ -435,7 +435,10 @@ function normalizeCodexNotification(message: JsonRecord): { events: BackendEvent
const status = terminalStatusFromValue(turn.status);
const error = asRecordAt(turn, "error");
const messageText = typeof error.message === "string" ? redactText(error.message) : null;
return { events: [{ type: "backend_status", payload: { phase: method, terminalStatus: status } }], terminal: { status, failureKind: status === "completed" ? null : classifyMessageFailureKind(messageText ?? turn.status, "backend-failed"), message: messageText } };
const failureKind = status === "completed" ? null : classifyMessageFailureKind(messageText ?? turn.status, "backend-failed");
const events: BackendEvent[] = [{ type: "backend_status", payload: { phase: method, terminalStatus: status } }];
if (failureKind) events.push({ type: "error", payload: { failureKind, error: redactJson(error), phase: method } });
return { events, terminal: { status, failureKind, message: messageText } };
}
return { events: [{ type: "backend_status", payload: { phase: method } }] };
}
@@ -570,11 +573,19 @@ function classifyMessageFailureKind(message: string, fallback: FailureKind): Fai
const text = String(message || "").toLowerCase();
if (/rate.?limit|too many requests|\b429\b/u.test(text)) return "provider-rate-limited";
if (/\b401\b|\b403\b|unauthori[sz]ed|forbidden|invalid api key|authentication|auth failed|oauth|access token/u.test(text)) return "provider-auth-failed";
if (isProviderUnavailableMessage(text)) return "provider-unavailable";
if (/timed out|timeout|idle timeout/u.test(text)) return "backend-timeout";
if (/invalid json|json parse/u.test(text)) return "backend-json-parse-error";
return fallback;
}
function isProviderUnavailableMessage(text: string): boolean {
if (/\b(?:http(?:\s+status)?|status(?:\s+code)?|code)\s*[:=]?\s*5\d\d\b/u.test(text)) return true;
if (/\b5\d\d\b/u.test(text) && /service unavailable|bad gateway|gateway timeout|internal server error|provider|upstream|response\s*stream\s*disconnected|responsestreamdisconnected/u.test(text)) return true;
if (/service unavailable|temporar(?:y|ily) unavailable|provider (?:is )?unavailable|provider availability|upstream (?:is )?unavailable/u.test(text)) return true;
return false;
}
function positiveTimeout(value: number): number {
return Number.isFinite(value) && value > 0 ? Math.max(1, Math.floor(value)) : requestTimeoutCapMs;
}
+1
View File
@@ -15,6 +15,7 @@ export type FailureKind =
| "backend-timeout"
| "provider-auth-failed"
| "provider-rate-limited"
| "provider-unavailable"
| "infra-failed"
| "cancelled";
+3 -1
View File
@@ -34,11 +34,13 @@ const selfTest: SelfTestCase = async (context) => {
await access(path.join(projectedHome, "config.toml"));
await runFailureCase({ client, managerUrl: server.baseUrl, context, mode: "missing-turn-result", expectedStatus: "failed", expectedFailureKind: "backend-response-invalid" });
await runFailureCase({ client, managerUrl: server.baseUrl, context, mode: "provider-503-rpc-error", expectedStatus: "failed", expectedFailureKind: "provider-unavailable" });
await runFailureCase({ client, managerUrl: server.baseUrl, context, mode: "provider-503-terminal", expectedStatus: "failed", expectedFailureKind: "provider-unavailable" });
await runFailureCase({ client, managerUrl: server.baseUrl, context, mode: "invalid-json", expectedStatus: "failed", expectedFailureKind: "backend-json-parse-error" });
await runFailureCase({ client, managerUrl: server.baseUrl, context, mode: "missing-terminal", expectedStatus: "failed", expectedFailureKind: "backend-timeout", timeoutMs: 500 });
await runSpawnFailureCase({ client, managerUrl: server.baseUrl, context });
return { name: "codex-stdio", tests: ["runner-lease-heartbeat", "codex-stdio-fake-turn", "codex-stdio-projected-writable-home", "codex-stdio-missing-turn-result", "codex-stdio-invalid-json", "codex-stdio-timeout", "codex-stdio-spawn-failure"] };
return { name: "codex-stdio", tests: ["runner-lease-heartbeat", "codex-stdio-fake-turn", "codex-stdio-projected-writable-home", "codex-stdio-missing-turn-result", "codex-stdio-provider-503-rpc-error", "codex-stdio-provider-503-terminal", "codex-stdio-invalid-json", "codex-stdio-timeout", "codex-stdio-spawn-failure"] };
} finally {
await new Promise<void>((resolve) => server.server.close(() => resolve()));
}
+12
View File
@@ -35,6 +35,10 @@ for await (const line of rl) {
respond(message.id, {});
continue;
}
if (mode === "provider-503-rpc-error") {
respond(message.id, null, { code: -32000, message: "responseStreamDisconnected: HTTP 503 Service Unavailable from provider" });
continue;
}
if (mode === "missing-terminal") {
turnCounter += 1;
const turn = { id: `turn_selftest_${turnCounter}`, status: "running" };
@@ -42,6 +46,14 @@ for await (const line of rl) {
respond(message.id, { turn });
continue;
}
if (mode === "provider-503-terminal") {
turnCounter += 1;
const turn = { id: `turn_selftest_${turnCounter}`, status: "failed", error: { message: "HTTP 503 Service Unavailable" } };
notify("turn/started", { turn: { id: turn.id, status: "running" } });
notify("turn/completed", { turn });
respond(message.id, { turn });
continue;
}
turnCounter += 1;
const turn = { id: `turn_selftest_${turnCounter}`, status: "completed" };
notify("turn/started", { turn });