From 7534b87153e637fdb66fd47a9375a8d032752076 Mon Sep 17 00:00:00 2001 From: Codex Date: Fri, 29 May 2026 14:05:15 +0800 Subject: [PATCH 1/2] =?UTF-8?q?fix:=20=E5=BD=92=E7=B1=BB=20v0.1=20provider?= =?UTF-8?q?=20=E5=8F=AF=E7=94=A8=E6=80=A7=E5=A4=B1=E8=B4=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/reference/spec-v01-agentrun-mgr.md | 2 +- docs/reference/spec-v01-agentrun-runner.md | 1 + docs/reference/spec-v01-backend-adapter.md | 3 ++- docs/reference/spec-v01-backend-codex.md | 6 +++++- docs/reference/spec-v01-validation.md | 2 ++ src/backend/codex-stdio.ts | 13 ++++++++++++- src/common/types.ts | 1 + src/selftest/cases/30-codex-stdio.ts | 4 +++- src/selftest/fake-codex-app-server.ts | 12 ++++++++++++ 9 files changed, 39 insertions(+), 5 deletions(-) diff --git a/docs/reference/spec-v01-agentrun-mgr.md b/docs/reference/spec-v01-agentrun-mgr.md index 58fc148..0dfcfa0 100644 --- a/docs/reference/spec-v01-agentrun-mgr.md +++ b/docs/reference/spec-v01-agentrun-mgr.md @@ -81,7 +81,7 @@ POST /api/v1/commands/:commandId/ack - events append-only,单 run 内 `seq` 单调递增。 - 每个 run 必须最终出现 `terminal_status`,或保持明确 non-terminal status 并可查询 lease/heartbeat。 -- failureKind 至少能区分 `schema-invalid`、`tenant-policy-denied`、`secret-unavailable`、`runner-lease-conflict`、`backend-failed`、`provider-auth-failed`、`infra-failed`、`cancelled`。 +- failureKind 至少能区分 `schema-invalid`、`tenant-policy-denied`、`secret-unavailable`、`runner-lease-conflict`、`backend-failed`、`provider-auth-failed`、`provider-unavailable`、`infra-failed`、`cancelled`。 - health/readiness 必须返回 Postgres reachable、schema migration ready、SecretRef redacted 状态和 build/source metadata。 - 日志、event、trace、health 和 diagnostics 不得输出 provider credential、Codex auth/config 内容、DSN password、token 或 URL credential。 diff --git a/docs/reference/spec-v01-agentrun-runner.md b/docs/reference/spec-v01-agentrun-runner.md index 0001737..df826c9 100644 --- a/docs/reference/spec-v01-agentrun-runner.md +++ b/docs/reference/spec-v01-agentrun-runner.md @@ -71,6 +71,7 @@ Runner 必须把以下失败归类为结构化 failureKind: - `secret-unavailable`:SecretRef 缺失、RBAC 拒绝或 Secret projection 不完整。 - `provider-auth-failed`:上游 provider 鉴权失败。 +- `provider-unavailable`:上游 provider 返回 HTTP 5xx/503、`Service Unavailable`、携带 5xx 的 `responseStreamDisconnected` 或明确 temporary/provider unavailable 文案;这是外部 provider availability blocker,不得归为本地 `backend-failed`。 - `backend-failed`:backend 进程退出、协议错误或返回 terminal error。 - `runner-lease-conflict`:claim/lease 被其他 runner 持有。 - `infra-failed`:Job 启动、网络、manager API 或文件系统基础设施失败。 diff --git a/docs/reference/spec-v01-backend-adapter.md b/docs/reference/spec-v01-backend-adapter.md index 564bfdb..670cf58 100644 --- a/docs/reference/spec-v01-backend-adapter.md +++ b/docs/reference/spec-v01-backend-adapter.md @@ -51,6 +51,7 @@ Adapter 必须把 backend 错误映射为稳定 failureKind: | `secret-unavailable` | Secret projection 缺失、文件不存在、权限不可读。 | | `provider-auth-failed` | provider credential 或 auth file 无效、上游返回 401/403。 | | `provider-rate-limited` | 上游限流或 quota 错误。 | +| `provider-unavailable` | 上游 provider availability/transient 失败,包括 HTTP 5xx/503、`Service Unavailable`、`responseStreamDisconnected` 携带 5xx 状态码、明确 `provider unavailable` 或 `temporary unavailable` 文案。 | | `backend-protocol-error` | backend 输出无法解析、协议字段缺失。 | | `backend-json-parse-error` | backend stdout 不是合法 JSON-RPC 行。 | | `backend-response-invalid` | backend JSON-RPC response/terminal notification 缺少必需字段。 | @@ -74,7 +75,7 @@ Adapter 必须把 backend 错误映射为稳定 failureKind: ### T2 Failure mapping 自测试 -阅读本文,然后用 mock 错误覆盖 missing secret、provider auth failure、rate limit、protocol error、timeout 和 cancel。确认每类错误都映射到稳定 failureKind,且输出为 JSON 或结构化 event。 +阅读本文,然后用 mock 错误覆盖 missing secret、provider auth failure、rate limit、provider availability/transient failure、protocol error、timeout 和 cancel。provider availability/transient 样例必须至少包含 HTTP 503 `Service Unavailable` 或携带 5xx 的 `responseStreamDisconnected`,并确认不会被归类为 `backend-failed`。确认每类错误都映射到稳定 failureKind,且输出为 JSON 或结构化 event。 ### T3 真实 backend 联调 diff --git a/docs/reference/spec-v01-backend-codex.md b/docs/reference/spec-v01-backend-codex.md index ee34ba1..5a4b292 100644 --- a/docs/reference/spec-v01-backend-codex.md +++ b/docs/reference/spec-v01-backend-codex.md @@ -90,13 +90,17 @@ Run 的 `executionPolicy.secretScope` 应引用 `agentrun-v01-provider-codex` 阅读本文,然后使用无效的 Codex Secret 创建 run。确认 backend 返回 `provider-auth-failed` 或等价 failureKind,记录上游状态分类和 trace correlation,但不打印 Authorization header、token 或 auth/config 文件内容。 +### T5 Provider availability failure + +阅读本文,然后用 mock/fake Codex app-server 自测试 HTTP 503 `Service Unavailable`、携带 5xx 的 `responseStreamDisconnected` 或明确 temporary/provider unavailable 文案。确认 Codex adapter 返回 `provider-unavailable`,不会落到 `backend-failed`;综合联调若真实 provider 返回同类错误,应记录为外部 provider blocker,而不是本地 runner/backend 执行面 blocker。 + ## 规格的实现情况 | 规格项 | 状态 | 说明 | | --- | --- | --- | | Codex backend 规格 | 已定义 | 本文为 v0.1 第一真实 backend 权威。 | | Codex Secret projection | 未实现 | 需要后续 Kubernetes Secret 和 runner/backend manifest。 | -| Codex adapter | 已部分实现 | 当前代码已实现受控 `codex app-server --listen stdio://`、`initialize`/`thread/start`/`thread/resume`/`turn/start` response 校验、stderr 有界诊断、spawn/JSON parse/response invalid/timeout failureKind 和 fake app-server 自测试。 | +| Codex adapter | 已部分实现 | 当前代码已实现受控 `codex app-server --listen stdio://`、`initialize`/`thread/start`/`thread/resume`/`turn/start` response 校验、stderr 有界诊断、spawn/JSON parse/response invalid/timeout/provider 5xx availability failureKind 和 fake app-server 自测试。 | | 错误可观测与脱敏 | 已部分实现 | child env、cwd、workspace 和 Codex home 只输出摘要;stderr tail 有界且标记截断;事件和 failure 统一走 redaction。 | | 真实 provider turn | 未实现 | 综合联调必须真实完成后才能发布通过。 | | hostPath `~/.codex` | 不采用 | 只能通过 Kubernetes Secret projection 注入。 | diff --git a/docs/reference/spec-v01-validation.md b/docs/reference/spec-v01-validation.md index 69de878..ef21399 100644 --- a/docs/reference/spec-v01-validation.md +++ b/docs/reference/spec-v01-validation.md @@ -83,6 +83,8 @@ RESTful API 交互联调必须满足: - `GET /api/v1/runs/:runId/commands/:commandId` 和 `GET /api/v1/runs/:runId/events?afterSeq=N&limit=M` 能轮询到 terminal_status,event `seq` 单调递增,分页重复读取不丢失也不重复。 - 所有成功和失败响应都必须是 JSON;失败响应必须包含可判定的 failureKind、message 和 trace correlation,且不得泄露 Secret value。 +真实 provider 返回 HTTP 5xx/503、`Service Unavailable`、携带 5xx 的 `responseStreamDisconnected` 或明确 temporary/provider unavailable 文案时,综合联调结论应归类为外部 provider availability blocker,failureKind 使用 `provider-unavailable`。这类结果不能证明真实 turn 成功,但也不得被记录为本地 runner/backend 执行面 `backend-failed` blocker。 + CLI 与 RESTful API 可以复用同一个真实 run 做联调。若两者观察到的 run id、command id、state、terminal_status、failureKind、event seq 或 redaction 结果不一致,综合联调不通过。 ## 发布判定 diff --git a/src/backend/codex-stdio.ts b/src/backend/codex-stdio.ts index 0bfb862..df5fdbc 100644 --- a/src/backend/codex-stdio.ts +++ b/src/backend/codex-stdio.ts @@ -435,7 +435,10 @@ function normalizeCodexNotification(message: JsonRecord): { events: BackendEvent const status = terminalStatusFromValue(turn.status); const error = asRecordAt(turn, "error"); const messageText = typeof error.message === "string" ? redactText(error.message) : null; - return { events: [{ type: "backend_status", payload: { phase: method, terminalStatus: status } }], terminal: { status, failureKind: status === "completed" ? null : classifyMessageFailureKind(messageText ?? turn.status, "backend-failed"), message: messageText } }; + const failureKind = status === "completed" ? null : classifyMessageFailureKind(messageText ?? turn.status, "backend-failed"); + const events: BackendEvent[] = [{ type: "backend_status", payload: { phase: method, terminalStatus: status } }]; + if (failureKind) events.push({ type: "error", payload: { failureKind, error: redactJson(error), phase: method } }); + return { events, terminal: { status, failureKind, message: messageText } }; } return { events: [{ type: "backend_status", payload: { phase: method } }] }; } @@ -570,11 +573,19 @@ function classifyMessageFailureKind(message: string, fallback: FailureKind): Fai const text = String(message || "").toLowerCase(); if (/rate.?limit|too many requests|\b429\b/u.test(text)) return "provider-rate-limited"; if (/\b401\b|\b403\b|unauthori[sz]ed|forbidden|invalid api key|authentication|auth failed|oauth|access token/u.test(text)) return "provider-auth-failed"; + if (isProviderUnavailableMessage(text)) return "provider-unavailable"; if (/timed out|timeout|idle timeout/u.test(text)) return "backend-timeout"; if (/invalid json|json parse/u.test(text)) return "backend-json-parse-error"; return fallback; } +function isProviderUnavailableMessage(text: string): boolean { + if (/\b(?:http(?:\s+status)?|status(?:\s+code)?|code)\s*[:=]?\s*5\d\d\b/u.test(text)) return true; + if (/\b5\d\d\b/u.test(text) && /service unavailable|bad gateway|gateway timeout|internal server error|provider|upstream|response\s*stream\s*disconnected|responsestreamdisconnected/u.test(text)) return true; + if (/service unavailable|temporar(?:y|ily) unavailable|provider (?:is )?unavailable|provider availability|upstream (?:is )?unavailable/u.test(text)) return true; + return false; +} + function positiveTimeout(value: number): number { return Number.isFinite(value) && value > 0 ? Math.max(1, Math.floor(value)) : requestTimeoutCapMs; } diff --git a/src/common/types.ts b/src/common/types.ts index 195f4c6..a516722 100644 --- a/src/common/types.ts +++ b/src/common/types.ts @@ -15,6 +15,7 @@ export type FailureKind = | "backend-timeout" | "provider-auth-failed" | "provider-rate-limited" + | "provider-unavailable" | "infra-failed" | "cancelled"; diff --git a/src/selftest/cases/30-codex-stdio.ts b/src/selftest/cases/30-codex-stdio.ts index da37d68..59ec1a8 100644 --- a/src/selftest/cases/30-codex-stdio.ts +++ b/src/selftest/cases/30-codex-stdio.ts @@ -34,11 +34,13 @@ const selfTest: SelfTestCase = async (context) => { await access(path.join(projectedHome, "config.toml")); await runFailureCase({ client, managerUrl: server.baseUrl, context, mode: "missing-turn-result", expectedStatus: "failed", expectedFailureKind: "backend-response-invalid" }); + await runFailureCase({ client, managerUrl: server.baseUrl, context, mode: "provider-503-rpc-error", expectedStatus: "failed", expectedFailureKind: "provider-unavailable" }); + await runFailureCase({ client, managerUrl: server.baseUrl, context, mode: "provider-503-terminal", expectedStatus: "failed", expectedFailureKind: "provider-unavailable" }); await runFailureCase({ client, managerUrl: server.baseUrl, context, mode: "invalid-json", expectedStatus: "failed", expectedFailureKind: "backend-json-parse-error" }); await runFailureCase({ client, managerUrl: server.baseUrl, context, mode: "missing-terminal", expectedStatus: "failed", expectedFailureKind: "backend-timeout", timeoutMs: 500 }); await runSpawnFailureCase({ client, managerUrl: server.baseUrl, context }); - return { name: "codex-stdio", tests: ["runner-lease-heartbeat", "codex-stdio-fake-turn", "codex-stdio-projected-writable-home", "codex-stdio-missing-turn-result", "codex-stdio-invalid-json", "codex-stdio-timeout", "codex-stdio-spawn-failure"] }; + return { name: "codex-stdio", tests: ["runner-lease-heartbeat", "codex-stdio-fake-turn", "codex-stdio-projected-writable-home", "codex-stdio-missing-turn-result", "codex-stdio-provider-503-rpc-error", "codex-stdio-provider-503-terminal", "codex-stdio-invalid-json", "codex-stdio-timeout", "codex-stdio-spawn-failure"] }; } finally { await new Promise((resolve) => server.server.close(() => resolve())); } diff --git a/src/selftest/fake-codex-app-server.ts b/src/selftest/fake-codex-app-server.ts index 2f14559..65ecff3 100644 --- a/src/selftest/fake-codex-app-server.ts +++ b/src/selftest/fake-codex-app-server.ts @@ -35,6 +35,10 @@ for await (const line of rl) { respond(message.id, {}); continue; } + if (mode === "provider-503-rpc-error") { + respond(message.id, null, { code: -32000, message: "responseStreamDisconnected: HTTP 503 Service Unavailable from provider" }); + continue; + } if (mode === "missing-terminal") { turnCounter += 1; const turn = { id: `turn_selftest_${turnCounter}`, status: "running" }; @@ -42,6 +46,14 @@ for await (const line of rl) { respond(message.id, { turn }); continue; } + if (mode === "provider-503-terminal") { + turnCounter += 1; + const turn = { id: `turn_selftest_${turnCounter}`, status: "failed", error: { message: "HTTP 503 Service Unavailable" } }; + notify("turn/started", { turn: { id: turn.id, status: "running" } }); + notify("turn/completed", { turn }); + respond(message.id, { turn }); + continue; + } turnCounter += 1; const turn = { id: `turn_selftest_${turnCounter}`, status: "completed" }; notify("turn/started", { turn }); From 81642fdd54276488d3a945fb36cfdcb7236659fd Mon Sep 17 00:00:00 2001 From: Codex Date: Fri, 29 May 2026 14:17:12 +0800 Subject: [PATCH 2/2] =?UTF-8?q?fix:=20=E8=A6=86=E7=9B=96=20Codex=20retry?= =?UTF-8?q?=20503=20=E5=88=86=E7=B1=BB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/reference/spec-v01-backend-codex.md | 4 ++-- src/backend/codex-stdio.ts | 13 +++++++++++-- src/selftest/cases/30-codex-stdio.ts | 16 ++++++++++++++-- src/selftest/fake-codex-app-server.ts | 23 +++++++++++++++++++++++ 4 files changed, 50 insertions(+), 6 deletions(-) diff --git a/docs/reference/spec-v01-backend-codex.md b/docs/reference/spec-v01-backend-codex.md index 5a4b292..962c215 100644 --- a/docs/reference/spec-v01-backend-codex.md +++ b/docs/reference/spec-v01-backend-codex.md @@ -92,7 +92,7 @@ Run 的 `executionPolicy.secretScope` 应引用 `agentrun-v01-provider-codex` ### T5 Provider availability failure -阅读本文,然后用 mock/fake Codex app-server 自测试 HTTP 503 `Service Unavailable`、携带 5xx 的 `responseStreamDisconnected` 或明确 temporary/provider unavailable 文案。确认 Codex adapter 返回 `provider-unavailable`,不会落到 `backend-failed`;综合联调若真实 provider 返回同类错误,应记录为外部 provider blocker,而不是本地 runner/backend 执行面 blocker。 +阅读本文,然后用 mock/fake Codex app-server 自测试 HTTP 503 `Service Unavailable`、携带 5xx 的 `responseStreamDisconnected`、`method=error` retry notification 中 `willRetry=true` 且嵌套 `codexErrorInfo.responseStreamDisconnected.httpStatusCode=503` 的结构,或明确 temporary/provider unavailable 文案。确认 Codex adapter 返回 `provider-unavailable`,不会落到 `backend-failed`;综合联调若真实 provider 返回同类错误,应记录为外部 provider blocker,而不是本地 runner/backend 执行面 blocker。 ## 规格的实现情况 @@ -100,7 +100,7 @@ Run 的 `executionPolicy.secretScope` 应引用 `agentrun-v01-provider-codex` | --- | --- | --- | | Codex backend 规格 | 已定义 | 本文为 v0.1 第一真实 backend 权威。 | | Codex Secret projection | 未实现 | 需要后续 Kubernetes Secret 和 runner/backend manifest。 | -| Codex adapter | 已部分实现 | 当前代码已实现受控 `codex app-server --listen stdio://`、`initialize`/`thread/start`/`thread/resume`/`turn/start` response 校验、stderr 有界诊断、spawn/JSON parse/response invalid/timeout/provider 5xx availability failureKind 和 fake app-server 自测试。 | +| Codex adapter | 已部分实现 | 当前代码已实现受控 `codex app-server --listen stdio://`、`initialize`/`thread/start`/`thread/resume`/`turn/start` response 校验、stderr 有界诊断、spawn/JSON parse/response invalid/timeout/provider 5xx availability failureKind,以及包含 retry error notification 的 fake app-server 自测试。 | | 错误可观测与脱敏 | 已部分实现 | child env、cwd、workspace 和 Codex home 只输出摘要;stderr tail 有界且标记截断;事件和 failure 统一走 redaction。 | | 真实 provider turn | 未实现 | 综合联调必须真实完成后才能发布通过。 | | hostPath `~/.codex` | 不采用 | 只能通过 Kubernetes Secret projection 注入。 | diff --git a/src/backend/codex-stdio.ts b/src/backend/codex-stdio.ts index df5fdbc..5a81331 100644 --- a/src/backend/codex-stdio.ts +++ b/src/backend/codex-stdio.ts @@ -423,7 +423,7 @@ function normalizeCodexNotification(message: JsonRecord): { events: BackendEvent if (method === "error") { const error = asRecordAt(params, "error"); const messageText = typeof error.message === "string" ? error.message : "Codex app-server error"; - const failureKind = classifyMessageFailureKind(messageText, "backend-failed"); + const failureKind = classifyCodexErrorRecord(error, "backend-failed"); const terminal = params.willRetry === true ? undefined : { status: "failed" as const, failureKind, message: redactText(messageText) }; return { events: [{ type: "error", payload: { failureKind, error: redactJson(error), willRetry: params.willRetry === true } }], ...(terminal ? { terminal } : {}) }; } @@ -435,7 +435,7 @@ function normalizeCodexNotification(message: JsonRecord): { events: BackendEvent const status = terminalStatusFromValue(turn.status); const error = asRecordAt(turn, "error"); const messageText = typeof error.message === "string" ? redactText(error.message) : null; - const failureKind = status === "completed" ? null : classifyMessageFailureKind(messageText ?? turn.status, "backend-failed"); + const failureKind = status === "completed" ? null : classifyCodexErrorRecord(Object.keys(error).length > 0 ? error : { message: turn.status }, "backend-failed"); const events: BackendEvent[] = [{ type: "backend_status", payload: { phase: method, terminalStatus: status } }]; if (failureKind) events.push({ type: "error", payload: { failureKind, error: redactJson(error), phase: method } }); return { events, terminal: { status, failureKind, message: messageText } }; @@ -569,6 +569,15 @@ function normalizeFailure(error: unknown): CodexStdioFailure { return new CodexStdioFailure(classifyMessageFailureKind(message, "backend-protocol-error"), message, "codex-stdio"); } +function classifyCodexErrorRecord(error: JsonRecord, fallback: FailureKind): FailureKind { + const parts: string[] = []; + if (typeof error.message === "string") parts.push(error.message); + if (typeof error.additionalDetails === "string") parts.push(error.additionalDetails); + const redactedJson = JSON.stringify(redactJson(error as JsonValue)); + if (redactedJson && redactedJson !== "{}") parts.push(redactedJson); + return classifyMessageFailureKind(parts.join("\n"), fallback); +} + function classifyMessageFailureKind(message: string, fallback: FailureKind): FailureKind { const text = String(message || "").toLowerCase(); if (/rate.?limit|too many requests|\b429\b/u.test(text)) return "provider-rate-limited"; diff --git a/src/selftest/cases/30-codex-stdio.ts b/src/selftest/cases/30-codex-stdio.ts index 59ec1a8..9e678dd 100644 --- a/src/selftest/cases/30-codex-stdio.ts +++ b/src/selftest/cases/30-codex-stdio.ts @@ -36,17 +36,18 @@ const selfTest: SelfTestCase = async (context) => { await runFailureCase({ client, managerUrl: server.baseUrl, context, mode: "missing-turn-result", expectedStatus: "failed", expectedFailureKind: "backend-response-invalid" }); await runFailureCase({ client, managerUrl: server.baseUrl, context, mode: "provider-503-rpc-error", expectedStatus: "failed", expectedFailureKind: "provider-unavailable" }); await runFailureCase({ client, managerUrl: server.baseUrl, context, mode: "provider-503-terminal", expectedStatus: "failed", expectedFailureKind: "provider-unavailable" }); + await runFailureCase({ client, managerUrl: server.baseUrl, context, mode: "provider-503-retry-event", expectedStatus: "failed", expectedFailureKind: "provider-unavailable", expectRetryError: true }); await runFailureCase({ client, managerUrl: server.baseUrl, context, mode: "invalid-json", expectedStatus: "failed", expectedFailureKind: "backend-json-parse-error" }); await runFailureCase({ client, managerUrl: server.baseUrl, context, mode: "missing-terminal", expectedStatus: "failed", expectedFailureKind: "backend-timeout", timeoutMs: 500 }); await runSpawnFailureCase({ client, managerUrl: server.baseUrl, context }); - return { name: "codex-stdio", tests: ["runner-lease-heartbeat", "codex-stdio-fake-turn", "codex-stdio-projected-writable-home", "codex-stdio-missing-turn-result", "codex-stdio-provider-503-rpc-error", "codex-stdio-provider-503-terminal", "codex-stdio-invalid-json", "codex-stdio-timeout", "codex-stdio-spawn-failure"] }; + return { name: "codex-stdio", tests: ["runner-lease-heartbeat", "codex-stdio-fake-turn", "codex-stdio-projected-writable-home", "codex-stdio-missing-turn-result", "codex-stdio-provider-503-rpc-error", "codex-stdio-provider-503-terminal", "codex-stdio-provider-503-retry-event", "codex-stdio-invalid-json", "codex-stdio-timeout", "codex-stdio-spawn-failure"] }; } finally { await new Promise((resolve) => server.server.close(() => resolve())); } }; -async function runFailureCase(options: { client: ManagerClient; managerUrl: string; context: SelfTestContext; mode: string; expectedStatus: TerminalStatus; expectedFailureKind: FailureKind; timeoutMs?: number }): Promise { +async function runFailureCase(options: { client: ManagerClient; managerUrl: string; context: SelfTestContext; mode: string; expectedStatus: TerminalStatus; expectedFailureKind: FailureKind; timeoutMs?: number; expectRetryError?: boolean }): Promise { const item = await createRunWithCommand(options.client, options.context, `failure ${options.mode}`, `selftest-${options.mode}`, options.timeoutMs ?? 3_000); const result = await runOnce({ managerUrl: options.managerUrl, @@ -60,11 +61,22 @@ async function runFailureCase(options: { client: ManagerClient; managerUrl: stri assert.equal(result.failureKind, options.expectedFailureKind, options.mode); const events = await options.client.get(`/api/v1/runs/${item.runId}/events?afterSeq=0&limit=100`) as { items?: Array<{ type: string; payload: unknown }> }; assert.ok(events.items?.some((event) => event.type === "error"), options.mode); + assert.ok(events.items?.some((event) => event.type === "error" && eventPayload(event).failureKind === options.expectedFailureKind), `${options.mode} expected error event failureKind ${options.expectedFailureKind}`); + if (options.expectRetryError) { + assert.ok(events.items?.some((event) => { + const payload = eventPayload(event); + return event.type === "error" && payload.willRetry === true && payload.failureKind === options.expectedFailureKind; + }), `${options.mode} expected retry error event failureKind ${options.expectedFailureKind}`); + } const command = await options.client.get(`/api/v1/runs/${item.runId}/commands/${item.commandId}`) as { state?: string }; assert.equal(command.state, "failed", options.mode); assertNoSecretLeak(events); } +function eventPayload(event: { payload: unknown }): JsonRecord { + return typeof event.payload === "object" && event.payload !== null && !Array.isArray(event.payload) ? event.payload as JsonRecord : {}; +} + async function runSpawnFailureCase(options: { client: ManagerClient; managerUrl: string; context: SelfTestContext }): Promise { const item = await createRunWithCommand(options.client, options.context, "failure spawn", "selftest-spawn-failure", 3_000); const result = await runOnce({ diff --git a/src/selftest/fake-codex-app-server.ts b/src/selftest/fake-codex-app-server.ts index 65ecff3..1156ae1 100644 --- a/src/selftest/fake-codex-app-server.ts +++ b/src/selftest/fake-codex-app-server.ts @@ -54,6 +54,29 @@ for await (const line of rl) { respond(message.id, { turn }); continue; } + if (mode === "provider-503-retry-event") { + turnCounter += 1; + const turn = { + id: `turn_selftest_${turnCounter}`, + status: "failed", + error: { + message: "unexpected status 503 Service Unavailable: Service temporarily unavailable", + codexErrorInfo: { responseStreamDisconnected: { httpStatusCode: 503 } }, + }, + }; + notify("turn/started", { turn: { id: turn.id, status: "running" } }); + notify("error", { + willRetry: true, + error: { + message: "Reconnecting... 1/5", + codexErrorInfo: { responseStreamDisconnected: { httpStatusCode: 503 } }, + additionalDetails: "unexpected status 503 Service Unavailable: Service temporarily unavailable, url: https://hyueapi.com/responses", + }, + }); + notify("turn/completed", { turn }); + respond(message.id, { turn }); + continue; + } turnCounter += 1; const turn = { id: `turn_selftest_${turnCounter}`, status: "completed" }; notify("turn/started", { turn });