fix: 覆盖 Codex retry 503 分类
This commit is contained in:
@@ -92,7 +92,7 @@ Run 的 `executionPolicy.secretScope` 应引用 `agentrun-v01-provider-codex`
|
||||
|
||||
### T5 Provider availability failure
|
||||
|
||||
阅读本文,然后用 mock/fake Codex app-server 自测试 HTTP 503 `Service Unavailable`、携带 5xx 的 `responseStreamDisconnected` 或明确 temporary/provider unavailable 文案。确认 Codex adapter 返回 `provider-unavailable`,不会落到 `backend-failed`;综合联调若真实 provider 返回同类错误,应记录为外部 provider blocker,而不是本地 runner/backend 执行面 blocker。
|
||||
阅读本文,然后用 mock/fake Codex app-server 自测试 HTTP 503 `Service Unavailable`、携带 5xx 的 `responseStreamDisconnected`、`method=error` retry notification 中 `willRetry=true` 且嵌套 `codexErrorInfo.responseStreamDisconnected.httpStatusCode=503` 的结构,或明确 temporary/provider unavailable 文案。确认 Codex adapter 返回 `provider-unavailable`,不会落到 `backend-failed`;综合联调若真实 provider 返回同类错误,应记录为外部 provider blocker,而不是本地 runner/backend 执行面 blocker。
|
||||
|
||||
## 规格的实现情况
|
||||
|
||||
@@ -100,7 +100,7 @@ Run 的 `executionPolicy.secretScope` 应引用 `agentrun-v01-provider-codex`
|
||||
| --- | --- | --- |
|
||||
| Codex backend 规格 | 已定义 | 本文为 v0.1 第一真实 backend 权威。 |
|
||||
| Codex Secret projection | 未实现 | 需要后续 Kubernetes Secret 和 runner/backend manifest。 |
|
||||
| Codex adapter | 已部分实现 | 当前代码已实现受控 `codex app-server --listen stdio://`、`initialize`/`thread/start`/`thread/resume`/`turn/start` response 校验、stderr 有界诊断、spawn/JSON parse/response invalid/timeout/provider 5xx availability failureKind 和 fake app-server 自测试。 |
|
||||
| Codex adapter | 已部分实现 | 当前代码已实现受控 `codex app-server --listen stdio://`、`initialize`/`thread/start`/`thread/resume`/`turn/start` response 校验、stderr 有界诊断、spawn/JSON parse/response invalid/timeout/provider 5xx availability failureKind,以及包含 retry error notification 的 fake app-server 自测试。 |
|
||||
| 错误可观测与脱敏 | 已部分实现 | child env、cwd、workspace 和 Codex home 只输出摘要;stderr tail 有界且标记截断;事件和 failure 统一走 redaction。 |
|
||||
| 真实 provider turn | 未实现 | 综合联调必须真实完成后才能发布通过。 |
|
||||
| hostPath `~/.codex` | 不采用 | 只能通过 Kubernetes Secret projection 注入。 |
|
||||
|
||||
@@ -423,7 +423,7 @@ function normalizeCodexNotification(message: JsonRecord): { events: BackendEvent
|
||||
if (method === "error") {
|
||||
const error = asRecordAt(params, "error");
|
||||
const messageText = typeof error.message === "string" ? error.message : "Codex app-server error";
|
||||
const failureKind = classifyMessageFailureKind(messageText, "backend-failed");
|
||||
const failureKind = classifyCodexErrorRecord(error, "backend-failed");
|
||||
const terminal = params.willRetry === true ? undefined : { status: "failed" as const, failureKind, message: redactText(messageText) };
|
||||
return { events: [{ type: "error", payload: { failureKind, error: redactJson(error), willRetry: params.willRetry === true } }], ...(terminal ? { terminal } : {}) };
|
||||
}
|
||||
@@ -435,7 +435,7 @@ function normalizeCodexNotification(message: JsonRecord): { events: BackendEvent
|
||||
const status = terminalStatusFromValue(turn.status);
|
||||
const error = asRecordAt(turn, "error");
|
||||
const messageText = typeof error.message === "string" ? redactText(error.message) : null;
|
||||
const failureKind = status === "completed" ? null : classifyMessageFailureKind(messageText ?? turn.status, "backend-failed");
|
||||
const failureKind = status === "completed" ? null : classifyCodexErrorRecord(Object.keys(error).length > 0 ? error : { message: turn.status }, "backend-failed");
|
||||
const events: BackendEvent[] = [{ type: "backend_status", payload: { phase: method, terminalStatus: status } }];
|
||||
if (failureKind) events.push({ type: "error", payload: { failureKind, error: redactJson(error), phase: method } });
|
||||
return { events, terminal: { status, failureKind, message: messageText } };
|
||||
@@ -569,6 +569,15 @@ function normalizeFailure(error: unknown): CodexStdioFailure {
|
||||
return new CodexStdioFailure(classifyMessageFailureKind(message, "backend-protocol-error"), message, "codex-stdio");
|
||||
}
|
||||
|
||||
function classifyCodexErrorRecord(error: JsonRecord, fallback: FailureKind): FailureKind {
|
||||
const parts: string[] = [];
|
||||
if (typeof error.message === "string") parts.push(error.message);
|
||||
if (typeof error.additionalDetails === "string") parts.push(error.additionalDetails);
|
||||
const redactedJson = JSON.stringify(redactJson(error as JsonValue));
|
||||
if (redactedJson && redactedJson !== "{}") parts.push(redactedJson);
|
||||
return classifyMessageFailureKind(parts.join("\n"), fallback);
|
||||
}
|
||||
|
||||
function classifyMessageFailureKind(message: string, fallback: FailureKind): FailureKind {
|
||||
const text = String(message || "").toLowerCase();
|
||||
if (/rate.?limit|too many requests|\b429\b/u.test(text)) return "provider-rate-limited";
|
||||
|
||||
@@ -36,17 +36,18 @@ const selfTest: SelfTestCase = async (context) => {
|
||||
await runFailureCase({ client, managerUrl: server.baseUrl, context, mode: "missing-turn-result", expectedStatus: "failed", expectedFailureKind: "backend-response-invalid" });
|
||||
await runFailureCase({ client, managerUrl: server.baseUrl, context, mode: "provider-503-rpc-error", expectedStatus: "failed", expectedFailureKind: "provider-unavailable" });
|
||||
await runFailureCase({ client, managerUrl: server.baseUrl, context, mode: "provider-503-terminal", expectedStatus: "failed", expectedFailureKind: "provider-unavailable" });
|
||||
await runFailureCase({ client, managerUrl: server.baseUrl, context, mode: "provider-503-retry-event", expectedStatus: "failed", expectedFailureKind: "provider-unavailable", expectRetryError: true });
|
||||
await runFailureCase({ client, managerUrl: server.baseUrl, context, mode: "invalid-json", expectedStatus: "failed", expectedFailureKind: "backend-json-parse-error" });
|
||||
await runFailureCase({ client, managerUrl: server.baseUrl, context, mode: "missing-terminal", expectedStatus: "failed", expectedFailureKind: "backend-timeout", timeoutMs: 500 });
|
||||
await runSpawnFailureCase({ client, managerUrl: server.baseUrl, context });
|
||||
|
||||
return { name: "codex-stdio", tests: ["runner-lease-heartbeat", "codex-stdio-fake-turn", "codex-stdio-projected-writable-home", "codex-stdio-missing-turn-result", "codex-stdio-provider-503-rpc-error", "codex-stdio-provider-503-terminal", "codex-stdio-invalid-json", "codex-stdio-timeout", "codex-stdio-spawn-failure"] };
|
||||
return { name: "codex-stdio", tests: ["runner-lease-heartbeat", "codex-stdio-fake-turn", "codex-stdio-projected-writable-home", "codex-stdio-missing-turn-result", "codex-stdio-provider-503-rpc-error", "codex-stdio-provider-503-terminal", "codex-stdio-provider-503-retry-event", "codex-stdio-invalid-json", "codex-stdio-timeout", "codex-stdio-spawn-failure"] };
|
||||
} finally {
|
||||
await new Promise<void>((resolve) => server.server.close(() => resolve()));
|
||||
}
|
||||
};
|
||||
|
||||
async function runFailureCase(options: { client: ManagerClient; managerUrl: string; context: SelfTestContext; mode: string; expectedStatus: TerminalStatus; expectedFailureKind: FailureKind; timeoutMs?: number }): Promise<void> {
|
||||
async function runFailureCase(options: { client: ManagerClient; managerUrl: string; context: SelfTestContext; mode: string; expectedStatus: TerminalStatus; expectedFailureKind: FailureKind; timeoutMs?: number; expectRetryError?: boolean }): Promise<void> {
|
||||
const item = await createRunWithCommand(options.client, options.context, `failure ${options.mode}`, `selftest-${options.mode}`, options.timeoutMs ?? 3_000);
|
||||
const result = await runOnce({
|
||||
managerUrl: options.managerUrl,
|
||||
@@ -60,11 +61,22 @@ async function runFailureCase(options: { client: ManagerClient; managerUrl: stri
|
||||
assert.equal(result.failureKind, options.expectedFailureKind, options.mode);
|
||||
const events = await options.client.get(`/api/v1/runs/${item.runId}/events?afterSeq=0&limit=100`) as { items?: Array<{ type: string; payload: unknown }> };
|
||||
assert.ok(events.items?.some((event) => event.type === "error"), options.mode);
|
||||
assert.ok(events.items?.some((event) => event.type === "error" && eventPayload(event).failureKind === options.expectedFailureKind), `${options.mode} expected error event failureKind ${options.expectedFailureKind}`);
|
||||
if (options.expectRetryError) {
|
||||
assert.ok(events.items?.some((event) => {
|
||||
const payload = eventPayload(event);
|
||||
return event.type === "error" && payload.willRetry === true && payload.failureKind === options.expectedFailureKind;
|
||||
}), `${options.mode} expected retry error event failureKind ${options.expectedFailureKind}`);
|
||||
}
|
||||
const command = await options.client.get(`/api/v1/runs/${item.runId}/commands/${item.commandId}`) as { state?: string };
|
||||
assert.equal(command.state, "failed", options.mode);
|
||||
assertNoSecretLeak(events);
|
||||
}
|
||||
|
||||
function eventPayload(event: { payload: unknown }): JsonRecord {
|
||||
return typeof event.payload === "object" && event.payload !== null && !Array.isArray(event.payload) ? event.payload as JsonRecord : {};
|
||||
}
|
||||
|
||||
async function runSpawnFailureCase(options: { client: ManagerClient; managerUrl: string; context: SelfTestContext }): Promise<void> {
|
||||
const item = await createRunWithCommand(options.client, options.context, "failure spawn", "selftest-spawn-failure", 3_000);
|
||||
const result = await runOnce({
|
||||
|
||||
@@ -54,6 +54,29 @@ for await (const line of rl) {
|
||||
respond(message.id, { turn });
|
||||
continue;
|
||||
}
|
||||
if (mode === "provider-503-retry-event") {
|
||||
turnCounter += 1;
|
||||
const turn = {
|
||||
id: `turn_selftest_${turnCounter}`,
|
||||
status: "failed",
|
||||
error: {
|
||||
message: "unexpected status 503 Service Unavailable: Service temporarily unavailable",
|
||||
codexErrorInfo: { responseStreamDisconnected: { httpStatusCode: 503 } },
|
||||
},
|
||||
};
|
||||
notify("turn/started", { turn: { id: turn.id, status: "running" } });
|
||||
notify("error", {
|
||||
willRetry: true,
|
||||
error: {
|
||||
message: "Reconnecting... 1/5",
|
||||
codexErrorInfo: { responseStreamDisconnected: { httpStatusCode: 503 } },
|
||||
additionalDetails: "unexpected status 503 Service Unavailable: Service temporarily unavailable, url: https://hyueapi.com/responses",
|
||||
},
|
||||
});
|
||||
notify("turn/completed", { turn });
|
||||
respond(message.id, { turn });
|
||||
continue;
|
||||
}
|
||||
turnCounter += 1;
|
||||
const turn = { id: `turn_selftest_${turnCounter}`, status: "completed" };
|
||||
notify("turn/started", { turn });
|
||||
|
||||
Reference in New Issue
Block a user