fix: cool down Sub2API large context upstream failures

This commit is contained in:
Codex
2026-06-10 02:46:11 +00:00
parent 7b869f95a0
commit 8a5a97e07d
5 changed files with 23 additions and 1 deletions
+1 -1
View File
@@ -147,7 +147,7 @@ bun scripts/cli.ts platform-infra sub2api codex-pool configure-local --confirm
- 上游报 capacity/rate-limit/overload/Bad Gateway/Gateway Timeout 后没有切号或频繁先失败再恢复:先确认 `codex-pool validate``tempUnschedulable.ok=true` 且目标 account `runtimeEnabled=true`、规则数符合 YAML;再看 `validation.gatewayResponses.evidence.failovers` 的 account/upstream status。若 mismatch,跑 `codex-pool sync --confirm`,不要手工 patch Sub2API credentials。
- Codex 报 weekly-limit、`less than 10% of your weekly limit left``Run /status for a breakdown` 等账号状态/软配额提示并要求切号:把稳定 body 关键词放进 `pool.defaultTempUnschedulable` 的 403 和 429 规则,跑 `codex-pool sync --confirm`,再用 `codex-pool validate` 确认每个 managed account 的 runtime 403/429 rules 都包含这些关键词。Sub2API 临时下线规则按 HTTP status + body keyword 匹配;如果该文案是 HTTP 200 成功内容,需要另提响应分类能力 issue,不能只靠 YAML 冷却规则声明解决。
- 上游 503 响应体出现 `model_not_found``No available channel for model ...` 或同类稳定模型路由失败文案:把稳定 body 关键词放进 `pool.defaultTempUnschedulable` 的 503 规则,跑 `codex-pool sync --confirm`,再用 `codex-pool validate` 确认目标 account 的 runtime 503 rule 包含这些关键词;不要用 account membership、priority、capacity、loadFactor、WebSocket mode 或 User-Agent 改动掩盖该错误族。
- 上游错误反复触发:默认错误冷却按严重程度分层;临时问题可从 10 分钟起步,网关/服务不可用/过载/模型路由类应更长,认证/权限/配额/账号状态类使用最长冷却。`Recovered upstream error ...``Bad Gateway``Gateway Timeout`、Codex-facing `Upstream request failed``Unknown error``context deadline exceeded``context canceled``model_not_found``No available channel for model` 这类稳定包装文案都应留在 YAML 冷却政策里。具体数值只以 YAML 为准,修改后必须 `codex-pool sync --confirm``codex-pool validate`。长期判定见 `docs/reference/platform-infra.md`
- 上游错误反复触发:默认错误冷却按严重程度分层;临时问题可从 10 分钟起步,网关/服务不可用/过载/模型路由类应更长,认证/权限/配额/账号状态类使用最长冷却。`Recovered upstream error ...``Bad Gateway``Gateway Timeout`、Codex-facing `Upstream request failed``Unknown error``context deadline exceeded``context canceled``model_not_found``No available channel for model`、大上下文 `413``openai_error` 这类稳定包装文案都应留在 YAML 冷却政策里。具体数值只以 YAML 为准,修改后必须 `codex-pool sync --confirm``codex-pool validate`。长期判定见 `docs/reference/platform-infra.md`
- Codex auto compact 后丢上下文:先确认本机 `~/.codex/config.toml` 是否有 `supports_websockets = true``responses_websockets_v2 = true`,再看 `codex-pool validate` 的 WSv2 candidate 和 Sub2API 日志里的 `transport=responses_websockets_v2`
- Codex smoke 有 reconnect/1013:这是上游并发/可用性问题,和 HTTP-only compact context-loss 分开处理;记录 session/log 证据并关联专项 issue,不要用运行时手补覆盖 YAML 容量。
@@ -30,6 +30,10 @@ pool:
keywords: [capacity, overloaded, temporarily unavailable, temporary, upstream, bad gateway, upstream request failed, websocket dial, handshake response, recovered upstream error]
durationMinutes: 30
description: Gateway upstream failures, including recovered upstream error wrappers, should cool down longer.
- statusCode: 413
keywords: [openai_error, payload too large, request too large, context length, context window, maximum context]
durationMinutes: 30
description: Large-context upstream failures should cool down the selected account so a larger-context channel can handle the request.
- statusCode: 503
keywords: [capacity, overloaded, temporarily unavailable, temporary, upstream, recovered upstream error, model_not_found, no available channel for model]
durationMinutes: 30
@@ -56,6 +60,7 @@ profiles:
accountName: unidesk-codex-gptclub
configFile: config.toml.gptclub
authFile: auth.json.gptclub
capacity: 10
priority: 100
- profile: only
accountName: unidesk-codex-only
@@ -48,6 +48,11 @@ if (parsed.pool?.defaultTempUnschedulable?.enabled === true) {
const gateway502Rule = rules.find((rule) => rule.statusCode === 502);
const gateway502Keywords = new Set((gateway502Rule?.keywords ?? []).map((keyword) => keyword.toLowerCase()));
assertCondition(gateway502Keywords.has("recovered upstream error"), "502 temporary-unschedulable rule must catch recovered upstream error wrappers", gateway502Rule);
const largeContext413Rule = rules.find((rule) => rule.statusCode === 413);
const largeContext413Keywords = new Set((largeContext413Rule?.keywords ?? []).map((keyword) => keyword.toLowerCase()));
for (const keyword of ["openai_error", "context length", "maximum context"]) {
assertCondition(largeContext413Keywords.has(keyword), "413 temporary-unschedulable rule must catch large-context upstream failures", { keyword, largeContext413Rule });
}
const gateway504Rule = rules.find((rule) => rule.statusCode === 504);
const gateway504Keywords = new Set((gateway504Rule?.keywords ?? []).map((keyword) => keyword.toLowerCase()));
for (const keyword of ["gateway timeout", "unknown error", "context deadline exceeded"]) {
@@ -78,6 +83,7 @@ console.log(JSON.stringify({
"optional WebSocket mode overrides use supported values",
"temporary unschedulable rules are structurally valid when enabled",
"generic recovered upstream error wrappers are caught by cooldown rules",
"large-context upstream failures are caught by the 413 cooldown rule",
"gateway timeout wrappers are caught by the 504 cooldown rule",
"Codex weekly-limit prompts are caught by account-state and quota cooldown rules",
"upstream model-routing failures are caught by the 503 cooldown rule",
@@ -31,6 +31,7 @@ const accountState403Rule = rules.find((rule) => rule.error_code === 403);
const quota429Rule = rules.find((rule) => rule.error_code === 429);
const serviceUnavailable503Rule = rules.find((rule) => rule.error_code === 503);
const gatewayTimeout504Rule = rules.find((rule) => rule.error_code === 504);
const largeContext413Rule = rules.find((rule) => rule.error_code === 413);
for (const keyword of ["weekly limit", "less than 10% of your weekly limit left", "run /status for a breakdown"]) {
assertCondition(accountState403Rule?.keywords?.includes(keyword), "403 rendered rule must preserve Codex weekly-limit account-state keyword", { keyword, accountState403Rule });
assertCondition(quota429Rule?.keywords?.includes(keyword), "429 rendered rule must preserve Codex weekly-limit quota keyword", { keyword, quota429Rule });
@@ -38,6 +39,9 @@ for (const keyword of ["weekly limit", "less than 10% of your weekly limit left"
for (const keyword of ["model_not_found", "no available channel for model"]) {
assertCondition(serviceUnavailable503Rule?.keywords?.includes(keyword), "503 rendered rule must catch upstream model-routing failures", { keyword, serviceUnavailable503Rule });
}
for (const keyword of ["openai_error", "context length", "maximum context"]) {
assertCondition(largeContext413Rule?.keywords?.includes(keyword), "413 rendered rule must catch large-context upstream failures", { keyword, largeContext413Rule });
}
for (const keyword of ["gateway timeout", "unknown error", "context deadline exceeded"]) {
assertCondition(gatewayTimeout504Rule?.keywords?.includes(keyword), "504 rendered rule must preserve gateway-timeout cooldown keyword", { keyword, gatewayTimeout504Rule });
}
@@ -56,6 +60,7 @@ console.log(JSON.stringify({
"temporary unschedulable policy renders to Sub2API credential field names",
"temporary unschedulable rendering follows the input policy without hard-coded policy gates",
"Codex weekly-limit prompt keywords render into 403 and 429 cooldown rules",
"large-context upstream failures render into the 413 cooldown rule",
"upstream model-routing failures render into the 503 cooldown rule",
"gateway timeout wrappers render into the 504 cooldown rule",
"disabled policies clear runtime rules",
@@ -682,6 +682,12 @@ export function defaultCodexTempUnschedulablePolicy(): CodexTempUnschedulablePol
durationMinutes: 30,
description: "Gateway upstream failures, including recovered upstream error wrappers, should cool down longer.",
},
{
statusCode: 413,
keywords: ["openai_error", "payload too large", "request too large", "context length", "context window", "maximum context"],
durationMinutes: 30,
description: "Large-context upstream failures should cool down the selected account so a larger-context channel can handle the request.",
},
{
statusCode: 503,
keywords: ["capacity", "overloaded", "temporarily unavailable", "temporary", "upstream", "recovered upstream error", "model_not_found", "no available channel for model"],