fix: cool down Sub2API large context upstream failures

2026-06-10 02:46:11 +00:00
parent 7b869f95a0
commit 8a5a97e07d
5 changed files with 23 additions and 1 deletions
@@ -147,7 +147,7 @@ bun scripts/cli.ts platform-infra sub2api codex-pool configure-local --confirm
 - 上游报 capacity/rate-limit/overload/Bad Gateway/Gateway Timeout 后没有切号或频繁先失败再恢复：先确认 `codex-pool validate` 里 `tempUnschedulable.ok=true` 且目标 account `runtimeEnabled=true`、规则数符合 YAML；再看 `validation.gatewayResponses.evidence.failovers` 的 account/upstream status。若 mismatch，跑 `codex-pool sync --confirm`，不要手工 patch Sub2API credentials。
 - Codex 报 weekly-limit、`less than 10% of your weekly limit left`、`Run /status for a breakdown` 等账号状态/软配额提示并要求切号：把稳定 body 关键词放进 `pool.defaultTempUnschedulable` 的 403 和 429 规则，跑 `codex-pool sync --confirm`，再用 `codex-pool validate` 确认每个 managed account 的 runtime 403/429 rules 都包含这些关键词。Sub2API 临时下线规则按 HTTP status + body keyword 匹配；如果该文案是 HTTP 200 成功内容，需要另提响应分类能力 issue，不能只靠 YAML 冷却规则声明解决。
 - 上游 503 响应体出现 `model_not_found`、`No available channel for model ...` 或同类稳定模型路由失败文案：把稳定 body 关键词放进 `pool.defaultTempUnschedulable` 的 503 规则，跑 `codex-pool sync --confirm`，再用 `codex-pool validate` 确认目标 account 的 runtime 503 rule 包含这些关键词；不要用 account membership、priority、capacity、loadFactor、WebSocket mode 或 User-Agent 改动掩盖该错误族。
- 上游错误反复触发：默认错误冷却按严重程度分层；临时问题可从 10 分钟起步，网关/服务不可用/过载/模型路由类应更长，认证/权限/配额/账号状态类使用最长冷却。`Recovered upstream error ...`、`Bad Gateway`、`Gateway Timeout`、Codex-facing `Upstream request failed`、`Unknown error`、`context deadline exceeded`、`context canceled`、`model_not_found` 和 `No available channel for model` 这类稳定包装文案都应留在 YAML 冷却政策里。具体数值只以 YAML 为准，修改后必须 `codex-pool sync --confirm` 和 `codex-pool validate`。长期判定见 `docs/reference/platform-infra.md`。
+- 上游错误反复触发：默认错误冷却按严重程度分层；临时问题可从 10 分钟起步，网关/服务不可用/过载/模型路由类应更长，认证/权限/配额/账号状态类使用最长冷却。`Recovered upstream error ...`、`Bad Gateway`、`Gateway Timeout`、Codex-facing `Upstream request failed`、`Unknown error`、`context deadline exceeded`、`context canceled`、`model_not_found`、`No available channel for model`、大上下文 `413` 和 `openai_error` 这类稳定包装文案都应留在 YAML 冷却政策里。具体数值只以 YAML 为准，修改后必须 `codex-pool sync --confirm` 和 `codex-pool validate`。长期判定见 `docs/reference/platform-infra.md`。
 - Codex auto compact 后丢上下文：先确认本机 `~/.codex/config.toml` 是否有 `supports_websockets = true` 和 `responses_websockets_v2 = true`，再看 `codex-pool validate` 的 WSv2 candidate 和 Sub2API 日志里的 `transport=responses_websockets_v2`。
 - Codex smoke 有 reconnect/1013：这是上游并发/可用性问题，和 HTTP-only compact context-loss 分开处理；记录 session/log 证据并关联专项 issue，不要用运行时手补覆盖 YAML 容量。

@@ -30,6 +30,10 @@ pool:
        keywords: [capacity, overloaded, temporarily unavailable, temporary, upstream, bad gateway, upstream request failed, websocket dial, handshake response, recovered upstream error]
        durationMinutes: 30
        description: Gateway upstream failures, including recovered upstream error wrappers, should cool down longer.
+      - statusCode: 413
+        keywords: [openai_error, payload too large, request too large, context length, context window, maximum context]
+        durationMinutes: 30
+        description: Large-context upstream failures should cool down the selected account so a larger-context channel can handle the request.
      - statusCode: 503
        keywords: [capacity, overloaded, temporarily unavailable, temporary, upstream, recovered upstream error, model_not_found, no available channel for model]
        durationMinutes: 30
@@ -56,6 +60,7 @@ profiles:
      accountName: unidesk-codex-gptclub
      configFile: config.toml.gptclub
      authFile: auth.json.gptclub
+      capacity: 10
      priority: 100
    - profile: only
      accountName: unidesk-codex-only
@@ -48,6 +48,11 @@ if (parsed.pool?.defaultTempUnschedulable?.enabled === true) {
  const gateway502Rule = rules.find((rule) => rule.statusCode === 502);
  const gateway502Keywords = new Set((gateway502Rule?.keywords ?? []).map((keyword) => keyword.toLowerCase()));
  assertCondition(gateway502Keywords.has("recovered upstream error"), "502 temporary-unschedulable rule must catch recovered upstream error wrappers", gateway502Rule);
+  const largeContext413Rule = rules.find((rule) => rule.statusCode === 413);
+  const largeContext413Keywords = new Set((largeContext413Rule?.keywords ?? []).map((keyword) => keyword.toLowerCase()));
+  for (const keyword of ["openai_error", "context length", "maximum context"]) {
+    assertCondition(largeContext413Keywords.has(keyword), "413 temporary-unschedulable rule must catch large-context upstream failures", { keyword, largeContext413Rule });
+  }
  const gateway504Rule = rules.find((rule) => rule.statusCode === 504);
  const gateway504Keywords = new Set((gateway504Rule?.keywords ?? []).map((keyword) => keyword.toLowerCase()));
  for (const keyword of ["gateway timeout", "unknown error", "context deadline exceeded"]) {
@@ -78,6 +83,7 @@ console.log(JSON.stringify({
    "optional WebSocket mode overrides use supported values",
    "temporary unschedulable rules are structurally valid when enabled",
    "generic recovered upstream error wrappers are caught by cooldown rules",
+    "large-context upstream failures are caught by the 413 cooldown rule",
    "gateway timeout wrappers are caught by the 504 cooldown rule",
    "Codex weekly-limit prompts are caught by account-state and quota cooldown rules",
    "upstream model-routing failures are caught by the 503 cooldown rule",
@@ -31,6 +31,7 @@ const accountState403Rule = rules.find((rule) => rule.error_code === 403);
 const quota429Rule = rules.find((rule) => rule.error_code === 429);
 const serviceUnavailable503Rule = rules.find((rule) => rule.error_code === 503);
 const gatewayTimeout504Rule = rules.find((rule) => rule.error_code === 504);
+const largeContext413Rule = rules.find((rule) => rule.error_code === 413);
 for (const keyword of ["weekly limit", "less than 10% of your weekly limit left", "run /status for a breakdown"]) {
  assertCondition(accountState403Rule?.keywords?.includes(keyword), "403 rendered rule must preserve Codex weekly-limit account-state keyword", { keyword, accountState403Rule });
  assertCondition(quota429Rule?.keywords?.includes(keyword), "429 rendered rule must preserve Codex weekly-limit quota keyword", { keyword, quota429Rule });
@@ -38,6 +39,9 @@ for (const keyword of ["weekly limit", "less than 10% of your weekly limit left"
 for (const keyword of ["model_not_found", "no available channel for model"]) {
  assertCondition(serviceUnavailable503Rule?.keywords?.includes(keyword), "503 rendered rule must catch upstream model-routing failures", { keyword, serviceUnavailable503Rule });
 }
+for (const keyword of ["openai_error", "context length", "maximum context"]) {
+  assertCondition(largeContext413Rule?.keywords?.includes(keyword), "413 rendered rule must catch large-context upstream failures", { keyword, largeContext413Rule });
+}
 for (const keyword of ["gateway timeout", "unknown error", "context deadline exceeded"]) {
  assertCondition(gatewayTimeout504Rule?.keywords?.includes(keyword), "504 rendered rule must preserve gateway-timeout cooldown keyword", { keyword, gatewayTimeout504Rule });
 }
@@ -56,6 +60,7 @@ console.log(JSON.stringify({
    "temporary unschedulable policy renders to Sub2API credential field names",
    "temporary unschedulable rendering follows the input policy without hard-coded policy gates",
    "Codex weekly-limit prompt keywords render into 403 and 429 cooldown rules",
+    "large-context upstream failures render into the 413 cooldown rule",
    "upstream model-routing failures render into the 503 cooldown rule",
    "gateway timeout wrappers render into the 504 cooldown rule",
    "disabled policies clear runtime rules",
@@ -682,6 +682,12 @@ export function defaultCodexTempUnschedulablePolicy(): CodexTempUnschedulablePol
        durationMinutes: 30,
        description: "Gateway upstream failures, including recovered upstream error wrappers, should cool down longer.",
      },
+      {
+        statusCode: 413,
+        keywords: ["openai_error", "payload too large", "request too large", "context length", "context window", "maximum context"],
+        durationMinutes: 30,
+        description: "Large-context upstream failures should cool down the selected account so a larger-context channel can handle the request.",
+      },
      {
        statusCode: 503,
        keywords: ["capacity", "overloaded", "temporarily unavailable", "temporary", "upstream", "recovered upstream error", "model_not_found", "no available channel for model"],