diff --git a/config/platform-infra/sub2api-codex-pool.yaml b/config/platform-infra/sub2api-codex-pool.yaml index 229a1964..5b7cd66c 100644 --- a/config/platform-infra/sub2api-codex-pool.yaml +++ b/config/platform-infra/sub2api-codex-pool.yaml @@ -34,6 +34,10 @@ pool: keywords: [capacity, overloaded, temporarily unavailable, temporary, upstream, recovered upstream error, model_not_found, no available channel for model] durationMinutes: 30 description: Service unavailable and upstream model-routing failures should cool down longer than one-off transient failures. + - statusCode: 504 + keywords: [gateway timeout, timeout, upstream, upstream request failed, unknown error, context deadline exceeded, context canceled, recovered upstream error] + durationMinutes: 30 + description: Gateway timeout responses should cool down the selected account so another account can handle the next request. - statusCode: 529 keywords: [capacity, overloaded, temporarily unavailable, temporary, recovered upstream error] durationMinutes: 30 diff --git a/scripts/platform-infra-sub2api-codex-routing-contract-test.ts b/scripts/platform-infra-sub2api-codex-routing-contract-test.ts index 34bbc691..6cdd1807 100644 --- a/scripts/platform-infra-sub2api-codex-routing-contract-test.ts +++ b/scripts/platform-infra-sub2api-codex-routing-contract-test.ts @@ -48,6 +48,11 @@ if (parsed.pool?.defaultTempUnschedulable?.enabled === true) { const gateway502Rule = rules.find((rule) => rule.statusCode === 502); const gateway502Keywords = new Set((gateway502Rule?.keywords ?? []).map((keyword) => keyword.toLowerCase())); assertCondition(gateway502Keywords.has("recovered upstream error"), "502 temporary-unschedulable rule must catch recovered upstream error wrappers", gateway502Rule); + const gateway504Rule = rules.find((rule) => rule.statusCode === 504); + const gateway504Keywords = new Set((gateway504Rule?.keywords ?? []).map((keyword) => keyword.toLowerCase())); + for (const keyword of ["gateway timeout", "unknown error", "context deadline exceeded"]) { + assertCondition(gateway504Keywords.has(keyword), "504 temporary-unschedulable rule must catch gateway timeout wrappers", { keyword, gateway504Rule }); + } const accountState403Rule = rules.find((rule) => rule.statusCode === 403); const quota429Rule = rules.find((rule) => rule.statusCode === 429); const serviceUnavailable503Rule = rules.find((rule) => rule.statusCode === 503); @@ -73,6 +78,7 @@ console.log(JSON.stringify({ "optional WebSocket mode overrides use supported values", "temporary unschedulable rules are structurally valid when enabled", "generic recovered upstream error wrappers are caught by cooldown rules", + "gateway timeout wrappers are caught by the 504 cooldown rule", "Codex weekly-limit prompts are caught by account-state and quota cooldown rules", "upstream model-routing failures are caught by the 503 cooldown rule", "Responses smoke model is YAML-declared", diff --git a/scripts/platform-infra-sub2api-codex-temp-unsched-contract-test.ts b/scripts/platform-infra-sub2api-codex-temp-unsched-contract-test.ts index a1e2da29..99e6ec38 100644 --- a/scripts/platform-infra-sub2api-codex-temp-unsched-contract-test.ts +++ b/scripts/platform-infra-sub2api-codex-temp-unsched-contract-test.ts @@ -30,6 +30,7 @@ assertCondition(!("api_key" in credentials) && !("base_url" in credentials), "te const accountState403Rule = rules.find((rule) => rule.error_code === 403); const quota429Rule = rules.find((rule) => rule.error_code === 429); const serviceUnavailable503Rule = rules.find((rule) => rule.error_code === 503); +const gatewayTimeout504Rule = rules.find((rule) => rule.error_code === 504); for (const keyword of ["weekly limit", "less than 10% of your weekly limit left", "run /status for a breakdown"]) { assertCondition(accountState403Rule?.keywords?.includes(keyword), "403 rendered rule must preserve Codex weekly-limit account-state keyword", { keyword, accountState403Rule }); assertCondition(quota429Rule?.keywords?.includes(keyword), "429 rendered rule must preserve Codex weekly-limit quota keyword", { keyword, quota429Rule }); @@ -37,6 +38,9 @@ for (const keyword of ["weekly limit", "less than 10% of your weekly limit left" for (const keyword of ["model_not_found", "no available channel for model"]) { assertCondition(serviceUnavailable503Rule?.keywords?.includes(keyword), "503 rendered rule must catch upstream model-routing failures", { keyword, serviceUnavailable503Rule }); } +for (const keyword of ["gateway timeout", "unknown error", "context deadline exceeded"]) { + assertCondition(gatewayTimeout504Rule?.keywords?.includes(keyword), "504 rendered rule must preserve gateway-timeout cooldown keyword", { keyword, gatewayTimeout504Rule }); +} const disabled = renderSub2ApiTempUnschedulableCredentials({ enabled: false, rules: policy.rules }) as { temp_unschedulable_enabled?: boolean; @@ -53,6 +57,7 @@ console.log(JSON.stringify({ "temporary unschedulable rendering follows the input policy without hard-coded policy gates", "Codex weekly-limit prompt keywords render into 403 and 429 cooldown rules", "upstream model-routing failures render into the 503 cooldown rule", + "gateway timeout wrappers render into the 504 cooldown rule", "disabled policies clear runtime rules", ], })); diff --git a/scripts/src/platform-infra-sub2api-codex.ts b/scripts/src/platform-infra-sub2api-codex.ts index 1d91cb0e..a641ef25 100644 --- a/scripts/src/platform-infra-sub2api-codex.ts +++ b/scripts/src/platform-infra-sub2api-codex.ts @@ -688,6 +688,12 @@ export function defaultCodexTempUnschedulablePolicy(): CodexTempUnschedulablePol durationMinutes: 30, description: "Service unavailable and upstream model-routing failures should cool down longer than one-off transient failures.", }, + { + statusCode: 504, + keywords: ["gateway timeout", "timeout", "upstream", "upstream request failed", "unknown error", "context deadline exceeded", "context canceled", "recovered upstream error"], + durationMinutes: 30, + description: "Gateway timeout responses should cool down the selected account so another account can handle the next request.", + }, { statusCode: 529, keywords: ["capacity", "overloaded", "temporarily unavailable", "temporary", "recovered upstream error"],