From 877d227c83bbc7516f2bc9af332a1d50c0ddd3df Mon Sep 17 00:00:00 2001 From: Codex Date: Thu, 11 Jun 2026 17:17:33 +0000 Subject: [PATCH] fix: use backoff for codex gateway stream freezes --- config/platform-infra/sub2api-codex-pool.yaml | 4 ++- .../platform-infra-sub2api-codex-sentinel.ts | 33 ++++++++++++++++--- 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/config/platform-infra/sub2api-codex-pool.yaml b/config/platform-infra/sub2api-codex-pool.yaml index 44198419..308b5d85 100644 --- a/config/platform-infra/sub2api-codex-pool.yaml +++ b/config/platform-infra/sub2api-codex-pool.yaml @@ -183,7 +183,9 @@ sentinel: enabled: true lookbackSeconds: 900 tailLines: 4000 - freezeTtlMinutes: 10 + initialTtlMinutes: 5 + maxTtlMinutes: 30 + backoffMultiplier: 2 paths: - /responses - /v1/responses diff --git a/scripts/src/platform-infra-sub2api-codex-sentinel.ts b/scripts/src/platform-infra-sub2api-codex-sentinel.ts index 19be7ae1..59a1be1c 100644 --- a/scripts/src/platform-infra-sub2api-codex-sentinel.ts +++ b/scripts/src/platform-infra-sub2api-codex-sentinel.ts @@ -30,7 +30,9 @@ export interface CodexPoolSentinelConfig { enabled: boolean; lookbackSeconds: number; tailLines: number; - freezeTtlMinutes: number; + initialTtlMinutes: number; + maxTtlMinutes: number; + backoffMultiplier: number; paths: string[]; }; sdk: { @@ -111,7 +113,9 @@ export function defaultCodexPoolSentinelConfig(): CodexPoolSentinelConfig { enabled: false, lookbackSeconds: 900, tailLines: 4000, - freezeTtlMinutes: 10, + initialTtlMinutes: 5, + maxTtlMinutes: 30, + backoffMultiplier: 2, paths: ["/responses", "/v1/responses"], }, sdk: { @@ -198,7 +202,9 @@ export function readCodexPoolSentinelConfig(value: unknown, defaults: CodexPoolS enabled: readBoolean(valueAt(gatewayFailureMonitor, "enabled"), `${sourcePath}.sentinel.gatewayFailureMonitor.enabled`, defaults.gatewayFailureMonitor.enabled), lookbackSeconds: readInt(valueAt(gatewayFailureMonitor, "lookbackSeconds"), `${sourcePath}.sentinel.gatewayFailureMonitor.lookbackSeconds`, defaults.gatewayFailureMonitor.lookbackSeconds, 60, 7200), tailLines: readInt(valueAt(gatewayFailureMonitor, "tailLines"), `${sourcePath}.sentinel.gatewayFailureMonitor.tailLines`, defaults.gatewayFailureMonitor.tailLines, 100, 50000), - freezeTtlMinutes: readInt(valueAt(gatewayFailureMonitor, "freezeTtlMinutes"), `${sourcePath}.sentinel.gatewayFailureMonitor.freezeTtlMinutes`, defaults.gatewayFailureMonitor.freezeTtlMinutes, 1, 1440), + initialTtlMinutes: readInt(valueAt(gatewayFailureMonitor, "initialTtlMinutes"), `${sourcePath}.sentinel.gatewayFailureMonitor.initialTtlMinutes`, defaults.gatewayFailureMonitor.initialTtlMinutes, 1, 1440), + maxTtlMinutes: readInt(valueAt(gatewayFailureMonitor, "maxTtlMinutes"), `${sourcePath}.sentinel.gatewayFailureMonitor.maxTtlMinutes`, defaults.gatewayFailureMonitor.maxTtlMinutes, 1, 1440), + backoffMultiplier: readInt(valueAt(gatewayFailureMonitor, "backoffMultiplier"), `${sourcePath}.sentinel.gatewayFailureMonitor.backoffMultiplier`, defaults.gatewayFailureMonitor.backoffMultiplier, 1, 10), paths: readPathList(valueAt(gatewayFailureMonitor, "paths"), `${sourcePath}.sentinel.gatewayFailureMonitor.paths`, defaults.gatewayFailureMonitor.paths), }, sdk: { @@ -239,6 +245,9 @@ export function readCodexPoolSentinelConfig(value: unknown, defaults: CodexPoolS if (config.freeze.maxTtlMinutes < config.freeze.initialTtlMinutes) { throw new Error(`${sourcePath}.sentinel.freeze.maxTtlMinutes must be >= initialTtlMinutes`); } + if (config.gatewayFailureMonitor.maxTtlMinutes < config.gatewayFailureMonitor.initialTtlMinutes) { + throw new Error(`${sourcePath}.sentinel.gatewayFailureMonitor.maxTtlMinutes must be >= initialTtlMinutes`); + } if (!/^[-0-9A-Za-z_/*,\s]+$/u.test(config.schedule)) { throw new Error(`${sourcePath}.sentinel.schedule has an unsupported cron format`); } @@ -1383,11 +1392,23 @@ def gateway_failure_account_map(admin): by_id[account_id] = name return by_id +def next_gateway_freeze_interval(account_state, config): + monitor_cfg = config.get("gatewayFailureMonitor") if isinstance(config.get("gatewayFailureMonitor"), dict) else {} + initial = int(monitor_cfg.get("initialTtlMinutes") or 5) + maximum = int(monitor_cfg.get("maxTtlMinutes") or 30) + multiplier = int(monitor_cfg.get("backoffMultiplier") or 2) + previous = int(account_state.get("gatewayFailureBackoffIntervalMinutes") or 0) + quarantine = account_state.get("quarantine") if isinstance(account_state.get("quarantine"), dict) else {} + if quarantine.get("reason") == "gateway-forward-failure": + previous = max(previous, int(quarantine.get("intervalMinutes") or 0)) + if previous <= 0: + return initial + return min(maximum, max(initial, previous * multiplier)) + def apply_gateway_failure(account_name, failures, state, config, now, admin, profile): latest = failures[-1] account_state = state.setdefault("accounts", {}).setdefault(account_name, {}) - monitor_cfg = config.get("gatewayFailureMonitor") if isinstance(config.get("gatewayFailureMonitor"), dict) else {} - interval = int(monitor_cfg.get("freezeTtlMinutes") or config["freeze"]["maxTtlMinutes"]) + interval = next_gateway_freeze_interval(account_state, config) until = add_minutes(now, interval, int(config["freeze"]["jitterPercent"])) actions_enabled = bool(config["actions"]["enabled"]) applied = False @@ -1426,6 +1447,7 @@ def apply_gateway_failure(account_name, failures, state, config, now, admin, pro account_state["lastStatus"] = "quarantined" account_state["lastFailureAt"] = iso(now) account_state["lastGatewayFailureAt"] = iso(now) + account_state["gatewayFailureBackoffIntervalMinutes"] = interval account_state["lastGatewayFailure"] = { "accountName": account_name, "accountId": latest.get("accountId"), @@ -1440,6 +1462,7 @@ def apply_gateway_failure(account_name, failures, state, config, now, admin, pro "countInRun": len(failures), "firstAt": failures[0].get("at"), "lastAt": latest.get("at"), + "intervalMinutes": interval, "freezeUntil": iso(until), "action": action, }