fix: use backoff for codex gateway stream freezes

This commit is contained in:
Codex
2026-06-11 17:17:33 +00:00
parent 79b4c668c0
commit 877d227c83
2 changed files with 31 additions and 6 deletions
@@ -183,7 +183,9 @@ sentinel:
enabled: true
lookbackSeconds: 900
tailLines: 4000
freezeTtlMinutes: 10
initialTtlMinutes: 5
maxTtlMinutes: 30
backoffMultiplier: 2
paths:
- /responses
- /v1/responses
@@ -30,7 +30,9 @@ export interface CodexPoolSentinelConfig {
enabled: boolean;
lookbackSeconds: number;
tailLines: number;
freezeTtlMinutes: number;
initialTtlMinutes: number;
maxTtlMinutes: number;
backoffMultiplier: number;
paths: string[];
};
sdk: {
@@ -111,7 +113,9 @@ export function defaultCodexPoolSentinelConfig(): CodexPoolSentinelConfig {
enabled: false,
lookbackSeconds: 900,
tailLines: 4000,
freezeTtlMinutes: 10,
initialTtlMinutes: 5,
maxTtlMinutes: 30,
backoffMultiplier: 2,
paths: ["/responses", "/v1/responses"],
},
sdk: {
@@ -198,7 +202,9 @@ export function readCodexPoolSentinelConfig(value: unknown, defaults: CodexPoolS
enabled: readBoolean(valueAt(gatewayFailureMonitor, "enabled"), `${sourcePath}.sentinel.gatewayFailureMonitor.enabled`, defaults.gatewayFailureMonitor.enabled),
lookbackSeconds: readInt(valueAt(gatewayFailureMonitor, "lookbackSeconds"), `${sourcePath}.sentinel.gatewayFailureMonitor.lookbackSeconds`, defaults.gatewayFailureMonitor.lookbackSeconds, 60, 7200),
tailLines: readInt(valueAt(gatewayFailureMonitor, "tailLines"), `${sourcePath}.sentinel.gatewayFailureMonitor.tailLines`, defaults.gatewayFailureMonitor.tailLines, 100, 50000),
freezeTtlMinutes: readInt(valueAt(gatewayFailureMonitor, "freezeTtlMinutes"), `${sourcePath}.sentinel.gatewayFailureMonitor.freezeTtlMinutes`, defaults.gatewayFailureMonitor.freezeTtlMinutes, 1, 1440),
initialTtlMinutes: readInt(valueAt(gatewayFailureMonitor, "initialTtlMinutes"), `${sourcePath}.sentinel.gatewayFailureMonitor.initialTtlMinutes`, defaults.gatewayFailureMonitor.initialTtlMinutes, 1, 1440),
maxTtlMinutes: readInt(valueAt(gatewayFailureMonitor, "maxTtlMinutes"), `${sourcePath}.sentinel.gatewayFailureMonitor.maxTtlMinutes`, defaults.gatewayFailureMonitor.maxTtlMinutes, 1, 1440),
backoffMultiplier: readInt(valueAt(gatewayFailureMonitor, "backoffMultiplier"), `${sourcePath}.sentinel.gatewayFailureMonitor.backoffMultiplier`, defaults.gatewayFailureMonitor.backoffMultiplier, 1, 10),
paths: readPathList(valueAt(gatewayFailureMonitor, "paths"), `${sourcePath}.sentinel.gatewayFailureMonitor.paths`, defaults.gatewayFailureMonitor.paths),
},
sdk: {
@@ -239,6 +245,9 @@ export function readCodexPoolSentinelConfig(value: unknown, defaults: CodexPoolS
if (config.freeze.maxTtlMinutes < config.freeze.initialTtlMinutes) {
throw new Error(`${sourcePath}.sentinel.freeze.maxTtlMinutes must be >= initialTtlMinutes`);
}
if (config.gatewayFailureMonitor.maxTtlMinutes < config.gatewayFailureMonitor.initialTtlMinutes) {
throw new Error(`${sourcePath}.sentinel.gatewayFailureMonitor.maxTtlMinutes must be >= initialTtlMinutes`);
}
if (!/^[-0-9A-Za-z_/*,\s]+$/u.test(config.schedule)) {
throw new Error(`${sourcePath}.sentinel.schedule has an unsupported cron format`);
}
@@ -1383,11 +1392,23 @@ def gateway_failure_account_map(admin):
by_id[account_id] = name
return by_id
def next_gateway_freeze_interval(account_state, config):
monitor_cfg = config.get("gatewayFailureMonitor") if isinstance(config.get("gatewayFailureMonitor"), dict) else {}
initial = int(monitor_cfg.get("initialTtlMinutes") or 5)
maximum = int(monitor_cfg.get("maxTtlMinutes") or 30)
multiplier = int(monitor_cfg.get("backoffMultiplier") or 2)
previous = int(account_state.get("gatewayFailureBackoffIntervalMinutes") or 0)
quarantine = account_state.get("quarantine") if isinstance(account_state.get("quarantine"), dict) else {}
if quarantine.get("reason") == "gateway-forward-failure":
previous = max(previous, int(quarantine.get("intervalMinutes") or 0))
if previous <= 0:
return initial
return min(maximum, max(initial, previous * multiplier))
def apply_gateway_failure(account_name, failures, state, config, now, admin, profile):
latest = failures[-1]
account_state = state.setdefault("accounts", {}).setdefault(account_name, {})
monitor_cfg = config.get("gatewayFailureMonitor") if isinstance(config.get("gatewayFailureMonitor"), dict) else {}
interval = int(monitor_cfg.get("freezeTtlMinutes") or config["freeze"]["maxTtlMinutes"])
interval = next_gateway_freeze_interval(account_state, config)
until = add_minutes(now, interval, int(config["freeze"]["jitterPercent"]))
actions_enabled = bool(config["actions"]["enabled"])
applied = False
@@ -1426,6 +1447,7 @@ def apply_gateway_failure(account_name, failures, state, config, now, admin, pro
account_state["lastStatus"] = "quarantined"
account_state["lastFailureAt"] = iso(now)
account_state["lastGatewayFailureAt"] = iso(now)
account_state["gatewayFailureBackoffIntervalMinutes"] = interval
account_state["lastGatewayFailure"] = {
"accountName": account_name,
"accountId": latest.get("accountId"),
@@ -1440,6 +1462,7 @@ def apply_gateway_failure(account_name, failures, state, config, now, admin, pro
"countInRun": len(failures),
"firstAt": failures[0].get("at"),
"lastAt": latest.get("at"),
"intervalMinutes": interval,
"freezeUntil": iso(until),
"action": action,
}