fix: use backoff for codex gateway stream freezes
This commit is contained in:
@@ -183,7 +183,9 @@ sentinel:
|
||||
enabled: true
|
||||
lookbackSeconds: 900
|
||||
tailLines: 4000
|
||||
freezeTtlMinutes: 10
|
||||
initialTtlMinutes: 5
|
||||
maxTtlMinutes: 30
|
||||
backoffMultiplier: 2
|
||||
paths:
|
||||
- /responses
|
||||
- /v1/responses
|
||||
|
||||
@@ -30,7 +30,9 @@ export interface CodexPoolSentinelConfig {
|
||||
enabled: boolean;
|
||||
lookbackSeconds: number;
|
||||
tailLines: number;
|
||||
freezeTtlMinutes: number;
|
||||
initialTtlMinutes: number;
|
||||
maxTtlMinutes: number;
|
||||
backoffMultiplier: number;
|
||||
paths: string[];
|
||||
};
|
||||
sdk: {
|
||||
@@ -111,7 +113,9 @@ export function defaultCodexPoolSentinelConfig(): CodexPoolSentinelConfig {
|
||||
enabled: false,
|
||||
lookbackSeconds: 900,
|
||||
tailLines: 4000,
|
||||
freezeTtlMinutes: 10,
|
||||
initialTtlMinutes: 5,
|
||||
maxTtlMinutes: 30,
|
||||
backoffMultiplier: 2,
|
||||
paths: ["/responses", "/v1/responses"],
|
||||
},
|
||||
sdk: {
|
||||
@@ -198,7 +202,9 @@ export function readCodexPoolSentinelConfig(value: unknown, defaults: CodexPoolS
|
||||
enabled: readBoolean(valueAt(gatewayFailureMonitor, "enabled"), `${sourcePath}.sentinel.gatewayFailureMonitor.enabled`, defaults.gatewayFailureMonitor.enabled),
|
||||
lookbackSeconds: readInt(valueAt(gatewayFailureMonitor, "lookbackSeconds"), `${sourcePath}.sentinel.gatewayFailureMonitor.lookbackSeconds`, defaults.gatewayFailureMonitor.lookbackSeconds, 60, 7200),
|
||||
tailLines: readInt(valueAt(gatewayFailureMonitor, "tailLines"), `${sourcePath}.sentinel.gatewayFailureMonitor.tailLines`, defaults.gatewayFailureMonitor.tailLines, 100, 50000),
|
||||
freezeTtlMinutes: readInt(valueAt(gatewayFailureMonitor, "freezeTtlMinutes"), `${sourcePath}.sentinel.gatewayFailureMonitor.freezeTtlMinutes`, defaults.gatewayFailureMonitor.freezeTtlMinutes, 1, 1440),
|
||||
initialTtlMinutes: readInt(valueAt(gatewayFailureMonitor, "initialTtlMinutes"), `${sourcePath}.sentinel.gatewayFailureMonitor.initialTtlMinutes`, defaults.gatewayFailureMonitor.initialTtlMinutes, 1, 1440),
|
||||
maxTtlMinutes: readInt(valueAt(gatewayFailureMonitor, "maxTtlMinutes"), `${sourcePath}.sentinel.gatewayFailureMonitor.maxTtlMinutes`, defaults.gatewayFailureMonitor.maxTtlMinutes, 1, 1440),
|
||||
backoffMultiplier: readInt(valueAt(gatewayFailureMonitor, "backoffMultiplier"), `${sourcePath}.sentinel.gatewayFailureMonitor.backoffMultiplier`, defaults.gatewayFailureMonitor.backoffMultiplier, 1, 10),
|
||||
paths: readPathList(valueAt(gatewayFailureMonitor, "paths"), `${sourcePath}.sentinel.gatewayFailureMonitor.paths`, defaults.gatewayFailureMonitor.paths),
|
||||
},
|
||||
sdk: {
|
||||
@@ -239,6 +245,9 @@ export function readCodexPoolSentinelConfig(value: unknown, defaults: CodexPoolS
|
||||
if (config.freeze.maxTtlMinutes < config.freeze.initialTtlMinutes) {
|
||||
throw new Error(`${sourcePath}.sentinel.freeze.maxTtlMinutes must be >= initialTtlMinutes`);
|
||||
}
|
||||
if (config.gatewayFailureMonitor.maxTtlMinutes < config.gatewayFailureMonitor.initialTtlMinutes) {
|
||||
throw new Error(`${sourcePath}.sentinel.gatewayFailureMonitor.maxTtlMinutes must be >= initialTtlMinutes`);
|
||||
}
|
||||
if (!/^[-0-9A-Za-z_/*,\s]+$/u.test(config.schedule)) {
|
||||
throw new Error(`${sourcePath}.sentinel.schedule has an unsupported cron format`);
|
||||
}
|
||||
@@ -1383,11 +1392,23 @@ def gateway_failure_account_map(admin):
|
||||
by_id[account_id] = name
|
||||
return by_id
|
||||
|
||||
def next_gateway_freeze_interval(account_state, config):
|
||||
monitor_cfg = config.get("gatewayFailureMonitor") if isinstance(config.get("gatewayFailureMonitor"), dict) else {}
|
||||
initial = int(monitor_cfg.get("initialTtlMinutes") or 5)
|
||||
maximum = int(monitor_cfg.get("maxTtlMinutes") or 30)
|
||||
multiplier = int(monitor_cfg.get("backoffMultiplier") or 2)
|
||||
previous = int(account_state.get("gatewayFailureBackoffIntervalMinutes") or 0)
|
||||
quarantine = account_state.get("quarantine") if isinstance(account_state.get("quarantine"), dict) else {}
|
||||
if quarantine.get("reason") == "gateway-forward-failure":
|
||||
previous = max(previous, int(quarantine.get("intervalMinutes") or 0))
|
||||
if previous <= 0:
|
||||
return initial
|
||||
return min(maximum, max(initial, previous * multiplier))
|
||||
|
||||
def apply_gateway_failure(account_name, failures, state, config, now, admin, profile):
|
||||
latest = failures[-1]
|
||||
account_state = state.setdefault("accounts", {}).setdefault(account_name, {})
|
||||
monitor_cfg = config.get("gatewayFailureMonitor") if isinstance(config.get("gatewayFailureMonitor"), dict) else {}
|
||||
interval = int(monitor_cfg.get("freezeTtlMinutes") or config["freeze"]["maxTtlMinutes"])
|
||||
interval = next_gateway_freeze_interval(account_state, config)
|
||||
until = add_minutes(now, interval, int(config["freeze"]["jitterPercent"]))
|
||||
actions_enabled = bool(config["actions"]["enabled"])
|
||||
applied = False
|
||||
@@ -1426,6 +1447,7 @@ def apply_gateway_failure(account_name, failures, state, config, now, admin, pro
|
||||
account_state["lastStatus"] = "quarantined"
|
||||
account_state["lastFailureAt"] = iso(now)
|
||||
account_state["lastGatewayFailureAt"] = iso(now)
|
||||
account_state["gatewayFailureBackoffIntervalMinutes"] = interval
|
||||
account_state["lastGatewayFailure"] = {
|
||||
"accountName": account_name,
|
||||
"accountId": latest.get("accountId"),
|
||||
@@ -1440,6 +1462,7 @@ def apply_gateway_failure(account_name, failures, state, config, now, admin, pro
|
||||
"countInRun": len(failures),
|
||||
"firstAt": failures[0].get("at"),
|
||||
"lastAt": latest.get("at"),
|
||||
"intervalMinutes": interval,
|
||||
"freezeUntil": iso(until),
|
||||
"action": action,
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user