fix: monitor codex compact gateway failures

This commit is contained in:
Codex
2026-06-11 18:24:04 +00:00
parent a8d49845ca
commit f5cf701969
2 changed files with 53 additions and 11 deletions
@@ -189,6 +189,8 @@ sentinel:
paths:
- /responses
- /v1/responses
- /responses/compact
- /v1/responses/compact
cadence:
successInitialIntervalMinutes: 1
successMaxIntervalMinutes: 20
@@ -116,7 +116,7 @@ export function defaultCodexPoolSentinelConfig(): CodexPoolSentinelConfig {
initialTtlMinutes: 5,
maxTtlMinutes: 30,
backoffMultiplier: 2,
paths: ["/responses", "/v1/responses"],
paths: ["/responses", "/v1/responses", "/responses/compact", "/v1/responses/compact"],
},
sdk: {
openaiPythonVersion: "2.41.1",
@@ -1327,16 +1327,16 @@ def gateway_monitor_paths(config):
paths = cfg.get("paths")
if isinstance(paths, list) and paths:
return set(str(item) for item in paths if isinstance(item, str) and item)
return {"/responses", "/v1/responses"}
return {"/responses", "/v1/responses", "/responses/compact", "/v1/responses/compact"}
def is_gateway_stream_failure(message, payload, config):
def gateway_failure_kind(message, payload, config):
if "openai.forward_failed" not in message or not isinstance(payload, dict):
return False
return None
path = payload.get("path")
if path not in gateway_monitor_paths(config):
return False
return None
if payload.get("account_id") is None:
return False
return None
error_text = str(payload.get("error") or "").lower()
fallback_written = payload.get("fallback_error_response_written") is True
upstream_already_written = payload.get("upstream_error_response_already_written") is True
@@ -1346,9 +1346,44 @@ def is_gateway_stream_failure(message, payload, config):
"stream read error",
"stream data interval timeout",
))
return fallback_written or upstream_already_written or stream_failure
if fallback_written or upstream_already_written or stream_failure:
return "gateway-stream-forward-failure"
final_compatibility_failure = any(token in error_text for token in (
"encrypted content could not be decrypted",
"could not be verified",
"invalid_encrypted_content",
"bad_response_status_code",
"model_not_found",
"no available channel for model",
"unsupported model",
"not support",
"not supported",
"payload too large",
"request too large",
"context length",
"context window",
"maximum context",
))
if final_compatibility_failure:
return "gateway-final-compatibility-failure"
final_5xx_failure = any(token in error_text for token in (
"upstream error: 500",
"upstream error: 502",
"upstream error: 503",
"upstream error: 504",
"upstream error: 524",
"gateway timeout",
"bad gateway",
"upstream request failed",
"unknown error",
"context deadline exceeded",
"context canceled",
))
if final_5xx_failure:
return "gateway-final-transient-failure"
return None
def gateway_failure_item(ts, pod_name, payload):
def gateway_failure_item(ts, pod_name, payload, failure_kind):
request_id = payload.get("request_id") or sha(json.dumps(payload, sort_keys=True, ensure_ascii=False))
try:
account_id = int(payload.get("account_id"))
@@ -1360,6 +1395,7 @@ def gateway_failure_item(ts, pod_name, payload):
"requestId": request_id,
"clientRequestId": payload.get("client_request_id"),
"accountId": account_id,
"failureKind": failure_kind,
"path": payload.get("path"),
"errorPreview": preview(payload.get("error"), 240),
"fallbackErrorResponseWritten": payload.get("fallback_error_response_written") is True,
@@ -1425,11 +1461,12 @@ def apply_gateway_failure(account_name, failures, state, config, now, admin, pro
"until": iso(until),
"intervalMinutes": interval,
"reason": "gateway-forward-failure",
"failureKind": "gateway-stream-forward-failure",
"failureKind": latest.get("failureKind") or "gateway-forward-failure",
"errorDetails": {
"kind": "Sub2APIGatewayForwardFailure",
"requestId": latest.get("requestId"),
"clientRequestId": latest.get("clientRequestId"),
"failureKind": latest.get("failureKind"),
"path": latest.get("path"),
"errorPreview": latest.get("errorPreview"),
"fallbackErrorResponseWritten": latest.get("fallbackErrorResponseWritten"),
@@ -1453,6 +1490,7 @@ def apply_gateway_failure(account_name, failures, state, config, now, admin, pro
"accountId": latest.get("accountId"),
"requestId": latest.get("requestId"),
"clientRequestId": latest.get("clientRequestId"),
"failureKind": latest.get("failureKind"),
"path": latest.get("path"),
"errorPreview": latest.get("errorPreview"),
"fallbackErrorResponseWritten": latest.get("fallbackErrorResponseWritten"),
@@ -1497,8 +1535,9 @@ def run_gateway_failure_monitor(state, config, now, kube, admin, profiles):
continue
for line in str(logs).splitlines():
ts, message, payload = log_line_payload(line)
if is_gateway_stream_failure(message, payload, config):
candidates.append(gateway_failure_item(ts, pod_name, payload))
failure_kind = gateway_failure_kind(message, payload, config)
if failure_kind is not None:
candidates.append(gateway_failure_item(ts, pod_name, payload, failure_kind))
by_id = gateway_failure_account_map(admin) if candidates else {}
profile_by_name = {item.get("accountName"): item for item in profiles if isinstance(item, dict) and isinstance(item.get("accountName"), str)}
by_account = {}
@@ -1532,6 +1571,7 @@ def run_gateway_failure_monitor(state, config, now, kube, admin, profiles):
"accountId": failures[-1].get("accountId"),
"failureCount": len(failures),
"requestId": failures[-1].get("requestId"),
"failureKind": failures[-1].get("failureKind"),
"path": failures[-1].get("path"),
"errorPreview": failures[-1].get("errorPreview"),
"taken": action.get("taken"),