From f5cf7019696017919f436c4dfe2687e801ddbcce Mon Sep 17 00:00:00 2001 From: Codex Date: Thu, 11 Jun 2026 18:24:04 +0000 Subject: [PATCH] fix: monitor codex compact gateway failures --- config/platform-infra/sub2api-codex-pool.yaml | 2 + .../platform-infra-sub2api-codex-sentinel.ts | 62 +++++++++++++++---- 2 files changed, 53 insertions(+), 11 deletions(-) diff --git a/config/platform-infra/sub2api-codex-pool.yaml b/config/platform-infra/sub2api-codex-pool.yaml index 308b5d85..549d7bd0 100644 --- a/config/platform-infra/sub2api-codex-pool.yaml +++ b/config/platform-infra/sub2api-codex-pool.yaml @@ -189,6 +189,8 @@ sentinel: paths: - /responses - /v1/responses + - /responses/compact + - /v1/responses/compact cadence: successInitialIntervalMinutes: 1 successMaxIntervalMinutes: 20 diff --git a/scripts/src/platform-infra-sub2api-codex-sentinel.ts b/scripts/src/platform-infra-sub2api-codex-sentinel.ts index 59a1be1c..c91b7d25 100644 --- a/scripts/src/platform-infra-sub2api-codex-sentinel.ts +++ b/scripts/src/platform-infra-sub2api-codex-sentinel.ts @@ -116,7 +116,7 @@ export function defaultCodexPoolSentinelConfig(): CodexPoolSentinelConfig { initialTtlMinutes: 5, maxTtlMinutes: 30, backoffMultiplier: 2, - paths: ["/responses", "/v1/responses"], + paths: ["/responses", "/v1/responses", "/responses/compact", "/v1/responses/compact"], }, sdk: { openaiPythonVersion: "2.41.1", @@ -1327,16 +1327,16 @@ def gateway_monitor_paths(config): paths = cfg.get("paths") if isinstance(paths, list) and paths: return set(str(item) for item in paths if isinstance(item, str) and item) - return {"/responses", "/v1/responses"} + return {"/responses", "/v1/responses", "/responses/compact", "/v1/responses/compact"} -def is_gateway_stream_failure(message, payload, config): +def gateway_failure_kind(message, payload, config): if "openai.forward_failed" not in message or not isinstance(payload, dict): - return False + return None path = payload.get("path") if path not in gateway_monitor_paths(config): - return False + return None if payload.get("account_id") is None: - return False + return None error_text = str(payload.get("error") or "").lower() fallback_written = payload.get("fallback_error_response_written") is True upstream_already_written = payload.get("upstream_error_response_already_written") is True @@ -1346,9 +1346,44 @@ def is_gateway_stream_failure(message, payload, config): "stream read error", "stream data interval timeout", )) - return fallback_written or upstream_already_written or stream_failure + if fallback_written or upstream_already_written or stream_failure: + return "gateway-stream-forward-failure" + final_compatibility_failure = any(token in error_text for token in ( + "encrypted content could not be decrypted", + "could not be verified", + "invalid_encrypted_content", + "bad_response_status_code", + "model_not_found", + "no available channel for model", + "unsupported model", + "not support", + "not supported", + "payload too large", + "request too large", + "context length", + "context window", + "maximum context", + )) + if final_compatibility_failure: + return "gateway-final-compatibility-failure" + final_5xx_failure = any(token in error_text for token in ( + "upstream error: 500", + "upstream error: 502", + "upstream error: 503", + "upstream error: 504", + "upstream error: 524", + "gateway timeout", + "bad gateway", + "upstream request failed", + "unknown error", + "context deadline exceeded", + "context canceled", + )) + if final_5xx_failure: + return "gateway-final-transient-failure" + return None -def gateway_failure_item(ts, pod_name, payload): +def gateway_failure_item(ts, pod_name, payload, failure_kind): request_id = payload.get("request_id") or sha(json.dumps(payload, sort_keys=True, ensure_ascii=False)) try: account_id = int(payload.get("account_id")) @@ -1360,6 +1395,7 @@ def gateway_failure_item(ts, pod_name, payload): "requestId": request_id, "clientRequestId": payload.get("client_request_id"), "accountId": account_id, + "failureKind": failure_kind, "path": payload.get("path"), "errorPreview": preview(payload.get("error"), 240), "fallbackErrorResponseWritten": payload.get("fallback_error_response_written") is True, @@ -1425,11 +1461,12 @@ def apply_gateway_failure(account_name, failures, state, config, now, admin, pro "until": iso(until), "intervalMinutes": interval, "reason": "gateway-forward-failure", - "failureKind": "gateway-stream-forward-failure", + "failureKind": latest.get("failureKind") or "gateway-forward-failure", "errorDetails": { "kind": "Sub2APIGatewayForwardFailure", "requestId": latest.get("requestId"), "clientRequestId": latest.get("clientRequestId"), + "failureKind": latest.get("failureKind"), "path": latest.get("path"), "errorPreview": latest.get("errorPreview"), "fallbackErrorResponseWritten": latest.get("fallbackErrorResponseWritten"), @@ -1453,6 +1490,7 @@ def apply_gateway_failure(account_name, failures, state, config, now, admin, pro "accountId": latest.get("accountId"), "requestId": latest.get("requestId"), "clientRequestId": latest.get("clientRequestId"), + "failureKind": latest.get("failureKind"), "path": latest.get("path"), "errorPreview": latest.get("errorPreview"), "fallbackErrorResponseWritten": latest.get("fallbackErrorResponseWritten"), @@ -1497,8 +1535,9 @@ def run_gateway_failure_monitor(state, config, now, kube, admin, profiles): continue for line in str(logs).splitlines(): ts, message, payload = log_line_payload(line) - if is_gateway_stream_failure(message, payload, config): - candidates.append(gateway_failure_item(ts, pod_name, payload)) + failure_kind = gateway_failure_kind(message, payload, config) + if failure_kind is not None: + candidates.append(gateway_failure_item(ts, pod_name, payload, failure_kind)) by_id = gateway_failure_account_map(admin) if candidates else {} profile_by_name = {item.get("accountName"): item for item in profiles if isinstance(item, dict) and isinstance(item.get("accountName"), str)} by_account = {} @@ -1532,6 +1571,7 @@ def run_gateway_failure_monitor(state, config, now, kube, admin, profiles): "accountId": failures[-1].get("accountId"), "failureCount": len(failures), "requestId": failures[-1].get("requestId"), + "failureKind": failures[-1].get("failureKind"), "path": failures[-1].get("path"), "errorPreview": failures[-1].get("errorPreview"), "taken": action.get("taken"),