fix: monitor codex compact gateway failures
This commit is contained in:
@@ -189,6 +189,8 @@ sentinel:
|
||||
paths:
|
||||
- /responses
|
||||
- /v1/responses
|
||||
- /responses/compact
|
||||
- /v1/responses/compact
|
||||
cadence:
|
||||
successInitialIntervalMinutes: 1
|
||||
successMaxIntervalMinutes: 20
|
||||
|
||||
@@ -116,7 +116,7 @@ export function defaultCodexPoolSentinelConfig(): CodexPoolSentinelConfig {
|
||||
initialTtlMinutes: 5,
|
||||
maxTtlMinutes: 30,
|
||||
backoffMultiplier: 2,
|
||||
paths: ["/responses", "/v1/responses"],
|
||||
paths: ["/responses", "/v1/responses", "/responses/compact", "/v1/responses/compact"],
|
||||
},
|
||||
sdk: {
|
||||
openaiPythonVersion: "2.41.1",
|
||||
@@ -1327,16 +1327,16 @@ def gateway_monitor_paths(config):
|
||||
paths = cfg.get("paths")
|
||||
if isinstance(paths, list) and paths:
|
||||
return set(str(item) for item in paths if isinstance(item, str) and item)
|
||||
return {"/responses", "/v1/responses"}
|
||||
return {"/responses", "/v1/responses", "/responses/compact", "/v1/responses/compact"}
|
||||
|
||||
def is_gateway_stream_failure(message, payload, config):
|
||||
def gateway_failure_kind(message, payload, config):
|
||||
if "openai.forward_failed" not in message or not isinstance(payload, dict):
|
||||
return False
|
||||
return None
|
||||
path = payload.get("path")
|
||||
if path not in gateway_monitor_paths(config):
|
||||
return False
|
||||
return None
|
||||
if payload.get("account_id") is None:
|
||||
return False
|
||||
return None
|
||||
error_text = str(payload.get("error") or "").lower()
|
||||
fallback_written = payload.get("fallback_error_response_written") is True
|
||||
upstream_already_written = payload.get("upstream_error_response_already_written") is True
|
||||
@@ -1346,9 +1346,44 @@ def is_gateway_stream_failure(message, payload, config):
|
||||
"stream read error",
|
||||
"stream data interval timeout",
|
||||
))
|
||||
return fallback_written or upstream_already_written or stream_failure
|
||||
if fallback_written or upstream_already_written or stream_failure:
|
||||
return "gateway-stream-forward-failure"
|
||||
final_compatibility_failure = any(token in error_text for token in (
|
||||
"encrypted content could not be decrypted",
|
||||
"could not be verified",
|
||||
"invalid_encrypted_content",
|
||||
"bad_response_status_code",
|
||||
"model_not_found",
|
||||
"no available channel for model",
|
||||
"unsupported model",
|
||||
"not support",
|
||||
"not supported",
|
||||
"payload too large",
|
||||
"request too large",
|
||||
"context length",
|
||||
"context window",
|
||||
"maximum context",
|
||||
))
|
||||
if final_compatibility_failure:
|
||||
return "gateway-final-compatibility-failure"
|
||||
final_5xx_failure = any(token in error_text for token in (
|
||||
"upstream error: 500",
|
||||
"upstream error: 502",
|
||||
"upstream error: 503",
|
||||
"upstream error: 504",
|
||||
"upstream error: 524",
|
||||
"gateway timeout",
|
||||
"bad gateway",
|
||||
"upstream request failed",
|
||||
"unknown error",
|
||||
"context deadline exceeded",
|
||||
"context canceled",
|
||||
))
|
||||
if final_5xx_failure:
|
||||
return "gateway-final-transient-failure"
|
||||
return None
|
||||
|
||||
def gateway_failure_item(ts, pod_name, payload):
|
||||
def gateway_failure_item(ts, pod_name, payload, failure_kind):
|
||||
request_id = payload.get("request_id") or sha(json.dumps(payload, sort_keys=True, ensure_ascii=False))
|
||||
try:
|
||||
account_id = int(payload.get("account_id"))
|
||||
@@ -1360,6 +1395,7 @@ def gateway_failure_item(ts, pod_name, payload):
|
||||
"requestId": request_id,
|
||||
"clientRequestId": payload.get("client_request_id"),
|
||||
"accountId": account_id,
|
||||
"failureKind": failure_kind,
|
||||
"path": payload.get("path"),
|
||||
"errorPreview": preview(payload.get("error"), 240),
|
||||
"fallbackErrorResponseWritten": payload.get("fallback_error_response_written") is True,
|
||||
@@ -1425,11 +1461,12 @@ def apply_gateway_failure(account_name, failures, state, config, now, admin, pro
|
||||
"until": iso(until),
|
||||
"intervalMinutes": interval,
|
||||
"reason": "gateway-forward-failure",
|
||||
"failureKind": "gateway-stream-forward-failure",
|
||||
"failureKind": latest.get("failureKind") or "gateway-forward-failure",
|
||||
"errorDetails": {
|
||||
"kind": "Sub2APIGatewayForwardFailure",
|
||||
"requestId": latest.get("requestId"),
|
||||
"clientRequestId": latest.get("clientRequestId"),
|
||||
"failureKind": latest.get("failureKind"),
|
||||
"path": latest.get("path"),
|
||||
"errorPreview": latest.get("errorPreview"),
|
||||
"fallbackErrorResponseWritten": latest.get("fallbackErrorResponseWritten"),
|
||||
@@ -1453,6 +1490,7 @@ def apply_gateway_failure(account_name, failures, state, config, now, admin, pro
|
||||
"accountId": latest.get("accountId"),
|
||||
"requestId": latest.get("requestId"),
|
||||
"clientRequestId": latest.get("clientRequestId"),
|
||||
"failureKind": latest.get("failureKind"),
|
||||
"path": latest.get("path"),
|
||||
"errorPreview": latest.get("errorPreview"),
|
||||
"fallbackErrorResponseWritten": latest.get("fallbackErrorResponseWritten"),
|
||||
@@ -1497,8 +1535,9 @@ def run_gateway_failure_monitor(state, config, now, kube, admin, profiles):
|
||||
continue
|
||||
for line in str(logs).splitlines():
|
||||
ts, message, payload = log_line_payload(line)
|
||||
if is_gateway_stream_failure(message, payload, config):
|
||||
candidates.append(gateway_failure_item(ts, pod_name, payload))
|
||||
failure_kind = gateway_failure_kind(message, payload, config)
|
||||
if failure_kind is not None:
|
||||
candidates.append(gateway_failure_item(ts, pod_name, payload, failure_kind))
|
||||
by_id = gateway_failure_account_map(admin) if candidates else {}
|
||||
profile_by_name = {item.get("accountName"): item for item in profiles if isinstance(item, dict) and isinstance(item.get("accountName"), str)}
|
||||
by_account = {}
|
||||
@@ -1532,6 +1571,7 @@ def run_gateway_failure_monitor(state, config, now, kube, admin, profiles):
|
||||
"accountId": failures[-1].get("accountId"),
|
||||
"failureCount": len(failures),
|
||||
"requestId": failures[-1].get("requestId"),
|
||||
"failureKind": failures[-1].get("failureKind"),
|
||||
"path": failures[-1].get("path"),
|
||||
"errorPreview": failures[-1].get("errorPreview"),
|
||||
"taken": action.get("taken"),
|
||||
|
||||
Reference in New Issue
Block a user