fix: add compact timeout cooldown evidence

2026-06-10 07:42:21 +00:00
parent 9710a1a6f6
commit 8735b4103c
6 changed files with 85 additions and 14 deletions
@@ -131,7 +131,7 @@ bun scripts/cli.ts platform-infra sub2api codex-pool configure-local --confirm

 - `sub2api status`：Deployment/StatefulSet/Service/Secret 可见，运行镜像与 YAML 一致。
 - `sub2api validate`：app、PostgreSQL、Redis 和 service proxy 基础检查通过。
- `codex-pool validate`：统一 key 的 `GET /v1/models` 成功，并用 `localCodex.responsesSmokeModel` 跑一次小的 `POST /v1/responses` smoke；owner balance / owner concurrency 已满足 YAML 最小值，capacity、WebSocket v2 和 temporary-unschedulable 运行时状态与 YAML 对齐。若 Responses smoke `outcome=succeeded-with-failover`，说明请求已恢复但仍有账号级上游 5xx 需要按 evidence 继续降频或冷却。
+- `codex-pool validate`：统一 key 的 `GET /v1/models` 成功，并用 `localCodex.responsesSmokeModel` 跑一次小的 `POST /v1/responses` smoke；owner balance / owner concurrency 已满足 YAML 最小值，capacity、WebSocket v2 和 temporary-unschedulable 运行时状态与 YAML 对齐；`validation.gatewayCompactRecent` 会汇总最近 6 小时 `/responses/compact` 成功、失败、failover、最终 4xx/5xx 和 `context canceled` 证据。若 Responses smoke `outcome=succeeded-with-failover` 或 `gatewayCompactRecent.degraded=true`，说明请求已恢复但仍有账号级上游 5xx/compact timeout 需要按 evidence 继续降频或冷却。
 - 若 `publicExposure.enabled=true`，确认 FRP path 可用；`expose --confirm` 会用未带 key 的 public `/v1/models` 401 作为网关可达性探针。

 如果要证明真实模型请求可用，使用最小 `/v1/responses` 或等价 Codex smoke。不要把 group-level `/v1/models` 成功解释成每个上游 account 都健康。
@@ -148,7 +148,7 @@ bun scripts/cli.ts platform-infra sub2api codex-pool configure-local --confirm
 - 上游报 capacity/rate-limit/overload/Bad Gateway/Gateway Timeout 后没有切号或频繁先失败再恢复：先确认 `codex-pool validate` 里 `tempUnschedulable.ok=true` 且目标 account `runtimeEnabled=true`、规则数符合 YAML；再看 `validation.gatewayResponses.evidence.failovers` 的 account/upstream status。若 mismatch，跑 `codex-pool sync --confirm`，不要手工 patch Sub2API credentials。
 - Codex 报 weekly-limit、`less than 10% of your weekly limit left`、`Run /status for a breakdown` 等账号状态/软配额提示并要求切号：把稳定 body 关键词放进 `pool.defaultTempUnschedulable` 的 403 和 429 规则，跑 `codex-pool sync --confirm`，再用 `codex-pool validate` 确认每个 managed account 的 runtime 403/429 rules 都包含这些关键词。Sub2API 临时下线规则按 HTTP status + body keyword 匹配；如果该文案是 HTTP 200 成功内容，需要另提响应分类能力 issue，不能只靠 YAML 冷却规则声明解决。
 - 上游 503 响应体出现 `model_not_found`、`No available channel for model ...` 或同类稳定模型路由失败文案：把稳定 body 关键词放进 `pool.defaultTempUnschedulable` 的 503 规则，跑 `codex-pool sync --confirm`，再用 `codex-pool validate` 确认目标 account 的 runtime 503 rule 包含这些关键词；不要用 account membership、priority、capacity、loadFactor、WebSocket mode 或 User-Agent 改动掩盖该错误族。
- 上游错误反复触发：默认错误冷却按严重程度分层；临时问题可从 10 分钟起步，网关/服务不可用/过载/模型路由类应更长，认证/权限/配额/账号状态类使用最长冷却。`Recovered upstream error ...`、`Bad Gateway`、`Gateway Timeout`、Cloudflare `524`、Codex-facing `Upstream request failed`、`Unknown error`、`context deadline exceeded`、`context canceled`、`model_not_found`、`No available channel for model`、大上下文 `413` 和 `openai_error` 这类稳定包装文案都应留在 YAML 冷却政策里。具体数值只以 YAML 为准，修改后必须 `codex-pool sync --confirm` 和 `codex-pool validate`。长期判定见 `docs/reference/platform-infra.md`。
+- 上游错误反复触发：默认错误冷却按严重程度分层；临时问题可从 10 分钟起步，网关/服务不可用/过载/模型路由类应更长，认证/权限/配额/账号状态类使用最长冷却。`Recovered upstream error ...`、`Bad Gateway`、`Gateway Timeout`、Cloudflare `524`、Codex-facing `Upstream request failed`、`Unknown error`、`context deadline exceeded`、`context canceled`、`model_not_found`、`No available channel for model`、大上下文 `413` 和 `openai_error` 这类稳定包装文案都应留在对应 5xx/413 YAML 冷却政策里，特别是 compact 链路里上游 524 可能最终表现为客户端 502/504 + `Unknown error`。具体数值只以 YAML 为准，修改后必须 `codex-pool sync --confirm` 和 `codex-pool validate`。长期判定见 `docs/reference/platform-infra.md`。
 - Codex auto compact 后丢上下文：先确认 YAML `localCodex` 是否声明启用 WSv2；若启用，再确认本机 `~/.codex/config.toml` 是否有 `supports_websockets = true` 和 `responses_websockets_v2 = true`，并看 `codex-pool validate` 的 WSv2 candidate 和 Sub2API 日志里的 `transport=responses_websockets_v2`。若 YAML 当前禁用 WSv2，则按 HTTP Responses 稳定性排查，不把旧 WS 口径当成验收要求。
 - Codex smoke 有 reconnect/1013：这是上游并发/可用性问题，和 HTTP-only compact context-loss 分开处理；记录 session/log 证据并关联专项 issue，不要用运行时手补覆盖 YAML 容量。

@@ -28,7 +28,7 @@ pool:
        durationMinutes: 10
        description: Transient upstream server failures should start with a ten-minute cooldown and prefer another account.
      - statusCode: 502
-        keywords: [capacity, overloaded, temporarily unavailable, temporary, upstream, bad gateway, upstream request failed, websocket dial, handshake response, recovered upstream error]
+        keywords: [capacity, overloaded, temporarily unavailable, temporary, upstream, bad gateway, upstream request failed, unknown error, context deadline exceeded, context canceled, websocket dial, handshake response, recovered upstream error]
        durationMinutes: 30
        description: Gateway upstream failures, including recovered upstream error wrappers, should cool down longer.
      - statusCode: 413
@@ -44,7 +44,7 @@ pool:
        durationMinutes: 30
        description: Gateway timeout responses should cool down the selected account so another account can handle the next request.
      - statusCode: 524
-        keywords: [timeout, a timeout occurred, cloudflare, gateway timeout, upstream, context deadline exceeded]
+        keywords: [timeout, a timeout occurred, cloudflare, gateway timeout, upstream, upstream request failed, unknown error, context deadline exceeded, context canceled, recovered upstream error]
        durationMinutes: 30
        description: Cloudflare 524 timeout responses should cool down the selected account so another account can handle the next request.
      - statusCode: 529
@@ -35,7 +35,7 @@
 - Do not change account membership, priority, capacity, load factor, WebSocket mode, or other routing policy from inference alone. Unless the user explicitly asks for a configuration change, first preserve the current YAML, collect provenance and runtime evidence, and write the finding to the relevant issue or runbook before proposing a change.
 - `profiles.entries[].tempUnschedulable` may override the pool default for one account. The CLI renders it into Sub2API credentials as `temp_unschedulable_enabled` and `temp_unschedulable_rules`; rules match HTTP status plus response-body keywords and place only that account into a temporary unschedulable cooldown.
 - Codex account-state or quota prompts that stop a task and ask the operator to switch accounts belong in `pool.defaultTempUnschedulable`, not in account membership, priority, capacity, load factor, WebSocket mode, or `pool_mode`. Keep stable body phrases such as weekly-limit and `/status` prompts in both the 403 account-state rule and the 429 quota/rate-limit rule, then run `codex-pool sync --confirm` and `codex-pool validate`. The validation evidence must include runtime temporary-unschedulable alignment for each managed account, not only successful group-level `/v1/models` or `/v1/responses` smoke output.
- Upstream model-routing failures that surface as 503 responses, such as `model_not_found` or "no available channel for model" wrappers, also belong in `pool.defaultTempUnschedulable`. Gateway timeout failures that surface as 504 responses, including `Gateway Timeout`, `Unknown error`, `Upstream request failed`, `context deadline exceeded`, `context canceled`, or recovered upstream-error wrappers, belong in the same YAML policy. They are not membership, priority, capacity, load factor, WebSocket mode, or User-Agent decisions by themselves. After adding stable body phrases, run `codex-pool sync --confirm` and `codex-pool validate`, and verify the affected account's runtime status-specific rule includes the new keywords.
+- Upstream model-routing failures that surface as 503 responses, such as `model_not_found` or "no available channel for model" wrappers, also belong in `pool.defaultTempUnschedulable`. Gateway and timeout failures that surface as 502, 504, or 524 responses, including `Gateway Timeout`, `Unknown error`, `Upstream request failed`, `context deadline exceeded`, `context canceled`, or recovered upstream-error wrappers, belong in the same YAML policy. This is especially important for compact requests, where an upstream Cloudflare 524 may eventually reach Codex as a 502/504 unknown-error wrapper after failover or client cancellation. They are not membership, priority, capacity, load factor, WebSocket mode, or User-Agent decisions by themselves. After adding stable body phrases, run `codex-pool sync --confirm` and `codex-pool validate`, and verify the affected account's runtime status-specific rule includes the new keywords.
 - `profiles.entries[].openaiResponsesWebSocketsV2Mode` is the account-level Responses WebSocket v2 switch for OpenAI-compatible upstreams that require WebSocket transport. Allowed values are `off`, `ctx_pool`, and `passthrough`; omit the field unless that upstream needs it.
 - `profiles.entries[].upstreamUserAgent` is an optional account-level upstream request User-Agent override. Use it only for upstreams that require a Codex CLI compatible User-Agent; keep the value YAML-controlled and newline-free.
 - `publicExposure` controls the optional FRP bridge from master server to the G14 ClusterIP service.
@@ -59,6 +59,9 @@ if (parsed.pool?.defaultTempUnschedulable?.enabled === true) {
  const gateway502Rule = rules.find((rule) => rule.statusCode === 502);
  const gateway502Keywords = new Set((gateway502Rule?.keywords ?? []).map((keyword) => keyword.toLowerCase()));
  assertCondition(gateway502Keywords.has("recovered upstream error"), "502 temporary-unschedulable rule must catch recovered upstream error wrappers", gateway502Rule);
+  for (const keyword of ["unknown error", "upstream request failed", "context canceled"]) {
+    assertCondition(gateway502Keywords.has(keyword), "502 temporary-unschedulable rule must catch compact gateway wrappers", { keyword, gateway502Rule });
+  }
  const largeContext413Rule = rules.find((rule) => rule.statusCode === 413);
  const largeContext413Keywords = new Set((largeContext413Rule?.keywords ?? []).map((keyword) => keyword.toLowerCase()));
  for (const keyword of ["openai_error", "context length", "maximum context"]) {
@@ -71,7 +74,7 @@ if (parsed.pool?.defaultTempUnschedulable?.enabled === true) {
  }
  const cloudflare524Rule = rules.find((rule) => rule.statusCode === 524);
  const cloudflare524Keywords = new Set((cloudflare524Rule?.keywords ?? []).map((keyword) => keyword.toLowerCase()));
-  for (const keyword of ["timeout", "a timeout occurred", "cloudflare"]) {
+  for (const keyword of ["timeout", "a timeout occurred", "cloudflare", "unknown error", "upstream request failed", "context canceled"]) {
    assertCondition(cloudflare524Keywords.has(keyword), "524 temporary-unschedulable rule must catch Cloudflare timeout wrappers", { keyword, cloudflare524Rule });
  }
  const accountState403Rule = rules.find((rule) => rule.statusCode === 403);
@@ -29,6 +29,7 @@ assertCondition(!("pool_mode" in credentials), "pool_mode must not be enabled be
 assertCondition(!("api_key" in credentials) && !("base_url" in credentials), "temporary-unschedulable rendering must not include secrets or endpoints", credentials);
 const accountState403Rule = rules.find((rule) => rule.error_code === 403);
 const quota429Rule = rules.find((rule) => rule.error_code === 429);
+const gateway502Rule = rules.find((rule) => rule.error_code === 502);
 const serviceUnavailable503Rule = rules.find((rule) => rule.error_code === 503);
 const gatewayTimeout504Rule = rules.find((rule) => rule.error_code === 504);
 const largeContext413Rule = rules.find((rule) => rule.error_code === 413);
@@ -43,10 +44,13 @@ for (const keyword of ["model_not_found", "no available channel for model"]) {
 for (const keyword of ["openai_error", "context length", "maximum context"]) {
  assertCondition(largeContext413Rule?.keywords?.includes(keyword), "413 rendered rule must catch large-context upstream failures", { keyword, largeContext413Rule });
 }
+for (const keyword of ["unknown error", "upstream request failed", "context canceled"]) {
+  assertCondition(gateway502Rule?.keywords?.includes(keyword), "502 rendered rule must catch compact gateway wrappers", { keyword, gateway502Rule });
+}
 for (const keyword of ["gateway timeout", "unknown error", "context deadline exceeded"]) {
  assertCondition(gatewayTimeout504Rule?.keywords?.includes(keyword), "504 rendered rule must preserve gateway-timeout cooldown keyword", { keyword, gatewayTimeout504Rule });
 }
-for (const keyword of ["timeout", "a timeout occurred", "cloudflare"]) {
+for (const keyword of ["timeout", "a timeout occurred", "cloudflare", "unknown error", "upstream request failed", "context canceled"]) {
  assertCondition(cloudflare524Rule?.keywords?.includes(keyword), "524 rendered rule must catch Cloudflare timeout wrappers", { keyword, cloudflare524Rule });
 }

@@ -694,7 +694,7 @@ export function defaultCodexTempUnschedulablePolicy(): CodexTempUnschedulablePol
      },
      {
        statusCode: 502,
-        keywords: ["capacity", "overloaded", "temporarily unavailable", "temporary", "upstream", "bad gateway", "upstream request failed", "websocket dial", "handshake response", "recovered upstream error"],
+        keywords: ["capacity", "overloaded", "temporarily unavailable", "temporary", "upstream", "bad gateway", "upstream request failed", "unknown error", "context deadline exceeded", "context canceled", "websocket dial", "handshake response", "recovered upstream error"],
        durationMinutes: 30,
        description: "Gateway upstream failures, including recovered upstream error wrappers, should cool down longer.",
      },
@@ -718,7 +718,7 @@ export function defaultCodexTempUnschedulablePolicy(): CodexTempUnschedulablePol
      },
      {
        statusCode: 524,
-        keywords: ["timeout", "a timeout occurred", "cloudflare", "gateway timeout", "upstream", "context deadline exceeded"],
+        keywords: ["timeout", "a timeout occurred", "cloudflare", "gateway timeout", "upstream", "upstream request failed", "unknown error", "context deadline exceeded", "context canceled", "recovered upstream error"],
        durationMinutes: 30,
        description: "Cloudflare 524 timeout responses should cool down the selected account so another account can handle the next request.",
      },
@@ -2326,6 +2326,68 @@ def request_log_evidence(request_id):
        "logsStderr": text(proc.stderr, 1000),
    }

+def recent_compact_gateway_evidence():
+    proc = kubectl(["-n", NAMESPACE, "logs", "deployment/sub2api", "--since=6h", "--tail=2500"])
+    stdout = proc.stdout.decode("utf-8", errors="replace")
+    failures = []
+    successes = []
+    failovers = []
+    final_errors = []
+    context_canceled = []
+    for line in stdout.splitlines():
+        if "/responses/compact" not in line and "remote_compact" not in line:
+            continue
+        json_start = line.find("{")
+        if json_start < 0:
+            continue
+        try:
+            item = json.loads(line[json_start:])
+        except Exception:
+            continue
+        path = item.get("path")
+        entry = {
+            "requestId": item.get("request_id"),
+            "clientRequestId": item.get("client_request_id"),
+            "accountId": item.get("account_id"),
+            "statusCode": item.get("status_code"),
+            "upstreamStatus": item.get("upstream_status"),
+            "latencyMs": item.get("latency_ms"),
+            "path": path,
+        }
+        if "codex.remote_compact.failed" in line:
+            failures.append(entry)
+        elif "codex.remote_compact.succeeded" in line:
+            successes.append(entry)
+        elif "upstream_failover_switching" in line and path == "/responses/compact":
+            failovers.append({
+                **entry,
+                "switchCount": item.get("switch_count"),
+                "maxSwitches": item.get("max_switches"),
+            })
+        elif "http request completed" in line and path == "/responses/compact" and isinstance(item.get("status_code"), int) and item.get("status_code") >= 400:
+            final_errors.append(entry)
+        if "context canceled" in line and path == "/responses/compact":
+            context_canceled.append(entry)
+    return {
+        "ok": True,
+        "degraded": len(failures) > 0 or len(final_errors) > 0 or len(context_canceled) > 0,
+        "window": "6h",
+        "tailLines": 2500,
+        "failureCount": len(failures),
+        "successCount": len(successes),
+        "failoverCount": len(failovers),
+        "finalErrorCount": len(final_errors),
+        "contextCanceledCount": len(context_canceled),
+        "recentFailures": failures[-5:],
+        "recentSuccesses": successes[-5:],
+        "recentFailovers": failovers[-8:],
+        "recentFinalErrors": final_errors[-5:],
+        "recentContextCanceled": context_canceled[-5:],
+        "logsExitCode": proc.returncode,
+        "logsStderr": text(proc.stderr, 1000),
+        "valuesPrinted": False,
+    }
+
 def validate_gateway_responses(api_key):
    request_id = "unidesk-codex-pool-validate-" + str(int(time.time() * 1000))
    payload = {
@@ -2427,7 +2489,7 @@ def summarize_temp_unschedulable_rules(rules):
        "errorCode": rule.get("error_code"),
        "durationMinutes": rule.get("duration_minutes"),
        "keywordCount": len(rule.get("keywords") or []),
-        "keywords": (rule.get("keywords") or [])[:8],
+        "keywords": rule.get("keywords") or [],
        "hasDescription": bool(rule.get("description")),
    } for rule in rules]

@@ -2677,9 +2739,10 @@ def run_sync():
    owner_concurrency = ensure_pool_owner_concurrency(token, api_key_result["userId"])
    gateway = validate_gateway(api_key)
    responses_smoke = validate_gateway_responses(api_key)
+    compact_evidence = recent_compact_gateway_evidence()
    return {
        "ok": gateway["ok"] is True and responses_smoke["ok"] is True and owner_concurrency["ok"] is True and capacity_status["ok"] is True and load_factor_status["ok"] is True and ws_v2_status["ok"] is True and temp_unschedulable_status["ok"] is True,
-        "degraded": bool(responses_smoke.get("degraded")),
+        "degraded": bool(responses_smoke.get("degraded")) or bool(compact_evidence.get("degraded")),
        "mode": "sync",
        "namespace": NAMESPACE,
        "serviceDns": SERVICE_DNS,
@@ -2713,7 +2776,7 @@ def run_sync():
        },
        "ownerBalance": owner_balance,
        "ownerConcurrency": owner_concurrency,
-        "validation": {"gatewayModels": gateway, "gatewayResponses": responses_smoke},
+        "validation": {"gatewayModels": gateway, "gatewayResponses": responses_smoke, "gatewayCompactRecent": compact_evidence},
    }

 def run_validate():
@@ -2733,9 +2796,10 @@ def run_validate():
    temp_unschedulable_status = account_temp_unschedulable_status(token)
    gateway = validate_gateway(api_key)
    responses_smoke = validate_gateway_responses(api_key)
+    compact_evidence = recent_compact_gateway_evidence()
    return {
        "ok": gateway["ok"] is True and responses_smoke["ok"] is True and (owner_concurrency is None or owner_concurrency["ok"] is True) and capacity_status["ok"] is True and load_factor_status["ok"] is True and ws_v2_status["ok"] is True and temp_unschedulable_status["ok"] is True,
-        "degraded": bool(responses_smoke.get("degraded")),
+        "degraded": bool(responses_smoke.get("degraded")) or bool(compact_evidence.get("degraded")),
        "mode": "validate",
        "namespace": NAMESPACE,
        "serviceDns": SERVICE_DNS,
@@ -2754,7 +2818,7 @@ def run_validate():
        "loadFactor": load_factor_status,
        "webSocketsV2": ws_v2_status,
        "tempUnschedulable": temp_unschedulable_status,
-        "validation": {"gatewayModels": gateway, "gatewayResponses": responses_smoke},
+        "validation": {"gatewayModels": gateway, "gatewayResponses": responses_smoke, "gatewayCompactRecent": compact_evidence},
    }

 try: