From 01b19d92382e8ed6402c4357a2ffeb6a6b9a652d Mon Sep 17 00:00:00 2001 From: Codex Date: Thu, 11 Jun 2026 15:44:53 +0000 Subject: [PATCH] fix: disable sub2api builtin temp unschedulable --- .agents/skills/unidesk-sub2api/SKILL.md | 25 ++-- config/platform-infra/sub2api-codex-pool.yaml | 2 +- docs/reference/platform-infra.md | 21 ++-- scripts/src/platform-infra-sub2api-codex.ts | 111 ++++-------------- 4 files changed, 46 insertions(+), 113 deletions(-) diff --git a/.agents/skills/unidesk-sub2api/SKILL.md b/.agents/skills/unidesk-sub2api/SKILL.md index 1d2f28f8..9fde20aa 100644 --- a/.agents/skills/unidesk-sub2api/SKILL.md +++ b/.agents/skills/unidesk-sub2api/SKILL.md @@ -74,16 +74,15 @@ bun scripts/cli.ts platform-infra sub2api codex-pool cleanup-probes --confirm - `pool.apiKeySecretName` / `pool.apiKeySecretKey`: 统一消费 API key 的 k3s Secret 位置,默认 `platform-infra/sub2api-codex-pool-api-key.API_KEY`。 - `pool.minOwnerBalanceUsd`: pool key owner 最低余额,sync/validate 会补齐。 - `pool.minOwnerConcurrency`: 可选统一消费 API key owner 最低并发;省略时 CLI 自动使用所有已解析账号 capacity 的总和,sync/validate 会补齐。显式 YAML 值只作为 override,仍必须不小于账号 capacity 总和;未显式写 `profiles.entries[].capacity` 的账号会使用 `pool.defaultAccountCapacity` 参与求和,不要用提高某个 provider capacity 来掩盖用户并发层 WS 1013。 -- `pool.defaultTempUnschedulable`: 默认账号级临时下线规则;只声明 Sub2API 已支持的错误路径能力,用于在上游返回容量、限流、overload、service unavailable、gateway timeout、稳定模型路由错误或认证状态异常时,让 Sub2API 冷却该账号并切换到同组其他账号。不要用 YAML、UniDesk CLI、k8s 热补或本地 fork 魔改 Sub2API 不支持的行为。 -- `pool.defaultTempUnschedulable` 的 `durationMinutes` 等业务数值只从 YAML 读取并同步到 Sub2API;不要在 TypeScript 默认值、schema、合同测试或文档 prose 中另写一份上限、下限或分层策略。 -- 自动冻结/切号失败时,必须修复 `temp_unschedulable` 与 failover 机制本身,并用运行时证据证明失败账号被临时冻结且请求切到其他可调度账号;禁止通过手动禁用账号、删除账号、移除 YAML entry、降低 membership 或临时改调度策略来替代自动恢复。只有明确的上游退役或所有权变更才走删除/禁用上游流程。 -- YAML 只选择和配置 Codex 上游,不声明 `schedulable` 长期字段;`schedulable=true` 只能作为 `codex-pool sync --confirm` 的过程控制基线恢复。自动冻结必须表现为 `temp_unschedulable_until` / `temp_unschedulable_reason`,避免把永久不可调度误当成自动冻结。 +- `pool.defaultTempUnschedulable`: Sub2API 内置临时不可调度开关和 YAML 规则列表。当前要求是 `enabled=false`,YAML 保留规则用于以后显式恢复;sync 按 WebUI 关闭开关语义删除运行时 `temp_unschedulable_enabled` / `temp_unschedulable_rules` credentials 字段,不让 Sub2API 内置规则参与调度。 +- `pool.defaultTempUnschedulable` 与外部 `sentinel.*` 分开配置、互不驱动。内置开关关闭不影响哨兵;哨兵配置变化也不能隐式打开内置规则。 +- YAML 只选择和配置 Codex 上游,不声明 `schedulable` 长期字段;`schedulable=true` 只能作为 `codex-pool sync --confirm` 对未处于哨兵隔离账号的过程控制基线恢复。 - `profiles.entries`: 从 master `~/.codex/` 选择上游 profile 并映射到 Sub2API account。 - `profiles.entries[].capacity`: 可选 per-account concurrency override;不写则使用 `pool.defaultAccountCapacity`。具体数值只以 `config/platform-infra/sub2api-codex-pool.yaml` 为准,skill 和长期参考只描述规则,不重复写当前值。 - `profiles.entries[].loadFactor`: 可选 per-account Sub2API `load_factor` override;不写则使用 `pool.defaultAccountLoadFactor`。具体数值只以 `config/platform-infra/sub2api-codex-pool.yaml` 为准,修改后必须 `codex-pool sync --confirm` 和 `codex-pool validate`。 - `profiles.entries[].trustUpstream`: 可选账号级哨兵信任标记;默认 `false`。可信账号使用 `sentinel.cadence.trustedSuccessMaxIntervalMinutes` 作为连续成功后的最大探测退避,不可信账号使用 `sentinel.cadence.untrustedSuccessMaxIntervalMinutes`。它只影响哨兵探测频率和状态可见性,不改变 Sub2API account priority/capacity/loadFactor。 - 除非用户明确要求修改配置,不要仅凭推断改账号 membership、priority、capacity、loadFactor、WebSocket mode 或其他调度策略;先保留 YAML,完成 provenance/runtime evidence 溯源,并把结论写回相关 issue 或 runbook 后再提出变更。 -- `profiles.entries[].tempUnschedulable`: 可选 per-account 临时下线规则覆盖;字段语义以 `docs/reference/platform-infra.md` 为权威。上游 Sub2API 不支持的成功体分类、调度策略或账号冷却行为不要在这里声明。 +- `profiles.entries[].tempUnschedulable`: 可选 per-account Sub2API 内置临时不可调度覆盖;当前同样应保持开关关闭,规则只保留在 YAML,不作为调度健康机制。 - `profiles.entries[].openaiResponsesWebSocketsV2Mode`: 需要 Responses WebSocket v2 的上游才设置,值为 `off`、`ctx_pool` 或 `passthrough`。 - `profiles.entries[].upstreamUserAgent`: 少数要求 Codex CLI User-Agent 的上游才设置,不能含换行。 - `sentinel.monitor.enabled`: 账号级 marker 哨兵监控开关;开启后 `codex-pool sync --confirm` 会在 `platform-infra` 创建/更新 k8s CronJob、ConfigMap、Secret、ServiceAccount、Role 和 RoleBinding。CronJob 直打 YAML-managed 上游账号的 OpenAI Responses `gpt-5.5`,用确定 marker 作为唯一健康标准,并在独立 state ConfigMap 中记录 token/cost 账本。 @@ -115,7 +114,7 @@ Codex 启动时反复出现 WebSocket reconnect、HTTPS fallback、`websocket cl 1. 在 master `~/.codex/` 准备带后缀的上游 profile 文件,例如 `config.toml.` 和 `auth.json.`;禁止覆盖默认 `config.toml` / `auth.json`。 2. 在 `config/platform-infra/sub2api-codex-pool.yaml` 添加 `profiles.entries` 项,指定 `profile`、`accountName`、`configFile`、`authFile`。 -3. 如需要,给该项加 `priority`、`capacity`、`loadFactor`、`trustUpstream`、`tempUnschedulable`、`openaiResponsesWebSocketsV2Mode` 或 `upstreamUserAgent`;capacity/loadFactor/信任退避的具体数值只写在 YAML。 +3. 如需要,给该项加 `priority`、`capacity`、`loadFactor`、`trustUpstream`、`openaiResponsesWebSocketsV2Mode` 或 `upstreamUserAgent`;capacity/loadFactor/信任退避的具体数值只写在 YAML。只有显式恢复 Sub2API 内置临时不可调度时才添加 per-account `tempUnschedulable`。 4. 如果新增账号会提高声明 capacity 总和,默认让省略的 `pool.minOwnerConcurrency` 继续按 capacity 总和自动解析;只有 YAML 已经显式写了该 override 时,才同步提高到不低于总 capacity,或删除 override 回到自动解析。 5. 跑 `codex-pool plan`,确认 profile 可读、`base_url` 和 API key 来源有效,且 stdout 未泄露完整 key。 6. 跑 `codex-pool sync --confirm`。 @@ -125,7 +124,7 @@ Codex 启动时反复出现 WebSocket reconnect、HTTPS fallback、`websocket cl ## 删除上游 -删除上游只用于明确退役、凭据所有权变更或用户明确要求移除 provider;不能作为上游 5xx、compact 失败、限流、模型路由失败或自动冻结/切号缺陷的恢复手段。 +删除上游只用于明确退役、凭据所有权变更或用户明确要求移除 provider;不能作为上游 5xx、compact 失败、限流、模型路由失败或哨兵隔离/恢复问题的处理手段。 1. 从 `config/platform-infra/sub2api-codex-pool.yaml` 删除对应 `profiles.entries` 项。 2. 跑 `codex-pool plan` 检查 desired 列表。 @@ -173,7 +172,7 @@ bun scripts/cli.ts platform-infra sub2api codex-pool configure-local --confirm - `sub2api status`:Deployment/StatefulSet/Service/Secret/NetworkPolicy 可见,运行镜像与 YAML 一致,`NetworkPolicy/allow-all` 符合 `podSelector: {}`、Ingress/Egress 全放行。 - `sub2api validate`:app、PostgreSQL、Redis、service proxy、`NetworkPolicy/allow-all` 和临时跨 Pod PostgreSQL/Redis 连通性检查通过。 -- `codex-pool validate`:统一 key 的 `GET /v1/models` 成功,并用 `localCodex.responsesSmokeModel` 跑一次小的 `POST /v1/responses` smoke;owner balance / owner concurrency 已满足 YAML 最小值,capacity、WebSocket v2 和 temporary-unschedulable 运行时状态与 YAML 对齐;`validation.gatewayResponsesRecent` 汇总最近 6 小时普通 `/responses` 和 `/v1/responses` 的 failover、forward failure、最终 4xx/5xx、慢 final error 与 `context canceled` 证据,`validation.gatewayCompactRecent` 单独汇总 `/responses/compact` 证据。若当前 Responses smoke `ok=true` 但 recent 字段 `degraded=true`,先区分是历史窗口残留还是新的 request id 正在失败;长期判定见 `docs/reference/platform-infra.md`。 +- `codex-pool validate`:统一 key 的 `GET /v1/models` 成功,并用 `localCodex.responsesSmokeModel` 跑一次小的 `POST /v1/responses` smoke;owner balance / owner concurrency 已满足 YAML 最小值,capacity、WebSocket v2、Sub2API 内置 temporary-unschedulable 开关/规则和 sentinel runtime 状态与 YAML 对齐;`validation.gatewayResponsesRecent` 汇总最近 6 小时普通 `/responses` 和 `/v1/responses` 的 failover、forward failure、最终 4xx/5xx、慢 final error 与 `context canceled` 证据,`validation.gatewayCompactRecent` 单独汇总 `/responses/compact` 证据。若当前 Responses smoke `ok=true` 但 recent 字段 `degraded=true`,先区分是历史窗口残留还是新的 request id 正在失败;长期判定见 `docs/reference/platform-infra.md`。 - 若 `publicExposure.enabled=true`,确认 FRP path 可用;`expose --confirm` 会用未带 key 的 public `/v1/models` 401 作为网关可达性探针。 如果要证明真实模型请求可用,使用最小 `/v1/responses` 或等价 Codex smoke。不要把 group-level `/v1/models` 成功解释成每个上游 account 都健康。 @@ -192,11 +191,11 @@ bun scripts/cli.ts platform-infra sub2api codex-pool configure-local --confirm - 上游需要 WebSocket v2:先做 direct Codex WSv2 probe;通过后才给该 profile 配 `openaiResponsesWebSocketsV2Mode: ctx_pool|passthrough` 并跑 `sync --confirm`;把它当 capability candidate,容量仍以 YAML 中的 `capacity` 或默认值为准。 - Codex 启动 WebSocket 回退:用原入口 Codex smoke 复现,再用 bounded Sub2API 日志确认 account;对 WS handshake 4xx/5xx、`openai.websocket_account_select_failed` 或 close-before-`response.completed` 的账号关闭 YAML WSv2 能力后同步。若没有剩余 WSv2-capable account,把 `localCodex.supportsWebSockets` 和 `localCodex.responsesWebSocketsV2` 一起关掉,不把临时可用性推断写成调度配置。 - 上游要求 Codex User-Agent:只给该 profile 配 `upstreamUserAgent`,跑 `sync --confirm`。 -- 上游报 capacity/rate-limit/overload/Bad Gateway/Gateway Timeout 后没有切号或频繁先失败再恢复:先确认 `codex-pool validate` 里 `tempUnschedulable.ok=true` 且目标 account `runtimeEnabled=true`、规则数符合 YAML;再看 `validation.gatewayResponses.evidence.failovers` 的 account/upstream status。若 mismatch,跑 `codex-pool sync --confirm`;若 runtime 规则已对齐但仍不冻结或不切号,继续修 Sub2API 自动冻结/failover 能力并复测,不要手工 patch Sub2API credentials,也不要手动禁用、删除或从 YAML 移除问题账号来绕过机制缺陷。 +- 上游报 capacity/rate-limit/overload/Bad Gateway/Gateway Timeout 后没有隔离或频繁先失败再恢复:先看 `codex-pool sentinel-report` 的 marker、动作、冻结 TTL 和下一次 probe;必要时用 `codex-pool sentinel-probe --account --confirm` 立即测量。不要通过开启 Sub2API 内置临时不可调度、手动禁用账号、删除账号或从 YAML 移除问题账号来替代哨兵隔离/恢复。 - `codex-pool sync --confirm` 或 `codex-pool validate` 超时:先区分 CLI 传输超时和 Sub2API 运行失败。受控 CLI 应返回远端作业进度和 stdout/stderr tail;如果只是低层 `trans` 60s 超时,不能据此判定 Sub2API failover 不工作。改用或修复 CLI 的远端 job/poll 路径后重跑,并以最终结构化结果作为证据。 -- Codex 报 weekly-limit、`less than 10% of your weekly limit left`、`Run /status for a breakdown` 等账号状态/软配额提示并要求切号:如果上游以 403/429 等错误状态返回,把稳定 body 关键词放进 `pool.defaultTempUnschedulable` 的对应规则,跑 `codex-pool sync --confirm`,再用 `codex-pool validate` 确认每个 managed account 的 runtime 规则包含这些关键词。若该文案是 HTTP 200 成功内容,不要写 Sub2API 原生 YAML 200 规则、不要热补 Sub2API、不要绕过 sync;启用账号级哨兵时由 marker-only 哨兵按非 marker 响应统一指数冻结。 -- 上游 400/503 响应体出现 `invalid_encrypted_content`、`bad_response_status_code`、`invalid_request_error` + 稳定 unsupported-model 文案、unsupported-model、`暂不支持` / `可用模型`、`model_not_found`、`No available channel for model ...` 或同类稳定模型路由 / Responses encrypted-content 兼容性失败:把稳定 body 关键词放进 `pool.defaultTempUnschedulable` 的对应 400/503 规则,跑 `codex-pool sync --confirm`,再用 `codex-pool validate` 确认目标 account 的 runtime rule 包含这些关键词;不要用 account membership、priority、capacity、loadFactor、WebSocket mode 或 User-Agent 改动掩盖该错误族。 -- 上游错误反复触发:`invalid_encrypted_content`、unsupported-model、`Recovered upstream error ...`、`Bad Gateway`、`Gateway Timeout`、Cloudflare `524`、Codex-facing `Upstream request failed`、`Unknown error`、`context deadline exceeded`、`context canceled`、`model_not_found`、`No available channel for model`、大上下文 `413` 和 `openai_error` 这类稳定包装文案都应留在对应 YAML 冷却政策里,特别是普通 `/responses` 与 compact 链路里上游兼容性错误或 524 可能最终表现为客户端 502/504 + `Unknown error`。冷却时长等具体数值只以 YAML 为准,修改后只需要 `codex-pool plan`、`codex-pool sync --confirm` 和 `codex-pool validate`;不要为数值调整新增合同测试、代码硬范围或长期参考数值口径。长期判定见 `docs/reference/platform-infra.md`。 +- Codex 报 weekly-limit、`less than 10% of your weekly limit left`、`Run /status for a breakdown` 等账号状态/软配额提示并要求切号:不要把新关键词写成 Sub2API 内置临时不可调度策略来恢复可用性;由 marker-only 哨兵按非 marker 响应统一冻结,并用 `sentinel-report` / `sentinel-probe` 验证。 +- 上游 400/503 响应体出现 `invalid_encrypted_content`、`bad_response_status_code`、`invalid_request_error` + 稳定 unsupported-model 文案、unsupported-model、`暂不支持` / `可用模型`、`model_not_found`、`No available channel for model ...` 或同类稳定模型路由 / Responses encrypted-content 兼容性失败:按哨兵 marker 失败处理,不用 account membership、priority、capacity、loadFactor、WebSocket mode、User-Agent 或 Sub2API 内置临时不可调度改动掩盖该错误族。 +- 上游错误反复触发:`invalid_encrypted_content`、unsupported-model、`Recovered upstream error ...`、`Bad Gateway`、`Gateway Timeout`、Cloudflare `524`、Codex-facing `Upstream request failed`、`Unknown error`、`context deadline exceeded`、`context canceled`、`model_not_found`、`No available channel for model`、大上下文 `413` 和 `openai_error` 这类稳定包装文案都由外部哨兵和运行日志证据处理;内置临时不可调度规则保留但默认关闭,不作为当前恢复路径。长期判定见 `docs/reference/platform-infra.md`。 - Codex auto compact 后丢上下文:先确认 YAML `localCodex` 是否声明启用 WSv2;若启用,再确认本机 `~/.codex/config.toml` 是否有 `supports_websockets = true` 和 `responses_websockets_v2 = true`,并看 `codex-pool validate` 的 WSv2 candidate 和 Sub2API 日志里的 `transport=responses_websockets_v2`。若 YAML 当前禁用 WSv2,则按 HTTP Responses 稳定性排查,不把旧 WS 口径当成验收要求。 - Codex smoke 有 reconnect/1013:这是上游并发/可用性问题,和 HTTP-only compact context-loss 分开处理;记录 session/log 证据并关联专项 issue,不要用运行时手补覆盖 YAML 容量。 @@ -208,5 +207,5 @@ bun scripts/cli.ts platform-infra sub2api codex-pool configure-local --confirm - 不给 Sub2API manifest 添加 CPU/memory limits,除非有新的 YAML 化明确决策。 - 不打印完整 API key、admin password 或 Secret 明文。 - 不把普通上游增删做成代码变更、CI/CD、feature flag 或兼容双路径。 -- 不把手动禁用账号、删除账号、移除 YAML entry、降低 membership 或临时改 priority/capacity/loadFactor 当作自动冻结/切号失败的修复。 +- 不把手动禁用账号、删除账号、移除 YAML entry、降低 membership、临时改 priority/capacity/loadFactor 或打开 Sub2API 内置临时不可调度当作哨兵隔离/恢复问题的修复。 - 不魔改 Sub2API:Sub2API 本身不支持的能力就不做,不通过 UniDesk 脚本、k8s 原地热补、本地 fork、YAML 伪声明或隐藏 fallback 代替上游实现。 diff --git a/config/platform-infra/sub2api-codex-pool.yaml b/config/platform-infra/sub2api-codex-pool.yaml index dabbfe86..8dc79644 100644 --- a/config/platform-infra/sub2api-codex-pool.yaml +++ b/config/platform-infra/sub2api-codex-pool.yaml @@ -8,7 +8,7 @@ pool: defaultAccountCapacity: 10 defaultAccountLoadFactor: 10 defaultTempUnschedulable: - enabled: true + enabled: false rules: - statusCode: 400 keywords: [invalid_encrypted_content, encrypted content, could not be verified, could not be decrypted, bad_response_status_code, model_not_found, no available channel for model, unsupported, not supported, not support, 暂不支持, 可用模型] diff --git a/docs/reference/platform-infra.md b/docs/reference/platform-infra.md index a313abd4..1bff5891 100644 --- a/docs/reference/platform-infra.md +++ b/docs/reference/platform-infra.md @@ -26,9 +26,9 @@ - `pool.groupName` names the Sub2API group that represents the pool. - `pool.apiKeySecretName` and `pool.apiKeySecretKey` name the k3s Secret that stores the single consumer API key. - `pool.minOwnerConcurrency` is optional; when omitted, the CLI automatically uses the sum of all resolved account capacities as the minimum concurrency for the Sub2API user that owns the unified consumer API key. A YAML value is only an explicit override and must still be at least that capacity sum, so the shared key does not fail requests or WS sessions at the user-concurrency layer. "Resolved" means each account's explicit `profiles.entries[].capacity` or, when omitted, `pool.defaultAccountCapacity`. Do not compensate for owner-concurrency 1013 errors by pinning capacity to one provider. -- `pool.defaultTempUnschedulable` declares Sub2API account-level temporary unschedulable rules for capabilities that Sub2API itself already supports. Keep 429/overload/capacity, service-unavailable, gateway timeout, and stable model-routing failures in this YAML policy so the scheduler can cool down a failing account and choose another candidate instead of hard-pinning one provider. Do not declare unsupported Sub2API behavior in YAML as a promise that UniDesk code or runtime patches should emulate. -- When a managed upstream repeatedly causes `/v1/responses` or `/responses/compact` failures, the required fix path is to make automatic temporary-unschedulable and failover work, then verify it with runtime evidence. Do not restore availability by manually disabling an account, deleting a managed account, removing its YAML entry, lowering membership, or otherwise changing routing policy merely to avoid the failing upstream; those actions are allowed only for an explicit upstream retirement or ownership change. -- Codex accounts selected by YAML do not declare `schedulable` as durable configuration. `schedulable=true` is a `codex-pool sync --confirm` process-control baseline for UniDesk-managed accounts, not a YAML field. Account cooling must be represented by `temp_unschedulable_until` / `temp_unschedulable_reason`, so validation can distinguish real automatic cooldown from stale manual unschedulable state. +- `pool.defaultTempUnschedulable` is the Sub2API built-in temporary-unschedulable switch plus its YAML rule list. UniDesk keeps this built-in switch disabled by default while preserving the rule list in YAML for explicit future recovery; sync follows the WebUI close-switch behavior by omitting the runtime `temp_unschedulable_enabled` and `temp_unschedulable_rules` credential fields. The external account-level sentinel is the active account health and freeze/restore mechanism. +- The built-in temporary-unschedulable configuration and external `sentinel.*` configuration are separate control surfaces. Changing `pool.defaultTempUnschedulable.enabled` or `profiles.entries[].tempUnschedulable` must not change sentinel cadence, marker health semantics, or sentinel quarantine state; changing sentinel settings must not implicitly enable Sub2API built-in temporary-unschedulable rules. +- Codex accounts selected by YAML do not declare `schedulable` as durable configuration. `schedulable=true` is a `codex-pool sync --confirm` process-control baseline for UniDesk-managed accounts that are not under sentinel quarantine, not a YAML field. - `codex-pool sync --confirm` preserves UniDesk-managed accounts that are absent from YAML by default; explicit upstream retirement requires `codex-pool sync --confirm --prune-removed`. This keeps account deletion out of the normal availability-recovery path and prevents temporary YAML edits from becoming destructive runtime changes. - `profiles.entries` selects local Codex profile files from `~/.codex/` and maps them to Sub2API account names. - The unsuffixed master `~/.codex/config.toml` and `~/.codex/auth.json` are reserved for the unified Sub2API consumer. `config.toml` must keep `base_url = "https://sub2api.74-48-78-17.nip.io/"`, and `auth.json` must contain the unified pool API key from `pool.apiKeySecretName` / `pool.apiKeySecretKey`. Do not replace these two files with direct upstream account credentials. @@ -36,9 +36,8 @@ - `profiles.entries[].capacity` optionally overrides `pool.defaultAccountCapacity` for one account. Capacity is a YAML-controlled routing input; concrete current values belong only in `config/platform-infra/sub2api-codex-pool.yaml` and runtime validation output, not in long-term reference prose. Code constants, Secrets, ad-hoc runtime patches, or stale tests must not override YAML source of truth. - `profiles.entries[].loadFactor` optionally overrides `pool.defaultAccountLoadFactor` for one account and is rendered to Sub2API `load_factor`. Treat it as routing policy: values belong in YAML and `codex-pool validate` output, not code constants, Secrets, or ad-hoc runtime patches. - Do not change account membership, priority, capacity, load factor, WebSocket mode, or other routing policy from inference alone. Unless the user explicitly asks for a configuration change, first preserve the current YAML, collect provenance and runtime evidence, and write the finding to the relevant issue or runbook before proposing a change. -- `profiles.entries[].tempUnschedulable` may override the pool default for one account. The CLI renders it into Sub2API credentials as `temp_unschedulable_enabled` and `temp_unschedulable_rules`; rules match HTTP status plus response-body keywords and place only that account into a temporary unschedulable cooldown. -- Codex account-state or quota prompts that stop a task and ask the operator to switch accounts belong in `pool.defaultTempUnschedulable`, not in account membership, priority, capacity, load factor, WebSocket mode, or `pool_mode`. Keep stable body phrases such as weekly-limit and `/status` prompts in both the 403 account-state rule and the 429 quota/rate-limit rule, then run `codex-pool sync --confirm` and `codex-pool validate`. The validation evidence must include runtime temporary-unschedulable alignment for each managed account, not only successful group-level `/v1/models` or `/v1/responses` smoke output. -- Upstream model-routing and Responses compatibility failures that surface as 400 responses, such as `invalid_encrypted_content`, `bad_response_status_code`, `invalid_request_error` with a stable unsupported-model message, unsupported-model wrappers, or stable "available models" messages, belong in `pool.defaultTempUnschedulable` when another account can handle the same Codex request. Upstream model-routing failures that surface as 503 responses, such as `model_not_found` or "no available channel for model" wrappers, also belong there. Gateway and timeout failures that surface as 502, 504, or 524 responses, including `Gateway Timeout`, `Unknown error`, `Upstream request failed`, `context deadline exceeded`, `context canceled`, or recovered upstream-error wrappers, belong in the same YAML policy. This is especially important for compact and long `/responses` requests, where an upstream Cloudflare 524 or account-specific compatibility failure may eventually reach Codex as a 502/504 unknown-error wrapper after failover or client cancellation. They are not membership, priority, capacity, load factor, WebSocket mode, or User-Agent decisions by themselves. After adding stable body phrases, run `codex-pool sync --confirm` and `codex-pool validate`, and verify the affected account's runtime status-specific rule includes the new keywords. +- `profiles.entries[].tempUnschedulable` may override the pool default for one account. When enabled, the CLI renders it into Sub2API credentials as `temp_unschedulable_enabled` and `temp_unschedulable_rules`; when disabled, runtime credentials omit both fields and the YAML rule list remains only source-side configuration. +- Codex account-state, quota prompts, model-routing failures, gateway wrappers, and timeout-like upstream errors are handled by the external marker-only sentinel unless the Sub2API built-in temporary-unschedulable switch is explicitly re-enabled. Do not change membership, priority, capacity, load factor, WebSocket mode, or `pool_mode` merely to work around those errors. - `profiles.entries[].openaiResponsesWebSocketsV2Mode` is the account-level Responses WebSocket v2 switch for OpenAI-compatible upstreams that require WebSocket transport. Allowed values are `off`, `ctx_pool`, and `passthrough`; omit the field unless that upstream needs it. - `profiles.entries[].upstreamUserAgent` is an optional account-level upstream request User-Agent override. Use it only for upstreams that require a Codex CLI compatible User-Agent; keep the value YAML-controlled and newline-free. - `publicExposure` controls the optional FRP bridge from master server to the G14 ClusterIP service. @@ -52,11 +51,9 @@ When Codex startup repeatedly reports WebSocket reconnects or HTTPS fallback, pr Do not encode current availability assumptions in long-term reference prose. If an account needs a higher concurrency or load factor than the pool default, make that a deliberate YAML override and verify it with `codex-pool validate`; the reference document should describe the rule, not repeat the current numeric value. -Do not enable Sub2API `pool_mode` for UniDesk-managed Codex accounts. `pool_mode` retries the same selected account path, while UniDesk's desired failover behavior is to mark the failing account temporarily unschedulable and let Sub2API choose another account from the group. `codex-pool validate` reports each managed account's temporary-unschedulable runtime alignment and should be used after `codex-pool sync --confirm`. Generic 502/503/504 bodies such as `Recovered upstream error 502`, `Bad Gateway`, `Gateway Timeout`, Codex-facing `Upstream request failed`, `Unknown error`, context-deadline/canceled wrappers, stable 400 `invalid_encrypted_content` / unsupported-model wrappers, and stable `model_not_found` / "no available channel for model" wrappers must stay in the YAML cooldown policy so an intermittently bad account is cooled down instead of repeatedly adding latency at the next compact or Responses request. Exact current cooldown values and any business-policy grouping belong only in YAML and runtime validation output; do not repeat those values here, encode them as code/schema hard limits, or require contract tests for value changes. +Do not enable Sub2API `pool_mode` for UniDesk-managed Codex accounts. `pool_mode` retries the same selected account path and does not replace sentinel quarantine. The current failover and recovery model is: the external marker-only sentinel freezes or restores account schedulability, while Sub2API routes among currently schedulable accounts in the group. -Sub2API temporary-unschedulable rules require both an HTTP status match and a response-body keyword match in the upstream failure/error path. Do not treat them as a general successful-response content filter, and do not add a YAML 200 cooldown rule, patch Sub2API in place, fork Sub2API behavior in UniDesk, or bypass `codex-pool sync` to make the native pool pretend that HTTP 200 content cooling exists. HTTP 200 private content, maintenance text, quota prompts, ads, and similar semantic failures are handled by the external account-level sentinel when that sentinel is enabled, not by Sub2API native `temp_unschedulable_rules`. - -If automatic cooling or same-request failover does not happen for an error that the YAML policy declares, treat that as a Sub2API capability or integration defect. The closeout must show the failing account being marked temporarily unschedulable and the next request or same request selecting another schedulable account; a manually disabled, deleted, or pruned account is not valid evidence for this class of fix. +Sub2API temporary-unschedulable rules require both an HTTP status match and a response-body keyword match in the upstream failure/error path when the built-in switch is enabled. UniDesk currently keeps that switch disabled and does not use built-in rules as a successful-response content filter. HTTP 200 private content, maintenance text, quota prompts, ads, and similar semantic failures are handled by the external account-level sentinel. ## Sub2API Account Test Semantics @@ -76,7 +73,7 @@ The UniDesk account-level sentinel uses marker-only health semantics. A probe is The sentinel must not maintain separate classifiers for "private content", "maintenance", "quota", "ads", or provider-specific body phrases as health gates. The only recovery condition is a later recovery probe that matches the marker. Freeze TTL expiry only schedules the next recovery probe; it does not restore an account by itself. Repeated non-marker results use a short exponential freeze backoff because failed marker probes produce little or no useful output token usage; repeated marker-matching results use the configured success cadence backoff. This contract applies equally to OpenAI Responses `gpt-5.5` direct account probes and manual `codex-pool sentinel-probe --account ... --confirm` measurements. -`profiles.entries[].trustUpstream` is the durable account-level trust marker for sentinel success cadence, and the absence of the field means untrusted. Trusted and untrusted accounts use separate YAML cadence maximums after marker-matching probes; the values belong only in `config/platform-infra/sub2api-codex-pool.yaml`. This field must not change Sub2API scheduler priority, capacity, load factor, membership, native temporary-unschedulable rules, or the marker-only health contract. Its purpose is to keep intermittently unreliable 200-success providers under more frequent direct probes without adding provider-specific content classifiers. +`profiles.entries[].trustUpstream` is the durable account-level trust marker for sentinel success cadence, and the absence of the field means untrusted. Trusted and untrusted accounts use separate YAML cadence maximums after marker-matching probes; the values belong only in `config/platform-infra/sub2api-codex-pool.yaml`. This field must not change Sub2API scheduler priority, capacity, load factor, membership, built-in temporary-unschedulable settings, or the marker-only health contract. Its purpose is to keep intermittently unreliable 200-success providers under more frequent direct probes without adding provider-specific content classifiers. When `codex-pool sync --confirm` creates a YAML-managed account or changes direct-probe-relevant account inputs such as the profile mapping, upstream base URL, API key fingerprint, upstream User-Agent, Responses WebSocket mode, or `trustUpstream`, only that account must be default-frozen before it can enter the scheduler. Sync first records a pending sentinel quality gate from the pre-mutation runtime state, then updates the account, then schedules the account probe immediately. This ordering prevents a new or changed account from being written to Sub2API without a matching sentinel quarantine record if sync fails midway. Passing the marker clears the quality gate and restores schedulability; any non-marker result continues the failure freeze backoff. Unchanged accounts must not have their existing success or failure backoff reset by unrelated YAML syncs. @@ -106,7 +103,7 @@ When `publicExposure.enabled` is true, the same FRP TCP bridge exposes both Open The public management UI is an operations endpoint. Keep Sub2API itself in `platform-infra`, keep the Kubernetes Service as ClusterIP, and treat FRP as the only public bridge unless a later decision explicitly changes the exposure model. -The public bridge has two separate failure classes. Sub2API upstream/account failures are visible in Sub2API logs and should be handled by temporary-unschedulable rules, sentinel quarantine, or Sub2API failover. Edge failures between master Caddy and the FRP remotePort are not visible to Sub2API; symptoms include Caddy `connect: connection refused`, EOF, connection reset, or short 502 bursts while frps closes and reopens the configured remotePort. Those failures must be diagnosed from Caddy and frps/frpc evidence and mitigated through YAML-controlled Caddy edge retry or FRP stability fixes, not by disabling accounts or changing pool membership. +The public bridge has two separate failure classes. Sub2API upstream/account failures are visible in Sub2API logs and currently belong to sentinel quarantine plus normal Sub2API routing among schedulable accounts. Edge failures between master Caddy and the FRP remotePort are not visible to Sub2API; symptoms include Caddy `connect: connection refused`, EOF, connection reset, or short 502 bursts while frps closes and reopens the configured remotePort. Those failures must be diagnosed from Caddy and frps/frpc evidence and mitigated through YAML-controlled Caddy edge retry or FRP stability fixes, not by disabling accounts or changing pool membership. ## Availability And Probes diff --git a/scripts/src/platform-infra-sub2api-codex.ts b/scripts/src/platform-infra-sub2api-codex.ts index 031d39ae..d0531433 100644 --- a/scripts/src/platform-infra-sub2api-codex.ts +++ b/scripts/src/platform-infra-sub2api-codex.ts @@ -1041,75 +1041,8 @@ function defaultCodexPoolConfig(): CodexPoolConfig { export function defaultCodexTempUnschedulablePolicy(): CodexTempUnschedulablePolicy { return { - enabled: true, - rules: [ - { - statusCode: 400, - keywords: ["invalid_encrypted_content", "encrypted content", "could not be verified", "could not be decrypted", "bad_response_status_code", "model_not_found", "no available channel for model", "unsupported", "not supported", "not support", "暂不支持", "可用模型"], - durationMinutes: 120, - description: "Stable upstream 400 model-routing or Responses encrypted-content compatibility failures should use another account.", - }, - { - statusCode: 401, - keywords: ["unauthorized", "invalid api key", "invalid_api_key", "authentication", "recovered upstream error"], - durationMinutes: 120, - description: "Credential/auth failures should use the longest cooldown.", - }, - { - statusCode: 403, - keywords: ["forbidden", "access denied", "quota", "billing", "capacity", "weekly limit", "less than 10% of your weekly limit left", "run /status for a breakdown", "recovered upstream error"], - durationMinutes: 120, - description: "Permission, quota, or account-state failures should use the longest cooldown.", - }, - { - statusCode: 429, - keywords: ["capacity", "rate limit", "rate_limit", "quota", "weekly limit", "less than 10% of your weekly limit left", "run /status for a breakdown", "too many requests", "overloaded", "resource_exhausted", "recovered upstream error"], - durationMinutes: 10, - description: "Capacity and rate-limit responses are often temporary; start with a ten-minute cooldown and use another account.", - }, - { - statusCode: 500, - keywords: ["capacity", "overloaded", "temporarily unavailable", "temporary", "upstream", "recovered upstream error"], - durationMinutes: 10, - description: "Transient upstream server failures should start with a ten-minute cooldown and prefer another account.", - }, - { - statusCode: 502, - keywords: ["capacity", "overloaded", "temporarily unavailable", "temporary", "upstream", "bad gateway", "upstream request failed", "unknown error", "context deadline exceeded", "context canceled", "websocket dial", "handshake response", "recovered upstream error"], - durationMinutes: 30, - description: "Gateway upstream failures, including recovered upstream error wrappers, should cool down longer.", - }, - { - statusCode: 413, - keywords: ["openai_error", "payload too large", "request too large", "context length", "context window", "maximum context"], - durationMinutes: 30, - description: "Large-context upstream failures should cool down the selected account so a larger-context channel can handle the request.", - }, - { - statusCode: 503, - keywords: ["capacity", "overloaded", "temporarily unavailable", "temporary", "upstream", "recovered upstream error", "model_not_found", "no available channel for model"], - durationMinutes: 30, - description: "Service unavailable and upstream model-routing failures should cool down longer than one-off transient failures.", - }, - { - statusCode: 504, - keywords: ["gateway timeout", "timeout", "upstream", "upstream request failed", "unknown error", "context deadline exceeded", "context canceled", "recovered upstream error"], - durationMinutes: 30, - description: "Gateway timeout responses should cool down the selected account so another account can handle the next request.", - }, - { - statusCode: 524, - keywords: ["timeout", "a timeout occurred", "cloudflare", "gateway timeout", "upstream", "upstream request failed", "unknown error", "context deadline exceeded", "context canceled", "recovered upstream error"], - durationMinutes: 30, - description: "Cloudflare 524 timeout responses should cool down the selected account so another account can handle the next request.", - }, - { - statusCode: 529, - keywords: ["capacity", "overloaded", "temporarily unavailable", "temporary", "recovered upstream error"], - durationMinutes: 30, - description: "Provider overloaded responses should cool down longer than generic transient failures and use another account.", - }, - ], + enabled: false, + rules: [], }; } @@ -1542,8 +1475,8 @@ function compactProfile(profile: CodexProfile): Record { trustUpstream: profile.trustUpstream, capacity: profile.capacity, loadFactor: profile.loadFactor, - tempUnschedulableEnabled: profile.tempUnschedulable.enabled && profile.tempUnschedulable.rules.length > 0, - tempUnschedulableRuleCount: profile.tempUnschedulable.enabled ? profile.tempUnschedulable.rules.length : 0, + tempUnschedulableEnabled: profile.tempUnschedulable.enabled, + tempUnschedulableRuleCount: profile.tempUnschedulable.rules.length, apiKeyPresent: profile.apiKey !== null && profile.apiKey.length > 0, ok: profile.ok, error: profile.error, @@ -1552,24 +1485,23 @@ function compactProfile(profile: CodexProfile): Record { } export function renderSub2ApiTempUnschedulableCredentials(policy: CodexTempUnschedulablePolicy): Record { + if (!policy.enabled) return {}; return { - temp_unschedulable_enabled: policy.enabled && policy.rules.length > 0, - temp_unschedulable_rules: policy.enabled - ? policy.rules.map((rule) => ({ - error_code: rule.statusCode, - keywords: [...rule.keywords], - duration_minutes: rule.durationMinutes, - description: rule.description ?? "", - })) - : [], + temp_unschedulable_enabled: policy.enabled, + temp_unschedulable_rules: policy.rules.map((rule) => ({ + error_code: rule.statusCode, + keywords: [...rule.keywords], + duration_minutes: rule.durationMinutes, + description: rule.description ?? "", + })), }; } function tempUnschedulableSummary(policy: CodexTempUnschedulablePolicy): Record { return { - enabled: policy.enabled && policy.rules.length > 0, - ruleCount: policy.enabled ? policy.rules.length : 0, - statusCodes: policy.enabled ? policy.rules.map((rule) => rule.statusCode) : [], + enabled: policy.enabled, + ruleCount: policy.rules.length, + statusCodes: policy.rules.map((rule) => rule.statusCode), }; } @@ -3711,8 +3643,9 @@ def account_payload(profile, group_id): if upstream_user_agent: credentials["user_agent"] = upstream_user_agent temp_unschedulable = temp_unschedulable_credentials(profile) - credentials["temp_unschedulable_enabled"] = temp_unschedulable["enabled"] - credentials["temp_unschedulable_rules"] = temp_unschedulable["rules"] + if temp_unschedulable["enabled"]: + credentials["temp_unschedulable_enabled"] = True + credentials["temp_unschedulable_rules"] = temp_unschedulable["rules"] return { "name": profile["accountName"], "notes": f"UniDesk-managed Codex profile {profile['profile']} from {profile['configFile']} and {profile['authFile']}; secret source={profile['apiKeySource']}; fingerprint={profile['apiKeyFingerprint']}.", @@ -4803,8 +4736,8 @@ def normalize_temp_unschedulable_credentials(credentials): "description": description, }) return { - "enabled": enabled and len(rules) > 0, - "rules": rules if enabled else [], + "enabled": enabled, + "rules": rules, } def summarize_temp_unschedulable_rules(rules): @@ -4819,6 +4752,8 @@ def summarize_temp_unschedulable_rules(rules): def success_body_reclassification_requirement(): for name in sorted(EXPECTED_ACCOUNT_TEMP_UNSCHEDULABLE): expected = normalize_temp_unschedulable_credentials(EXPECTED_ACCOUNT_TEMP_UNSCHEDULABLE[name]) + if expected["enabled"] is not True: + continue for rule in expected["rules"]: error_code = rule.get("error_code") keywords = rule.get("keywords") or [] @@ -4844,6 +4779,8 @@ def model_routing_400_failover_requirement(): preferred = ["暂不支持", "可用模型", "unsupported model", "model not supported", "does not support", "not supported", "model_not_found", "no available channel for model"] for name in sorted(EXPECTED_ACCOUNT_TEMP_UNSCHEDULABLE): expected = normalize_temp_unschedulable_credentials(EXPECTED_ACCOUNT_TEMP_UNSCHEDULABLE[name]) + if expected["enabled"] is not True: + continue for rule in expected["rules"]: error_code = rule.get("error_code") keywords = rule.get("keywords") or []