fix: extend Sub2API compact proxy timeout

This commit is contained in:
Codex
2026-06-10 07:48:31 +00:00
parent 8735b4103c
commit 38283fb413
7 changed files with 48 additions and 11 deletions
+3 -1
View File
@@ -105,6 +105,7 @@ bun scripts/cli.ts platform-infra sub2api codex-pool expose --confirm
-`publicExposure` YAML 控制。默认公共端是 `publicBaseUrl`master 本地消费端是 `masterBaseUrl`
- `expose --confirm` 只为 YAML 指定的 `remotePort` 补 master `frps` allow port,并在 G14 创建/更新 `sub2api-frpc`
- master Caddy site 也由 `publicExposure.masterCaddy` 渲染;`responseHeaderTimeoutSeconds` 必须足够覆盖 Codex `/responses/compact` 长请求,避免 Caddy 先返回 504 而 Sub2API 后台实际稍后成功。
- 同一个 FRP TCP 入口同时暴露 OpenAI-compatible API 和 Sub2API 管理 UI `/login`。不要另开第二个管理端口,除非 YAML 明确声明新的暴露决策。
- Sub2API Kubernetes Service 继续保持 ClusterIP。
@@ -140,7 +141,8 @@ bun scripts/cli.ts platform-infra sub2api codex-pool configure-local --confirm
- profile invalid:先修 `~/.codex/config.toml.<profile>``base_url``wire_api``model``auth.json.<profile>` 的 API key;不要在 YAML 中写密钥。
- pool key 401:跑 `codex-pool sync --confirm` 重建 Sub2API key 与 k3s Secret 绑定,再跑 `codex-pool validate`
- FRP 不通:先看 `codex-pool expose --confirm` 输出的 `masterFrps``sub2api-frpc` 和 public 401 probe;需要低层证据时只用 `trans G14:k3s` 做 bounded 查询。
- FRP 不通:先看 `codex-pool expose --confirm` 输出的 `masterFrps``masterCaddy``sub2api-frpc` 和 public 401 probe;需要低层证据时只用 `trans G14:k3s` 做 bounded 查询。
- `/responses/compact` 约 30 秒后返回 504 但 Sub2API 日志稍后记录 `codex.remote_compact.succeeded` 时,优先检查 master Caddy `response_header_timeout` 是否由 YAML `publicExposure.masterCaddy.responseHeaderTimeoutSeconds` 渲染,修正后跑 `codex-pool expose --confirm`;这类边缘代理超时不会触发 Sub2API 账号级临时下线。
- default profile 递归:检查 YAML default entry 是否使用 `*.pre-sub2api` 备份文件;必要时恢复备份后重新 `configure-local --confirm`
- 上游需要 WebSocket v2:先做 direct Codex WSv2 probe;通过后才给该 profile 配 `openaiResponsesWebSocketsV2Mode: ctx_pool|passthrough` 并跑 `sync --confirm`;把它当 capability candidate,容量仍以 YAML 中的 `capacity` 或默认值为准。
- Codex 启动 WebSocket 回退:用原入口 Codex smoke 复现,再用 bounded Sub2API 日志确认 account;对 WS handshake 4xx/5xx、`openai.websocket_account_select_failed` 或 close-before-`response.completed` 的账号关闭 YAML WSv2 能力后同步。若没有剩余 WSv2-capable account,把 `localCodex.supportsWebSockets``localCodex.responsesWebSocketsV2` 一起关掉,不把临时可用性推断写成调度配置。
@@ -116,6 +116,7 @@ publicExposure:
configPath: /etc/caddy/Caddyfile
serviceName: caddy
upstreamBaseUrl: http://127.0.0.1:21880
responseHeaderTimeoutSeconds: 180
localCodex:
backupSuffix: pre-sub2api
providerName: OpenAI
+1
View File
@@ -39,6 +39,7 @@
- `profiles.entries[].openaiResponsesWebSocketsV2Mode` is the account-level Responses WebSocket v2 switch for OpenAI-compatible upstreams that require WebSocket transport. Allowed values are `off`, `ctx_pool`, and `passthrough`; omit the field unless that upstream needs it.
- `profiles.entries[].upstreamUserAgent` is an optional account-level upstream request User-Agent override. Use it only for upstreams that require a Codex CLI compatible User-Agent; keep the value YAML-controlled and newline-free.
- `publicExposure` controls the optional FRP bridge from master server to the G14 ClusterIP service.
- `publicExposure.masterCaddy.responseHeaderTimeoutSeconds` controls the master Caddy `response_header_timeout` for the public Sub2API site. It must be long enough for Codex `/responses/compact` requests; otherwise Caddy can return a client-visible 504 before Sub2API finishes the upstream compact request, and that edge timeout is not an account-level upstream failure that Sub2API can use for temporary-unschedulable failover.
- `localCodex` controls how the master server's current `~/.codex` consumer files are backed up and rewritten. Keep `supportsWebSockets` and `responsesWebSocketsV2` in the same state, and enable them only when at least one YAML-managed account has a current direct Codex WSv2 smoke that passes. If no upstream profile can sustain Responses WSv2, the honest long-term state is `false/false` so Codex uses HTTP Responses directly instead of repeatedly reconnecting before `response.completed`. `localCodex.responsesSmokeModel` is the YAML-declared model used by `codex-pool validate` for the lightweight `POST /v1/responses` smoke.
Enable account-level WebSocket v2 only for upstream profiles that have passed a direct Codex WSv2 probe. Treat this as a YAML-declared capability set, not a hard scheduling pin to one profile; if `localCodex` enables WebSocket transport, `codex-pool validate` must show at least one current `webSocketsV2.schedulableEnabled` account, and runtime smoke remains the availability proof. The same validation reports each managed account's runtime WebSocket v2 mode and whether it matches YAML, so stale `ctx_pool` / `passthrough` settings cannot silently keep routing Codex WS sessions to an upstream that closes with `no available account`, WS handshake 5xx/4xx, or before `response.completed`.
@@ -1,4 +1,4 @@
import { renderCodexLocalConsumerToml } from "./src/platform-infra-sub2api-codex";
import { renderCaddySiteBlock, renderCodexLocalConsumerToml } from "./src/platform-infra-sub2api-codex";
function assertCondition(condition: unknown, message: string, detail: unknown = {}): void {
if (!condition) throw new Error(`${message}: ${JSON.stringify(detail)}`);
@@ -58,11 +58,19 @@ const disabled = renderCodexLocalConsumerToml(existing, {
assertCondition(disabled.includes("supports_websockets = false"), "disabled localCodex policy must render provider WebSocket transport off", disabled);
assertCondition(disabled.includes("responses_websockets_v2 = false"), "disabled localCodex policy must render Responses WebSocket v2 off", disabled);
const caddyBlock = renderCaddySiteBlock("sub2api.example.test", "http://127.0.0.1:21880", 180);
assertCondition(caddyBlock.includes("sub2api.example.test {"), "Caddy site block must use the configured domain", caddyBlock);
assertCondition(caddyBlock.includes("reverse_proxy 127.0.0.1:21880"), "Caddy site block must use the configured local upstream", caddyBlock);
assertCondition(caddyBlock.includes("response_header_timeout 180s"), "Caddy response header timeout must allow long Codex compact requests", caddyBlock);
assertCondition(!caddyBlock.includes("response_header_timeout 30s"), "Caddy site block must not retain the old 30s compact timeout", caddyBlock);
console.log(JSON.stringify({
ok: true,
checks: [
"existing Codex TOML is upgraded to the Sub2API WSv2 consumer settings",
"fresh Codex TOML creates provider and feature sections with WSv2 enabled",
"disabled localCodex WebSocket policy renders both consumer flags off",
"Caddy site block uses the YAML-controlled long response-header timeout",
],
}));
@@ -18,6 +18,7 @@ const parsed = Bun.YAML.parse(readFileSync(configPath, "utf8")) as {
};
};
profiles?: { entries?: Array<{ profile?: string; accountName?: string; capacity?: number; loadFactor?: number; openaiResponsesWebSocketsV2Mode?: string | null }> };
publicExposure?: { masterCaddy?: { responseHeaderTimeoutSeconds?: number } };
localCodex?: { supportsWebSockets?: boolean; responsesWebSocketsV2?: boolean; responsesSmokeModel?: string };
};
@@ -39,6 +40,11 @@ assertCondition(entries.every((entry) => typeof entry.profile === "string" && en
assertCondition(entries.every((entry) => typeof entry.accountName === "string" && entry.accountName.length > 0), "profile entries must declare account names", entries);
assertCondition(entries.every((entry) => entry.capacity === undefined || (Number.isInteger(entry.capacity) && entry.capacity > 0)), "profile capacity overrides must be positive integers when declared", entries);
assertCondition(entries.every((entry) => entry.loadFactor === undefined || (Number.isInteger(entry.loadFactor) && entry.loadFactor > 0)), "profile load factor overrides must be positive integers when declared", entries);
assertCondition(
Number.isInteger(parsed.publicExposure?.masterCaddy?.responseHeaderTimeoutSeconds) && (parsed.publicExposure?.masterCaddy?.responseHeaderTimeoutSeconds ?? 0) >= 180,
"Sub2API public Caddy response-header timeout must allow long Codex compact requests",
parsed.publicExposure?.masterCaddy,
);
assertCondition(
entries.every((entry) => entry.openaiResponsesWebSocketsV2Mode === undefined || entry.openaiResponsesWebSocketsV2Mode === null || allowedWebSocketModes.has(entry.openaiResponsesWebSocketsV2Mode)),
"profile WebSocket mode overrides must use supported values when declared",
@@ -59,8 +65,8 @@ if (parsed.pool?.defaultTempUnschedulable?.enabled === true) {
const gateway502Rule = rules.find((rule) => rule.statusCode === 502);
const gateway502Keywords = new Set((gateway502Rule?.keywords ?? []).map((keyword) => keyword.toLowerCase()));
assertCondition(gateway502Keywords.has("recovered upstream error"), "502 temporary-unschedulable rule must catch recovered upstream error wrappers", gateway502Rule);
for (const keyword of ["unknown error", "upstream request failed", "context canceled"]) {
assertCondition(gateway502Keywords.has(keyword), "502 temporary-unschedulable rule must catch compact gateway wrappers", { keyword, gateway502Rule });
for (const keyword of ["unknown error", "upstream request failed", "context deadline exceeded", "context canceled"]) {
assertCondition(gateway502Keywords.has(keyword), "502 temporary-unschedulable rule must catch compact gateway timeout wrappers", { keyword, gateway502Rule });
}
const largeContext413Rule = rules.find((rule) => rule.statusCode === 413);
const largeContext413Keywords = new Set((largeContext413Rule?.keywords ?? []).map((keyword) => keyword.toLowerCase()));
@@ -74,7 +80,7 @@ if (parsed.pool?.defaultTempUnschedulable?.enabled === true) {
}
const cloudflare524Rule = rules.find((rule) => rule.statusCode === 524);
const cloudflare524Keywords = new Set((cloudflare524Rule?.keywords ?? []).map((keyword) => keyword.toLowerCase()));
for (const keyword of ["timeout", "a timeout occurred", "cloudflare", "unknown error", "upstream request failed", "context canceled"]) {
for (const keyword of ["timeout", "a timeout occurred", "cloudflare", "upstream request failed", "unknown error", "context canceled", "recovered upstream error"]) {
assertCondition(cloudflare524Keywords.has(keyword), "524 temporary-unschedulable rule must catch Cloudflare timeout wrappers", { keyword, cloudflare524Rule });
}
const accountState403Rule = rules.find((rule) => rule.statusCode === 403);
@@ -99,6 +105,7 @@ console.log(JSON.stringify({
"routing config is schema-valid without profile-specific test gates",
"pool owner concurrency covers the YAML account capacity set",
"profile load factor overrides are YAML-controlled positive integers",
"public Caddy response-header timeout is long enough for Codex compact",
"optional WebSocket mode overrides use supported values",
"local Codex WebSocket transport is consistent with YAML-declared WSv2-capable accounts",
"temporary unschedulable rules are structurally valid when enabled",
@@ -44,13 +44,13 @@ for (const keyword of ["model_not_found", "no available channel for model"]) {
for (const keyword of ["openai_error", "context length", "maximum context"]) {
assertCondition(largeContext413Rule?.keywords?.includes(keyword), "413 rendered rule must catch large-context upstream failures", { keyword, largeContext413Rule });
}
for (const keyword of ["unknown error", "upstream request failed", "context canceled"]) {
assertCondition(gateway502Rule?.keywords?.includes(keyword), "502 rendered rule must catch compact gateway wrappers", { keyword, gateway502Rule });
for (const keyword of ["unknown error", "upstream request failed", "context deadline exceeded", "context canceled"]) {
assertCondition(gateway502Rule?.keywords?.includes(keyword), "502 rendered rule must catch compact gateway timeout wrappers", { keyword, gateway502Rule });
}
for (const keyword of ["gateway timeout", "unknown error", "context deadline exceeded"]) {
assertCondition(gatewayTimeout504Rule?.keywords?.includes(keyword), "504 rendered rule must preserve gateway-timeout cooldown keyword", { keyword, gatewayTimeout504Rule });
}
for (const keyword of ["timeout", "a timeout occurred", "cloudflare", "unknown error", "upstream request failed", "context canceled"]) {
for (const keyword of ["timeout", "a timeout occurred", "cloudflare", "upstream request failed", "unknown error", "context canceled", "recovered upstream error"]) {
assertCondition(cloudflare524Rule?.keywords?.includes(keyword), "524 rendered rule must catch Cloudflare timeout wrappers", { keyword, cloudflare524Rule });
}
+21 -3
View File
@@ -124,6 +124,7 @@ interface CodexPoolPublicExposureConfig {
configPath: string;
serviceName: string;
upstreamBaseUrl: string;
responseHeaderTimeoutSeconds: number;
};
}
@@ -651,6 +652,7 @@ function defaultCodexPoolConfig(): CodexPoolConfig {
configPath: "/etc/caddy/Caddyfile",
serviceName: "caddy",
upstreamBaseUrl: "http://127.0.0.1:21880",
responseHeaderTimeoutSeconds: 180,
},
},
localCodex: {
@@ -822,6 +824,15 @@ function readAccountLoadFactor(value: unknown, key: string): number {
return loadFactor;
}
function readCaddyTimeoutSeconds(value: unknown, key: string, fallback: number): number {
if (value === undefined || value === null) return fallback;
const seconds = numberValue(value);
if (seconds === null || !Number.isInteger(seconds) || seconds < 30 || seconds > 900) {
throw new Error(`${codexPoolConfigPath}.${key} must be an integer from 30 to 900`);
}
return seconds;
}
function readTempUnschedulablePolicy(value: unknown, key: string, fallback: CodexTempUnschedulablePolicy): CodexTempUnschedulablePolicy {
if (value === undefined || value === null) return cloneTempUnschedulablePolicy(fallback);
if (!isRecord(value)) throw new Error(`${codexPoolConfigPath}.${key} must be a YAML object`);
@@ -916,6 +927,11 @@ function readPublicExposureConfig(value: unknown, defaults: CodexPoolPublicExpos
configPath: stringValue(masterCaddyValue.configPath) ?? defaults.masterCaddy.configPath,
serviceName: stringValue(masterCaddyValue.serviceName) ?? defaults.masterCaddy.serviceName,
upstreamBaseUrl: normalizeBaseUrl(stringValue(masterCaddyValue.upstreamBaseUrl)) ?? defaults.masterCaddy.upstreamBaseUrl,
responseHeaderTimeoutSeconds: readCaddyTimeoutSeconds(
masterCaddyValue.responseHeaderTimeoutSeconds,
"publicExposure.masterCaddy.responseHeaderTimeoutSeconds",
defaults.masterCaddy.responseHeaderTimeoutSeconds,
),
},
};
validateKubernetesName(config.configMapName, "publicExposure.configMapName", true);
@@ -1092,6 +1108,7 @@ function publicExposureSummary(pool: CodexPoolConfig): Record<string, unknown> {
configPath: pool.publicExposure.masterCaddy.configPath,
serviceName: pool.publicExposure.masterCaddy.serviceName,
upstreamBaseUrl: pool.publicExposure.masterCaddy.upstreamBaseUrl,
responseHeaderTimeoutSeconds: pool.publicExposure.masterCaddy.responseHeaderTimeoutSeconds,
},
upstream: {
localIP: pool.publicExposure.localIP,
@@ -1156,7 +1173,7 @@ async function applyMasterCaddySite(pool: CodexPoolConfig): Promise<Record<strin
const path = caddy.configPath;
if (!existsSync(path)) return { ok: false, error: "master-caddy-config-missing", path, valuesPrinted: false };
const before = readFileSync(path, "utf8");
const desiredBlock = renderCaddySiteBlock(caddy.domain, caddy.upstreamBaseUrl);
const desiredBlock = renderCaddySiteBlock(caddy.domain, caddy.upstreamBaseUrl, caddy.responseHeaderTimeoutSeconds);
const existing = caddySiteBlock(before, caddy.domain);
const alreadyConfigured = existing === desiredBlock;
let backupPath: string | null = null;
@@ -1187,6 +1204,7 @@ async function applyMasterCaddySite(pool: CodexPoolConfig): Promise<Record<strin
domain: caddy.domain,
upstreamBaseUrl: caddy.upstreamBaseUrl,
serviceName: caddy.serviceName,
responseHeaderTimeoutSeconds: caddy.responseHeaderTimeoutSeconds,
validate: {
exitCode: validate.exitCode,
stdoutTail: Buffer.from(validate.stdout).toString("utf8").slice(-1000),
@@ -1206,7 +1224,7 @@ async function applyMasterCaddySite(pool: CodexPoolConfig): Promise<Record<strin
};
}
function renderCaddySiteBlock(domain: string, upstreamBaseUrl: string): string {
export function renderCaddySiteBlock(domain: string, upstreamBaseUrl: string, responseHeaderTimeoutSeconds = 180): string {
const upstream = new URL(upstreamBaseUrl);
const upstreamHost = `${upstream.hostname}${upstream.port ? `:${upstream.port}` : ""}`;
return `${domain} {
@@ -1216,7 +1234,7 @@ function renderCaddySiteBlock(domain: string, upstreamBaseUrl: string): string {
header_up X-Real-IP {remote_host}
transport http {
dial_timeout 5s
response_header_timeout 30s
response_header_timeout ${responseHeaderTimeoutSeconds}s
}
}
}`;