fix: add sub2api edge retry

This commit is contained in:
Codex
2026-06-11 12:01:21 +00:00
parent 5e44545f07
commit ea92eed148
4 changed files with 136 additions and 3 deletions
+3
View File
@@ -144,6 +144,8 @@ bun scripts/cli.ts platform-infra sub2api codex-pool expose --confirm
-`publicExposure` YAML 控制。默认公共端是 `publicBaseUrl`master 本地消费端是 `masterBaseUrl`
- `expose --confirm` 只为 YAML 指定的 `remotePort` 补 master `frps` allow port,并在 G14 创建/更新 `sub2api-frpc`
- master Caddy site 也由 `publicExposure.masterCaddy` 渲染;`responseHeaderTimeoutSeconds` 必须足够覆盖 Codex `/responses/compact` 长请求,避免 Caddy 先返回 504 而 Sub2API 后台实际稍后成功。具体数值只改 `config/platform-infra/sub2api-codex-pool.yaml`,修改后跑 `codex-pool expose --confirm`,再核对 Caddyfile 中渲染出的 `response_header_timeout`
- master Caddy 的短窗口边缘重试由 `publicExposure.masterCaddy.edgeRetry` 渲染;用于吸收 FRP remotePort 短暂关闭、`connect: connection refused`、EOF 或 connection reset 这类请求尚未稳定到达 Sub2API 的 502。具体 retry 时长、间隔和 `retryMatch` 范围只写 YAML,修改后跑 `codex-pool expose --confirm`,再核对 Caddyfile 中渲染出的 `lb_try_duration``lb_try_interval``lb_retry_match`。不要手工 patch `/etc/caddy/Caddyfile`
- 非幂等 POST 的 round-trip retry 必须收窄到 YAML `retryMatch` 声明的安全路径;普通 `/responses` 上游账号错误仍归 Sub2API failover / temp-unschedulable / sentinel 处理,不用 Caddy 重放整段推理请求来掩盖账号池问题。
- 同一个 FRP TCP 入口同时暴露 OpenAI-compatible API 和 Sub2API 管理 UI `/login`。不要另开第二个管理端口,除非 YAML 明确声明新的暴露决策。
- Sub2API Kubernetes Service 继续保持 ClusterIP。
@@ -184,6 +186,7 @@ bun scripts/cli.ts platform-infra sub2api codex-pool configure-local --confirm
- 运行中过去的验证探针残留:只用 `codex-pool cleanup-probes --confirm` 清理 `unidesk-probe-*` 临时资源;不要把真实 managed account 删除当作探针清理或可用性恢复。
- FRP 不通:先看 `codex-pool expose --confirm` 输出的 `masterFrps``masterCaddy``sub2api-frpc` 和 public 401 probe;需要低层证据时只用 `trans G14:k3s` 做 bounded 查询。
- `/responses/compact` 在接近 master Caddy `response_header_timeout` 的固定时长后返回 504,或 Sub2API 日志稍后记录 `codex.remote_compact.succeeded` 时,优先检查 master Caddy `response_header_timeout` 是否由 YAML `publicExposure.masterCaddy.responseHeaderTimeoutSeconds` 渲染,修正后跑 `codex-pool expose --confirm`;这类边缘代理超时不会触发 Sub2API 账号级临时下线。reload 前已经在途的 compact 请求仍可能按旧 timeout 结束,判断修复是否生效时只看 reload 之后新发起的请求。
- `/responses/compact` 或普通 public URL 在几秒窗口内出现 502Caddy 日志显示 `dial tcp 127.0.0.1:<remotePort>: connect: connection refused``EOF``connection reset by peer`,同时 frps 日志出现 `platform-infra-sub2api proxy closing` / `listener is closed` / `new proxy ... success`,说明失败在 master Caddy 与 FRP remotePort 边缘层,Sub2API 和 sentinel 可能完全看不到。先确认 `publicExposure.masterCaddy.edgeRetry` 已按 YAML 渲染并 `codex-pool expose --confirm` 生效;若仍频繁发生,再继续查 G14 `sub2api-frpc` 到 master `frps` 的控制连接稳定性。不要把这类边缘 502 误判成账号池上游错误,也不要通过禁用账号恢复。
- default profile 递归:检查 YAML default entry 是否使用 `*.pre-sub2api` 备份文件;必要时恢复备份后重新 `configure-local --confirm`
- 上游需要 WebSocket v2:先做 direct Codex WSv2 probe;通过后才给该 profile 配 `openaiResponsesWebSocketsV2Mode: ctx_pool|passthrough` 并跑 `sync --confirm`;把它当 capability candidate,容量仍以 YAML 中的 `capacity` 或默认值为准。
- Codex 启动 WebSocket 回退:用原入口 Codex smoke 复现,再用 bounded Sub2API 日志确认 account;对 WS handshake 4xx/5xx、`openai.websocket_account_select_failed` 或 close-before-`response.completed` 的账号关闭 YAML WSv2 能力后同步。若没有剩余 WSv2-capable account,把 `localCodex.supportsWebSockets``localCodex.responsesWebSocketsV2` 一起关掉,不把临时可用性推断写成调度配置。
@@ -138,6 +138,13 @@ publicExposure:
serviceName: caddy
upstreamBaseUrl: http://127.0.0.1:21880
responseHeaderTimeoutSeconds: 600
edgeRetry:
enabled: true
tryDurationSeconds: 10
tryIntervalMilliseconds: 250
retryMatch:
methods: [POST]
paths: [/responses/compact]
localCodex:
backupSuffix: pre-sub2api
providerName: OpenAI
+4
View File
@@ -43,6 +43,7 @@
- `profiles.entries[].upstreamUserAgent` is an optional account-level upstream request User-Agent override. Use it only for upstreams that require a Codex CLI compatible User-Agent; keep the value YAML-controlled and newline-free.
- `publicExposure` controls the optional FRP bridge from master server to the G14 ClusterIP service.
- `publicExposure.masterCaddy.responseHeaderTimeoutSeconds` controls the master Caddy `response_header_timeout` for the public Sub2API site. It must be long enough for Codex `/responses/compact` requests; otherwise Caddy can return a client-visible 504 before Sub2API finishes the upstream compact request, and that edge timeout is not an account-level upstream failure that Sub2API can use for temporary-unschedulable failover. The numeric value belongs only in `config/platform-infra/sub2api-codex-pool.yaml`; after changing it, use `codex-pool expose --confirm` to reload Caddy and verify the rendered `response_header_timeout`. Requests that were already in flight before the reload may still finish with the previous timeout, so post-change evidence should check only requests that started after the reload.
- `publicExposure.masterCaddy.edgeRetry` controls the master Caddy reverse-proxy retry window for the public Sub2API site. This belongs at the edge because FRP remotePort listener loss, `connection refused`, EOF, or connection reset can happen before a request reaches Sub2API, so Sub2API account failover and sentinel logic cannot observe or recover that request. Keep retry scope narrow, especially for non-idempotent POST traffic: connection-attempt failures may be retried by the reverse proxy, while round-trip retry after an upstream connection was established should be limited by YAML `retryMatch` to paths that are safe to repeat, such as compact. Retry durations and intervals belong only in YAML; after changing them, run `codex-pool expose --confirm` and verify the rendered Caddyfile contains the expected `lb_try_duration`, `lb_try_interval`, and `lb_retry_match`.
- `localCodex` controls how the master server's current `~/.codex` consumer files are backed up and rewritten. Keep `supportsWebSockets` and `responsesWebSocketsV2` in the same state, and enable them only when at least one YAML-managed account has a current direct Codex WSv2 smoke that passes. If no upstream profile can sustain Responses WSv2, the honest long-term state is `false/false` so Codex uses HTTP Responses directly instead of repeatedly reconnecting before `response.completed`. `localCodex.responsesSmokeModel` is the YAML-declared model used by `codex-pool validate` for the lightweight `POST /v1/responses` smoke.
Enable account-level WebSocket v2 only for upstream profiles that have passed a direct Codex WSv2 probe. Treat this as a YAML-declared capability set, not a hard scheduling pin to one profile; if `localCodex` enables WebSocket transport, `codex-pool validate` must show at least one current `webSocketsV2.schedulableEnabled` account, and runtime smoke remains the availability proof. The same validation reports each managed account's runtime WebSocket v2 mode and whether it matches YAML, so stale `ctx_pool` / `passthrough` settings cannot silently keep routing Codex WS sessions to an upstream that closes with `no available account`, WS handshake 5xx/4xx, or before `response.completed`.
@@ -101,6 +102,8 @@ When `publicExposure.enabled` is true, the same FRP TCP bridge exposes both Open
The public management UI is an operations endpoint. Keep Sub2API itself in `platform-infra`, keep the Kubernetes Service as ClusterIP, and treat FRP as the only public bridge unless a later decision explicitly changes the exposure model.
The public bridge has two separate failure classes. Sub2API upstream/account failures are visible in Sub2API logs and should be handled by temporary-unschedulable rules, sentinel quarantine, or Sub2API failover. Edge failures between master Caddy and the FRP remotePort are not visible to Sub2API; symptoms include Caddy `connect: connection refused`, EOF, connection reset, or short 502 bursts while frps closes and reopens the configured remotePort. Those failures must be diagnosed from Caddy and frps/frpc evidence and mitigated through YAML-controlled Caddy edge retry or FRP stability fixes, not by disabling accounts or changing pool membership.
## Availability And Probes
Kubernetes readiness is not the same as pool availability:
@@ -109,6 +112,7 @@ Kubernetes readiness is not the same as pool availability:
- The FRP client deployment is currently a simple connector deployment and does not itself prove that master-local traffic reaches Sub2API.
- No scheduled `CronJob`, `ServiceMonitor`, or `PodMonitor` currently proves the full unified Codex API path.
- `platform-infra sub2api validate` and `platform-infra sub2api codex-pool validate` are on-demand checks. Operational usage is documented in `$unidesk-sub2api`; they are acceptable for deployment closeout, but they are not continuous monitoring. `codex-pool validate` must test both `GET /v1/models` and a small `POST /v1/responses` request, and the Responses smoke should report request id, selected/final account evidence, upstream failover count, and whether the validation succeeded only after failover. It should also summarize recent `/responses` and `/responses/compact` gateway failures separately so ordinary long streaming failures are not hidden behind compact-only evidence.
- Public exposure closeout must include the edge layer when the user-facing URL is involved. A Sub2API-side compact success summary does not rule out Caddy/FRP 502s that happened before Sub2API received the request; inspect the edge Caddy/frps/frpc evidence or use a CLI report that summarizes it before declaring public compact stable.
- Because `codex-pool validate` includes account alignment, recent-log inspection, and gateway smoke, timeout of the CLI transport is not valid negative evidence about Sub2API scheduling by itself. Closeout evidence must come from the final structured validation result or from an explicitly reported remote job failure with stdout/stderr tail, not from a single low-level `trans` timeout.
When an automatic availability probe is added, it should be YAML-controlled and cover these layers without printing secrets:
+122 -3
View File
@@ -156,6 +156,17 @@ interface CodexPoolPublicExposureConfig {
serviceName: string;
upstreamBaseUrl: string;
responseHeaderTimeoutSeconds: number;
edgeRetry: CodexPoolCaddyEdgeRetryConfig;
};
}
interface CodexPoolCaddyEdgeRetryConfig {
enabled: boolean;
tryDurationSeconds: number;
tryIntervalMilliseconds: number;
retryMatch: {
methods: string[];
paths: string[];
};
}
@@ -999,6 +1010,15 @@ function defaultCodexPoolConfig(): CodexPoolConfig {
serviceName: "caddy",
upstreamBaseUrl: "http://127.0.0.1:21880",
responseHeaderTimeoutSeconds: 180,
edgeRetry: {
enabled: false,
tryDurationSeconds: 0,
tryIntervalMilliseconds: 250,
retryMatch: {
methods: [],
paths: [],
},
},
},
},
localCodex: {
@@ -1205,6 +1225,66 @@ function readCaddyTimeoutSeconds(value: unknown, key: string, fallback: number):
return seconds;
}
function readCaddyEdgeRetryConfig(value: unknown, fallback: CodexPoolCaddyEdgeRetryConfig, key: string): CodexPoolCaddyEdgeRetryConfig {
if (value === undefined || value === null) return cloneCaddyEdgeRetryConfig(fallback);
if (!isRecord(value)) throw new Error(`${codexPoolConfigPath}.${key} must be a YAML object`);
const enabled = readBooleanConfig(value.enabled, `${key}.enabled`, fallback.enabled);
if (!enabled) return { ...cloneCaddyEdgeRetryConfig(fallback), enabled: false };
const tryDurationSeconds = readPositiveInteger(value.tryDurationSeconds, `${key}.tryDurationSeconds`);
const tryIntervalMilliseconds = readPositiveInteger(value.tryIntervalMilliseconds, `${key}.tryIntervalMilliseconds`);
const retryMatchValue = isRecord(value.retryMatch) ? value.retryMatch : null;
if (retryMatchValue === null) throw new Error(`${codexPoolConfigPath}.${key}.retryMatch must be a YAML object when enabled=true`);
const retryMatch = {
methods: readCaddyRetryMethods(retryMatchValue.methods, `${key}.retryMatch.methods`),
paths: readCaddyRetryPaths(retryMatchValue.paths, `${key}.retryMatch.paths`),
};
if (retryMatch.methods.length === 0 && retryMatch.paths.length === 0) {
throw new Error(`${codexPoolConfigPath}.${key}.retryMatch must include at least one method or path matcher when enabled=true`);
}
return { enabled, tryDurationSeconds, tryIntervalMilliseconds, retryMatch };
}
function readPositiveInteger(value: unknown, key: string): number {
const parsed = numberValue(value);
if (parsed === null || !Number.isInteger(parsed) || parsed < 1) {
throw new Error(`${codexPoolConfigPath}.${key} must be a positive integer`);
}
return parsed;
}
function readCaddyRetryMethods(value: unknown, key: string): string[] {
if (value === undefined || value === null) return [];
if (!Array.isArray(value)) throw new Error(`${codexPoolConfigPath}.${key} must be a YAML array`);
const seen = new Set<string>();
const methods: string[] = [];
for (const item of value) {
const method = stringValue(item)?.toUpperCase() ?? null;
if (method === null || !/^[A-Z][A-Z0-9_-]*$/u.test(method)) throw new Error(`${codexPoolConfigPath}.${key} entries must be HTTP method tokens`);
if (seen.has(method)) continue;
seen.add(method);
methods.push(method);
}
return methods;
}
function readCaddyRetryPaths(value: unknown, key: string): string[] {
if (value === undefined || value === null) return [];
if (!Array.isArray(value)) throw new Error(`${codexPoolConfigPath}.${key} must be a YAML array`);
const seen = new Set<string>();
const paths: string[] = [];
for (const item of value) {
const path = stringValue(item);
if (path === null || !path.startsWith("/") || /[\r\n]/u.test(path)) {
throw new Error(`${codexPoolConfigPath}.${key} entries must be Caddy path matchers starting with / and without newlines`);
}
if (seen.has(path)) continue;
seen.add(path);
paths.push(path);
}
return paths;
}
function readTempUnschedulablePolicy(value: unknown, key: string, fallback: CodexTempUnschedulablePolicy): CodexTempUnschedulablePolicy {
if (value === undefined || value === null) return cloneTempUnschedulablePolicy(fallback);
if (!isRecord(value)) throw new Error(`${codexPoolConfigPath}.${key} must be a YAML object`);
@@ -1265,6 +1345,18 @@ function cloneTempUnschedulablePolicy(policy: CodexTempUnschedulablePolicy): Cod
};
}
function cloneCaddyEdgeRetryConfig(config: CodexPoolCaddyEdgeRetryConfig): CodexPoolCaddyEdgeRetryConfig {
return {
enabled: config.enabled,
tryDurationSeconds: config.tryDurationSeconds,
tryIntervalMilliseconds: config.tryIntervalMilliseconds,
retryMatch: {
methods: [...config.retryMatch.methods],
paths: [...config.retryMatch.paths],
},
};
}
function readBooleanConfig(value: unknown, key: string, fallback: boolean): boolean {
if (value === undefined || value === null) return fallback;
const parsed = booleanValue(value);
@@ -1304,6 +1396,11 @@ function readPublicExposureConfig(value: unknown, defaults: CodexPoolPublicExpos
"publicExposure.masterCaddy.responseHeaderTimeoutSeconds",
defaults.masterCaddy.responseHeaderTimeoutSeconds,
),
edgeRetry: readCaddyEdgeRetryConfig(
masterCaddyValue.edgeRetry,
defaults.masterCaddy.edgeRetry,
"publicExposure.masterCaddy.edgeRetry",
),
},
};
validateKubernetesName(config.configMapName, "publicExposure.configMapName", true);
@@ -2137,6 +2234,7 @@ function publicExposureSummary(pool: CodexPoolConfig): Record<string, unknown> {
serviceName: pool.publicExposure.masterCaddy.serviceName,
upstreamBaseUrl: pool.publicExposure.masterCaddy.upstreamBaseUrl,
responseHeaderTimeoutSeconds: pool.publicExposure.masterCaddy.responseHeaderTimeoutSeconds,
edgeRetry: pool.publicExposure.masterCaddy.edgeRetry,
},
upstream: {
localIP: pool.publicExposure.localIP,
@@ -2201,7 +2299,7 @@ async function applyMasterCaddySite(pool: CodexPoolConfig): Promise<Record<strin
const path = caddy.configPath;
if (!existsSync(path)) return { ok: false, error: "master-caddy-config-missing", path, valuesPrinted: false };
const before = readFileSync(path, "utf8");
const desiredBlock = renderCaddySiteBlock(caddy.domain, caddy.upstreamBaseUrl, caddy.responseHeaderTimeoutSeconds);
const desiredBlock = renderCaddySiteBlock(caddy.domain, caddy.upstreamBaseUrl, caddy.responseHeaderTimeoutSeconds, caddy.edgeRetry);
const existing = caddySiteBlock(before, caddy.domain);
const alreadyConfigured = existing === desiredBlock;
let backupPath: string | null = null;
@@ -2233,6 +2331,7 @@ async function applyMasterCaddySite(pool: CodexPoolConfig): Promise<Record<strin
upstreamBaseUrl: caddy.upstreamBaseUrl,
serviceName: caddy.serviceName,
responseHeaderTimeoutSeconds: caddy.responseHeaderTimeoutSeconds,
edgeRetry: caddy.edgeRetry,
validate: {
exitCode: validate.exitCode,
stdoutTail: Buffer.from(validate.stdout).toString("utf8").slice(-1000),
@@ -2252,13 +2351,19 @@ async function applyMasterCaddySite(pool: CodexPoolConfig): Promise<Record<strin
};
}
export function renderCaddySiteBlock(domain: string, upstreamBaseUrl: string, responseHeaderTimeoutSeconds = 180): string {
export function renderCaddySiteBlock(
domain: string,
upstreamBaseUrl: string,
responseHeaderTimeoutSeconds = 180,
edgeRetry: CodexPoolCaddyEdgeRetryConfig | null = null,
): string {
const upstream = new URL(upstreamBaseUrl);
const upstreamHost = `${upstream.hostname}${upstream.port ? `:${upstream.port}` : ""}`;
const retryBlock = renderCaddyEdgeRetryBlock(edgeRetry);
return `${domain} {
encode zstd gzip
reverse_proxy ${upstreamHost} {
header_up Host {host}
${retryBlock} header_up Host {host}
header_up X-Real-IP {remote_host}
transport http {
dial_timeout 5s
@@ -2268,6 +2373,20 @@ export function renderCaddySiteBlock(domain: string, upstreamBaseUrl: string, re
}`;
}
function renderCaddyEdgeRetryBlock(edgeRetry: CodexPoolCaddyEdgeRetryConfig | null): string {
if (edgeRetry === null || !edgeRetry.enabled) return "";
const matchLines = [
edgeRetry.retryMatch.methods.length > 0 ? ` method ${edgeRetry.retryMatch.methods.join(" ")}` : null,
edgeRetry.retryMatch.paths.length > 0 ? ` path ${edgeRetry.retryMatch.paths.join(" ")}` : null,
].filter((line): line is string => line !== null);
return ` lb_try_duration ${edgeRetry.tryDurationSeconds}s
lb_try_interval ${edgeRetry.tryIntervalMilliseconds}ms
lb_retry_match {
${matchLines.join("\n")}
}
`;
}
function caddySiteBlock(text: string, domain: string): string | null {
const startMatch = new RegExp(`(^|\\n)${escapeRegExp(domain)}\\s*\\{`, "u").exec(text);
if (startMatch === null) return null;