diff --git a/.agents/skills/unidesk-sub2api/SKILL.md b/.agents/skills/unidesk-sub2api/SKILL.md index 3dfd5cad..a73093ed 100644 --- a/.agents/skills/unidesk-sub2api/SKILL.md +++ b/.agents/skills/unidesk-sub2api/SKILL.md @@ -105,7 +105,8 @@ bun scripts/cli.ts platform-infra sub2api codex-pool cleanup-probes --target D60 - `profiles.entries[].capacity`: 可选 per-account concurrency override;不写则使用 `pool.defaultAccountCapacity`。具体数值只以 `config/platform-infra/sub2api-codex-pool.yaml` 为准,skill 和长期参考只描述规则,不重复写当前值。 - `profiles.entries[].loadFactor`: 可选 per-account Sub2API `load_factor` override;不写则使用 `pool.defaultAccountLoadFactor`。具体数值只以 `config/platform-infra/sub2api-codex-pool.yaml` 为准,修改后必须 `codex-pool sync --confirm` 和 `codex-pool validate`。 - `profiles.entries[].trustUpstream`: 可选账号级哨兵信任标记;默认 `false`。可信账号使用 `sentinel.cadence.trustedSuccessMaxIntervalMinutes` 作为连续成功后的最大探测退避,不可信账号使用 `sentinel.cadence.untrustedSuccessMaxIntervalMinutes`。它只影响哨兵探测频率和状态可见性,不改变 Sub2API account priority/capacity/loadFactor。 -- `profiles.entries[].sentinelProtect`: 可选账号级哨兵保护策略;默认关闭,只给需要避免偶发 marker 波动误屏蔽的高价值账号启用。启用时必须显式声明 `consecutiveFailures`、`initialRetryDelaySeconds`、`maxRetryDelaySeconds` 和 `backoffMultiplier`;marker probe 或 gateway failure 触发冻结前会先按该策略做连续 marker 确认,只有全部失败才进入冻结状态机。它只影响哨兵冻结判定和 `sentinel-report` 可见性,不改变 Sub2API account priority/capacity/loadFactor。 +- `pool.defaultSentinelProtect`: 账号级哨兵保护默认策略;是否启用、连续确认次数、初始延迟、最大延迟和退避倍率都只以 YAML 为准。marker probe 或 gateway failure 触发冻结前会先按该策略做连续 marker 确认,只有全部失败才进入冻结状态机。 +- `profiles.entries[].sentinelProtect`: 可选账号级哨兵保护覆盖;只用于明确偏离 pool 默认策略。它只影响哨兵冻结判定和 `sentinel-report` 可见性,不改变 Sub2API account priority/capacity/loadFactor。 - 除非用户明确要求修改配置,不要仅凭推断改账号 membership、priority、capacity、loadFactor、WebSocket mode 或其他调度策略;先保留 YAML,完成 provenance/runtime evidence 溯源,并把结论写回相关 issue 或 runbook 后再提出变更。 - Sub2API 是 UniDesk 可读源码和可观测运行面的受控组件;排查 Sub2API 调度、failover、错误传播、临时不可调度或 account selection 时,默认先读当前 Sub2API 源码实现,再用真实 request id、Sub2API 日志和原入口流量验证。不要用 mock upstream、临时 probe account 或测试桩作为默认结论来源;这类探针最多是显式 debug 辅助,不能替代源码链路和真实运行证据。 - `profiles.entries[].tempUnschedulable`: 可选 per-account Sub2API 内置临时不可调度覆盖;只用于明确偏离 pool 默认规则,不用它给某个账号特殊优先级或临时绕过通用 failover。 diff --git a/config/platform-infra/sub2api-codex-pool.yaml b/config/platform-infra/sub2api-codex-pool.yaml index e5c6d4e5..de59c7d0 100644 --- a/config/platform-infra/sub2api-codex-pool.yaml +++ b/config/platform-infra/sub2api-codex-pool.yaml @@ -7,6 +7,12 @@ pool: defaultAccountPriority: 10 defaultAccountCapacity: 10 defaultAccountLoadFactor: 10 + defaultSentinelProtect: + enabled: true + consecutiveFailures: 3 + initialRetryDelaySeconds: 2 + maxRetryDelaySeconds: 60 + backoffMultiplier: 2 defaultTempUnschedulable: enabled: true rules: @@ -90,12 +96,6 @@ profiles: configFile: config.toml.only authFile: auth.json.only trustUpstream: true - sentinelProtect: - enabled: true - consecutiveFailures: 3 - initialRetryDelaySeconds: 2 - maxRetryDelaySeconds: 60 - backoffMultiplier: 2 loadFactor: 1 priority: 110 - profile: zakuzaku diff --git a/docs/reference/platform-infra.md b/docs/reference/platform-infra.md index 77deb2dd..9db3a599 100644 --- a/docs/reference/platform-infra.md +++ b/docs/reference/platform-infra.md @@ -134,9 +134,9 @@ The sentinel must not maintain separate classifiers for "private content", "main `profiles.entries[].trustUpstream` is the durable account-level trust marker for sentinel success cadence, and the absence of the field means untrusted. Trusted and untrusted accounts use separate YAML cadence maximums after marker-matching probes; the values belong only in `config/platform-infra/sub2api-codex-pool.yaml`. This field must not change Sub2API scheduler priority, capacity, load factor, membership, built-in temporary-unschedulable settings, or the marker-only health contract. Its purpose is to keep intermittently unreliable 200-success providers under more frequent direct probes without adding provider-specific content classifiers. -`profiles.entries[].sentinelProtect` is an optional account-level protection policy for sentinel freeze decisions, and the absence of the field means disabled. For protected accounts, the marker-only health contract still applies, but the sentinel must exhaust the configured consecutive marker confirmation attempts before treating the account as failed and entering the freeze state machine. The retry count, initial delay, maximum delay, and backoff multiplier are YAML values; long-term reference prose must not duplicate the current numbers. This policy exists only to absorb occasional marker/probe or gateway-failure confirmation jitter for selected accounts. It must not change Sub2API scheduler priority, capacity, load factor, membership, built-in temporary-unschedulable settings, or the recovery condition. +`pool.defaultSentinelProtect` is the default protection policy for sentinel freeze decisions, and `profiles.entries[].sentinelProtect` may override it for a specific account. For protected accounts, the marker-only health contract still applies, but the sentinel must exhaust the configured consecutive marker confirmation attempts before treating the account as failed and entering the freeze state machine. The retry count, initial delay, maximum delay, and backoff multiplier are YAML values; long-term reference prose must not duplicate the current numbers. This policy exists only to absorb occasional marker/probe or gateway-failure confirmation jitter. It must not change Sub2API scheduler priority, capacity, load factor, membership, built-in temporary-unschedulable settings, or the recovery condition. -When `codex-pool sync --confirm` creates a YAML-managed account or changes direct-probe-relevant account inputs such as the profile mapping, upstream base URL, API key fingerprint, upstream User-Agent, Responses WebSocket mode, `trustUpstream`, or `sentinelProtect`, sync records a pending sentinel probe from the pre-mutation runtime state, updates the account, restores `schedulable=true` unless an active sentinel quarantine already exists, and schedules the account probe immediately. New or changed accounts are not default-frozen; only an actual non-marker probe result or an existing active quarantine may remove an account from the scheduler. This avoids zero-available windows during sync while still ensuring that later marker failures enter the normal freeze/restore state machine. Unchanged accounts must not have their existing success or failure backoff reset by unrelated YAML syncs. +When `codex-pool sync --confirm` creates a YAML-managed account or changes direct-probe-relevant account inputs such as the profile mapping, upstream base URL, API key fingerprint, upstream User-Agent, Responses WebSocket mode, `trustUpstream`, pool/profile `sentinelProtect`, sync records a pending sentinel probe from the pre-mutation runtime state, updates the account, restores `schedulable=true` unless an active sentinel quarantine already exists, and schedules the account probe immediately. New or changed accounts are not default-frozen; only an actual non-marker probe result or an existing active quarantine may remove an account from the scheduler. This avoids zero-available windows during sync while still ensuring that later marker failures enter the normal freeze/restore state machine. Unchanged accounts must not have their existing success or failure backoff reset by unrelated YAML syncs. If the YAML failure freeze maximum is lowered, `codex-pool sync --confirm` may migrate only currently active sentinel quarantines whose stored interval or next recovery time exceeds the current maximum. The migration keeps the account frozen, marks the next recovery probe due immediately, and lets the next marker result decide restore versus the new shorter failure backoff. It must not clear quarantine or restore schedulability merely because an older TTL has expired. diff --git a/scripts/src/platform-infra-sub2api-codex.ts b/scripts/src/platform-infra-sub2api-codex.ts index 3c4ca3c8..a3d18cb1 100644 --- a/scripts/src/platform-infra-sub2api-codex.ts +++ b/scripts/src/platform-infra-sub2api-codex.ts @@ -141,6 +141,7 @@ interface CodexPoolConfig { defaultAccountCapacity: number; defaultAccountLoadFactor: number; defaultTempUnschedulable: CodexTempUnschedulablePolicy; + defaultSentinelProtect: CodexSentinelProtectPolicy; profiles: CodexPoolProfileConfig[]; publicExposure: CodexPoolPublicExposureConfig; localCodex: CodexPoolLocalCodexConfig; @@ -703,6 +704,7 @@ async function codexPoolSync(config: UniDeskConfig, options: SyncOptions): Promi defaultAccountPriority: pool.defaultAccountPriority, defaultAccountCapacity: pool.defaultAccountCapacity, defaultAccountLoadFactor: pool.defaultAccountLoadFactor, + defaultSentinelProtect: pool.defaultSentinelProtect, }, profiles: profiles.map((profile) => ({ profile: profile.profile, @@ -1085,7 +1087,7 @@ function collectCodexProfiles(): CodexProfile[] { const seenAccountNames = new Set(); const configs = pool.profiles.length > 0 ? pool.profiles - : discoverCodexProfileConfigs(codexDir, pool.defaultTempUnschedulable, pool.defaultAccountPriority); + : discoverCodexProfileConfigs(codexDir, pool.defaultTempUnschedulable, pool.defaultSentinelProtect, pool.defaultAccountPriority); return configs.map((entry) => { const resolved = resolveProfileFiles(codexDir, entry); const profile = entry.profile; @@ -1162,6 +1164,7 @@ function collectCodexProfiles(): CodexProfile[] { function discoverCodexProfileConfigs( codexDir: string, defaultTempUnschedulable = defaultCodexTempUnschedulablePolicy(), + defaultSentinelProtect = defaultCodexSentinelProtectPolicy(), defaultPriority = defaultAccountPriority, ): CodexPoolProfileConfig[] { return readdirSync(codexDir) @@ -1184,7 +1187,7 @@ function discoverCodexProfileConfigs( openaiResponsesWebSocketsV2Mode: null, upstreamUserAgent: null, trustUpstream: false, - sentinelProtect: defaultCodexSentinelProtectPolicy(), + sentinelProtect: defaultSentinelProtect, priority: defaultPriority, capacity: null, loadFactor: null, @@ -1215,10 +1218,12 @@ function readCodexPoolConfig(): CodexPoolConfig { const defaultAccountPriorityValue = readAccountPriority(pool.defaultAccountPriority, "pool.defaultAccountPriority"); const defaultAccountCapacityValue = readAccountCapacity(pool.defaultAccountCapacity, "pool.defaultAccountCapacity"); const defaultAccountLoadFactorValue = readAccountLoadFactor(pool.defaultAccountLoadFactor, "pool.defaultAccountLoadFactor"); + const defaultSentinelProtect = readSentinelProtectPolicy(pool.defaultSentinelProtect, "pool.defaultSentinelProtect", defaults.defaultSentinelProtect); const profiles = readProfileConfig( parsed.profiles, defaults.profiles, defaultTempUnschedulable, + defaultSentinelProtect, defaultAccountPriorityValue, ); const declaredAccountCapacity = desiredProfileCapacityTotal(profiles, defaultAccountCapacityValue); @@ -1238,6 +1243,7 @@ function readCodexPoolConfig(): CodexPoolConfig { defaultAccountCapacity: defaultAccountCapacityValue, defaultAccountLoadFactor: defaultAccountLoadFactorValue, defaultTempUnschedulable, + defaultSentinelProtect, profiles, publicExposure: readPublicExposureConfig(parsed.publicExposure, defaults.publicExposure), localCodex: readLocalCodexConfig(parsed.localCodex, defaults.localCodex), @@ -1268,6 +1274,7 @@ function defaultCodexPoolConfig(): CodexPoolConfig { defaultAccountCapacity: 5, defaultAccountLoadFactor: 1, defaultTempUnschedulable: defaultCodexTempUnschedulablePolicy(), + defaultSentinelProtect: defaultCodexSentinelProtectPolicy(), profiles: [], publicExposure: { enabled: false, @@ -1329,6 +1336,7 @@ function readProfileConfig( value: unknown, defaults: CodexPoolProfileConfig[], defaultTempUnschedulable: CodexTempUnschedulablePolicy, + defaultSentinelProtect: CodexSentinelProtectPolicy, defaultPriority: number, ): CodexPoolProfileConfig[] { if (!isRecord(value)) return defaults; @@ -1354,7 +1362,7 @@ function readProfileConfig( const openaiResponsesWebSocketsV2Mode = readOpenAIResponsesWebSocketsV2Mode(entry.openaiResponsesWebSocketsV2Mode, `profiles.entries[${index}].openaiResponsesWebSocketsV2Mode`); const upstreamUserAgent = readUpstreamUserAgent(entry.upstreamUserAgent, `profiles.entries[${index}].upstreamUserAgent`); const trustUpstream = readTrustUpstream(entry.trustUpstream, `profiles.entries[${index}].trustUpstream`); - const sentinelProtect = readSentinelProtectPolicy(entry.sentinelProtect, `profiles.entries[${index}].sentinelProtect`); + const sentinelProtect = readSentinelProtectPolicy(entry.sentinelProtect, `profiles.entries[${index}].sentinelProtect`, defaultSentinelProtect); const priority = readAccountPriority(entry.priority, `profiles.entries[${index}].priority`, defaultPriority); const capacity = entry.capacity === undefined || entry.capacity === null ? null : readAccountCapacity(entry.capacity, `profiles.entries[${index}].capacity`); const loadFactor = entry.loadFactor === undefined || entry.loadFactor === null ? null : readAccountLoadFactor(entry.loadFactor, `profiles.entries[${index}].loadFactor`); @@ -1422,8 +1430,7 @@ export function defaultCodexSentinelProtectPolicy(): CodexSentinelProtectPolicy }; } -function readSentinelProtectPolicy(value: unknown, key: string): CodexSentinelProtectPolicy { - const fallback = defaultCodexSentinelProtectPolicy(); +function readSentinelProtectPolicy(value: unknown, key: string, fallback = defaultCodexSentinelProtectPolicy()): CodexSentinelProtectPolicy { if (value === undefined || value === null) return fallback; if (!isRecord(value)) throw new Error(`${codexPoolConfigPath}.${key} must be a YAML object`); const enabled = readBooleanConfig(value.enabled, `${key}.enabled`, fallback.enabled); @@ -1857,6 +1864,7 @@ function codexPoolConfigSummary(pool: CodexPoolConfig): Record defaultAccountCapacity: pool.defaultAccountCapacity, defaultAccountLoadFactor: pool.defaultAccountLoadFactor, defaultTempUnschedulable: tempUnschedulableSummary(pool.defaultTempUnschedulable), + defaultSentinelProtect: pool.defaultSentinelProtect, profileCount: pool.profiles.length, publicExposure: publicExposureSummary(pool), localCodex: pool.localCodex, @@ -2744,6 +2752,7 @@ function poolTarget(pool = readCodexPoolConfig(), target = codexPoolRuntimeTarge defaultAccountPriority: pool.defaultAccountPriority, defaultAccountCapacity: pool.defaultAccountCapacity, defaultAccountLoadFactor: pool.defaultAccountLoadFactor, + defaultSentinelProtect: pool.defaultSentinelProtect, sentinel: { monitorEnabled: pool.sentinel.monitor.enabled, actionsEnabled: pool.sentinel.actions.enabled, @@ -3875,7 +3884,7 @@ function desiredAccountCapacityMap(pool: CodexPoolConfig): Record(); const configs = pool.profiles.length > 0 ? pool.profiles - : discoverCodexProfileConfigs(codexDir, pool.defaultTempUnschedulable, pool.defaultAccountPriority); + : discoverCodexProfileConfigs(codexDir, pool.defaultTempUnschedulable, pool.defaultSentinelProtect, pool.defaultAccountPriority); const capacities: Record = {}; for (const entry of configs) { const accountName = entry.accountName ?? uniqueAccountName(entry.profile, seenAccountNames); @@ -3890,7 +3899,7 @@ function desiredAccountLoadFactorMap(pool: CodexPoolConfig): Record(); const configs = pool.profiles.length > 0 ? pool.profiles - : discoverCodexProfileConfigs(codexDir, pool.defaultTempUnschedulable, pool.defaultAccountPriority); + : discoverCodexProfileConfigs(codexDir, pool.defaultTempUnschedulable, pool.defaultSentinelProtect, pool.defaultAccountPriority); const loadFactors: Record = {}; for (const entry of configs) { const accountName = entry.accountName ?? uniqueAccountName(entry.profile, seenAccountNames); @@ -3905,7 +3914,7 @@ function desiredAccountWebSocketsV2ModeMap(pool: CodexPoolConfig): Record(); const configs = pool.profiles.length > 0 ? pool.profiles - : discoverCodexProfileConfigs(codexDir, pool.defaultTempUnschedulable, pool.defaultAccountPriority); + : discoverCodexProfileConfigs(codexDir, pool.defaultTempUnschedulable, pool.defaultSentinelProtect, pool.defaultAccountPriority); const modes: Record = {}; for (const entry of configs) { const accountName = entry.accountName ?? uniqueAccountName(entry.profile, seenAccountNames); @@ -3920,7 +3929,7 @@ function desiredAccountTempUnschedulableMap(pool: CodexPoolConfig): Record(); const configs = pool.profiles.length > 0 ? pool.profiles - : discoverCodexProfileConfigs(codexDir, pool.defaultTempUnschedulable, pool.defaultAccountPriority); + : discoverCodexProfileConfigs(codexDir, pool.defaultTempUnschedulable, pool.defaultSentinelProtect, pool.defaultAccountPriority); const policies: Record = {}; for (const entry of configs) { const accountName = entry.accountName ?? uniqueAccountName(entry.profile, seenAccountNames);