From 9f85274da00f04b4ccb89083d2bc98b98e964216 Mon Sep 17 00:00:00 2001 From: Codex Date: Thu, 11 Jun 2026 10:20:23 +0000 Subject: [PATCH] fix: gate changed sub2api accounts behind sentinel probes --- .agents/skills/unidesk-sub2api/SKILL.md | 6 +- config/platform-infra/sub2api-codex-pool.yaml | 4 +- docs/reference/platform-infra.md | 6 +- ...ra-sub2api-codex-sentinel-contract-test.ts | 80 ++++- .../platform-infra-sub2api-codex-sentinel.ts | 7 +- scripts/src/platform-infra-sub2api-codex.ts | 340 ++++++++++++++++-- 6 files changed, 408 insertions(+), 35 deletions(-) diff --git a/.agents/skills/unidesk-sub2api/SKILL.md b/.agents/skills/unidesk-sub2api/SKILL.md index 01f20830..416b338d 100644 --- a/.agents/skills/unidesk-sub2api/SKILL.md +++ b/.agents/skills/unidesk-sub2api/SKILL.md @@ -89,14 +89,14 @@ bun scripts/cli.ts platform-infra sub2api codex-pool cleanup-probes --confirm - `sentinel.probe.maxOutputTokens`: 哨兵本地流式 delta 收集上限,必须保持小值;它不作为上游 `max_output_tokens` 字段发送,以保持与 Sub2API WebUI 默认账号连接测试的 Responses SSE 请求形态一致。哨兵不限制并发和每轮账号数,所有到期账号会在同一轮并发探测。 - `sentinel.probe.userAgent`: 哨兵 direct upstream probe 的默认 User-Agent,通过 OpenAI SDK `extra_headers` 传递;默认贴近 Sub2API `net/http` 账号连接测试形态,个别账号仍可用 `profiles.entries[].upstreamUserAgent` 覆盖。 - `sentinel.cadence`: 成功信任指数退避配置。当前口径是从 1 分钟开始,连续成功后退避到最大 20 分钟;任意非 marker match 清零成功信任并进入冻结退避。 -- `sentinel.freeze`: 冻结 TTL 指数退避配置。当前口径是初始 2 分钟,失败后 `2m -> 4m -> 8m -> 16m -> 32m -> 64m -> 120m`,最大 2 小时;冻结到期后只做恢复 probe,通过才自动恢复,不能仅靠 TTL 到期解封。 +- `sentinel.freeze`: 失败冻结 TTL 指数退避配置。当前口径是初始 1 分钟,失败后 `1m -> 2m -> 4m -> 8m -> 10m`,最大 10 分钟;失败 probe 基本不消耗有效输出 token,因此冻结窗口保持短周期。冻结到期后只做恢复 probe,通过才自动恢复,不能仅靠 TTL 到期解封。 - `sentinel.pricing`: 直打上游时哨兵自己的 token/cost 估算价格。因为 direct upstream probe 不经过 Sub2API 普通用量账本,哨兵必须自己记录全局与 per-account token/cost;这些账本只用于观察,不作为跳过探测的预算门禁。 -`sync --confirm` 会登录 Sub2API admin、创建/更新 group、创建/更新 YAML 中的 `unidesk-codex-*` accounts、创建/复用统一 API key Secret,并把 managed account 的 `schedulable=true` 恢复为过程控制基线;它默认不删除 YAML 中缺席的 managed account。只有明确退役上游时才使用 `sync --confirm --prune-removed` 删除缺席且 `extra.unidesk_managed=true` 的 `unidesk-codex-*` account。 +`sync --confirm` 会登录 Sub2API admin、创建/更新 group、创建/更新 YAML 中的 `unidesk-codex-*` accounts、创建/复用统一 API key Secret,并把未触发质检的 managed account 的 `schedulable=true` 恢复为过程控制基线;它默认不删除 YAML 中缺席的 managed account。只有明确退役上游时才使用 `sync --confirm --prune-removed` 删除缺席且 `extra.unidesk_managed=true` 的 `unidesk-codex-*` account。 `sentinel-image status|build` 管理哨兵 Python 运行环境镜像。镜像由 YAML 的 `sentinel.image` 基础镜像和 `sentinel.sdk.openaiPythonVersion` 派生,发布到 G14 本地 registry `127.0.0.1:5000/platform-infra/sub2api-account-sentinel:`;`build --confirm` 会先检查 registry tag,存在则快速复用,不存在才在 G14 host 构建并 push。CronJob 启动时只校验 SDK 版本,不在运行时 `pip install`。 -`sync --confirm` 同时会按 YAML 渲染账号级哨兵资源,并在 monitor 开启时先确保可复用哨兵镜像存在。当前目标是 `sentinel.monitor.enabled=true` + `sentinel.actions.enabled=true` 的 marker-only 自动冻结/恢复;不要手工 patch CronJob、Secret 或 Sub2API account。若怀疑某个账号被误判,先用 `codex-pool sentinel-probe --account --confirm` 立即触发该账号测量;该命令从现有 CronJob 模板派生一次性 Job,复用同一份 Secret、ConfigMap、OpenAI SDK probe、token/cost 账本和冻结/恢复状态机。 +`sync --confirm` 同时会按 YAML 渲染账号级哨兵资源,并在 monitor 开启时先确保可复用哨兵镜像存在。当前目标是 `sentinel.monitor.enabled=true` + `sentinel.actions.enabled=true` 的 marker-only 自动冻结/恢复;不要手工 patch CronJob、Secret 或 Sub2API account。若 YAML 新增账号或修改 profile/base URL/API key fingerprint/upstream User-Agent/Responses WebSocket mode,sync 会先从变更前 runtime state 写入 pending quality gate,再更新账号并保持默认冻结,最后立即安排 sentinel probe;marker 通过后才自动恢复调度,避免坏号未经质检混入池子。无关账号的既有成功/失败退避不能被重置。若 YAML 下调失败冻结最大窗口,sync 会把仍 active 的旧冻结状态迁移到当前最大窗口内并立即安排 recovery probe,但不会直接解冻。若怀疑某个账号被误判,先用 `codex-pool sentinel-probe --account --confirm` 立即触发该账号测量;该命令从现有 CronJob 模板派生一次性 Job,复用同一份 Secret、ConfigMap、OpenAI SDK probe、token/cost 账本和冻结/恢复状态机。 `sentinel-report` 是只读低噪声报表,不触发 probe、不修改账号。默认输出类似 `ps` 的文本表,展示每个账号的探测次数、最近 marker/HTTP/动作、冻结 TTL、成功退避、下一次 probe 和最近 run 事件;需要机器处理时使用 `sentinel-report --raw`。 diff --git a/config/platform-infra/sub2api-codex-pool.yaml b/config/platform-infra/sub2api-codex-pool.yaml index 723485ed..5ff69ef8 100644 --- a/config/platform-infra/sub2api-codex-pool.yaml +++ b/config/platform-infra/sub2api-codex-pool.yaml @@ -174,8 +174,8 @@ sentinel: successBackoffMultiplier: 2 jitterPercent: 10 freeze: - initialTtlMinutes: 2 - maxTtlMinutes: 120 + initialTtlMinutes: 1 + maxTtlMinutes: 10 backoffMultiplier: 2 jitterPercent: 10 pricing: diff --git a/docs/reference/platform-infra.md b/docs/reference/platform-infra.md index 195d4bea..2f8c7264 100644 --- a/docs/reference/platform-infra.md +++ b/docs/reference/platform-infra.md @@ -73,7 +73,11 @@ An external account-level sentinel that wants parity with this WebUI path should The UniDesk account-level sentinel uses marker-only health semantics. A probe is healthy only when the upstream response satisfies the configured marker match. Every other result is unhealthy and must enter the same exponential freeze state machine, regardless of whether the immediate response is HTTP 200, 400, 403, 429, 500, 502, 503, 504, a streaming error event, malformed output, empty output, timeout, or any other transport/API failure. HTTP status, upstream error code, body hash, body preview, headers, and SDK exception class are diagnostics only; they must not become additional allow/deny criteria that bypass marker mismatch. -The sentinel must not maintain separate classifiers for "private content", "maintenance", "quota", "ads", or provider-specific body phrases as health gates. The only recovery condition is a later recovery probe that matches the marker. Freeze TTL expiry only schedules the next recovery probe; it does not restore an account by itself. Repeated non-marker results use exponential freeze backoff, and repeated marker-matching results use the configured success cadence backoff. This contract applies equally to OpenAI Responses `gpt-5.5` direct account probes and manual `codex-pool sentinel-probe --account ... --confirm` measurements. +The sentinel must not maintain separate classifiers for "private content", "maintenance", "quota", "ads", or provider-specific body phrases as health gates. The only recovery condition is a later recovery probe that matches the marker. Freeze TTL expiry only schedules the next recovery probe; it does not restore an account by itself. Repeated non-marker results use a short exponential freeze backoff because failed marker probes produce little or no useful output token usage; repeated marker-matching results use the configured success cadence backoff. This contract applies equally to OpenAI Responses `gpt-5.5` direct account probes and manual `codex-pool sentinel-probe --account ... --confirm` measurements. + +When `codex-pool sync --confirm` creates a YAML-managed account or changes direct-probe-relevant account inputs such as the profile mapping, upstream base URL, API key fingerprint, upstream User-Agent, or Responses WebSocket mode, only that account must be default-frozen before it can enter the scheduler. Sync first records a pending sentinel quality gate from the pre-mutation runtime state, then updates the account, then schedules the account probe immediately. This ordering prevents a new or changed account from being written to Sub2API without a matching sentinel quarantine record if sync fails midway. Passing the marker clears the quality gate and restores schedulability; any non-marker result continues the failure freeze backoff. Unchanged accounts must not have their existing success or failure backoff reset by unrelated YAML syncs. + +If the YAML failure freeze maximum is lowered, `codex-pool sync --confirm` may migrate only currently active sentinel quarantines whose stored interval or next recovery time exceeds the current maximum. The migration keeps the account frozen, marks the next recovery probe due immediately, and lets the next marker result decide restore versus the new shorter failure backoff. It must not clear quarantine or restore schedulability merely because an older TTL has expired. Operational observation for this sentinel should use the read-only `codex-pool sentinel-report` table or its `--raw` form. It is the canonical low-noise view for per-account probe count, marker result, HTTP/error diagnostics, freeze TTL, success cadence, next probe time, and recent CronJob runs; raw ConfigMap dumps and ad hoc log scraping are fallback diagnostics, not the primary state surface. diff --git a/scripts/platform-infra-sub2api-codex-sentinel-contract-test.ts b/scripts/platform-infra-sub2api-codex-sentinel-contract-test.ts index 7d116dd3..27e01ccd 100644 --- a/scripts/platform-infra-sub2api-codex-sentinel-contract-test.ts +++ b/scripts/platform-infra-sub2api-codex-sentinel-contract-test.ts @@ -3,7 +3,7 @@ import { rmSync, writeFileSync } from "node:fs"; import { tmpdir } from "node:os"; import { join } from "node:path"; import { rootPath } from "./src/config"; -import { codexPoolHelp, defaultCodexTempUnschedulablePolicy } from "./src/platform-infra-sub2api-codex"; +import { codexPoolHelp, codexPoolSentinelProbeConfigFingerprint, defaultCodexTempUnschedulablePolicy } from "./src/platform-infra-sub2api-codex"; import { codexPoolSentinelRuntimeImage, defaultCodexPoolSentinelConfig, @@ -18,6 +18,7 @@ function assertCondition(condition: unknown, message: string, detail: unknown = } const configPath = rootPath("config", "platform-infra", "sub2api-codex-pool.yaml"); +const codexPoolSourcePath = rootPath("scripts", "src", "platform-infra-sub2api-codex.ts"); const sentinelDockerfilePath = rootPath("src", "components", "platform-infra", "sub2api", "sentinel.Dockerfile"); const parsed = Bun.YAML.parse(readFileSync(configPath, "utf8")) as { sentinel?: unknown; @@ -25,6 +26,7 @@ const parsed = Bun.YAML.parse(readFileSync(configPath, "utf8")) as { }; const sentinel = readCodexPoolSentinelConfig(parsed.sentinel, defaultCodexPoolSentinelConfig(), configPath); const sentinelRuntimeImage = codexPoolSentinelRuntimeImage(sentinel); +const codexPoolSource = readFileSync(codexPoolSourcePath, "utf8"); const sentinelDockerfile = readFileSync(sentinelDockerfilePath, "utf8"); const yamlTempUnschedulableRules = parsed.pool?.defaultTempUnschedulable?.rules ?? []; @@ -44,10 +46,52 @@ assertCondition(!("concurrency" in sentinel.probe), "sentinel must not cap probe assertCondition(!("maxAccountsPerRun" in sentinel.probe), "sentinel must not cap accounts per run; all due accounts are eligible", sentinel.probe); assertCondition(sentinel.cadence.successInitialIntervalMinutes === 1, "success trust backoff must start at 1 minute", sentinel.cadence); assertCondition(sentinel.cadence.successMaxIntervalMinutes === 20, "success trust backoff must cap at 20 minutes", sentinel.cadence); -assertCondition(sentinel.freeze.initialTtlMinutes === 2, "freeze backoff must start at 2 minutes", sentinel.freeze); -assertCondition(sentinel.freeze.maxTtlMinutes === 120, "freeze backoff must cap at 2 hours", sentinel.freeze); +assertCondition(sentinel.freeze.initialTtlMinutes === 1, "failure freeze backoff must start at 1 minute", sentinel.freeze); +assertCondition(sentinel.freeze.maxTtlMinutes === 10, "failure freeze backoff must cap at 10 minutes", sentinel.freeze); assertCondition(!("budget" in sentinel), "sentinel must not use token budgets as a probe gate; usage is recorded only", sentinel); +const probeFingerprint = codexPoolSentinelProbeConfigFingerprint({ + accountName: "unidesk-codex-example", + profile: "example", + baseUrl: "https://example.invalid/v1/", + apiKeyFingerprint: "key-a", + upstreamUserAgent: null, + openaiResponsesWebSocketsV2Mode: null, +}); +assertCondition( + probeFingerprint === codexPoolSentinelProbeConfigFingerprint({ + accountName: "unidesk-codex-example", + profile: "example", + baseUrl: "https://example.invalid/v1", + apiKeyFingerprint: "key-a", + upstreamUserAgent: null, + openaiResponsesWebSocketsV2Mode: null, + }), + "sentinel probe fingerprint must normalize base URL trailing slash", +); +assertCondition( + probeFingerprint !== codexPoolSentinelProbeConfigFingerprint({ + accountName: "unidesk-codex-example", + profile: "example", + baseUrl: "https://example.invalid/v1", + apiKeyFingerprint: "key-b", + upstreamUserAgent: null, + openaiResponsesWebSocketsV2Mode: null, + }), + "sentinel probe fingerprint must change when API key fingerprint changes", +); +assertCondition( + probeFingerprint !== codexPoolSentinelProbeConfigFingerprint({ + accountName: "unidesk-codex-example", + profile: "example", + baseUrl: "https://example.invalid/v1", + apiKeyFingerprint: "key-a", + upstreamUserAgent: "CodexCLI/0.1", + openaiResponsesWebSocketsV2Mode: "ctx_pool", + }), + "sentinel probe fingerprint must change when direct probe request-shape inputs change", +); + const manifest = renderCodexPoolSentinelManifest(sentinel, [ { accountName: "unidesk-codex-example", @@ -102,6 +146,33 @@ assertCondition(runner.includes('"responseBodyPreview": item.get("responseBodyPr assertCondition(runner.includes("SENTINEL_ACCOUNT_NAMES"), "sentinel runner must support forced account probes for CLI manual measurement", runner); assertCondition(runner.includes('parsed.get("code") not in (None, 0)'), "sentinel admin client must treat Sub2API {code:0,message:success,data} envelopes as successful", runner); assertCondition(runner.includes("page_size=20&platform=openai&type=apikey&search="), "sentinel admin client must query one target account instead of fetching all accounts into the 64KiB admin response cap", runner); +assertCondition(runner.includes('account_state["qualityGate"] = {**quality_gate, "pending": False'), "sentinel runner must clear pending YAML quality gates after marker-matched recovery", runner); + +assertCondition(codexPoolSource.includes("sentinelProbeConfigFingerprint: codexPoolSentinelProbeConfigFingerprint({"), "sync payload must include per-account sentinel probe fingerprints", codexPoolSourcePath); +assertCondition(codexPoolSource.includes("quality_gate_required = sentinel_quality_gate_enabled() and len(change_reasons) > 0"), "sync must require quality gate only for new or direct-probe-changed accounts", codexPoolSourcePath); +assertCondition(codexPoolSource.includes('ensure_account_schedulable(token, data["id"], profile["accountName"], not quality_gate_required and not keep_frozen)'), "sync must default-freeze changed/new accounts before they enter scheduling", codexPoolSourcePath); +assertCondition(codexPoolSource.includes('"sentinelProbeRequired": quality_gate_required'), "sync result must report whether each account needs sentinel quality gate", codexPoolSourcePath); +assertCondition(codexPoolSource.includes('"reason": "yaml-account-change-pending-sentinel-probe"'), "sync state must record YAML-change quality-gate quarantine reason", codexPoolSourcePath); +assertCondition(codexPoolSource.includes('account_state["nextProbeAfter"] = now'), "sync state must schedule changed/new account probes immediately", codexPoolSourcePath); +assertCondition(codexPoolSource.includes("planned_account_results = planned_sentinel_account_results(profiles, existing_accounts)"), "sync must compute changed/new accounts from pre-mutation runtime state", codexPoolSourcePath); +assertCondition(codexPoolSource.includes("sentinel_quality_prepare = ensure_sentinel_state_for_sync(planned_account_results, True)"), "sync must prepare changed/new account quality gate state before mutating accounts", codexPoolSourcePath); +assertCondition(codexPoolSource.indexOf("sentinel_quality_prepare = ensure_sentinel_state_for_sync(planned_account_results, True)") < codexPoolSource.indexOf("account_results, pruned_account_results = ensure_accounts("), "sync must prepare quality gate before account reconcile", codexPoolSourcePath); +assertCondition(codexPoolSource.includes("sentinel_quality = ensure_sentinel_state_for_sync(account_results)"), "sync response must surface sentinel quality-gate reconciliation", codexPoolSourcePath); +const reassertFunction = codexPoolSource.slice( + codexPoolSource.indexOf("def reassert_sentinel_freezes_after_sync("), + codexPoolSource.indexOf("def list_user_keys("), +); +assertCondition(!reassertFunction.includes("until_epoch"), "sync freeze reassert must not restore active quarantines just because TTL has expired", codexPoolSourcePath); +assertCondition(codexPoolSource.includes("protected_frozen_names = active_sentinel_quarantine_names()"), "sync must read active sentinel quarantines before reconciling accounts", codexPoolSourcePath); +assertCondition(codexPoolSource.includes("sentinelFreezeProtected"), "sync output must reveal accounts kept frozen by pre-existing sentinel quarantine", codexPoolSourcePath); +assertCondition(codexPoolSource.includes("def clamp_sentinel_freezes_for_config("), "sync must migrate active freeze state when YAML freeze max is lowered", codexPoolSourcePath); +assertCondition(codexPoolSource.includes('reason = "freeze-backoff-clamped-to-current-config"'), "sync quality gate output must report freeze backoff clamping", codexPoolSourcePath); +const stateUpdateFunction = codexPoolSource.slice( + codexPoolSource.indexOf("def update_sentinel_state_configmap("), + codexPoolSource.indexOf("def ensure_sentinel_state_for_sync("), +); +assertCondition(!stateUpdateFunction.includes('"patch"'), "sentinel state updates must not pass large state.json through kubectl patch argv", codexPoolSourcePath); +assertCondition(stateUpdateFunction.includes('"-f", "-"'), "sentinel state updates must stream the ConfigMap manifest through stdin", codexPoolSourcePath); const disabledMonitor = { ...sentinel, @@ -149,7 +220,8 @@ console.log(JSON.stringify({ "marker match is the only health standard", "budget is record-only and does not gate probes", "success trust backoff is 1m to 20m", - "freeze backoff is 2m to 120m", + "failure freeze backoff is 1m to 10m", + "YAML changed/new accounts default-freeze until marker-matched sentinel recovery", "CronJob is k8s-native with Forbid concurrency and minimal RBAC", "monitor switch controls CronJob suspend state", "rendered Secret avoids plaintext upstream credentials", diff --git a/scripts/src/platform-infra-sub2api-codex-sentinel.ts b/scripts/src/platform-infra-sub2api-codex-sentinel.ts index 1bbaaf26..aca069e8 100644 --- a/scripts/src/platform-infra-sub2api-codex-sentinel.ts +++ b/scripts/src/platform-infra-sub2api-codex-sentinel.ts @@ -107,8 +107,8 @@ export function defaultCodexPoolSentinelConfig(): CodexPoolSentinelConfig { jitterPercent: 10, }, freeze: { - initialTtlMinutes: 2, - maxTtlMinutes: 120, + initialTtlMinutes: 1, + maxTtlMinutes: 10, backoffMultiplier: 2, jitterPercent: 10, }, @@ -1149,6 +1149,9 @@ def apply_result(result, state, config, now, admin): except Exception as exc: action = {"taken": False, "type": "restore-failed", "error": str(exc)} account_state["quarantine"] = {"active": False, "clearedAt": iso(now), "lastApplied": quarantine.get("applied") is True} + quality_gate = account_state.get("qualityGate") if isinstance(account_state.get("qualityGate"), dict) else None + if quality_gate and quality_gate.get("pending") is True: + account_state["qualityGate"] = {**quality_gate, "pending": False, "clearedAt": iso(now)} account_state["successStreak"] = 0 account_state["successIntervalMinutes"] = 0 interval = next_success_interval(account_state, config) diff --git a/scripts/src/platform-infra-sub2api-codex.ts b/scripts/src/platform-infra-sub2api-codex.ts index f12268de..c6d6865e 100644 --- a/scripts/src/platform-infra-sub2api-codex.ts +++ b/scripts/src/platform-infra-sub2api-codex.ts @@ -462,6 +462,14 @@ async function codexPoolSync(config: UniDeskConfig, options: SyncOptions): Promi apiKey: profile.apiKey, apiKeySource: profile.apiKeySource, apiKeyFingerprint: fingerprint(profile.apiKey ?? ""), + sentinelProbeConfigFingerprint: codexPoolSentinelProbeConfigFingerprint({ + accountName: profile.accountName, + profile: profile.profile, + baseUrl: profile.baseUrl, + apiKeyFingerprint: fingerprint(profile.apiKey ?? ""), + upstreamUserAgent: profile.upstreamUserAgent, + openaiResponsesWebSocketsV2Mode: profile.openaiResponsesWebSocketsV2Mode, + }), openaiResponsesWebSocketsV2Mode: profile.openaiResponsesWebSocketsV2Mode, upstreamUserAgent: profile.upstreamUserAgent, priority: profile.priority, @@ -1721,6 +1729,8 @@ function compactSentinelStatus(block: unknown): unknown { const configMap = isRecord(runtime.configMap) ? runtime.configMap : {}; const state = isRecord(runtime.state) ? runtime.state : {}; const freezeReassert = isRecord(block.freezeReassert) ? block.freezeReassert : {}; + const qualityGatePrepare = isRecord(block.qualityGatePrepare) ? block.qualityGatePrepare : {}; + const qualityGate = isRecord(block.qualityGate) ? block.qualityGate : {}; return { ok: block.ok, action: block.action, @@ -1769,6 +1779,25 @@ function compactSentinelStatus(block: unknown): unknown { itemCount: freezeReassert.itemCount, attentionItems: freezeReassert.attentionItems, } : undefined, + qualityGatePrepare: Object.keys(qualityGatePrepare).length > 0 ? { + ok: qualityGatePrepare.ok, + skipped: qualityGatePrepare.skipped, + reason: qualityGatePrepare.reason, + changedCount: qualityGatePrepare.changedCount, + fingerprintOnlyCount: qualityGatePrepare.fingerprintOnlyCount, + pendingOnly: qualityGatePrepare.pendingOnly, + items: qualityGatePrepare.items, + } : undefined, + qualityGate: Object.keys(qualityGate).length > 0 ? { + ok: qualityGate.ok, + skipped: qualityGate.skipped, + reason: qualityGate.reason, + changedCount: qualityGate.changedCount, + fingerprintOnlyCount: qualityGate.fingerprintOnlyCount, + clampedCount: qualityGate.clampedCount, + items: qualityGate.items, + clampedItems: qualityGate.clampedItems, + } : undefined, valuesPrinted: false, }; } @@ -2728,6 +2757,24 @@ function fingerprint(value: string): string { return createHash("sha256").update(value).digest("hex").slice(0, 12); } +export function codexPoolSentinelProbeConfigFingerprint(input: { + accountName: string; + profile: string; + baseUrl: string; + apiKeyFingerprint: string | null; + upstreamUserAgent: string | null; + openaiResponsesWebSocketsV2Mode: string | null; +}): string { + return fingerprint(JSON.stringify({ + accountName: input.accountName, + profile: input.profile, + baseUrl: normalizeBaseUrl(input.baseUrl) ?? input.baseUrl, + apiKeyFingerprint: input.apiKeyFingerprint, + upstreamUserAgent: input.upstreamUserAgent, + openaiResponsesWebSocketsV2Mode: input.openaiResponsesWebSocketsV2Mode, + })); +} + function syncScript(payload: unknown, pool: CodexPoolConfig): string { const encoded = Buffer.from(JSON.stringify(payload), "utf8").toString("base64"); return remotePythonScript("sync", encoded, pool); @@ -2750,7 +2797,7 @@ set -eu python3 - <<'PY' import json import subprocess -from datetime import datetime, timezone +from datetime import datetime, timezone, timedelta NAMESPACE = ${JSON.stringify(namespace)} STATE_NAME = ${JSON.stringify(stateName)} @@ -3122,12 +3169,13 @@ set -u python3 - <<'PY' import base64 import json +import re import secrets import string import subprocess import sys import time -from datetime import datetime +from datetime import datetime, timezone, timedelta from urllib.parse import quote NAMESPACE = "${namespace}" @@ -3237,6 +3285,60 @@ def parse_curl_output(proc): "transportExitCode": proc.returncode, } +def utc_iso(offset_seconds=0): + return (datetime.now(timezone.utc) + timedelta(seconds=offset_seconds)).strftime("%Y-%m-%dT%H:%M:%SZ") + +def normalize_runtime_base_url(value): + if not isinstance(value, str): + return None + value = value.strip().rstrip("/") + return value or None + +def empty_to_none(value): + return value if isinstance(value, str) and value else None + +def sentinel_quality_gate_enabled(): + return (SENTINEL_CONFIG.get("monitor") or {}).get("enabled") is True and (SENTINEL_CONFIG.get("actions") or {}).get("enabled") is True + +def account_notes_fingerprint(account): + notes = account.get("notes") if isinstance(account, dict) else None + if not isinstance(notes, str): + return None + match = re.search(r"fingerprint=([A-Za-z0-9_-]+)", notes) + return match.group(1) if match else None + +def runtime_account_credentials(account): + credentials = account.get("credentials") if isinstance(account, dict) and isinstance(account.get("credentials"), dict) else {} + return credentials + +def runtime_account_extra(account): + extra = account.get("extra") if isinstance(account, dict) and isinstance(account.get("extra"), dict) else {} + return extra + +def sentinel_probe_change_reasons(current, profile): + if not isinstance(current, dict) or current.get("id") is None: + return ["created"] + credentials = runtime_account_credentials(current) + extra = runtime_account_extra(current) + expected_base_url = normalize_runtime_base_url(profile.get("baseUrl")) + runtime_base_url = normalize_runtime_base_url(credentials.get("base_url")) + expected_user_agent = empty_to_none(profile.get("upstreamUserAgent")) + runtime_user_agent = empty_to_none(credentials.get("user_agent")) + expected_ws_mode = empty_to_none(profile.get("openaiResponsesWebSocketsV2Mode")) + runtime_ws_mode = empty_to_none(extra.get("openai_apikey_responses_websockets_v2_mode")) + reasons = [] + if empty_to_none(extra.get("unidesk_codex_profile")) != profile.get("profile"): + reasons.append("profile") + if runtime_base_url != expected_base_url: + reasons.append("base-url") + if account_notes_fingerprint(current) != profile.get("apiKeyFingerprint"): + reasons.append("api-key-fingerprint") + if runtime_user_agent != expected_user_agent: + reasons.append("upstream-user-agent") + if runtime_ws_mode != expected_ws_mode: + reasons.append("responses-websockets-v2-mode") + return reasons + def curl_api(method, path, bearer=None, payload=None): body = b"" if payload is None else json.dumps(payload, separators=(",", ":")).encode("utf-8") script = r''' @@ -3478,21 +3580,44 @@ def account_payload(profile, group_id): "confirm_mixed_channel_risk": True, } -def ensure_account_schedulable(token, account_id, account_name): +def ensure_account_schedulable(token, account_id, account_name, schedulable=True): data = ensure_success( - curl_api("POST", f"/api/v1/admin/accounts/{account_id}/schedulable", bearer=token, payload={"schedulable": True}), - f"set account {account_name} schedulable", + curl_api("POST", f"/api/v1/admin/accounts/{account_id}/schedulable", bearer=token, payload={"schedulable": bool(schedulable)}), + f"set account {account_name} schedulable={bool(schedulable)}", ) return data if isinstance(data, dict) else {} -def ensure_accounts(token, profiles, group_id, prune_removed=False): - existing_accounts = list_accounts(token) +def planned_sentinel_account_results(profiles, existing_accounts): + existing = {item.get("name"): item for item in existing_accounts if isinstance(item, dict)} + results = [] + for profile in profiles: + change_reasons = sentinel_probe_change_reasons(existing.get(profile["accountName"]), profile) + quality_gate_required = sentinel_quality_gate_enabled() and len(change_reasons) > 0 + results.append({ + "profile": profile["profile"], + "accountName": profile["accountName"], + "sentinelProbeConfigFingerprint": profile.get("sentinelProbeConfigFingerprint"), + "sentinelProbeRequired": quality_gate_required, + "sentinelChangeReasons": change_reasons if quality_gate_required else [], + "sentinelDefaultFrozen": quality_gate_required, + "valuesPrinted": False, + }) + return results + +def ensure_accounts(token, profiles, group_id, prune_removed=False, protected_frozen_names=None, existing_accounts=None): + if not isinstance(protected_frozen_names, set): + protected_frozen_names = set() + if not isinstance(existing_accounts, list): + existing_accounts = list_accounts(token) existing = {item.get("name"): item for item in existing_accounts} desired_names = {profile["accountName"] for profile in profiles} results = [] for profile in profiles: payload = account_payload(profile, group_id) current = existing.get(profile["accountName"]) + change_reasons = sentinel_probe_change_reasons(current, profile) + quality_gate_required = sentinel_quality_gate_enabled() and len(change_reasons) > 0 + keep_frozen = profile["accountName"] in protected_frozen_names if current and current.get("id") is not None: account_id = current["id"] update_payload = dict(payload) @@ -3504,7 +3629,7 @@ def ensure_accounts(token, profiles, group_id, prune_removed=False): data = ensure_success(curl_api("POST", "/api/v1/admin/accounts", bearer=token, payload=payload), f"create account {profile['accountName']}") action = "created" if isinstance(data, dict) and data.get("id") is not None: - schedulable_data = ensure_account_schedulable(token, data["id"], profile["accountName"]) + schedulable_data = ensure_account_schedulable(token, data["id"], profile["accountName"], not quality_gate_required and not keep_frozen) if schedulable_data: data = schedulable_data results.append({ @@ -3515,6 +3640,11 @@ def ensure_accounts(token, profiles, group_id, prune_removed=False): "baseUrl": profile["baseUrl"], "apiKeySource": profile["apiKeySource"], "apiKeyFingerprint": profile["apiKeyFingerprint"], + "sentinelProbeConfigFingerprint": profile.get("sentinelProbeConfigFingerprint"), + "sentinelProbeRequired": quality_gate_required, + "sentinelChangeReasons": change_reasons if quality_gate_required else [], + "sentinelDefaultFrozen": quality_gate_required, + "sentinelFreezeProtected": keep_frozen, "openaiResponsesWebSocketsV2Mode": profile.get("openaiResponsesWebSocketsV2Mode"), "priority": int(profile.get("priority", POOL_DEFAULT_ACCOUNT_PRIORITY) or POOL_DEFAULT_ACCOUNT_PRIORITY), "capacity": int(profile.get("capacity", 5) or 5), @@ -3745,20 +3875,181 @@ def parse_epoch_z(value): def sentinel_state_object(): state_name = SENTINEL_CONFIG.get("stateConfigMapName") if not state_name: - return None + return None, None obj, err = safe_kube_json(["-n", NAMESPACE, "get", "configmap", state_name], f"configmap/{state_name}") if not isinstance(obj, dict): - return None + return None, None raw_state = (obj.get("data") or {}).get("state.json") if not isinstance(raw_state, str) or not raw_state: - return None + return obj, None try: - return json.loads(raw_state) + return obj, json.loads(raw_state) except Exception: - return None + return obj, None + +def active_sentinel_quarantine_names(): + _, state = sentinel_state_object() + if not isinstance(state, dict): + return set() + accounts_state = state.get("accounts") if isinstance(state.get("accounts"), dict) else {} + names = set() + for name, account_state in accounts_state.items(): + if not isinstance(name, str) or not isinstance(account_state, dict): + continue + quarantine = account_state.get("quarantine") + if isinstance(quarantine, dict) and quarantine.get("active") is True and quarantine.get("applied") is True: + names.add(name) + return names + +def default_sentinel_state(): + return {"version": 1, "accounts": {}, "ledger": {}, "history": []} + +def clamp_sentinel_freezes_for_config(state, now): + freeze_config = SENTINEL_CONFIG.get("freeze") if isinstance(SENTINEL_CONFIG.get("freeze"), dict) else {} + try: + max_interval = int(freeze_config.get("maxTtlMinutes") or 10) + except Exception: + max_interval = 10 + accounts_state = state.get("accounts") if isinstance(state.get("accounts"), dict) else {} + now_epoch = time.time() + items = [] + for name, account_state in accounts_state.items(): + if not isinstance(name, str) or not isinstance(account_state, dict): + continue + quarantine = account_state.get("quarantine") + if not isinstance(quarantine, dict) or quarantine.get("active") is not True or quarantine.get("applied") is not True: + continue + try: + interval = int(quarantine.get("intervalMinutes") or 0) + except Exception: + interval = 0 + until_epoch = parse_epoch_z(quarantine.get("until")) + old_until = quarantine.get("until") + if interval <= max_interval and (until_epoch is None or until_epoch <= now_epoch + max_interval * 60): + continue + quarantine["previousIntervalMinutes"] = interval + quarantine["intervalMinutes"] = max_interval + quarantine["until"] = now + quarantine["clampedAt"] = now + quarantine["clampedBy"] = "sync-freeze-max-ttl" + account_state["nextProbeAfter"] = now + items.append({ + "accountName": name, + "previousIntervalMinutes": interval, + "maxIntervalMinutes": max_interval, + "previousUntil": old_until, + "nextProbeAfter": now, + }) + return items + +def update_sentinel_state_configmap(obj, state): + state_name = SENTINEL_CONFIG.get("stateConfigMapName") + if not state_name: + return {"ok": False, "reason": "state-configmap-missing"} + state_json = json.dumps(state, ensure_ascii=False, indent=2) + manifest = { + "apiVersion": "v1", + "kind": "ConfigMap", + "metadata": { + "name": state_name, + "namespace": NAMESPACE, + "labels": { + "app.kubernetes.io/name": SERVICE_NAME, + "app.kubernetes.io/component": "account-sentinel", + "app.kubernetes.io/managed-by": "unidesk-platform-infra", + }, + }, + "data": {"state.json": state_json}, + } + proc = kubectl(["apply", "--server-side", "--force-conflicts", f"--field-manager={FIELD_MANAGER}", "-f", "-"], json.dumps(manifest)) + action = "applied" if isinstance(obj, dict) else "created" + if proc.returncode != 0: + return {"ok": False, "reason": f"{action}-failed", "stderrTail": text(proc.stderr, 2000)} + return {"ok": True, "action": action} + +def ensure_sentinel_state_for_sync(account_results, pending_only=False): + if not sentinel_quality_gate_enabled(): + return {"ok": True, "skipped": True, "reason": "sentinel-quality-gate-disabled", "changedCount": 0, "items": [], "valuesPrinted": False} + state_obj, state = sentinel_state_object() + if not isinstance(state, dict): + state = default_sentinel_state() + state.setdefault("version", 1) + accounts_state = state.setdefault("accounts", {}) + if not isinstance(accounts_state, dict): + accounts_state = {} + state["accounts"] = accounts_state + now = utc_iso() + pending_until = utc_iso(3600) + items = [] + clamped_items = [] if pending_only else clamp_sentinel_freezes_for_config(state, now) + changed_count = 0 + fingerprint_only_count = 0 + for item in account_results: + name = item.get("accountName") + if not isinstance(name, str) or not name: + continue + account_state = accounts_state.setdefault(name, {}) + if not isinstance(account_state, dict): + account_state = {} + accounts_state[name] = account_state + fingerprint_value = item.get("sentinelProbeConfigFingerprint") + if isinstance(fingerprint_value, str) and fingerprint_value: + account_state["probeConfigFingerprint"] = fingerprint_value + if item.get("sentinelProbeRequired") is not True: + fingerprint_only_count += 1 + continue + changed_count += 1 + reasons = item.get("sentinelChangeReasons") if isinstance(item.get("sentinelChangeReasons"), list) else [] + account_state["quarantine"] = { + "active": True, + "applied": True, + "until": pending_until if pending_only else now, + "intervalMinutes": 0, + "reason": "yaml-account-change-pending-sentinel-probe", + "failureKind": "pending-quality-gate", + "changeReasons": reasons, + "startedAt": now, + "lastBadAt": now, + } + account_state["nextProbeAfter"] = pending_until if pending_only else now + account_state["successStreak"] = 0 + account_state["successIntervalMinutes"] = 0 + account_state["lastStatus"] = "pending-sentinel-quality-gate" + account_state["qualityGate"] = { + "pending": True, + "reason": "yaml-account-change", + "changeReasons": reasons, + "markedAt": now, + "pendingOnly": pending_only, + } + items.append({"accountName": name, "changeReasons": reasons, "nextProbeAfter": pending_until if pending_only else now, "defaultFrozen": True, "pendingOnly": pending_only}) + if changed_count <= 0 and len(clamped_items) <= 0: + return {"ok": True, "skipped": False, "reason": "no-new-or-changed-accounts", "changedCount": 0, "fingerprintOnlyCount": fingerprint_only_count, "clampedCount": 0, "items": [], "valuesPrinted": False} + update = update_sentinel_state_configmap(state_obj, state) + if pending_only and changed_count > 0: + reason = "new-or-changed-accounts-pending-quality-gate-prepared" + elif changed_count > 0 and len(clamped_items) > 0: + reason = "new-or-changed-accounts-default-frozen-and-freeze-backoff-clamped" + elif changed_count > 0: + reason = "new-or-changed-accounts-default-frozen" + else: + reason = "freeze-backoff-clamped-to-current-config" + return { + "ok": update.get("ok") is True, + "skipped": False, + "reason": reason, + "changedCount": changed_count, + "fingerprintOnlyCount": fingerprint_only_count, + "clampedCount": len(clamped_items), + "pendingOnly": pending_only, + "items": items, + "clampedItems": clamped_items, + "update": update, + "valuesPrinted": False, + } def sentinel_state_summary(): - state = sentinel_state_object() + _, state = sentinel_state_object() if not isinstance(state, dict): return {"exists": False, "valuesPrinted": False} accounts = state.get("accounts") if isinstance(state.get("accounts"), dict) else {} @@ -3813,13 +4104,12 @@ def sentinel_state_summary(): def reassert_sentinel_freezes_after_sync(token): if (SENTINEL_CONFIG.get("actions") or {}).get("enabled") is not True: return {"ok": True, "skipped": True, "reason": "actions-disabled", "items": [], "valuesPrinted": False} - state = sentinel_state_object() + _, state = sentinel_state_object() if not isinstance(state, dict): return {"ok": True, "skipped": True, "reason": "state-missing", "items": [], "valuesPrinted": False} accounts_state = state.get("accounts") if isinstance(state.get("accounts"), dict) else {} accounts = list_accounts(token) by_name = {item.get("name"): item for item in accounts if isinstance(item.get("name"), str)} - now_epoch = time.time() items = [] for name, account_state in accounts_state.items(): if not isinstance(account_state, dict): @@ -3827,9 +4117,6 @@ def reassert_sentinel_freezes_after_sync(token): quarantine = account_state.get("quarantine") if not isinstance(quarantine, dict) or quarantine.get("active") is not True or quarantine.get("applied") is not True: continue - until_epoch = parse_epoch_z(quarantine.get("until")) - if until_epoch is not None and until_epoch <= now_epoch: - continue account = by_name.get(name) if not account or account.get("id") is None: items.append({"accountName": name, "ok": False, "reason": "account-not-found"}) @@ -4949,7 +5236,13 @@ def run_sync(): group_id = group.get("id") if isinstance(group, dict) else None if group_id is None: raise RuntimeError("pool group id missing after ensure") - account_results, pruned_account_results = ensure_accounts(token, profiles, group_id, prune_removed) + existing_accounts = list_accounts(token) + planned_account_results = planned_sentinel_account_results(profiles, existing_accounts) + sentinel_quality_prepare = ensure_sentinel_state_for_sync(planned_account_results, True) + if sentinel_quality_prepare.get("ok") is not True: + raise RuntimeError("prepare sentinel quality gate failed: " + json.dumps(sentinel_quality_prepare, ensure_ascii=False)) + protected_frozen_names = active_sentinel_quarantine_names() + account_results, pruned_account_results = ensure_accounts(token, profiles, group_id, prune_removed, protected_frozen_names, existing_accounts) capacity_status = account_capacity_status(token) load_factor_status = account_load_factor_status(token) ws_v2_status = account_ws_v2_status(token) @@ -4964,9 +5257,10 @@ def run_sync(): responses_evidence = recent_responses_gateway_evidence() runtime_capabilities = validate_runtime_capabilities(token) sentinel = apply_sentinel_manifest(sentinel_payload.get("manifest")) + sentinel_quality = ensure_sentinel_state_for_sync(account_results) sentinel_reassert = reassert_sentinel_freezes_after_sync(token) return { - "ok": gateway["ok"] is True and responses_smoke["ok"] is True and owner_concurrency["ok"] is True and capacity_status["ok"] is True and load_factor_status["ok"] is True and ws_v2_status["ok"] is True and temp_unschedulable_status["ok"] is True and sentinel.get("ok") is True and sentinel_reassert.get("ok") is True, + "ok": gateway["ok"] is True and responses_smoke["ok"] is True and owner_concurrency["ok"] is True and capacity_status["ok"] is True and load_factor_status["ok"] is True and ws_v2_status["ok"] is True and temp_unschedulable_status["ok"] is True and sentinel.get("ok") is True and sentinel_quality_prepare.get("ok") is True and sentinel_quality.get("ok") is True and sentinel_reassert.get("ok") is True, "degraded": bool(responses_smoke.get("degraded")) or bool(compact_evidence.get("degraded")) or bool(responses_evidence.get("degraded")) or runtime_capabilities.get("ok") is not True, "mode": "sync", "namespace": NAMESPACE, @@ -4982,7 +5276,7 @@ def run_sync(): "pruneMode": "explicit" if prune_removed else "disabled-by-default", "items": account_results, "prunedItems": pruned_account_results, - "processControl": {"schedulableRestore": "POST /api/v1/admin/accounts/:id/schedulable", "durableConfig": False}, + "processControl": {"schedulableRestore": "POST /api/v1/admin/accounts/:id/schedulable for non-quarantined accounts only", "durableConfig": False}, "valuesPrinted": False, }, "capacity": capacity_status, @@ -5003,7 +5297,7 @@ def run_sync(): }, "ownerBalance": owner_balance, "ownerConcurrency": owner_concurrency, - "sentinel": {**sentinel, "freezeReassert": sentinel_reassert}, + "sentinel": {**sentinel, "qualityGatePrepare": sentinel_quality_prepare, "qualityGate": sentinel_quality, "freezeReassert": sentinel_reassert}, "runtimeCapabilities": runtime_capabilities, "validation": {"gatewayModels": gateway, "gatewayResponses": responses_smoke, "gatewayResponsesRecent": responses_evidence, "gatewayCompactRecent": compact_evidence}, }