feat: add sub2api account sentinel

2026-06-11 05:02:27 +00:00
parent b151fbbee8
commit 01f0120d25
6 changed files with 1515 additions and 2 deletions
@@ -69,9 +69,17 @@ bun scripts/cli.ts platform-infra sub2api codex-pool cleanup-probes --confirm
 - `profiles.entries[].tempUnschedulable`: 可选 per-account 临时下线规则覆盖；字段语义以 `docs/reference/platform-infra.md` 为权威。上游 Sub2API 不支持的成功体分类、调度策略或账号冷却行为不要在这里声明。
 - `profiles.entries[].openaiResponsesWebSocketsV2Mode`: 需要 Responses WebSocket v2 的上游才设置，值为 `off`、`ctx_pool` 或 `passthrough`。
 - `profiles.entries[].upstreamUserAgent`: 少数要求 Codex CLI User-Agent 的上游才设置，不能含换行。
+- `sentinel.monitor.enabled`: 账号级 HTTP 200 成功体哨兵监控开关；开启后 `codex-pool sync --confirm` 会在 `platform-infra` 创建/更新 k8s CronJob、ConfigMap、Secret、ServiceAccount、Role 和 RoleBinding。CronJob 直打 YAML-managed 上游账号的 OpenAI Responses `gpt-5.5`，用确定 marker 判定是否出现维护/故障/广告等非预期成功体，并在独立 state ConfigMap 中记录 token/cost 账本。
+- `sentinel.actions.enabled`: 账号级哨兵冻结/恢复动作开关；默认必须保持 `false`，先监控一段时间，确认 marker 判定正确率、token 消耗和误报率后再改为 `true`。动作关闭时只记录 `would-freeze`，不会调用 Sub2API admin API 改 `schedulable`。
+- `sentinel.probe.maxOutputTokens`: 哨兵 OpenAI Responses 请求的硬输出上限，必须保持小值；不要只靠 prompt 要求模型少输出。哨兵不限制并发和每轮账号数，所有到期账号会在同一轮并发探测。
+- `sentinel.cadence`: 成功信任指数退避配置。当前口径是从 1 分钟开始，连续成功后退避到最大 20 分钟；任意 marker mismatch 清零成功信任并进入冻结退避。
+- `sentinel.freeze`: 冻结 TTL 指数退避配置。当前口径是初始 10 分钟，失败后 `10m -> 20m -> 40m -> 80m -> 120m`，最大 2 小时；冻结到期后只做恢复 probe，通过才自动恢复，不能仅靠 TTL 到期解封。
+- `sentinel.pricing`: 直打上游时哨兵自己的 token/cost 估算价格。因为 direct upstream probe 不经过 Sub2API 普通用量账本，哨兵必须自己记录全局与 per-account token/cost；这些账本只用于观察，不作为跳过探测的预算门禁。

 `sync --confirm` 会登录 Sub2API admin、创建/更新 group、创建/更新 YAML 中的 `unidesk-codex-*` accounts、创建/复用统一 API key Secret，并把 managed account 的 `schedulable=true` 恢复为过程控制基线；它默认不删除 YAML 中缺席的 managed account。只有明确退役上游时才使用 `sync --confirm --prune-removed` 删除缺席且 `extra.unidesk_managed=true` 的 `unidesk-codex-*` account。

+`sync --confirm` 同时会按 YAML 渲染账号级哨兵资源。哨兵默认使用 `sentinel.monitor.enabled=true` + `sentinel.actions.enabled=false` 的观察模式；如果后续要开启自动冻结/恢复，只改 `sentinel.actions.enabled=true` 后重新 `codex-pool sync --confirm`，不要手工 patch CronJob、Secret 或 Sub2API account。打开动作前必须先用 `codex-pool validate --full` 或 k8s state 证据确认近期 `lastRun` 的 marker mismatch 与实际上游行为一致，且 token/cost 账本可解释、成本可接受。
+
 `sync --confirm` 和 `validate` 可能超过单次 SSH/runtime 短连接窗口。必须继续使用 `bun scripts/cli.ts platform-infra sub2api codex-pool ...`，由 CLI 在 G14 远端提交作业并短轮询状态；不要改用裸 `trans G14:k3s script` 等一个长连接等待完整结果。若看到 `UNIDESK_SSH_RUNTIME_TIMEOUT`，先按 `docs/reference/platform-infra.md` 的规则处理为控制面可见性问题，修 CLI/job/poll 或重跑受控命令，不要手工 patch Sub2API credentials 或源码。

 不要给 UniDesk-managed Codex accounts 开 Sub2API `pool_mode`。UniDesk 期望的 failover 是把失败账号临时标记为 unschedulable，让同组其他账号接手；`pool_mode` 会重试同一个 account path。
@@ -144,3 +144,41 @@ localCodex:
  supportsWebSockets: false
  responsesWebSocketsV2: false
  responsesSmokeModel: gpt-5.5
+sentinel:
+  monitor:
+    enabled: true
+  actions:
+    enabled: false
+    freezeOnMarkerMismatch: true
+    freezeOnTransportError: false
+  schedule: "*/1 * * * *"
+  image: python:3.12-alpine
+  serviceAccountName: sub2api-account-sentinel
+  configMapName: sub2api-account-sentinel-config
+  credentialsSecretName: sub2api-account-sentinel-profiles
+  stateConfigMapName: sub2api-account-sentinel-state
+  cronJobName: sub2api-account-sentinel
+  model: gpt-5.5
+  endpoint: responses
+  marker:
+    prefix: UDSG_OK
+    exact: true
+  probe:
+    timeoutSeconds: 30
+    maxResponseBytes: 8192
+    maxOutputTokens: 12
+    transportRetryMinutes: 5
+  cadence:
+    successInitialIntervalMinutes: 1
+    successMaxIntervalMinutes: 20
+    successBackoffMultiplier: 2
+    jitterPercent: 10
+  freeze:
+    initialTtlMinutes: 10
+    maxTtlMinutes: 120
+    backoffMultiplier: 2
+    jitterPercent: 10
+  pricing:
+    usdPer1MInputTokens: 1.25
+    usdPer1MOutputTokens: 10
+  historyLimit: 200
@@ -0,0 +1,107 @@
+import { readFileSync } from "node:fs";
+import { rmSync, writeFileSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+import { rootPath } from "./src/config";
+import {
+  defaultCodexPoolSentinelConfig,
+  readCodexPoolSentinelConfig,
+  renderCodexPoolSentinelManifest,
+  sentinelRunnerPython,
+} from "./src/platform-infra-sub2api-codex-sentinel";
+
+function assertCondition(condition: unknown, message: string, detail: unknown = {}): void {
+  if (!condition) throw new Error(`${message}: ${JSON.stringify(detail)}`);
+}
+
+const configPath = rootPath("config", "platform-infra", "sub2api-codex-pool.yaml");
+const parsed = Bun.YAML.parse(readFileSync(configPath, "utf8")) as { sentinel?: unknown };
+const sentinel = readCodexPoolSentinelConfig(parsed.sentinel, defaultCodexPoolSentinelConfig(), configPath);
+
+assertCondition(sentinel.monitor.enabled === true, "sentinel monitor must be enabled for observation-first rollout", sentinel);
+assertCondition(sentinel.actions.enabled === false, "sentinel actions must default off until monitoring quality is reviewed", sentinel);
+assertCondition(sentinel.actions.freezeOnMarkerMismatch === true, "marker mismatch must be configured as freeze-worthy when actions are enabled", sentinel.actions);
+assertCondition(sentinel.actions.freezeOnTransportError === false, "transport errors must not freeze by default", sentinel.actions);
+assertCondition(sentinel.endpoint === "responses", "v1 sentinel must target OpenAI Responses only", sentinel);
+assertCondition(sentinel.model === "gpt-5.5", "v1 sentinel must use GPT-5.5", sentinel);
+assertCondition(sentinel.probe.maxOutputTokens > 0 && sentinel.probe.maxOutputTokens <= 16, "sentinel maxOutputTokens must be tightly capped", sentinel.probe);
+assertCondition(!("concurrency" in sentinel.probe), "sentinel must not cap probe concurrency; all due accounts are probed concurrently", sentinel.probe);
+assertCondition(!("maxAccountsPerRun" in sentinel.probe), "sentinel must not cap accounts per run; all due accounts are eligible", sentinel.probe);
+assertCondition(sentinel.cadence.successInitialIntervalMinutes === 1, "success trust backoff must start at 1 minute", sentinel.cadence);
+assertCondition(sentinel.cadence.successMaxIntervalMinutes === 20, "success trust backoff must cap at 20 minutes", sentinel.cadence);
+assertCondition(sentinel.freeze.initialTtlMinutes === 10, "freeze backoff must start at 10 minutes", sentinel.freeze);
+assertCondition(sentinel.freeze.maxTtlMinutes === 120, "freeze backoff must cap at 2 hours", sentinel.freeze);
+assertCondition(!("budget" in sentinel), "sentinel must not use token budgets as a probe gate; usage is recorded only", sentinel);
+
+const manifest = renderCodexPoolSentinelManifest(sentinel, [
+  {
+    accountName: "unidesk-codex-example",
+    profile: "example",
+    baseUrl: "https://example.invalid/v1",
+    apiKey: "sk-test-secret",
+    upstreamUserAgent: null,
+  },
+], {
+  namespace: "platform-infra",
+  serviceName: "sub2api",
+  serviceDns: "sub2api.platform-infra.svc.cluster.local:8080",
+  appSecretName: "sub2api-secrets",
+});
+
+assertCondition(manifest.includes("kind: CronJob"), "sentinel manifest must render a CronJob", manifest.slice(0, 1000));
+assertCondition(manifest.includes("concurrencyPolicy: Forbid"), "sentinel CronJob must forbid overlapping runs", manifest);
+assertCondition(manifest.includes("suspend: false"), "monitor.enabled=true must unsuspend the CronJob", manifest);
+assertCondition(manifest.includes("kind: ServiceAccount") && manifest.includes("kind: Role") && manifest.includes("kind: RoleBinding"), "sentinel manifest must include minimal RBAC", manifest);
+assertCondition(manifest.includes("sub2api-account-sentinel-state"), "sentinel manifest must reference the state ConfigMap", manifest);
+assertCondition(manifest.includes("\"enabled\": false"), "sentinel manifest must preserve actions.enabled=false in config.json", manifest);
+assertCondition(!manifest.includes("sk-test-secret"), "sentinel manifest must not expose upstream credentials as plaintext", manifest);
+assertCondition(manifest.includes("profiles.json:"), "sentinel credentials Secret must include the profiles payload as Secret data", manifest);
+assertCondition(manifest.includes("\"budgetMode\": \"record-only\""), "sentinel runner must expose record-only budget/accounting mode", manifest);
+assertCondition(manifest.includes("max_workers=max(1, len(due))"), "sentinel runner must probe all due accounts concurrently", manifest);
+
+const disabledMonitor = {
+  ...sentinel,
+  monitor: { enabled: false },
+  actions: { ...sentinel.actions, enabled: false },
+};
+const suspendedManifest = renderCodexPoolSentinelManifest(disabledMonitor, [], {
+  namespace: "platform-infra",
+  serviceName: "sub2api",
+  serviceDns: "sub2api.platform-infra.svc.cluster.local:8080",
+  appSecretName: "sub2api-secrets",
+});
+assertCondition(suspendedManifest.includes("suspend: true"), "monitor.enabled=false must suspend the CronJob", suspendedManifest);
+
+const pythonPath = join(tmpdir(), `sub2api-account-sentinel-${process.pid}.py`);
+writeFileSync(pythonPath, sentinelRunnerPython(), "utf8");
+try {
+  const pyCompile = Bun.spawnSync(["python3", "-m", "py_compile", pythonPath], {
+    stdout: "pipe",
+    stderr: "pipe",
+  });
+  assertCondition(pyCompile.exitCode === 0, "sentinel runner python must compile", {
+    exitCode: pyCompile.exitCode,
+    stdout: pyCompile.stdout.toString(),
+    stderr: pyCompile.stderr.toString(),
+  });
+} finally {
+  rmSync(pythonPath, { force: true });
+}
+
+console.log(JSON.stringify({
+  ok: true,
+  checks: [
+    "sentinel has independent monitor/actions YAML switches",
+    "observation-first rollout keeps actions disabled",
+    "v1 scope is OpenAI Responses + GPT-5.5",
+    "probe max_output_tokens is tightly capped",
+    "probe concurrency is not artificially capped",
+    "budget is record-only and does not gate probes",
+    "success trust backoff is 1m to 20m",
+    "freeze backoff is 10m to 120m",
+    "CronJob is k8s-native with Forbid concurrency and minimal RBAC",
+    "monitor switch controls CronJob suspend state",
+    "rendered Secret avoids plaintext upstream credentials",
+    "embedded Python runner compiles",
+  ],
+}));
@@ -537,6 +537,7 @@ export async function runChecks(config: UniDeskConfig, options: CheckOptions = d
    items.push(await commandItem("playwright:cli-wrapper-contract", ["bun", "scripts/playwright-cli-contract-test.ts"], 30_000, process.env, options.checkHeartbeatMs));
    items.push(await commandItem("platform-infra:sub2api-codex-local-config-contract", ["bun", "scripts/platform-infra-sub2api-codex-local-config-contract-test.ts"], 30_000, process.env, options.checkHeartbeatMs));
    items.push(await commandItem("platform-infra:sub2api-codex-routing-contract", ["bun", "scripts/platform-infra-sub2api-codex-routing-contract-test.ts"], 30_000, process.env, options.checkHeartbeatMs));
+    items.push(await commandItem("platform-infra:sub2api-codex-sentinel-contract", ["bun", "scripts/platform-infra-sub2api-codex-sentinel-contract-test.ts"], 30_000, process.env, options.checkHeartbeatMs));
    items.push(await commandItem("platform-infra:sub2api-codex-temp-unsched-contract", ["bun", "scripts/platform-infra-sub2api-codex-temp-unsched-contract-test.ts"], 30_000, process.env, options.checkHeartbeatMs));
    items.push(await commandItem("platform-infra:sub2api-http-upstream-contract", ["bun", "scripts/platform-infra-sub2api-http-upstream-contract-test.ts"], 30_000, process.env, options.checkHeartbeatMs));
    items.push(await commandItem("auth-broker:p0-contract", ["bun", "scripts/auth-broker-contract-test.ts"], 30_000, process.env, options.checkHeartbeatMs));
@@ -579,6 +580,7 @@ export async function runChecks(config: UniDeskConfig, options: CheckOptions = d
    items.push(skippedItem("server:cleanup-plan-contract", "Server cleanup dry-run contract is opt-in with script checks", "--scripts-typecheck or --full"));
    items.push(skippedItem("check:gh-contract-scope-contract", "Check option scope contract is opt-in with script checks", "--scripts-typecheck or --full"));
    items.push(skippedItem("playwright:cli-wrapper-contract", "Playwright wrapper/headless/session contract is opt-in with script checks", "--scripts-typecheck or --full"));
+    items.push(skippedItem("platform-infra:sub2api-codex-sentinel-contract", "Sub2API Codex account sentinel contract is opt-in with script checks", "--scripts-typecheck or --full"));
    items.push(skippedItem("auth-broker:p0-contract", "Auth Broker P0 skeleton and CLI adapter contract is opt-in with script checks", "--scripts-typecheck or --full"));
    items.push(skippedItem("d601:recovery-guardrails-contract", "D601 recovery guardrails fixture contract is opt-in with script checks", "--scripts-typecheck or --full"));
    items.push(skippedItem("hwlab:cd-wrapper-contract", "HWLAB DEV CD wrapper contract is opt-in with script checks", "--scripts-typecheck or --full"));
@@ -4,6 +4,14 @@ import { homedir } from "node:os";
 import { join } from "node:path";
 import type { UniDeskConfig } from "./config";
 import { rootPath } from "./config";
+import {
+  codexPoolSentinelSummary,
+  defaultCodexPoolSentinelConfig,
+  readCodexPoolSentinelConfig,
+  renderCodexPoolSentinelManifest,
+  type CodexPoolSentinelConfig,
+  type CodexPoolSentinelProfileSecret,
+} from "./platform-infra-sub2api-codex-sentinel";
 import { runSshCommandCapture, type SshCaptureResult } from "./ssh";

 const g14K3sRoute = "G14:k3s";
@@ -89,6 +97,7 @@ interface CodexPoolConfig {
  profiles: CodexPoolProfileConfig[];
  publicExposure: CodexPoolPublicExposureConfig;
  localCodex: CodexPoolLocalCodexConfig;
+  sentinel: CodexPoolSentinelConfig;
 }

 interface CodexPoolProfileConfig {
@@ -174,6 +183,8 @@ export function codexPoolHelp(): unknown {
      poolApiKeySecretKey: pool.apiKeySecretKey,
      publicBaseUrl: pool.publicExposure.enabled ? pool.publicExposure.publicBaseUrl : null,
      masterBaseUrl: pool.publicExposure.enabled ? pool.publicExposure.masterBaseUrl : null,
+      sentinelMonitorEnabled: pool.sentinel.monitor.enabled,
+      sentinelActionsEnabled: pool.sentinel.actions.enabled,
      secretValuesPrinted: false,
    },
  };
@@ -243,6 +254,9 @@ function codexPoolPlan(options: DisclosureOptions = { full: false, raw: false })
      accountType: "openai/apikey",
      grouping: `All discovered Codex profiles are bound to one Sub2API group named ${pool.groupName}.`,
      unifiedApiKey: `The client-facing API_KEY is controlled by k3s Secret ${namespace}/${pool.apiKeySecretName}.${pool.apiKeySecretKey}.`,
+      sentinel: pool.sentinel.monitor.enabled
+        ? `Account sentinel is enabled as k8s CronJob ${namespace}/${pool.sentinel.cronJobName}; actions.enabled=${pool.sentinel.actions.enabled}.`
+        : "Account sentinel monitoring is disabled by YAML.",
      publicExposure: pool.publicExposure.enabled
        ? `Default Codex consumers use ${codexConsumerBaseUrl(pool)}; bounded master-local probes may use ${pool.publicExposure.masterBaseUrl}. FRP proxy ${pool.publicExposure.proxyName} maps public ${pool.publicExposure.publicBaseUrl} to ${pool.publicExposure.localIP}:${pool.publicExposure.localPort}.`
        : "Public FRP exposure is disabled by YAML.",
@@ -272,6 +286,15 @@ async function codexPoolSync(config: UniDeskConfig, options: SyncOptions): Promi

  const payload = {
    pruneRemoved: options.pruneRemoved,
+    sentinel: {
+      manifest: renderCodexPoolSentinelManifest(pool.sentinel, sentinelProfileSecrets(profiles), {
+        namespace,
+        serviceName,
+        serviceDns,
+        appSecretName,
+      }),
+      summary: codexPoolSentinelSummary(pool.sentinel),
+    },
    pool: {
      groupName: pool.groupName,
      apiKeyName: pool.apiKeyName,
@@ -656,6 +679,7 @@ function readCodexPoolConfig(): CodexPoolConfig {
    profiles,
    publicExposure: readPublicExposureConfig(parsed.publicExposure, defaults.publicExposure),
    localCodex: readLocalCodexConfig(parsed.localCodex, defaults.localCodex),
+    sentinel: readCodexPoolSentinelConfig(parsed.sentinel, defaults.sentinel, codexPoolConfigPath),
  };
  validateKubernetesName(config.groupName, "pool.groupName", false);
  validateKubernetesName(config.apiKeySecretName, "pool.apiKeySecretName", true);
@@ -717,6 +741,7 @@ function defaultCodexPoolConfig(): CodexPoolConfig {
      responsesWebSocketsV2: true,
      responsesSmokeModel: "gpt-5.5",
    },
+    sentinel: defaultCodexPoolSentinelConfig(),
  };
 }

@@ -1190,6 +1215,7 @@ function codexPoolConfigSummary(pool: CodexPoolConfig): Record<string, unknown>
    profileCount: pool.profiles.length,
    publicExposure: publicExposureSummary(pool),
    localCodex: pool.localCodex,
+    sentinel: codexPoolSentinelSummary(pool.sentinel),
    disclosure: {
      full: "bun scripts/cli.ts platform-infra sub2api codex-pool plan --full",
    },
@@ -1432,6 +1458,67 @@ function compactGatewayCompactRecent(block: unknown): unknown {
  };
 }

+function compactSentinelStatus(block: unknown): unknown {
+  if (!isRecord(block)) return block;
+  const runtime = isRecord(block.runtime) ? block.runtime : block;
+  const desired = isRecord(runtime.desired) ? runtime.desired : {};
+  const cronJob = isRecord(runtime.cronJob) ? runtime.cronJob : {};
+  const secret = isRecord(runtime.secret) ? runtime.secret : {};
+  const configMap = isRecord(runtime.configMap) ? runtime.configMap : {};
+  const state = isRecord(runtime.state) ? runtime.state : {};
+  const freezeReassert = isRecord(block.freezeReassert) ? block.freezeReassert : {};
+  return {
+    ok: block.ok,
+    action: block.action,
+    desired: {
+      monitorEnabled: desired.monitorEnabled,
+      actionsEnabled: desired.actionsEnabled,
+      schedule: desired.schedule,
+      cronJobName: desired.cronJobName,
+      configMapName: desired.configMapName,
+      credentialsSecretName: desired.credentialsSecretName,
+      stateConfigMapName: desired.stateConfigMapName,
+    },
+    cronJob: {
+      exists: cronJob.exists,
+      schedule: cronJob.schedule,
+      suspend: cronJob.suspend,
+      lastScheduleTime: cronJob.lastScheduleTime,
+      active: cronJob.active,
+      error: cronJob.error,
+    },
+    secret: {
+      exists: secret.exists,
+      profileSecretPresent: secret.profileSecretPresent,
+      valuesPrinted: false,
+      error: secret.error,
+    },
+    configMap: {
+      exists: configMap.exists,
+      configPresent: configMap.configPresent,
+      runnerPresent: configMap.runnerPresent,
+      error: configMap.error,
+    },
+    state: {
+      exists: state.exists,
+      accountCount: state.accountCount,
+      quarantinedCount: state.quarantinedCount,
+      quarantined: state.quarantined,
+      recentAccounts: state.recentAccounts,
+      lastRun: state.lastRun,
+      error: state.error,
+    },
+    freezeReassert: Object.keys(freezeReassert).length > 0 ? {
+      ok: freezeReassert.ok,
+      skipped: freezeReassert.skipped,
+      reason: freezeReassert.reason,
+      itemCount: freezeReassert.itemCount,
+      attentionItems: freezeReassert.attentionItems,
+    } : undefined,
+    valuesPrinted: false,
+  };
+}
+
 function codexPoolValidationSummary(parsed: Record<string, unknown> | null): Record<string, unknown> | null {
  if (parsed === null) return null;
  const validation = isRecord(parsed.validation) ? parsed.validation : {};
@@ -1451,6 +1538,7 @@ function codexPoolValidationSummary(parsed: Record<string, unknown> | null): Rec
    loadFactor: compactStatusBlock(parsed.loadFactor, ["accountName", "accountId", "expectedLoadFactor", "runtimeLoadFactor", "priority", "status", "schedulable", "ok"]),
    webSocketsV2: compactStatusBlock(parsed.webSocketsV2, ["accountName", "accountId", "expectedMode", "runtimeMode", "runtimeEnabled", "status", "schedulable", "ok"]),
    tempUnschedulable: compactTempUnschedulableStatus(parsed.tempUnschedulable),
+    sentinel: compactSentinelStatus(parsed.sentinel),
    runtimeCapabilities: {
      ok: runtimeCapabilities.ok,
      runtimeImage: runtimeCapabilities.runtimeImage,
@@ -1528,10 +1616,28 @@ function poolTarget(pool = readCodexPoolConfig()): Record<string, unknown> {
    defaultAccountPriority: pool.defaultAccountPriority,
    defaultAccountCapacity: pool.defaultAccountCapacity,
    defaultAccountLoadFactor: pool.defaultAccountLoadFactor,
+    sentinel: {
+      monitorEnabled: pool.sentinel.monitor.enabled,
+      actionsEnabled: pool.sentinel.actions.enabled,
+      cronJobName: pool.sentinel.cronJobName,
+      stateConfigMapName: pool.sentinel.stateConfigMapName,
+    },
    valuesPrinted: false,
  };
 }

+function sentinelProfileSecrets(profiles: CodexProfile[]): CodexPoolSentinelProfileSecret[] {
+  return profiles
+    .filter((profile) => profile.ok && profile.apiKey !== null && profile.apiKey.length > 0)
+    .map((profile) => ({
+      accountName: profile.accountName,
+      profile: profile.profile,
+      baseUrl: profile.baseUrl,
+      apiKey: profile.apiKey ?? "",
+      upstreamUserAgent: profile.upstreamUserAgent,
+    }));
+}
+
 function publicExposureSummary(pool: CodexPoolConfig): Record<string, unknown> {
  return {
    enabled: pool.publicExposure.enabled,
@@ -2262,6 +2368,7 @@ import string
 import subprocess
 import sys
 import time
+from datetime import datetime
 from urllib.parse import quote

 NAMESPACE = "${namespace}"
@@ -2284,6 +2391,7 @@ EXPECTED_ACCOUNT_CAPACITIES = ${JSON.stringify(desiredAccountCapacityMap(pool))}
 EXPECTED_ACCOUNT_LOAD_FACTORS = ${JSON.stringify(desiredAccountLoadFactorMap(pool))}
 EXPECTED_ACCOUNT_WS_MODES = json.loads(${JSON.stringify(JSON.stringify(desiredAccountWebSocketsV2ModeMap(pool)))})
 EXPECTED_ACCOUNT_TEMP_UNSCHEDULABLE = json.loads(${JSON.stringify(JSON.stringify(desiredAccountTempUnschedulableMap(pool)))})
+SENTINEL_CONFIG = json.loads(${JSON.stringify(JSON.stringify(pool.sentinel))})
 MODE = "${mode}"
 PAYLOAD_B64 = "${encodedPayload}"

@@ -2726,6 +2834,201 @@ def ensure_api_key_secret(group_id):
        raise RuntimeError(f"apply API key secret failed: {text(proc.stderr, 1000)}")
    return api_key, secret_action, text(proc.stdout, 1000)

+def apply_sentinel_manifest(manifest):
+    if not isinstance(manifest, str) or not manifest.strip():
+        return {
+            "ok": False,
+            "action": "missing-manifest",
+            "valuesPrinted": False,
+        }
+    proc = kubectl(["apply", "--server-side", "--force-conflicts", f"--field-manager={FIELD_MANAGER}", "-f", "-"], manifest)
+    if proc.returncode != 0:
+        return {
+            "ok": False,
+            "action": "apply-failed",
+            "stdoutTail": text(proc.stdout, 2000),
+            "stderrTail": text(proc.stderr, 4000),
+            "valuesPrinted": False,
+        }
+    status = sentinel_runtime_status()
+    return {
+        "ok": status.get("ok") is True,
+        "action": "applied",
+        "stdoutTail": text(proc.stdout, 2000),
+        "runtime": status,
+        "valuesPrinted": False,
+    }
+
+def safe_kube_json(args, label):
+    proc = kubectl([*args, "-o", "json"])
+    if proc.returncode != 0:
+        return None, {"label": label, "exitCode": proc.returncode, "stderrTail": text(proc.stderr, 1000)}
+    try:
+        return json.loads(proc.stdout.decode("utf-8")), None
+    except Exception as exc:
+        return None, {"label": label, "exitCode": proc.returncode, "error": str(exc), "stdoutTail": text(proc.stdout, 1000)}
+
+def sentinel_runtime_status():
+    cfg = SENTINEL_CONFIG
+    cronjob_name = cfg.get("cronJobName")
+    secret_name = cfg.get("credentialsSecretName")
+    configmap_name = cfg.get("configMapName")
+    state_name = cfg.get("stateConfigMapName")
+    cronjob, cronjob_error = safe_kube_json(["-n", NAMESPACE, "get", "cronjob", cronjob_name], f"cronjob/{cronjob_name}")
+    secret, secret_error = safe_kube_json(["-n", NAMESPACE, "get", "secret", secret_name], f"secret/{secret_name}")
+    configmap, configmap_error = safe_kube_json(["-n", NAMESPACE, "get", "configmap", configmap_name], f"configmap/{configmap_name}")
+    state_cm, state_error = safe_kube_json(["-n", NAMESPACE, "get", "configmap", state_name], f"configmap/{state_name}")
+    state = None
+    if isinstance(state_cm, dict):
+        raw_state = (state_cm.get("data") or {}).get("state.json")
+        if isinstance(raw_state, str) and raw_state:
+            try:
+                state = json.loads(raw_state)
+            except Exception as exc:
+                state = {"parseError": str(exc)}
+    accounts = (state.get("accounts") or {}) if isinstance(state, dict) else {}
+    quarantined = []
+    recent_accounts = []
+    for name, account_state in accounts.items():
+        if not isinstance(account_state, dict):
+            continue
+        quarantine = account_state.get("quarantine")
+        if isinstance(quarantine, dict) and quarantine.get("active") is True:
+            quarantined.append({
+                "accountName": name,
+                "until": quarantine.get("until"),
+                "applied": quarantine.get("applied"),
+                "reason": quarantine.get("reason"),
+                "intervalMinutes": quarantine.get("intervalMinutes"),
+            })
+        last_probe = account_state.get("lastProbe")
+        if isinstance(last_probe, dict):
+            recent_accounts.append({
+                "accountName": name,
+                "lastProbeAt": account_state.get("lastProbeAt"),
+                "lastStatus": account_state.get("lastStatus"),
+                "nextProbeAfter": account_state.get("nextProbeAfter"),
+                "ok": last_probe.get("ok"),
+                "purpose": last_probe.get("purpose"),
+                "httpStatus": last_probe.get("httpStatus"),
+                "durationMs": last_probe.get("durationMs"),
+                "markerMatched": last_probe.get("markerMatched"),
+                "outputHash": last_probe.get("outputHash"),
+                "outputPreview": last_probe.get("outputPreview"),
+                "usage": last_probe.get("usage"),
+                "action": last_probe.get("action"),
+            })
+    recent_accounts.sort(key=lambda item: item.get("lastProbeAt") or "")
+    last_run = state.get("lastRun") if isinstance(state, dict) else None
+    cronjob_spec = cronjob.get("spec") if isinstance(cronjob, dict) else {}
+    secret_data = secret.get("data") if isinstance(secret, dict) else {}
+    configmap_data = configmap.get("data") if isinstance(configmap, dict) else {}
+    ok = cronjob is not None and secret is not None and configmap is not None
+    return {
+        "ok": ok,
+        "desired": {
+            "monitorEnabled": cfg.get("monitor", {}).get("enabled"),
+            "actionsEnabled": cfg.get("actions", {}).get("enabled"),
+            "schedule": cfg.get("schedule"),
+            "cronJobName": cronjob_name,
+            "configMapName": configmap_name,
+            "credentialsSecretName": secret_name,
+            "stateConfigMapName": state_name,
+        },
+        "cronJob": {
+            "exists": cronjob is not None,
+            "schedule": cronjob_spec.get("schedule") if isinstance(cronjob_spec, dict) else None,
+            "suspend": cronjob_spec.get("suspend") if isinstance(cronjob_spec, dict) else None,
+            "lastScheduleTime": (cronjob.get("status") or {}).get("lastScheduleTime") if isinstance(cronjob, dict) else None,
+            "active": len((cronjob.get("status") or {}).get("active") or []) if isinstance(cronjob, dict) else None,
+            "error": cronjob_error,
+        },
+        "secret": {
+            "exists": secret is not None,
+            "profileSecretPresent": isinstance(secret_data, dict) and "profiles.json" in secret_data,
+            "valuesPrinted": False,
+            "error": secret_error,
+        },
+        "configMap": {
+            "exists": configmap is not None,
+            "configPresent": isinstance(configmap_data, dict) and "config.json" in configmap_data,
+            "runnerPresent": isinstance(configmap_data, dict) and "sentinel.py" in configmap_data,
+            "error": configmap_error,
+        },
+        "state": {
+            "exists": state_cm is not None,
+            "accountCount": len(accounts),
+            "quarantinedCount": len(quarantined),
+            "quarantined": quarantined[-10:],
+            "recentAccounts": recent_accounts[-12:],
+            "lastRun": last_run,
+            "error": state_error,
+        },
+        "valuesPrinted": False,
+    }
+
+def parse_epoch_z(value):
+    if not isinstance(value, str) or not value:
+        return None
+    try:
+        return datetime.fromisoformat(value.replace("Z", "+00:00")).timestamp()
+    except Exception:
+        return None
+
+def sentinel_state_object():
+    state_name = SENTINEL_CONFIG.get("stateConfigMapName")
+    if not state_name:
+        return None
+    obj, err = safe_kube_json(["-n", NAMESPACE, "get", "configmap", state_name], f"configmap/{state_name}")
+    if not isinstance(obj, dict):
+        return None
+    raw_state = (obj.get("data") or {}).get("state.json")
+    if not isinstance(raw_state, str) or not raw_state:
+        return None
+    try:
+        return json.loads(raw_state)
+    except Exception:
+        return None
+
+def reassert_sentinel_freezes_after_sync(token):
+    if (SENTINEL_CONFIG.get("actions") or {}).get("enabled") is not True:
+        return {"ok": True, "skipped": True, "reason": "actions-disabled", "items": [], "valuesPrinted": False}
+    state = sentinel_state_object()
+    if not isinstance(state, dict):
+        return {"ok": True, "skipped": True, "reason": "state-missing", "items": [], "valuesPrinted": False}
+    accounts_state = state.get("accounts") if isinstance(state.get("accounts"), dict) else {}
+    accounts = list_accounts(token)
+    by_name = {item.get("name"): item for item in accounts if isinstance(item.get("name"), str)}
+    now_epoch = time.time()
+    items = []
+    for name, account_state in accounts_state.items():
+        if not isinstance(account_state, dict):
+            continue
+        quarantine = account_state.get("quarantine")
+        if not isinstance(quarantine, dict) or quarantine.get("active") is not True or quarantine.get("applied") is not True:
+            continue
+        until_epoch = parse_epoch_z(quarantine.get("until"))
+        if until_epoch is not None and until_epoch <= now_epoch:
+            continue
+        account = by_name.get(name)
+        if not account or account.get("id") is None:
+            items.append({"accountName": name, "ok": False, "reason": "account-not-found"})
+            continue
+        try:
+            ensure_success(
+                curl_api("POST", f"/api/v1/admin/accounts/{account['id']}/schedulable", bearer=token, payload={"schedulable": False}),
+                f"reassert sentinel freeze for {name}",
+            )
+            items.append({"accountName": name, "accountId": account.get("id"), "ok": True, "until": quarantine.get("until")})
+        except Exception as exc:
+            items.append({"accountName": name, "accountId": account.get("id"), "ok": False, "error": str(exc)})
+    return {
+        "ok": all(item.get("ok") is True for item in items),
+        "skipped": False,
+        "items": items,
+        "valuesPrinted": False,
+    }
+
 def list_user_keys(token):
    data = ensure_success(curl_api("GET", "/api/v1/keys?page=1&page_size=200", bearer=token), "list user keys")
    return extract_items(data)
@@ -3818,6 +4121,7 @@ def run_sync():
    payload = json.loads(base64.b64decode(PAYLOAD_B64).decode("utf-8"))
    profiles = payload.get("profiles") or []
    prune_removed = bool(payload.get("pruneRemoved"))
+    sentinel_payload = payload.get("sentinel") if isinstance(payload.get("sentinel"), dict) else {}
    if not profiles:
        raise RuntimeError("sync payload has no profiles")
    admin_email, token, admin_compliance = login()
@@ -3839,8 +4143,10 @@ def run_sync():
    compact_evidence = recent_compact_gateway_evidence()
    responses_evidence = recent_responses_gateway_evidence()
    runtime_capabilities = validate_runtime_capabilities(token)
+    sentinel = apply_sentinel_manifest(sentinel_payload.get("manifest"))
+    sentinel_reassert = reassert_sentinel_freezes_after_sync(token)
    return {
-        "ok": gateway["ok"] is True and responses_smoke["ok"] is True and owner_concurrency["ok"] is True and capacity_status["ok"] is True and load_factor_status["ok"] is True and ws_v2_status["ok"] is True and temp_unschedulable_status["ok"] is True,
+        "ok": gateway["ok"] is True and responses_smoke["ok"] is True and owner_concurrency["ok"] is True and capacity_status["ok"] is True and load_factor_status["ok"] is True and ws_v2_status["ok"] is True and temp_unschedulable_status["ok"] is True and sentinel.get("ok") is True and sentinel_reassert.get("ok") is True,
        "degraded": bool(responses_smoke.get("degraded")) or bool(compact_evidence.get("degraded")) or bool(responses_evidence.get("degraded")) or runtime_capabilities.get("ok") is not True,
        "mode": "sync",
        "namespace": NAMESPACE,
@@ -3877,6 +4183,7 @@ def run_sync():
        },
        "ownerBalance": owner_balance,
        "ownerConcurrency": owner_concurrency,
+        "sentinel": {**sentinel, "freezeReassert": sentinel_reassert},
        "runtimeCapabilities": runtime_capabilities,
        "validation": {"gatewayModels": gateway, "gatewayResponses": responses_smoke, "gatewayResponsesRecent": responses_evidence, "gatewayCompactRecent": compact_evidence},
    }
@@ -3901,8 +4208,9 @@ def run_validate():
    compact_evidence = recent_compact_gateway_evidence()
    responses_evidence = recent_responses_gateway_evidence()
    runtime_capabilities = validate_runtime_capabilities(token)
+    sentinel = sentinel_runtime_status()
    return {
-        "ok": gateway["ok"] is True and responses_smoke["ok"] is True and (owner_concurrency is None or owner_concurrency["ok"] is True) and capacity_status["ok"] is True and load_factor_status["ok"] is True and ws_v2_status["ok"] is True and temp_unschedulable_status["ok"] is True,
+        "ok": gateway["ok"] is True and responses_smoke["ok"] is True and (owner_concurrency is None or owner_concurrency["ok"] is True) and capacity_status["ok"] is True and load_factor_status["ok"] is True and ws_v2_status["ok"] is True and temp_unschedulable_status["ok"] is True and sentinel.get("ok") is True,
        "degraded": bool(responses_smoke.get("degraded")) or bool(compact_evidence.get("degraded")) or bool(responses_evidence.get("degraded")) or runtime_capabilities.get("ok") is not True,
        "mode": "validate",
        "namespace": NAMESPACE,
@@ -3922,6 +4230,7 @@ def run_validate():
        "loadFactor": load_factor_status,
        "webSocketsV2": ws_v2_status,
        "tempUnschedulable": temp_unschedulable_status,
+        "sentinel": sentinel,
        "runtimeCapabilities": runtime_capabilities,
        "validation": {"gatewayModels": gateway, "gatewayResponses": responses_smoke, "gatewayResponsesRecent": responses_evidence, "gatewayCompactRecent": compact_evidence},
    }