feat: add sub2api account sentinel

This commit is contained in:
Codex
2026-06-11 05:02:27 +00:00
parent b151fbbee8
commit 01f0120d25
6 changed files with 1515 additions and 2 deletions
+8
View File
@@ -69,9 +69,17 @@ bun scripts/cli.ts platform-infra sub2api codex-pool cleanup-probes --confirm
- `profiles.entries[].tempUnschedulable`: 可选 per-account 临时下线规则覆盖;字段语义以 `docs/reference/platform-infra.md` 为权威。上游 Sub2API 不支持的成功体分类、调度策略或账号冷却行为不要在这里声明。
- `profiles.entries[].openaiResponsesWebSocketsV2Mode`: 需要 Responses WebSocket v2 的上游才设置,值为 `off``ctx_pool``passthrough`
- `profiles.entries[].upstreamUserAgent`: 少数要求 Codex CLI User-Agent 的上游才设置,不能含换行。
- `sentinel.monitor.enabled`: 账号级 HTTP 200 成功体哨兵监控开关;开启后 `codex-pool sync --confirm` 会在 `platform-infra` 创建/更新 k8s CronJob、ConfigMap、Secret、ServiceAccount、Role 和 RoleBinding。CronJob 直打 YAML-managed 上游账号的 OpenAI Responses `gpt-5.5`,用确定 marker 判定是否出现维护/故障/广告等非预期成功体,并在独立 state ConfigMap 中记录 token/cost 账本。
- `sentinel.actions.enabled`: 账号级哨兵冻结/恢复动作开关;默认必须保持 `false`,先监控一段时间,确认 marker 判定正确率、token 消耗和误报率后再改为 `true`。动作关闭时只记录 `would-freeze`,不会调用 Sub2API admin API 改 `schedulable`
- `sentinel.probe.maxOutputTokens`: 哨兵 OpenAI Responses 请求的硬输出上限,必须保持小值;不要只靠 prompt 要求模型少输出。哨兵不限制并发和每轮账号数,所有到期账号会在同一轮并发探测。
- `sentinel.cadence`: 成功信任指数退避配置。当前口径是从 1 分钟开始,连续成功后退避到最大 20 分钟;任意 marker mismatch 清零成功信任并进入冻结退避。
- `sentinel.freeze`: 冻结 TTL 指数退避配置。当前口径是初始 10 分钟,失败后 `10m -> 20m -> 40m -> 80m -> 120m`,最大 2 小时;冻结到期后只做恢复 probe,通过才自动恢复,不能仅靠 TTL 到期解封。
- `sentinel.pricing`: 直打上游时哨兵自己的 token/cost 估算价格。因为 direct upstream probe 不经过 Sub2API 普通用量账本,哨兵必须自己记录全局与 per-account token/cost;这些账本只用于观察,不作为跳过探测的预算门禁。
`sync --confirm` 会登录 Sub2API admin、创建/更新 group、创建/更新 YAML 中的 `unidesk-codex-*` accounts、创建/复用统一 API key Secret,并把 managed account 的 `schedulable=true` 恢复为过程控制基线;它默认不删除 YAML 中缺席的 managed account。只有明确退役上游时才使用 `sync --confirm --prune-removed` 删除缺席且 `extra.unidesk_managed=true``unidesk-codex-*` account。
`sync --confirm` 同时会按 YAML 渲染账号级哨兵资源。哨兵默认使用 `sentinel.monitor.enabled=true` + `sentinel.actions.enabled=false` 的观察模式;如果后续要开启自动冻结/恢复,只改 `sentinel.actions.enabled=true` 后重新 `codex-pool sync --confirm`,不要手工 patch CronJob、Secret 或 Sub2API account。打开动作前必须先用 `codex-pool validate --full` 或 k8s state 证据确认近期 `lastRun` 的 marker mismatch 与实际上游行为一致,且 token/cost 账本可解释、成本可接受。
`sync --confirm``validate` 可能超过单次 SSH/runtime 短连接窗口。必须继续使用 `bun scripts/cli.ts platform-infra sub2api codex-pool ...`,由 CLI 在 G14 远端提交作业并短轮询状态;不要改用裸 `trans G14:k3s script` 等一个长连接等待完整结果。若看到 `UNIDESK_SSH_RUNTIME_TIMEOUT`,先按 `docs/reference/platform-infra.md` 的规则处理为控制面可见性问题,修 CLI/job/poll 或重跑受控命令,不要手工 patch Sub2API credentials 或源码。
不要给 UniDesk-managed Codex accounts 开 Sub2API `pool_mode`。UniDesk 期望的 failover 是把失败账号临时标记为 unschedulable,让同组其他账号接手;`pool_mode` 会重试同一个 account path。
@@ -144,3 +144,41 @@ localCodex:
supportsWebSockets: false
responsesWebSocketsV2: false
responsesSmokeModel: gpt-5.5
sentinel:
monitor:
enabled: true
actions:
enabled: false
freezeOnMarkerMismatch: true
freezeOnTransportError: false
schedule: "*/1 * * * *"
image: python:3.12-alpine
serviceAccountName: sub2api-account-sentinel
configMapName: sub2api-account-sentinel-config
credentialsSecretName: sub2api-account-sentinel-profiles
stateConfigMapName: sub2api-account-sentinel-state
cronJobName: sub2api-account-sentinel
model: gpt-5.5
endpoint: responses
marker:
prefix: UDSG_OK
exact: true
probe:
timeoutSeconds: 30
maxResponseBytes: 8192
maxOutputTokens: 12
transportRetryMinutes: 5
cadence:
successInitialIntervalMinutes: 1
successMaxIntervalMinutes: 20
successBackoffMultiplier: 2
jitterPercent: 10
freeze:
initialTtlMinutes: 10
maxTtlMinutes: 120
backoffMultiplier: 2
jitterPercent: 10
pricing:
usdPer1MInputTokens: 1.25
usdPer1MOutputTokens: 10
historyLimit: 200
@@ -0,0 +1,107 @@
import { readFileSync } from "node:fs";
import { rmSync, writeFileSync } from "node:fs";
import { tmpdir } from "node:os";
import { join } from "node:path";
import { rootPath } from "./src/config";
import {
defaultCodexPoolSentinelConfig,
readCodexPoolSentinelConfig,
renderCodexPoolSentinelManifest,
sentinelRunnerPython,
} from "./src/platform-infra-sub2api-codex-sentinel";
function assertCondition(condition: unknown, message: string, detail: unknown = {}): void {
if (!condition) throw new Error(`${message}: ${JSON.stringify(detail)}`);
}
const configPath = rootPath("config", "platform-infra", "sub2api-codex-pool.yaml");
const parsed = Bun.YAML.parse(readFileSync(configPath, "utf8")) as { sentinel?: unknown };
const sentinel = readCodexPoolSentinelConfig(parsed.sentinel, defaultCodexPoolSentinelConfig(), configPath);
assertCondition(sentinel.monitor.enabled === true, "sentinel monitor must be enabled for observation-first rollout", sentinel);
assertCondition(sentinel.actions.enabled === false, "sentinel actions must default off until monitoring quality is reviewed", sentinel);
assertCondition(sentinel.actions.freezeOnMarkerMismatch === true, "marker mismatch must be configured as freeze-worthy when actions are enabled", sentinel.actions);
assertCondition(sentinel.actions.freezeOnTransportError === false, "transport errors must not freeze by default", sentinel.actions);
assertCondition(sentinel.endpoint === "responses", "v1 sentinel must target OpenAI Responses only", sentinel);
assertCondition(sentinel.model === "gpt-5.5", "v1 sentinel must use GPT-5.5", sentinel);
assertCondition(sentinel.probe.maxOutputTokens > 0 && sentinel.probe.maxOutputTokens <= 16, "sentinel maxOutputTokens must be tightly capped", sentinel.probe);
assertCondition(!("concurrency" in sentinel.probe), "sentinel must not cap probe concurrency; all due accounts are probed concurrently", sentinel.probe);
assertCondition(!("maxAccountsPerRun" in sentinel.probe), "sentinel must not cap accounts per run; all due accounts are eligible", sentinel.probe);
assertCondition(sentinel.cadence.successInitialIntervalMinutes === 1, "success trust backoff must start at 1 minute", sentinel.cadence);
assertCondition(sentinel.cadence.successMaxIntervalMinutes === 20, "success trust backoff must cap at 20 minutes", sentinel.cadence);
assertCondition(sentinel.freeze.initialTtlMinutes === 10, "freeze backoff must start at 10 minutes", sentinel.freeze);
assertCondition(sentinel.freeze.maxTtlMinutes === 120, "freeze backoff must cap at 2 hours", sentinel.freeze);
assertCondition(!("budget" in sentinel), "sentinel must not use token budgets as a probe gate; usage is recorded only", sentinel);
const manifest = renderCodexPoolSentinelManifest(sentinel, [
{
accountName: "unidesk-codex-example",
profile: "example",
baseUrl: "https://example.invalid/v1",
apiKey: "sk-test-secret",
upstreamUserAgent: null,
},
], {
namespace: "platform-infra",
serviceName: "sub2api",
serviceDns: "sub2api.platform-infra.svc.cluster.local:8080",
appSecretName: "sub2api-secrets",
});
assertCondition(manifest.includes("kind: CronJob"), "sentinel manifest must render a CronJob", manifest.slice(0, 1000));
assertCondition(manifest.includes("concurrencyPolicy: Forbid"), "sentinel CronJob must forbid overlapping runs", manifest);
assertCondition(manifest.includes("suspend: false"), "monitor.enabled=true must unsuspend the CronJob", manifest);
assertCondition(manifest.includes("kind: ServiceAccount") && manifest.includes("kind: Role") && manifest.includes("kind: RoleBinding"), "sentinel manifest must include minimal RBAC", manifest);
assertCondition(manifest.includes("sub2api-account-sentinel-state"), "sentinel manifest must reference the state ConfigMap", manifest);
assertCondition(manifest.includes("\"enabled\": false"), "sentinel manifest must preserve actions.enabled=false in config.json", manifest);
assertCondition(!manifest.includes("sk-test-secret"), "sentinel manifest must not expose upstream credentials as plaintext", manifest);
assertCondition(manifest.includes("profiles.json:"), "sentinel credentials Secret must include the profiles payload as Secret data", manifest);
assertCondition(manifest.includes("\"budgetMode\": \"record-only\""), "sentinel runner must expose record-only budget/accounting mode", manifest);
assertCondition(manifest.includes("max_workers=max(1, len(due))"), "sentinel runner must probe all due accounts concurrently", manifest);
const disabledMonitor = {
...sentinel,
monitor: { enabled: false },
actions: { ...sentinel.actions, enabled: false },
};
const suspendedManifest = renderCodexPoolSentinelManifest(disabledMonitor, [], {
namespace: "platform-infra",
serviceName: "sub2api",
serviceDns: "sub2api.platform-infra.svc.cluster.local:8080",
appSecretName: "sub2api-secrets",
});
assertCondition(suspendedManifest.includes("suspend: true"), "monitor.enabled=false must suspend the CronJob", suspendedManifest);
const pythonPath = join(tmpdir(), `sub2api-account-sentinel-${process.pid}.py`);
writeFileSync(pythonPath, sentinelRunnerPython(), "utf8");
try {
const pyCompile = Bun.spawnSync(["python3", "-m", "py_compile", pythonPath], {
stdout: "pipe",
stderr: "pipe",
});
assertCondition(pyCompile.exitCode === 0, "sentinel runner python must compile", {
exitCode: pyCompile.exitCode,
stdout: pyCompile.stdout.toString(),
stderr: pyCompile.stderr.toString(),
});
} finally {
rmSync(pythonPath, { force: true });
}
console.log(JSON.stringify({
ok: true,
checks: [
"sentinel has independent monitor/actions YAML switches",
"observation-first rollout keeps actions disabled",
"v1 scope is OpenAI Responses + GPT-5.5",
"probe max_output_tokens is tightly capped",
"probe concurrency is not artificially capped",
"budget is record-only and does not gate probes",
"success trust backoff is 1m to 20m",
"freeze backoff is 10m to 120m",
"CronJob is k8s-native with Forbid concurrency and minimal RBAC",
"monitor switch controls CronJob suspend state",
"rendered Secret avoids plaintext upstream credentials",
"embedded Python runner compiles",
],
}));
+2
View File
@@ -537,6 +537,7 @@ export async function runChecks(config: UniDeskConfig, options: CheckOptions = d
items.push(await commandItem("playwright:cli-wrapper-contract", ["bun", "scripts/playwright-cli-contract-test.ts"], 30_000, process.env, options.checkHeartbeatMs));
items.push(await commandItem("platform-infra:sub2api-codex-local-config-contract", ["bun", "scripts/platform-infra-sub2api-codex-local-config-contract-test.ts"], 30_000, process.env, options.checkHeartbeatMs));
items.push(await commandItem("platform-infra:sub2api-codex-routing-contract", ["bun", "scripts/platform-infra-sub2api-codex-routing-contract-test.ts"], 30_000, process.env, options.checkHeartbeatMs));
items.push(await commandItem("platform-infra:sub2api-codex-sentinel-contract", ["bun", "scripts/platform-infra-sub2api-codex-sentinel-contract-test.ts"], 30_000, process.env, options.checkHeartbeatMs));
items.push(await commandItem("platform-infra:sub2api-codex-temp-unsched-contract", ["bun", "scripts/platform-infra-sub2api-codex-temp-unsched-contract-test.ts"], 30_000, process.env, options.checkHeartbeatMs));
items.push(await commandItem("platform-infra:sub2api-http-upstream-contract", ["bun", "scripts/platform-infra-sub2api-http-upstream-contract-test.ts"], 30_000, process.env, options.checkHeartbeatMs));
items.push(await commandItem("auth-broker:p0-contract", ["bun", "scripts/auth-broker-contract-test.ts"], 30_000, process.env, options.checkHeartbeatMs));
@@ -579,6 +580,7 @@ export async function runChecks(config: UniDeskConfig, options: CheckOptions = d
items.push(skippedItem("server:cleanup-plan-contract", "Server cleanup dry-run contract is opt-in with script checks", "--scripts-typecheck or --full"));
items.push(skippedItem("check:gh-contract-scope-contract", "Check option scope contract is opt-in with script checks", "--scripts-typecheck or --full"));
items.push(skippedItem("playwright:cli-wrapper-contract", "Playwright wrapper/headless/session contract is opt-in with script checks", "--scripts-typecheck or --full"));
items.push(skippedItem("platform-infra:sub2api-codex-sentinel-contract", "Sub2API Codex account sentinel contract is opt-in with script checks", "--scripts-typecheck or --full"));
items.push(skippedItem("auth-broker:p0-contract", "Auth Broker P0 skeleton and CLI adapter contract is opt-in with script checks", "--scripts-typecheck or --full"));
items.push(skippedItem("d601:recovery-guardrails-contract", "D601 recovery guardrails fixture contract is opt-in with script checks", "--scripts-typecheck or --full"));
items.push(skippedItem("hwlab:cd-wrapper-contract", "HWLAB DEV CD wrapper contract is opt-in with script checks", "--scripts-typecheck or --full"));
File diff suppressed because it is too large Load Diff
+311 -2
View File
@@ -4,6 +4,14 @@ import { homedir } from "node:os";
import { join } from "node:path";
import type { UniDeskConfig } from "./config";
import { rootPath } from "./config";
import {
codexPoolSentinelSummary,
defaultCodexPoolSentinelConfig,
readCodexPoolSentinelConfig,
renderCodexPoolSentinelManifest,
type CodexPoolSentinelConfig,
type CodexPoolSentinelProfileSecret,
} from "./platform-infra-sub2api-codex-sentinel";
import { runSshCommandCapture, type SshCaptureResult } from "./ssh";
const g14K3sRoute = "G14:k3s";
@@ -89,6 +97,7 @@ interface CodexPoolConfig {
profiles: CodexPoolProfileConfig[];
publicExposure: CodexPoolPublicExposureConfig;
localCodex: CodexPoolLocalCodexConfig;
sentinel: CodexPoolSentinelConfig;
}
interface CodexPoolProfileConfig {
@@ -174,6 +183,8 @@ export function codexPoolHelp(): unknown {
poolApiKeySecretKey: pool.apiKeySecretKey,
publicBaseUrl: pool.publicExposure.enabled ? pool.publicExposure.publicBaseUrl : null,
masterBaseUrl: pool.publicExposure.enabled ? pool.publicExposure.masterBaseUrl : null,
sentinelMonitorEnabled: pool.sentinel.monitor.enabled,
sentinelActionsEnabled: pool.sentinel.actions.enabled,
secretValuesPrinted: false,
},
};
@@ -243,6 +254,9 @@ function codexPoolPlan(options: DisclosureOptions = { full: false, raw: false })
accountType: "openai/apikey",
grouping: `All discovered Codex profiles are bound to one Sub2API group named ${pool.groupName}.`,
unifiedApiKey: `The client-facing API_KEY is controlled by k3s Secret ${namespace}/${pool.apiKeySecretName}.${pool.apiKeySecretKey}.`,
sentinel: pool.sentinel.monitor.enabled
? `Account sentinel is enabled as k8s CronJob ${namespace}/${pool.sentinel.cronJobName}; actions.enabled=${pool.sentinel.actions.enabled}.`
: "Account sentinel monitoring is disabled by YAML.",
publicExposure: pool.publicExposure.enabled
? `Default Codex consumers use ${codexConsumerBaseUrl(pool)}; bounded master-local probes may use ${pool.publicExposure.masterBaseUrl}. FRP proxy ${pool.publicExposure.proxyName} maps public ${pool.publicExposure.publicBaseUrl} to ${pool.publicExposure.localIP}:${pool.publicExposure.localPort}.`
: "Public FRP exposure is disabled by YAML.",
@@ -272,6 +286,15 @@ async function codexPoolSync(config: UniDeskConfig, options: SyncOptions): Promi
const payload = {
pruneRemoved: options.pruneRemoved,
sentinel: {
manifest: renderCodexPoolSentinelManifest(pool.sentinel, sentinelProfileSecrets(profiles), {
namespace,
serviceName,
serviceDns,
appSecretName,
}),
summary: codexPoolSentinelSummary(pool.sentinel),
},
pool: {
groupName: pool.groupName,
apiKeyName: pool.apiKeyName,
@@ -656,6 +679,7 @@ function readCodexPoolConfig(): CodexPoolConfig {
profiles,
publicExposure: readPublicExposureConfig(parsed.publicExposure, defaults.publicExposure),
localCodex: readLocalCodexConfig(parsed.localCodex, defaults.localCodex),
sentinel: readCodexPoolSentinelConfig(parsed.sentinel, defaults.sentinel, codexPoolConfigPath),
};
validateKubernetesName(config.groupName, "pool.groupName", false);
validateKubernetesName(config.apiKeySecretName, "pool.apiKeySecretName", true);
@@ -717,6 +741,7 @@ function defaultCodexPoolConfig(): CodexPoolConfig {
responsesWebSocketsV2: true,
responsesSmokeModel: "gpt-5.5",
},
sentinel: defaultCodexPoolSentinelConfig(),
};
}
@@ -1190,6 +1215,7 @@ function codexPoolConfigSummary(pool: CodexPoolConfig): Record<string, unknown>
profileCount: pool.profiles.length,
publicExposure: publicExposureSummary(pool),
localCodex: pool.localCodex,
sentinel: codexPoolSentinelSummary(pool.sentinel),
disclosure: {
full: "bun scripts/cli.ts platform-infra sub2api codex-pool plan --full",
},
@@ -1432,6 +1458,67 @@ function compactGatewayCompactRecent(block: unknown): unknown {
};
}
function compactSentinelStatus(block: unknown): unknown {
if (!isRecord(block)) return block;
const runtime = isRecord(block.runtime) ? block.runtime : block;
const desired = isRecord(runtime.desired) ? runtime.desired : {};
const cronJob = isRecord(runtime.cronJob) ? runtime.cronJob : {};
const secret = isRecord(runtime.secret) ? runtime.secret : {};
const configMap = isRecord(runtime.configMap) ? runtime.configMap : {};
const state = isRecord(runtime.state) ? runtime.state : {};
const freezeReassert = isRecord(block.freezeReassert) ? block.freezeReassert : {};
return {
ok: block.ok,
action: block.action,
desired: {
monitorEnabled: desired.monitorEnabled,
actionsEnabled: desired.actionsEnabled,
schedule: desired.schedule,
cronJobName: desired.cronJobName,
configMapName: desired.configMapName,
credentialsSecretName: desired.credentialsSecretName,
stateConfigMapName: desired.stateConfigMapName,
},
cronJob: {
exists: cronJob.exists,
schedule: cronJob.schedule,
suspend: cronJob.suspend,
lastScheduleTime: cronJob.lastScheduleTime,
active: cronJob.active,
error: cronJob.error,
},
secret: {
exists: secret.exists,
profileSecretPresent: secret.profileSecretPresent,
valuesPrinted: false,
error: secret.error,
},
configMap: {
exists: configMap.exists,
configPresent: configMap.configPresent,
runnerPresent: configMap.runnerPresent,
error: configMap.error,
},
state: {
exists: state.exists,
accountCount: state.accountCount,
quarantinedCount: state.quarantinedCount,
quarantined: state.quarantined,
recentAccounts: state.recentAccounts,
lastRun: state.lastRun,
error: state.error,
},
freezeReassert: Object.keys(freezeReassert).length > 0 ? {
ok: freezeReassert.ok,
skipped: freezeReassert.skipped,
reason: freezeReassert.reason,
itemCount: freezeReassert.itemCount,
attentionItems: freezeReassert.attentionItems,
} : undefined,
valuesPrinted: false,
};
}
function codexPoolValidationSummary(parsed: Record<string, unknown> | null): Record<string, unknown> | null {
if (parsed === null) return null;
const validation = isRecord(parsed.validation) ? parsed.validation : {};
@@ -1451,6 +1538,7 @@ function codexPoolValidationSummary(parsed: Record<string, unknown> | null): Rec
loadFactor: compactStatusBlock(parsed.loadFactor, ["accountName", "accountId", "expectedLoadFactor", "runtimeLoadFactor", "priority", "status", "schedulable", "ok"]),
webSocketsV2: compactStatusBlock(parsed.webSocketsV2, ["accountName", "accountId", "expectedMode", "runtimeMode", "runtimeEnabled", "status", "schedulable", "ok"]),
tempUnschedulable: compactTempUnschedulableStatus(parsed.tempUnschedulable),
sentinel: compactSentinelStatus(parsed.sentinel),
runtimeCapabilities: {
ok: runtimeCapabilities.ok,
runtimeImage: runtimeCapabilities.runtimeImage,
@@ -1528,10 +1616,28 @@ function poolTarget(pool = readCodexPoolConfig()): Record<string, unknown> {
defaultAccountPriority: pool.defaultAccountPriority,
defaultAccountCapacity: pool.defaultAccountCapacity,
defaultAccountLoadFactor: pool.defaultAccountLoadFactor,
sentinel: {
monitorEnabled: pool.sentinel.monitor.enabled,
actionsEnabled: pool.sentinel.actions.enabled,
cronJobName: pool.sentinel.cronJobName,
stateConfigMapName: pool.sentinel.stateConfigMapName,
},
valuesPrinted: false,
};
}
function sentinelProfileSecrets(profiles: CodexProfile[]): CodexPoolSentinelProfileSecret[] {
return profiles
.filter((profile) => profile.ok && profile.apiKey !== null && profile.apiKey.length > 0)
.map((profile) => ({
accountName: profile.accountName,
profile: profile.profile,
baseUrl: profile.baseUrl,
apiKey: profile.apiKey ?? "",
upstreamUserAgent: profile.upstreamUserAgent,
}));
}
function publicExposureSummary(pool: CodexPoolConfig): Record<string, unknown> {
return {
enabled: pool.publicExposure.enabled,
@@ -2262,6 +2368,7 @@ import string
import subprocess
import sys
import time
from datetime import datetime
from urllib.parse import quote
NAMESPACE = "${namespace}"
@@ -2284,6 +2391,7 @@ EXPECTED_ACCOUNT_CAPACITIES = ${JSON.stringify(desiredAccountCapacityMap(pool))}
EXPECTED_ACCOUNT_LOAD_FACTORS = ${JSON.stringify(desiredAccountLoadFactorMap(pool))}
EXPECTED_ACCOUNT_WS_MODES = json.loads(${JSON.stringify(JSON.stringify(desiredAccountWebSocketsV2ModeMap(pool)))})
EXPECTED_ACCOUNT_TEMP_UNSCHEDULABLE = json.loads(${JSON.stringify(JSON.stringify(desiredAccountTempUnschedulableMap(pool)))})
SENTINEL_CONFIG = json.loads(${JSON.stringify(JSON.stringify(pool.sentinel))})
MODE = "${mode}"
PAYLOAD_B64 = "${encodedPayload}"
@@ -2726,6 +2834,201 @@ def ensure_api_key_secret(group_id):
raise RuntimeError(f"apply API key secret failed: {text(proc.stderr, 1000)}")
return api_key, secret_action, text(proc.stdout, 1000)
def apply_sentinel_manifest(manifest):
if not isinstance(manifest, str) or not manifest.strip():
return {
"ok": False,
"action": "missing-manifest",
"valuesPrinted": False,
}
proc = kubectl(["apply", "--server-side", "--force-conflicts", f"--field-manager={FIELD_MANAGER}", "-f", "-"], manifest)
if proc.returncode != 0:
return {
"ok": False,
"action": "apply-failed",
"stdoutTail": text(proc.stdout, 2000),
"stderrTail": text(proc.stderr, 4000),
"valuesPrinted": False,
}
status = sentinel_runtime_status()
return {
"ok": status.get("ok") is True,
"action": "applied",
"stdoutTail": text(proc.stdout, 2000),
"runtime": status,
"valuesPrinted": False,
}
def safe_kube_json(args, label):
proc = kubectl([*args, "-o", "json"])
if proc.returncode != 0:
return None, {"label": label, "exitCode": proc.returncode, "stderrTail": text(proc.stderr, 1000)}
try:
return json.loads(proc.stdout.decode("utf-8")), None
except Exception as exc:
return None, {"label": label, "exitCode": proc.returncode, "error": str(exc), "stdoutTail": text(proc.stdout, 1000)}
def sentinel_runtime_status():
cfg = SENTINEL_CONFIG
cronjob_name = cfg.get("cronJobName")
secret_name = cfg.get("credentialsSecretName")
configmap_name = cfg.get("configMapName")
state_name = cfg.get("stateConfigMapName")
cronjob, cronjob_error = safe_kube_json(["-n", NAMESPACE, "get", "cronjob", cronjob_name], f"cronjob/{cronjob_name}")
secret, secret_error = safe_kube_json(["-n", NAMESPACE, "get", "secret", secret_name], f"secret/{secret_name}")
configmap, configmap_error = safe_kube_json(["-n", NAMESPACE, "get", "configmap", configmap_name], f"configmap/{configmap_name}")
state_cm, state_error = safe_kube_json(["-n", NAMESPACE, "get", "configmap", state_name], f"configmap/{state_name}")
state = None
if isinstance(state_cm, dict):
raw_state = (state_cm.get("data") or {}).get("state.json")
if isinstance(raw_state, str) and raw_state:
try:
state = json.loads(raw_state)
except Exception as exc:
state = {"parseError": str(exc)}
accounts = (state.get("accounts") or {}) if isinstance(state, dict) else {}
quarantined = []
recent_accounts = []
for name, account_state in accounts.items():
if not isinstance(account_state, dict):
continue
quarantine = account_state.get("quarantine")
if isinstance(quarantine, dict) and quarantine.get("active") is True:
quarantined.append({
"accountName": name,
"until": quarantine.get("until"),
"applied": quarantine.get("applied"),
"reason": quarantine.get("reason"),
"intervalMinutes": quarantine.get("intervalMinutes"),
})
last_probe = account_state.get("lastProbe")
if isinstance(last_probe, dict):
recent_accounts.append({
"accountName": name,
"lastProbeAt": account_state.get("lastProbeAt"),
"lastStatus": account_state.get("lastStatus"),
"nextProbeAfter": account_state.get("nextProbeAfter"),
"ok": last_probe.get("ok"),
"purpose": last_probe.get("purpose"),
"httpStatus": last_probe.get("httpStatus"),
"durationMs": last_probe.get("durationMs"),
"markerMatched": last_probe.get("markerMatched"),
"outputHash": last_probe.get("outputHash"),
"outputPreview": last_probe.get("outputPreview"),
"usage": last_probe.get("usage"),
"action": last_probe.get("action"),
})
recent_accounts.sort(key=lambda item: item.get("lastProbeAt") or "")
last_run = state.get("lastRun") if isinstance(state, dict) else None
cronjob_spec = cronjob.get("spec") if isinstance(cronjob, dict) else {}
secret_data = secret.get("data") if isinstance(secret, dict) else {}
configmap_data = configmap.get("data") if isinstance(configmap, dict) else {}
ok = cronjob is not None and secret is not None and configmap is not None
return {
"ok": ok,
"desired": {
"monitorEnabled": cfg.get("monitor", {}).get("enabled"),
"actionsEnabled": cfg.get("actions", {}).get("enabled"),
"schedule": cfg.get("schedule"),
"cronJobName": cronjob_name,
"configMapName": configmap_name,
"credentialsSecretName": secret_name,
"stateConfigMapName": state_name,
},
"cronJob": {
"exists": cronjob is not None,
"schedule": cronjob_spec.get("schedule") if isinstance(cronjob_spec, dict) else None,
"suspend": cronjob_spec.get("suspend") if isinstance(cronjob_spec, dict) else None,
"lastScheduleTime": (cronjob.get("status") or {}).get("lastScheduleTime") if isinstance(cronjob, dict) else None,
"active": len((cronjob.get("status") or {}).get("active") or []) if isinstance(cronjob, dict) else None,
"error": cronjob_error,
},
"secret": {
"exists": secret is not None,
"profileSecretPresent": isinstance(secret_data, dict) and "profiles.json" in secret_data,
"valuesPrinted": False,
"error": secret_error,
},
"configMap": {
"exists": configmap is not None,
"configPresent": isinstance(configmap_data, dict) and "config.json" in configmap_data,
"runnerPresent": isinstance(configmap_data, dict) and "sentinel.py" in configmap_data,
"error": configmap_error,
},
"state": {
"exists": state_cm is not None,
"accountCount": len(accounts),
"quarantinedCount": len(quarantined),
"quarantined": quarantined[-10:],
"recentAccounts": recent_accounts[-12:],
"lastRun": last_run,
"error": state_error,
},
"valuesPrinted": False,
}
def parse_epoch_z(value):
if not isinstance(value, str) or not value:
return None
try:
return datetime.fromisoformat(value.replace("Z", "+00:00")).timestamp()
except Exception:
return None
def sentinel_state_object():
state_name = SENTINEL_CONFIG.get("stateConfigMapName")
if not state_name:
return None
obj, err = safe_kube_json(["-n", NAMESPACE, "get", "configmap", state_name], f"configmap/{state_name}")
if not isinstance(obj, dict):
return None
raw_state = (obj.get("data") or {}).get("state.json")
if not isinstance(raw_state, str) or not raw_state:
return None
try:
return json.loads(raw_state)
except Exception:
return None
def reassert_sentinel_freezes_after_sync(token):
if (SENTINEL_CONFIG.get("actions") or {}).get("enabled") is not True:
return {"ok": True, "skipped": True, "reason": "actions-disabled", "items": [], "valuesPrinted": False}
state = sentinel_state_object()
if not isinstance(state, dict):
return {"ok": True, "skipped": True, "reason": "state-missing", "items": [], "valuesPrinted": False}
accounts_state = state.get("accounts") if isinstance(state.get("accounts"), dict) else {}
accounts = list_accounts(token)
by_name = {item.get("name"): item for item in accounts if isinstance(item.get("name"), str)}
now_epoch = time.time()
items = []
for name, account_state in accounts_state.items():
if not isinstance(account_state, dict):
continue
quarantine = account_state.get("quarantine")
if not isinstance(quarantine, dict) or quarantine.get("active") is not True or quarantine.get("applied") is not True:
continue
until_epoch = parse_epoch_z(quarantine.get("until"))
if until_epoch is not None and until_epoch <= now_epoch:
continue
account = by_name.get(name)
if not account or account.get("id") is None:
items.append({"accountName": name, "ok": False, "reason": "account-not-found"})
continue
try:
ensure_success(
curl_api("POST", f"/api/v1/admin/accounts/{account['id']}/schedulable", bearer=token, payload={"schedulable": False}),
f"reassert sentinel freeze for {name}",
)
items.append({"accountName": name, "accountId": account.get("id"), "ok": True, "until": quarantine.get("until")})
except Exception as exc:
items.append({"accountName": name, "accountId": account.get("id"), "ok": False, "error": str(exc)})
return {
"ok": all(item.get("ok") is True for item in items),
"skipped": False,
"items": items,
"valuesPrinted": False,
}
def list_user_keys(token):
data = ensure_success(curl_api("GET", "/api/v1/keys?page=1&page_size=200", bearer=token), "list user keys")
return extract_items(data)
@@ -3818,6 +4121,7 @@ def run_sync():
payload = json.loads(base64.b64decode(PAYLOAD_B64).decode("utf-8"))
profiles = payload.get("profiles") or []
prune_removed = bool(payload.get("pruneRemoved"))
sentinel_payload = payload.get("sentinel") if isinstance(payload.get("sentinel"), dict) else {}
if not profiles:
raise RuntimeError("sync payload has no profiles")
admin_email, token, admin_compliance = login()
@@ -3839,8 +4143,10 @@ def run_sync():
compact_evidence = recent_compact_gateway_evidence()
responses_evidence = recent_responses_gateway_evidence()
runtime_capabilities = validate_runtime_capabilities(token)
sentinel = apply_sentinel_manifest(sentinel_payload.get("manifest"))
sentinel_reassert = reassert_sentinel_freezes_after_sync(token)
return {
"ok": gateway["ok"] is True and responses_smoke["ok"] is True and owner_concurrency["ok"] is True and capacity_status["ok"] is True and load_factor_status["ok"] is True and ws_v2_status["ok"] is True and temp_unschedulable_status["ok"] is True,
"ok": gateway["ok"] is True and responses_smoke["ok"] is True and owner_concurrency["ok"] is True and capacity_status["ok"] is True and load_factor_status["ok"] is True and ws_v2_status["ok"] is True and temp_unschedulable_status["ok"] is True and sentinel.get("ok") is True and sentinel_reassert.get("ok") is True,
"degraded": bool(responses_smoke.get("degraded")) or bool(compact_evidence.get("degraded")) or bool(responses_evidence.get("degraded")) or runtime_capabilities.get("ok") is not True,
"mode": "sync",
"namespace": NAMESPACE,
@@ -3877,6 +4183,7 @@ def run_sync():
},
"ownerBalance": owner_balance,
"ownerConcurrency": owner_concurrency,
"sentinel": {**sentinel, "freezeReassert": sentinel_reassert},
"runtimeCapabilities": runtime_capabilities,
"validation": {"gatewayModels": gateway, "gatewayResponses": responses_smoke, "gatewayResponsesRecent": responses_evidence, "gatewayCompactRecent": compact_evidence},
}
@@ -3901,8 +4208,9 @@ def run_validate():
compact_evidence = recent_compact_gateway_evidence()
responses_evidence = recent_responses_gateway_evidence()
runtime_capabilities = validate_runtime_capabilities(token)
sentinel = sentinel_runtime_status()
return {
"ok": gateway["ok"] is True and responses_smoke["ok"] is True and (owner_concurrency is None or owner_concurrency["ok"] is True) and capacity_status["ok"] is True and load_factor_status["ok"] is True and ws_v2_status["ok"] is True and temp_unschedulable_status["ok"] is True,
"ok": gateway["ok"] is True and responses_smoke["ok"] is True and (owner_concurrency is None or owner_concurrency["ok"] is True) and capacity_status["ok"] is True and load_factor_status["ok"] is True and ws_v2_status["ok"] is True and temp_unschedulable_status["ok"] is True and sentinel.get("ok") is True,
"degraded": bool(responses_smoke.get("degraded")) or bool(compact_evidence.get("degraded")) or bool(responses_evidence.get("degraded")) or runtime_capabilities.get("ok") is not True,
"mode": "validate",
"namespace": NAMESPACE,
@@ -3922,6 +4230,7 @@ def run_validate():
"loadFactor": load_factor_status,
"webSocketsV2": ws_v2_status,
"tempUnschedulable": temp_unschedulable_status,
"sentinel": sentinel,
"runtimeCapabilities": runtime_capabilities,
"validation": {"gatewayModels": gateway, "gatewayResponses": responses_smoke, "gatewayResponsesRecent": responses_evidence, "gatewayCompactRecent": compact_evidence},
}