feat: add sub2api account sentinel
This commit is contained in:
@@ -69,9 +69,17 @@ bun scripts/cli.ts platform-infra sub2api codex-pool cleanup-probes --confirm
|
||||
- `profiles.entries[].tempUnschedulable`: 可选 per-account 临时下线规则覆盖;字段语义以 `docs/reference/platform-infra.md` 为权威。上游 Sub2API 不支持的成功体分类、调度策略或账号冷却行为不要在这里声明。
|
||||
- `profiles.entries[].openaiResponsesWebSocketsV2Mode`: 需要 Responses WebSocket v2 的上游才设置,值为 `off`、`ctx_pool` 或 `passthrough`。
|
||||
- `profiles.entries[].upstreamUserAgent`: 少数要求 Codex CLI User-Agent 的上游才设置,不能含换行。
|
||||
- `sentinel.monitor.enabled`: 账号级 HTTP 200 成功体哨兵监控开关;开启后 `codex-pool sync --confirm` 会在 `platform-infra` 创建/更新 k8s CronJob、ConfigMap、Secret、ServiceAccount、Role 和 RoleBinding。CronJob 直打 YAML-managed 上游账号的 OpenAI Responses `gpt-5.5`,用确定 marker 判定是否出现维护/故障/广告等非预期成功体,并在独立 state ConfigMap 中记录 token/cost 账本。
|
||||
- `sentinel.actions.enabled`: 账号级哨兵冻结/恢复动作开关;默认必须保持 `false`,先监控一段时间,确认 marker 判定正确率、token 消耗和误报率后再改为 `true`。动作关闭时只记录 `would-freeze`,不会调用 Sub2API admin API 改 `schedulable`。
|
||||
- `sentinel.probe.maxOutputTokens`: 哨兵 OpenAI Responses 请求的硬输出上限,必须保持小值;不要只靠 prompt 要求模型少输出。哨兵不限制并发和每轮账号数,所有到期账号会在同一轮并发探测。
|
||||
- `sentinel.cadence`: 成功信任指数退避配置。当前口径是从 1 分钟开始,连续成功后退避到最大 20 分钟;任意 marker mismatch 清零成功信任并进入冻结退避。
|
||||
- `sentinel.freeze`: 冻结 TTL 指数退避配置。当前口径是初始 10 分钟,失败后 `10m -> 20m -> 40m -> 80m -> 120m`,最大 2 小时;冻结到期后只做恢复 probe,通过才自动恢复,不能仅靠 TTL 到期解封。
|
||||
- `sentinel.pricing`: 直打上游时哨兵自己的 token/cost 估算价格。因为 direct upstream probe 不经过 Sub2API 普通用量账本,哨兵必须自己记录全局与 per-account token/cost;这些账本只用于观察,不作为跳过探测的预算门禁。
|
||||
|
||||
`sync --confirm` 会登录 Sub2API admin、创建/更新 group、创建/更新 YAML 中的 `unidesk-codex-*` accounts、创建/复用统一 API key Secret,并把 managed account 的 `schedulable=true` 恢复为过程控制基线;它默认不删除 YAML 中缺席的 managed account。只有明确退役上游时才使用 `sync --confirm --prune-removed` 删除缺席且 `extra.unidesk_managed=true` 的 `unidesk-codex-*` account。
|
||||
|
||||
`sync --confirm` 同时会按 YAML 渲染账号级哨兵资源。哨兵默认使用 `sentinel.monitor.enabled=true` + `sentinel.actions.enabled=false` 的观察模式;如果后续要开启自动冻结/恢复,只改 `sentinel.actions.enabled=true` 后重新 `codex-pool sync --confirm`,不要手工 patch CronJob、Secret 或 Sub2API account。打开动作前必须先用 `codex-pool validate --full` 或 k8s state 证据确认近期 `lastRun` 的 marker mismatch 与实际上游行为一致,且 token/cost 账本可解释、成本可接受。
|
||||
|
||||
`sync --confirm` 和 `validate` 可能超过单次 SSH/runtime 短连接窗口。必须继续使用 `bun scripts/cli.ts platform-infra sub2api codex-pool ...`,由 CLI 在 G14 远端提交作业并短轮询状态;不要改用裸 `trans G14:k3s script` 等一个长连接等待完整结果。若看到 `UNIDESK_SSH_RUNTIME_TIMEOUT`,先按 `docs/reference/platform-infra.md` 的规则处理为控制面可见性问题,修 CLI/job/poll 或重跑受控命令,不要手工 patch Sub2API credentials 或源码。
|
||||
|
||||
不要给 UniDesk-managed Codex accounts 开 Sub2API `pool_mode`。UniDesk 期望的 failover 是把失败账号临时标记为 unschedulable,让同组其他账号接手;`pool_mode` 会重试同一个 account path。
|
||||
|
||||
@@ -144,3 +144,41 @@ localCodex:
|
||||
supportsWebSockets: false
|
||||
responsesWebSocketsV2: false
|
||||
responsesSmokeModel: gpt-5.5
|
||||
sentinel:
|
||||
monitor:
|
||||
enabled: true
|
||||
actions:
|
||||
enabled: false
|
||||
freezeOnMarkerMismatch: true
|
||||
freezeOnTransportError: false
|
||||
schedule: "*/1 * * * *"
|
||||
image: python:3.12-alpine
|
||||
serviceAccountName: sub2api-account-sentinel
|
||||
configMapName: sub2api-account-sentinel-config
|
||||
credentialsSecretName: sub2api-account-sentinel-profiles
|
||||
stateConfigMapName: sub2api-account-sentinel-state
|
||||
cronJobName: sub2api-account-sentinel
|
||||
model: gpt-5.5
|
||||
endpoint: responses
|
||||
marker:
|
||||
prefix: UDSG_OK
|
||||
exact: true
|
||||
probe:
|
||||
timeoutSeconds: 30
|
||||
maxResponseBytes: 8192
|
||||
maxOutputTokens: 12
|
||||
transportRetryMinutes: 5
|
||||
cadence:
|
||||
successInitialIntervalMinutes: 1
|
||||
successMaxIntervalMinutes: 20
|
||||
successBackoffMultiplier: 2
|
||||
jitterPercent: 10
|
||||
freeze:
|
||||
initialTtlMinutes: 10
|
||||
maxTtlMinutes: 120
|
||||
backoffMultiplier: 2
|
||||
jitterPercent: 10
|
||||
pricing:
|
||||
usdPer1MInputTokens: 1.25
|
||||
usdPer1MOutputTokens: 10
|
||||
historyLimit: 200
|
||||
|
||||
@@ -0,0 +1,107 @@
|
||||
import { readFileSync } from "node:fs";
|
||||
import { rmSync, writeFileSync } from "node:fs";
|
||||
import { tmpdir } from "node:os";
|
||||
import { join } from "node:path";
|
||||
import { rootPath } from "./src/config";
|
||||
import {
|
||||
defaultCodexPoolSentinelConfig,
|
||||
readCodexPoolSentinelConfig,
|
||||
renderCodexPoolSentinelManifest,
|
||||
sentinelRunnerPython,
|
||||
} from "./src/platform-infra-sub2api-codex-sentinel";
|
||||
|
||||
function assertCondition(condition: unknown, message: string, detail: unknown = {}): void {
|
||||
if (!condition) throw new Error(`${message}: ${JSON.stringify(detail)}`);
|
||||
}
|
||||
|
||||
const configPath = rootPath("config", "platform-infra", "sub2api-codex-pool.yaml");
|
||||
const parsed = Bun.YAML.parse(readFileSync(configPath, "utf8")) as { sentinel?: unknown };
|
||||
const sentinel = readCodexPoolSentinelConfig(parsed.sentinel, defaultCodexPoolSentinelConfig(), configPath);
|
||||
|
||||
assertCondition(sentinel.monitor.enabled === true, "sentinel monitor must be enabled for observation-first rollout", sentinel);
|
||||
assertCondition(sentinel.actions.enabled === false, "sentinel actions must default off until monitoring quality is reviewed", sentinel);
|
||||
assertCondition(sentinel.actions.freezeOnMarkerMismatch === true, "marker mismatch must be configured as freeze-worthy when actions are enabled", sentinel.actions);
|
||||
assertCondition(sentinel.actions.freezeOnTransportError === false, "transport errors must not freeze by default", sentinel.actions);
|
||||
assertCondition(sentinel.endpoint === "responses", "v1 sentinel must target OpenAI Responses only", sentinel);
|
||||
assertCondition(sentinel.model === "gpt-5.5", "v1 sentinel must use GPT-5.5", sentinel);
|
||||
assertCondition(sentinel.probe.maxOutputTokens > 0 && sentinel.probe.maxOutputTokens <= 16, "sentinel maxOutputTokens must be tightly capped", sentinel.probe);
|
||||
assertCondition(!("concurrency" in sentinel.probe), "sentinel must not cap probe concurrency; all due accounts are probed concurrently", sentinel.probe);
|
||||
assertCondition(!("maxAccountsPerRun" in sentinel.probe), "sentinel must not cap accounts per run; all due accounts are eligible", sentinel.probe);
|
||||
assertCondition(sentinel.cadence.successInitialIntervalMinutes === 1, "success trust backoff must start at 1 minute", sentinel.cadence);
|
||||
assertCondition(sentinel.cadence.successMaxIntervalMinutes === 20, "success trust backoff must cap at 20 minutes", sentinel.cadence);
|
||||
assertCondition(sentinel.freeze.initialTtlMinutes === 10, "freeze backoff must start at 10 minutes", sentinel.freeze);
|
||||
assertCondition(sentinel.freeze.maxTtlMinutes === 120, "freeze backoff must cap at 2 hours", sentinel.freeze);
|
||||
assertCondition(!("budget" in sentinel), "sentinel must not use token budgets as a probe gate; usage is recorded only", sentinel);
|
||||
|
||||
const manifest = renderCodexPoolSentinelManifest(sentinel, [
|
||||
{
|
||||
accountName: "unidesk-codex-example",
|
||||
profile: "example",
|
||||
baseUrl: "https://example.invalid/v1",
|
||||
apiKey: "sk-test-secret",
|
||||
upstreamUserAgent: null,
|
||||
},
|
||||
], {
|
||||
namespace: "platform-infra",
|
||||
serviceName: "sub2api",
|
||||
serviceDns: "sub2api.platform-infra.svc.cluster.local:8080",
|
||||
appSecretName: "sub2api-secrets",
|
||||
});
|
||||
|
||||
assertCondition(manifest.includes("kind: CronJob"), "sentinel manifest must render a CronJob", manifest.slice(0, 1000));
|
||||
assertCondition(manifest.includes("concurrencyPolicy: Forbid"), "sentinel CronJob must forbid overlapping runs", manifest);
|
||||
assertCondition(manifest.includes("suspend: false"), "monitor.enabled=true must unsuspend the CronJob", manifest);
|
||||
assertCondition(manifest.includes("kind: ServiceAccount") && manifest.includes("kind: Role") && manifest.includes("kind: RoleBinding"), "sentinel manifest must include minimal RBAC", manifest);
|
||||
assertCondition(manifest.includes("sub2api-account-sentinel-state"), "sentinel manifest must reference the state ConfigMap", manifest);
|
||||
assertCondition(manifest.includes("\"enabled\": false"), "sentinel manifest must preserve actions.enabled=false in config.json", manifest);
|
||||
assertCondition(!manifest.includes("sk-test-secret"), "sentinel manifest must not expose upstream credentials as plaintext", manifest);
|
||||
assertCondition(manifest.includes("profiles.json:"), "sentinel credentials Secret must include the profiles payload as Secret data", manifest);
|
||||
assertCondition(manifest.includes("\"budgetMode\": \"record-only\""), "sentinel runner must expose record-only budget/accounting mode", manifest);
|
||||
assertCondition(manifest.includes("max_workers=max(1, len(due))"), "sentinel runner must probe all due accounts concurrently", manifest);
|
||||
|
||||
const disabledMonitor = {
|
||||
...sentinel,
|
||||
monitor: { enabled: false },
|
||||
actions: { ...sentinel.actions, enabled: false },
|
||||
};
|
||||
const suspendedManifest = renderCodexPoolSentinelManifest(disabledMonitor, [], {
|
||||
namespace: "platform-infra",
|
||||
serviceName: "sub2api",
|
||||
serviceDns: "sub2api.platform-infra.svc.cluster.local:8080",
|
||||
appSecretName: "sub2api-secrets",
|
||||
});
|
||||
assertCondition(suspendedManifest.includes("suspend: true"), "monitor.enabled=false must suspend the CronJob", suspendedManifest);
|
||||
|
||||
const pythonPath = join(tmpdir(), `sub2api-account-sentinel-${process.pid}.py`);
|
||||
writeFileSync(pythonPath, sentinelRunnerPython(), "utf8");
|
||||
try {
|
||||
const pyCompile = Bun.spawnSync(["python3", "-m", "py_compile", pythonPath], {
|
||||
stdout: "pipe",
|
||||
stderr: "pipe",
|
||||
});
|
||||
assertCondition(pyCompile.exitCode === 0, "sentinel runner python must compile", {
|
||||
exitCode: pyCompile.exitCode,
|
||||
stdout: pyCompile.stdout.toString(),
|
||||
stderr: pyCompile.stderr.toString(),
|
||||
});
|
||||
} finally {
|
||||
rmSync(pythonPath, { force: true });
|
||||
}
|
||||
|
||||
console.log(JSON.stringify({
|
||||
ok: true,
|
||||
checks: [
|
||||
"sentinel has independent monitor/actions YAML switches",
|
||||
"observation-first rollout keeps actions disabled",
|
||||
"v1 scope is OpenAI Responses + GPT-5.5",
|
||||
"probe max_output_tokens is tightly capped",
|
||||
"probe concurrency is not artificially capped",
|
||||
"budget is record-only and does not gate probes",
|
||||
"success trust backoff is 1m to 20m",
|
||||
"freeze backoff is 10m to 120m",
|
||||
"CronJob is k8s-native with Forbid concurrency and minimal RBAC",
|
||||
"monitor switch controls CronJob suspend state",
|
||||
"rendered Secret avoids plaintext upstream credentials",
|
||||
"embedded Python runner compiles",
|
||||
],
|
||||
}));
|
||||
@@ -537,6 +537,7 @@ export async function runChecks(config: UniDeskConfig, options: CheckOptions = d
|
||||
items.push(await commandItem("playwright:cli-wrapper-contract", ["bun", "scripts/playwright-cli-contract-test.ts"], 30_000, process.env, options.checkHeartbeatMs));
|
||||
items.push(await commandItem("platform-infra:sub2api-codex-local-config-contract", ["bun", "scripts/platform-infra-sub2api-codex-local-config-contract-test.ts"], 30_000, process.env, options.checkHeartbeatMs));
|
||||
items.push(await commandItem("platform-infra:sub2api-codex-routing-contract", ["bun", "scripts/platform-infra-sub2api-codex-routing-contract-test.ts"], 30_000, process.env, options.checkHeartbeatMs));
|
||||
items.push(await commandItem("platform-infra:sub2api-codex-sentinel-contract", ["bun", "scripts/platform-infra-sub2api-codex-sentinel-contract-test.ts"], 30_000, process.env, options.checkHeartbeatMs));
|
||||
items.push(await commandItem("platform-infra:sub2api-codex-temp-unsched-contract", ["bun", "scripts/platform-infra-sub2api-codex-temp-unsched-contract-test.ts"], 30_000, process.env, options.checkHeartbeatMs));
|
||||
items.push(await commandItem("platform-infra:sub2api-http-upstream-contract", ["bun", "scripts/platform-infra-sub2api-http-upstream-contract-test.ts"], 30_000, process.env, options.checkHeartbeatMs));
|
||||
items.push(await commandItem("auth-broker:p0-contract", ["bun", "scripts/auth-broker-contract-test.ts"], 30_000, process.env, options.checkHeartbeatMs));
|
||||
@@ -579,6 +580,7 @@ export async function runChecks(config: UniDeskConfig, options: CheckOptions = d
|
||||
items.push(skippedItem("server:cleanup-plan-contract", "Server cleanup dry-run contract is opt-in with script checks", "--scripts-typecheck or --full"));
|
||||
items.push(skippedItem("check:gh-contract-scope-contract", "Check option scope contract is opt-in with script checks", "--scripts-typecheck or --full"));
|
||||
items.push(skippedItem("playwright:cli-wrapper-contract", "Playwright wrapper/headless/session contract is opt-in with script checks", "--scripts-typecheck or --full"));
|
||||
items.push(skippedItem("platform-infra:sub2api-codex-sentinel-contract", "Sub2API Codex account sentinel contract is opt-in with script checks", "--scripts-typecheck or --full"));
|
||||
items.push(skippedItem("auth-broker:p0-contract", "Auth Broker P0 skeleton and CLI adapter contract is opt-in with script checks", "--scripts-typecheck or --full"));
|
||||
items.push(skippedItem("d601:recovery-guardrails-contract", "D601 recovery guardrails fixture contract is opt-in with script checks", "--scripts-typecheck or --full"));
|
||||
items.push(skippedItem("hwlab:cd-wrapper-contract", "HWLAB DEV CD wrapper contract is opt-in with script checks", "--scripts-typecheck or --full"));
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -4,6 +4,14 @@ import { homedir } from "node:os";
|
||||
import { join } from "node:path";
|
||||
import type { UniDeskConfig } from "./config";
|
||||
import { rootPath } from "./config";
|
||||
import {
|
||||
codexPoolSentinelSummary,
|
||||
defaultCodexPoolSentinelConfig,
|
||||
readCodexPoolSentinelConfig,
|
||||
renderCodexPoolSentinelManifest,
|
||||
type CodexPoolSentinelConfig,
|
||||
type CodexPoolSentinelProfileSecret,
|
||||
} from "./platform-infra-sub2api-codex-sentinel";
|
||||
import { runSshCommandCapture, type SshCaptureResult } from "./ssh";
|
||||
|
||||
const g14K3sRoute = "G14:k3s";
|
||||
@@ -89,6 +97,7 @@ interface CodexPoolConfig {
|
||||
profiles: CodexPoolProfileConfig[];
|
||||
publicExposure: CodexPoolPublicExposureConfig;
|
||||
localCodex: CodexPoolLocalCodexConfig;
|
||||
sentinel: CodexPoolSentinelConfig;
|
||||
}
|
||||
|
||||
interface CodexPoolProfileConfig {
|
||||
@@ -174,6 +183,8 @@ export function codexPoolHelp(): unknown {
|
||||
poolApiKeySecretKey: pool.apiKeySecretKey,
|
||||
publicBaseUrl: pool.publicExposure.enabled ? pool.publicExposure.publicBaseUrl : null,
|
||||
masterBaseUrl: pool.publicExposure.enabled ? pool.publicExposure.masterBaseUrl : null,
|
||||
sentinelMonitorEnabled: pool.sentinel.monitor.enabled,
|
||||
sentinelActionsEnabled: pool.sentinel.actions.enabled,
|
||||
secretValuesPrinted: false,
|
||||
},
|
||||
};
|
||||
@@ -243,6 +254,9 @@ function codexPoolPlan(options: DisclosureOptions = { full: false, raw: false })
|
||||
accountType: "openai/apikey",
|
||||
grouping: `All discovered Codex profiles are bound to one Sub2API group named ${pool.groupName}.`,
|
||||
unifiedApiKey: `The client-facing API_KEY is controlled by k3s Secret ${namespace}/${pool.apiKeySecretName}.${pool.apiKeySecretKey}.`,
|
||||
sentinel: pool.sentinel.monitor.enabled
|
||||
? `Account sentinel is enabled as k8s CronJob ${namespace}/${pool.sentinel.cronJobName}; actions.enabled=${pool.sentinel.actions.enabled}.`
|
||||
: "Account sentinel monitoring is disabled by YAML.",
|
||||
publicExposure: pool.publicExposure.enabled
|
||||
? `Default Codex consumers use ${codexConsumerBaseUrl(pool)}; bounded master-local probes may use ${pool.publicExposure.masterBaseUrl}. FRP proxy ${pool.publicExposure.proxyName} maps public ${pool.publicExposure.publicBaseUrl} to ${pool.publicExposure.localIP}:${pool.publicExposure.localPort}.`
|
||||
: "Public FRP exposure is disabled by YAML.",
|
||||
@@ -272,6 +286,15 @@ async function codexPoolSync(config: UniDeskConfig, options: SyncOptions): Promi
|
||||
|
||||
const payload = {
|
||||
pruneRemoved: options.pruneRemoved,
|
||||
sentinel: {
|
||||
manifest: renderCodexPoolSentinelManifest(pool.sentinel, sentinelProfileSecrets(profiles), {
|
||||
namespace,
|
||||
serviceName,
|
||||
serviceDns,
|
||||
appSecretName,
|
||||
}),
|
||||
summary: codexPoolSentinelSummary(pool.sentinel),
|
||||
},
|
||||
pool: {
|
||||
groupName: pool.groupName,
|
||||
apiKeyName: pool.apiKeyName,
|
||||
@@ -656,6 +679,7 @@ function readCodexPoolConfig(): CodexPoolConfig {
|
||||
profiles,
|
||||
publicExposure: readPublicExposureConfig(parsed.publicExposure, defaults.publicExposure),
|
||||
localCodex: readLocalCodexConfig(parsed.localCodex, defaults.localCodex),
|
||||
sentinel: readCodexPoolSentinelConfig(parsed.sentinel, defaults.sentinel, codexPoolConfigPath),
|
||||
};
|
||||
validateKubernetesName(config.groupName, "pool.groupName", false);
|
||||
validateKubernetesName(config.apiKeySecretName, "pool.apiKeySecretName", true);
|
||||
@@ -717,6 +741,7 @@ function defaultCodexPoolConfig(): CodexPoolConfig {
|
||||
responsesWebSocketsV2: true,
|
||||
responsesSmokeModel: "gpt-5.5",
|
||||
},
|
||||
sentinel: defaultCodexPoolSentinelConfig(),
|
||||
};
|
||||
}
|
||||
|
||||
@@ -1190,6 +1215,7 @@ function codexPoolConfigSummary(pool: CodexPoolConfig): Record<string, unknown>
|
||||
profileCount: pool.profiles.length,
|
||||
publicExposure: publicExposureSummary(pool),
|
||||
localCodex: pool.localCodex,
|
||||
sentinel: codexPoolSentinelSummary(pool.sentinel),
|
||||
disclosure: {
|
||||
full: "bun scripts/cli.ts platform-infra sub2api codex-pool plan --full",
|
||||
},
|
||||
@@ -1432,6 +1458,67 @@ function compactGatewayCompactRecent(block: unknown): unknown {
|
||||
};
|
||||
}
|
||||
|
||||
function compactSentinelStatus(block: unknown): unknown {
|
||||
if (!isRecord(block)) return block;
|
||||
const runtime = isRecord(block.runtime) ? block.runtime : block;
|
||||
const desired = isRecord(runtime.desired) ? runtime.desired : {};
|
||||
const cronJob = isRecord(runtime.cronJob) ? runtime.cronJob : {};
|
||||
const secret = isRecord(runtime.secret) ? runtime.secret : {};
|
||||
const configMap = isRecord(runtime.configMap) ? runtime.configMap : {};
|
||||
const state = isRecord(runtime.state) ? runtime.state : {};
|
||||
const freezeReassert = isRecord(block.freezeReassert) ? block.freezeReassert : {};
|
||||
return {
|
||||
ok: block.ok,
|
||||
action: block.action,
|
||||
desired: {
|
||||
monitorEnabled: desired.monitorEnabled,
|
||||
actionsEnabled: desired.actionsEnabled,
|
||||
schedule: desired.schedule,
|
||||
cronJobName: desired.cronJobName,
|
||||
configMapName: desired.configMapName,
|
||||
credentialsSecretName: desired.credentialsSecretName,
|
||||
stateConfigMapName: desired.stateConfigMapName,
|
||||
},
|
||||
cronJob: {
|
||||
exists: cronJob.exists,
|
||||
schedule: cronJob.schedule,
|
||||
suspend: cronJob.suspend,
|
||||
lastScheduleTime: cronJob.lastScheduleTime,
|
||||
active: cronJob.active,
|
||||
error: cronJob.error,
|
||||
},
|
||||
secret: {
|
||||
exists: secret.exists,
|
||||
profileSecretPresent: secret.profileSecretPresent,
|
||||
valuesPrinted: false,
|
||||
error: secret.error,
|
||||
},
|
||||
configMap: {
|
||||
exists: configMap.exists,
|
||||
configPresent: configMap.configPresent,
|
||||
runnerPresent: configMap.runnerPresent,
|
||||
error: configMap.error,
|
||||
},
|
||||
state: {
|
||||
exists: state.exists,
|
||||
accountCount: state.accountCount,
|
||||
quarantinedCount: state.quarantinedCount,
|
||||
quarantined: state.quarantined,
|
||||
recentAccounts: state.recentAccounts,
|
||||
lastRun: state.lastRun,
|
||||
error: state.error,
|
||||
},
|
||||
freezeReassert: Object.keys(freezeReassert).length > 0 ? {
|
||||
ok: freezeReassert.ok,
|
||||
skipped: freezeReassert.skipped,
|
||||
reason: freezeReassert.reason,
|
||||
itemCount: freezeReassert.itemCount,
|
||||
attentionItems: freezeReassert.attentionItems,
|
||||
} : undefined,
|
||||
valuesPrinted: false,
|
||||
};
|
||||
}
|
||||
|
||||
function codexPoolValidationSummary(parsed: Record<string, unknown> | null): Record<string, unknown> | null {
|
||||
if (parsed === null) return null;
|
||||
const validation = isRecord(parsed.validation) ? parsed.validation : {};
|
||||
@@ -1451,6 +1538,7 @@ function codexPoolValidationSummary(parsed: Record<string, unknown> | null): Rec
|
||||
loadFactor: compactStatusBlock(parsed.loadFactor, ["accountName", "accountId", "expectedLoadFactor", "runtimeLoadFactor", "priority", "status", "schedulable", "ok"]),
|
||||
webSocketsV2: compactStatusBlock(parsed.webSocketsV2, ["accountName", "accountId", "expectedMode", "runtimeMode", "runtimeEnabled", "status", "schedulable", "ok"]),
|
||||
tempUnschedulable: compactTempUnschedulableStatus(parsed.tempUnschedulable),
|
||||
sentinel: compactSentinelStatus(parsed.sentinel),
|
||||
runtimeCapabilities: {
|
||||
ok: runtimeCapabilities.ok,
|
||||
runtimeImage: runtimeCapabilities.runtimeImage,
|
||||
@@ -1528,10 +1616,28 @@ function poolTarget(pool = readCodexPoolConfig()): Record<string, unknown> {
|
||||
defaultAccountPriority: pool.defaultAccountPriority,
|
||||
defaultAccountCapacity: pool.defaultAccountCapacity,
|
||||
defaultAccountLoadFactor: pool.defaultAccountLoadFactor,
|
||||
sentinel: {
|
||||
monitorEnabled: pool.sentinel.monitor.enabled,
|
||||
actionsEnabled: pool.sentinel.actions.enabled,
|
||||
cronJobName: pool.sentinel.cronJobName,
|
||||
stateConfigMapName: pool.sentinel.stateConfigMapName,
|
||||
},
|
||||
valuesPrinted: false,
|
||||
};
|
||||
}
|
||||
|
||||
function sentinelProfileSecrets(profiles: CodexProfile[]): CodexPoolSentinelProfileSecret[] {
|
||||
return profiles
|
||||
.filter((profile) => profile.ok && profile.apiKey !== null && profile.apiKey.length > 0)
|
||||
.map((profile) => ({
|
||||
accountName: profile.accountName,
|
||||
profile: profile.profile,
|
||||
baseUrl: profile.baseUrl,
|
||||
apiKey: profile.apiKey ?? "",
|
||||
upstreamUserAgent: profile.upstreamUserAgent,
|
||||
}));
|
||||
}
|
||||
|
||||
function publicExposureSummary(pool: CodexPoolConfig): Record<string, unknown> {
|
||||
return {
|
||||
enabled: pool.publicExposure.enabled,
|
||||
@@ -2262,6 +2368,7 @@ import string
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime
|
||||
from urllib.parse import quote
|
||||
|
||||
NAMESPACE = "${namespace}"
|
||||
@@ -2284,6 +2391,7 @@ EXPECTED_ACCOUNT_CAPACITIES = ${JSON.stringify(desiredAccountCapacityMap(pool))}
|
||||
EXPECTED_ACCOUNT_LOAD_FACTORS = ${JSON.stringify(desiredAccountLoadFactorMap(pool))}
|
||||
EXPECTED_ACCOUNT_WS_MODES = json.loads(${JSON.stringify(JSON.stringify(desiredAccountWebSocketsV2ModeMap(pool)))})
|
||||
EXPECTED_ACCOUNT_TEMP_UNSCHEDULABLE = json.loads(${JSON.stringify(JSON.stringify(desiredAccountTempUnschedulableMap(pool)))})
|
||||
SENTINEL_CONFIG = json.loads(${JSON.stringify(JSON.stringify(pool.sentinel))})
|
||||
MODE = "${mode}"
|
||||
PAYLOAD_B64 = "${encodedPayload}"
|
||||
|
||||
@@ -2726,6 +2834,201 @@ def ensure_api_key_secret(group_id):
|
||||
raise RuntimeError(f"apply API key secret failed: {text(proc.stderr, 1000)}")
|
||||
return api_key, secret_action, text(proc.stdout, 1000)
|
||||
|
||||
def apply_sentinel_manifest(manifest):
|
||||
if not isinstance(manifest, str) or not manifest.strip():
|
||||
return {
|
||||
"ok": False,
|
||||
"action": "missing-manifest",
|
||||
"valuesPrinted": False,
|
||||
}
|
||||
proc = kubectl(["apply", "--server-side", "--force-conflicts", f"--field-manager={FIELD_MANAGER}", "-f", "-"], manifest)
|
||||
if proc.returncode != 0:
|
||||
return {
|
||||
"ok": False,
|
||||
"action": "apply-failed",
|
||||
"stdoutTail": text(proc.stdout, 2000),
|
||||
"stderrTail": text(proc.stderr, 4000),
|
||||
"valuesPrinted": False,
|
||||
}
|
||||
status = sentinel_runtime_status()
|
||||
return {
|
||||
"ok": status.get("ok") is True,
|
||||
"action": "applied",
|
||||
"stdoutTail": text(proc.stdout, 2000),
|
||||
"runtime": status,
|
||||
"valuesPrinted": False,
|
||||
}
|
||||
|
||||
def safe_kube_json(args, label):
|
||||
proc = kubectl([*args, "-o", "json"])
|
||||
if proc.returncode != 0:
|
||||
return None, {"label": label, "exitCode": proc.returncode, "stderrTail": text(proc.stderr, 1000)}
|
||||
try:
|
||||
return json.loads(proc.stdout.decode("utf-8")), None
|
||||
except Exception as exc:
|
||||
return None, {"label": label, "exitCode": proc.returncode, "error": str(exc), "stdoutTail": text(proc.stdout, 1000)}
|
||||
|
||||
def sentinel_runtime_status():
|
||||
cfg = SENTINEL_CONFIG
|
||||
cronjob_name = cfg.get("cronJobName")
|
||||
secret_name = cfg.get("credentialsSecretName")
|
||||
configmap_name = cfg.get("configMapName")
|
||||
state_name = cfg.get("stateConfigMapName")
|
||||
cronjob, cronjob_error = safe_kube_json(["-n", NAMESPACE, "get", "cronjob", cronjob_name], f"cronjob/{cronjob_name}")
|
||||
secret, secret_error = safe_kube_json(["-n", NAMESPACE, "get", "secret", secret_name], f"secret/{secret_name}")
|
||||
configmap, configmap_error = safe_kube_json(["-n", NAMESPACE, "get", "configmap", configmap_name], f"configmap/{configmap_name}")
|
||||
state_cm, state_error = safe_kube_json(["-n", NAMESPACE, "get", "configmap", state_name], f"configmap/{state_name}")
|
||||
state = None
|
||||
if isinstance(state_cm, dict):
|
||||
raw_state = (state_cm.get("data") or {}).get("state.json")
|
||||
if isinstance(raw_state, str) and raw_state:
|
||||
try:
|
||||
state = json.loads(raw_state)
|
||||
except Exception as exc:
|
||||
state = {"parseError": str(exc)}
|
||||
accounts = (state.get("accounts") or {}) if isinstance(state, dict) else {}
|
||||
quarantined = []
|
||||
recent_accounts = []
|
||||
for name, account_state in accounts.items():
|
||||
if not isinstance(account_state, dict):
|
||||
continue
|
||||
quarantine = account_state.get("quarantine")
|
||||
if isinstance(quarantine, dict) and quarantine.get("active") is True:
|
||||
quarantined.append({
|
||||
"accountName": name,
|
||||
"until": quarantine.get("until"),
|
||||
"applied": quarantine.get("applied"),
|
||||
"reason": quarantine.get("reason"),
|
||||
"intervalMinutes": quarantine.get("intervalMinutes"),
|
||||
})
|
||||
last_probe = account_state.get("lastProbe")
|
||||
if isinstance(last_probe, dict):
|
||||
recent_accounts.append({
|
||||
"accountName": name,
|
||||
"lastProbeAt": account_state.get("lastProbeAt"),
|
||||
"lastStatus": account_state.get("lastStatus"),
|
||||
"nextProbeAfter": account_state.get("nextProbeAfter"),
|
||||
"ok": last_probe.get("ok"),
|
||||
"purpose": last_probe.get("purpose"),
|
||||
"httpStatus": last_probe.get("httpStatus"),
|
||||
"durationMs": last_probe.get("durationMs"),
|
||||
"markerMatched": last_probe.get("markerMatched"),
|
||||
"outputHash": last_probe.get("outputHash"),
|
||||
"outputPreview": last_probe.get("outputPreview"),
|
||||
"usage": last_probe.get("usage"),
|
||||
"action": last_probe.get("action"),
|
||||
})
|
||||
recent_accounts.sort(key=lambda item: item.get("lastProbeAt") or "")
|
||||
last_run = state.get("lastRun") if isinstance(state, dict) else None
|
||||
cronjob_spec = cronjob.get("spec") if isinstance(cronjob, dict) else {}
|
||||
secret_data = secret.get("data") if isinstance(secret, dict) else {}
|
||||
configmap_data = configmap.get("data") if isinstance(configmap, dict) else {}
|
||||
ok = cronjob is not None and secret is not None and configmap is not None
|
||||
return {
|
||||
"ok": ok,
|
||||
"desired": {
|
||||
"monitorEnabled": cfg.get("monitor", {}).get("enabled"),
|
||||
"actionsEnabled": cfg.get("actions", {}).get("enabled"),
|
||||
"schedule": cfg.get("schedule"),
|
||||
"cronJobName": cronjob_name,
|
||||
"configMapName": configmap_name,
|
||||
"credentialsSecretName": secret_name,
|
||||
"stateConfigMapName": state_name,
|
||||
},
|
||||
"cronJob": {
|
||||
"exists": cronjob is not None,
|
||||
"schedule": cronjob_spec.get("schedule") if isinstance(cronjob_spec, dict) else None,
|
||||
"suspend": cronjob_spec.get("suspend") if isinstance(cronjob_spec, dict) else None,
|
||||
"lastScheduleTime": (cronjob.get("status") or {}).get("lastScheduleTime") if isinstance(cronjob, dict) else None,
|
||||
"active": len((cronjob.get("status") or {}).get("active") or []) if isinstance(cronjob, dict) else None,
|
||||
"error": cronjob_error,
|
||||
},
|
||||
"secret": {
|
||||
"exists": secret is not None,
|
||||
"profileSecretPresent": isinstance(secret_data, dict) and "profiles.json" in secret_data,
|
||||
"valuesPrinted": False,
|
||||
"error": secret_error,
|
||||
},
|
||||
"configMap": {
|
||||
"exists": configmap is not None,
|
||||
"configPresent": isinstance(configmap_data, dict) and "config.json" in configmap_data,
|
||||
"runnerPresent": isinstance(configmap_data, dict) and "sentinel.py" in configmap_data,
|
||||
"error": configmap_error,
|
||||
},
|
||||
"state": {
|
||||
"exists": state_cm is not None,
|
||||
"accountCount": len(accounts),
|
||||
"quarantinedCount": len(quarantined),
|
||||
"quarantined": quarantined[-10:],
|
||||
"recentAccounts": recent_accounts[-12:],
|
||||
"lastRun": last_run,
|
||||
"error": state_error,
|
||||
},
|
||||
"valuesPrinted": False,
|
||||
}
|
||||
|
||||
def parse_epoch_z(value):
|
||||
if not isinstance(value, str) or not value:
|
||||
return None
|
||||
try:
|
||||
return datetime.fromisoformat(value.replace("Z", "+00:00")).timestamp()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def sentinel_state_object():
|
||||
state_name = SENTINEL_CONFIG.get("stateConfigMapName")
|
||||
if not state_name:
|
||||
return None
|
||||
obj, err = safe_kube_json(["-n", NAMESPACE, "get", "configmap", state_name], f"configmap/{state_name}")
|
||||
if not isinstance(obj, dict):
|
||||
return None
|
||||
raw_state = (obj.get("data") or {}).get("state.json")
|
||||
if not isinstance(raw_state, str) or not raw_state:
|
||||
return None
|
||||
try:
|
||||
return json.loads(raw_state)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def reassert_sentinel_freezes_after_sync(token):
|
||||
if (SENTINEL_CONFIG.get("actions") or {}).get("enabled") is not True:
|
||||
return {"ok": True, "skipped": True, "reason": "actions-disabled", "items": [], "valuesPrinted": False}
|
||||
state = sentinel_state_object()
|
||||
if not isinstance(state, dict):
|
||||
return {"ok": True, "skipped": True, "reason": "state-missing", "items": [], "valuesPrinted": False}
|
||||
accounts_state = state.get("accounts") if isinstance(state.get("accounts"), dict) else {}
|
||||
accounts = list_accounts(token)
|
||||
by_name = {item.get("name"): item for item in accounts if isinstance(item.get("name"), str)}
|
||||
now_epoch = time.time()
|
||||
items = []
|
||||
for name, account_state in accounts_state.items():
|
||||
if not isinstance(account_state, dict):
|
||||
continue
|
||||
quarantine = account_state.get("quarantine")
|
||||
if not isinstance(quarantine, dict) or quarantine.get("active") is not True or quarantine.get("applied") is not True:
|
||||
continue
|
||||
until_epoch = parse_epoch_z(quarantine.get("until"))
|
||||
if until_epoch is not None and until_epoch <= now_epoch:
|
||||
continue
|
||||
account = by_name.get(name)
|
||||
if not account or account.get("id") is None:
|
||||
items.append({"accountName": name, "ok": False, "reason": "account-not-found"})
|
||||
continue
|
||||
try:
|
||||
ensure_success(
|
||||
curl_api("POST", f"/api/v1/admin/accounts/{account['id']}/schedulable", bearer=token, payload={"schedulable": False}),
|
||||
f"reassert sentinel freeze for {name}",
|
||||
)
|
||||
items.append({"accountName": name, "accountId": account.get("id"), "ok": True, "until": quarantine.get("until")})
|
||||
except Exception as exc:
|
||||
items.append({"accountName": name, "accountId": account.get("id"), "ok": False, "error": str(exc)})
|
||||
return {
|
||||
"ok": all(item.get("ok") is True for item in items),
|
||||
"skipped": False,
|
||||
"items": items,
|
||||
"valuesPrinted": False,
|
||||
}
|
||||
|
||||
def list_user_keys(token):
|
||||
data = ensure_success(curl_api("GET", "/api/v1/keys?page=1&page_size=200", bearer=token), "list user keys")
|
||||
return extract_items(data)
|
||||
@@ -3818,6 +4121,7 @@ def run_sync():
|
||||
payload = json.loads(base64.b64decode(PAYLOAD_B64).decode("utf-8"))
|
||||
profiles = payload.get("profiles") or []
|
||||
prune_removed = bool(payload.get("pruneRemoved"))
|
||||
sentinel_payload = payload.get("sentinel") if isinstance(payload.get("sentinel"), dict) else {}
|
||||
if not profiles:
|
||||
raise RuntimeError("sync payload has no profiles")
|
||||
admin_email, token, admin_compliance = login()
|
||||
@@ -3839,8 +4143,10 @@ def run_sync():
|
||||
compact_evidence = recent_compact_gateway_evidence()
|
||||
responses_evidence = recent_responses_gateway_evidence()
|
||||
runtime_capabilities = validate_runtime_capabilities(token)
|
||||
sentinel = apply_sentinel_manifest(sentinel_payload.get("manifest"))
|
||||
sentinel_reassert = reassert_sentinel_freezes_after_sync(token)
|
||||
return {
|
||||
"ok": gateway["ok"] is True and responses_smoke["ok"] is True and owner_concurrency["ok"] is True and capacity_status["ok"] is True and load_factor_status["ok"] is True and ws_v2_status["ok"] is True and temp_unschedulable_status["ok"] is True,
|
||||
"ok": gateway["ok"] is True and responses_smoke["ok"] is True and owner_concurrency["ok"] is True and capacity_status["ok"] is True and load_factor_status["ok"] is True and ws_v2_status["ok"] is True and temp_unschedulable_status["ok"] is True and sentinel.get("ok") is True and sentinel_reassert.get("ok") is True,
|
||||
"degraded": bool(responses_smoke.get("degraded")) or bool(compact_evidence.get("degraded")) or bool(responses_evidence.get("degraded")) or runtime_capabilities.get("ok") is not True,
|
||||
"mode": "sync",
|
||||
"namespace": NAMESPACE,
|
||||
@@ -3877,6 +4183,7 @@ def run_sync():
|
||||
},
|
||||
"ownerBalance": owner_balance,
|
||||
"ownerConcurrency": owner_concurrency,
|
||||
"sentinel": {**sentinel, "freezeReassert": sentinel_reassert},
|
||||
"runtimeCapabilities": runtime_capabilities,
|
||||
"validation": {"gatewayModels": gateway, "gatewayResponses": responses_smoke, "gatewayResponsesRecent": responses_evidence, "gatewayCompactRecent": compact_evidence},
|
||||
}
|
||||
@@ -3901,8 +4208,9 @@ def run_validate():
|
||||
compact_evidence = recent_compact_gateway_evidence()
|
||||
responses_evidence = recent_responses_gateway_evidence()
|
||||
runtime_capabilities = validate_runtime_capabilities(token)
|
||||
sentinel = sentinel_runtime_status()
|
||||
return {
|
||||
"ok": gateway["ok"] is True and responses_smoke["ok"] is True and (owner_concurrency is None or owner_concurrency["ok"] is True) and capacity_status["ok"] is True and load_factor_status["ok"] is True and ws_v2_status["ok"] is True and temp_unschedulable_status["ok"] is True,
|
||||
"ok": gateway["ok"] is True and responses_smoke["ok"] is True and (owner_concurrency is None or owner_concurrency["ok"] is True) and capacity_status["ok"] is True and load_factor_status["ok"] is True and ws_v2_status["ok"] is True and temp_unschedulable_status["ok"] is True and sentinel.get("ok") is True,
|
||||
"degraded": bool(responses_smoke.get("degraded")) or bool(compact_evidence.get("degraded")) or bool(responses_evidence.get("degraded")) or runtime_capabilities.get("ok") is not True,
|
||||
"mode": "validate",
|
||||
"namespace": NAMESPACE,
|
||||
@@ -3922,6 +4230,7 @@ def run_validate():
|
||||
"loadFactor": load_factor_status,
|
||||
"webSocketsV2": ws_v2_status,
|
||||
"tempUnschedulable": temp_unschedulable_status,
|
||||
"sentinel": sentinel,
|
||||
"runtimeCapabilities": runtime_capabilities,
|
||||
"validation": {"gatewayModels": gateway, "gatewayResponses": responses_smoke, "gatewayResponsesRecent": responses_evidence, "gatewayCompactRecent": compact_evidence},
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user