From 3a8681f458c7fc1dc8c080a1e4709fa5dc032cd9 Mon Sep 17 00:00:00 2001 From: Codex Date: Thu, 2 Jul 2026 02:43:01 +0000 Subject: [PATCH] fix: support PK01 Codex pool sync --- .../unidesk-sub2api/references/codex-pool.md | 8 +- .../references/local-codex-consumer.md | 2 +- .../unidesk-sub2api/references/operations.md | 2 +- docs/reference/platform-infra.md | 4 +- .../platform-infra-sub2api-codex/actions.ts | 30 +-- .../local-codex.ts | 53 +++++ .../platform-infra-sub2api-codex/options.ts | 1 + .../public-exposure.ts | 4 +- .../remote-python-sync.ts | 189 ++++++++++++++++-- .../runtime-target.ts | 10 + .../src/platform-infra-sub2api-codex/types.ts | 3 + 11 files changed, 258 insertions(+), 48 deletions(-) diff --git a/.agents/skills/unidesk-sub2api/references/codex-pool.md b/.agents/skills/unidesk-sub2api/references/codex-pool.md index cdd9bbfd..daa9b8a6 100644 --- a/.agents/skills/unidesk-sub2api/references/codex-pool.md +++ b/.agents/skills/unidesk-sub2api/references/codex-pool.md @@ -1,6 +1,8 @@ ## Codex Pool -当前 codex-pool sync/validate/report/trace 适配器主要覆盖 k3s target。若 YAML 默认 target 是 PK01 host-Docker,不要直接把无 `--target` 的 codex-pool 命令当成验收入口;先使用 `sub2api status --target PK01`、`sub2api validate --target PK01` 和最小 public `/v1/responses` smoke。host-Docker codex-pool adapter 补齐前,k3s 账号池操作必须显式选择 k3s target: +`codex-pool plan|sync|validate` 同时覆盖 k3s target 和 PK01 host-Docker target。PK01 host-Docker 的 `sync --confirm` 通过 Sub2API admin API 对齐 group、YAML-managed accounts、统一消费 API key、capacity/loadFactor、WebSocket v2 标记和内置 `temp_unschedulable` 规则;统一消费 key 写入 YAML 声明的 `targets[PK01].hostDocker.envPath`,不创建 k8s Secret,不部署 sentinel 资源,也不触发 `sub2api apply`、Docker compose、Caddy reload 或容器重启。`sentinel-report`、`sentinel-probe`、`sentinel-image` 和部分 `trace` 能力仍以 k3s target 为主;需要这些能力时显式选择对应 k3s target。 + +k3s 账号池操作示例: ```bash bun scripts/cli.ts platform-infra sub2api codex-pool plan --target D601 @@ -17,7 +19,7 @@ bun scripts/cli.ts platform-infra sub2api codex-pool cleanup-probes --target D60 `config/platform-infra/sub2api-codex-pool.yaml` 控制: - `pool.groupName`: Sub2API group 名称。 -- `pool.apiKeySecretName` / `pool.apiKeySecretKey`: 统一消费 API key 的 k3s Secret 位置,默认 `platform-infra/sub2api-codex-pool-api-key.API_KEY`。 +- `pool.apiKeySecretName` / `pool.apiKeySecretKey`: 统一消费 API key 的 key 名。k3s target 写入对应 namespace 下的 k3s Secret;PK01 host-Docker target 写入 `config/platform-infra/sub2api.yaml` 中 `targets[PK01].hostDocker.envPath` 声明的 env 文件。 - `pool.minOwnerBalanceUsd`: pool key owner 最低余额,sync/validate 会补齐。 - `pool.minOwnerConcurrency`: 可选统一消费 API key owner 最低并发;省略时 CLI 自动使用所有已解析账号 capacity 的总和,sync/validate 会补齐。显式 YAML 值只作为 override,仍必须不小于账号 capacity 总和;未显式写 `profiles.entries[].capacity` 的账号会使用 `pool.defaultAccountCapacity` 参与求和,不要用提高某个 provider capacity 来掩盖用户并发层 WS 1013。 - `pool.defaultTempUnschedulable`: Sub2API 内置请求路径临时不可调度开关和 YAML 规则列表。当前要求是按 YAML 开启通用规则;sync 把 `temp_unschedulable_enabled` / `temp_unschedulable_rules` 渲染到 managed accounts,让匹配的 400/5xx/超时/模型路由/加密内容错误短暂冷却当前账号并触发同组 failover。 @@ -38,7 +40,7 @@ bun scripts/cli.ts platform-infra sub2api codex-pool cleanup-probes --target D60 - `manualAccounts.protected`: 已在 Sub2API 手动创建/维护、且必须排除在 UniDesk-managed Codex pool credentials 和 sentinel 控制之外的账号。默认不得改 credentials/status/schedulable/priority/capacity/loadFactor;只有显式声明 `proxyBinding` 时,`sync --confirm` 才允许把该账号的 `proxy_id` 对齐到 YAML 目标的 egress proxy;只有显式声明 `groupBinding.source: pool-group` 时,才允许把该账号加入统一消费 API key 使用的 pool group。`targetIds` 可选;省略表示所有 target 都保护该账号,设置后只在匹配 target 上纳入 proxy/group 窄同步和 sentinel-probe 拒绝列表,避免 PK01-only 手动账号漂移卡住 JD01 pool。 - Sentinel 配置、marker-only 判定、镜像、report/probe 和远端 job/poll 边界见 [sentinel.md](sentinel.md)。 -对已支持的 k3s target,`sync --confirm` 会登录 Sub2API admin、创建/更新 group、创建/更新 YAML 中的 `unidesk-codex-*` accounts、创建/复用统一 API key Secret,并部署/更新哨兵资源;它不把既有 managed account 直接恢复为 `schedulable=true`。恢复只由哨兵在读取 Sub2API runtime `schedulable=false` 后触发 recovery probe,并在 marker 命中时执行。`sync` 默认不删除 YAML 中缺席的 managed account。只有明确退役上游时才使用 `sync --confirm --prune-removed` 删除缺席且 `extra.unidesk_managed=true` 的 `unidesk-codex-*` account。对 `manualAccounts.protected`,`sync` 只执行 YAML 显式允许的窄同步;当前允许项是从目标 `egressProxy` 创建/更新 Sub2API internal proxy 记录并绑定 `proxy_id`,以及把受保护手动账号加入当前 `pool.groupName`。它仍不接管该账号凭据、status、schedulable、priority/capacity/loadFactor 或哨兵状态。PK01 host-Docker target 在 codex-pool adapter 补齐前不具备这条完整 sync 路径。 +`sync --confirm` 会登录 Sub2API admin、创建/更新 group、创建/更新 YAML 中的 `unidesk-codex-*` accounts,并创建/复用统一 API key。k3s target 还会写入统一 API key Secret 并部署/更新哨兵资源;PK01 host-Docker target 只写 Sub2API runtime 和 host-Docker env 文件。`sync` 不把既有 managed account 直接恢复为 `schedulable=true`。恢复只由哨兵在读取 Sub2API runtime `schedulable=false` 后触发 recovery probe,并在 marker 命中时执行。`sync` 默认不删除 YAML 中缺席的 managed account。只有明确退役上游时才使用 `sync --confirm --prune-removed` 删除缺席且 `extra.unidesk_managed=true` 的 `unidesk-codex-*` account。对 `manualAccounts.protected`,`sync` 只执行 YAML 显式允许的窄同步;当前允许项是从目标 `egressProxy` 创建/更新 Sub2API internal proxy 记录并绑定 `proxy_id`,以及把受保护手动账号加入当前 `pool.groupName`。它仍不接管该账号凭据、status、schedulable、priority/capacity/loadFactor 或哨兵状态。若受保护手动账号在运行面缺失,`sync`/`validate` 会报告 manual account drift;不要自动创建、删除、接管或从 YAML 移除该账号。 `trace --request-id ` 是只读 request 追溯报表,不触发 probe、不修改账号。默认输出请求开始/最终状态、failover、`account_select_failed`、窗口内 `account_temp_unschedulable`、admin schedulable 写入计数和当前账号快照;`reason=failover-attempted-no-candidate` 表示 Sub2API 已进入自动切号,但排除当前失败账号后没有可用候选。需要机器处理时使用 `--raw`,需要原始匹配行时加 `--show-lines`。 diff --git a/.agents/skills/unidesk-sub2api/references/local-codex-consumer.md b/.agents/skills/unidesk-sub2api/references/local-codex-consumer.md index 30975dc1..af8ed5a7 100644 --- a/.agents/skills/unidesk-sub2api/references/local-codex-consumer.md +++ b/.agents/skills/unidesk-sub2api/references/local-codex-consumer.md @@ -7,7 +7,7 @@ bun scripts/cli.ts platform-infra sub2api codex-pool configure-local --confirm `configure-local --confirm` 会: -- 从 `platform-infra/.` 读取统一 API key。 +- 从 active target 的统一 API key 位置读取 key:k3s target 读取 `platform-infra/.`,PK01 host-Docker target 读取 YAML `hostDocker.envPath` 中的 ``。 - 把当前 `~/.codex/config.toml` 和 `~/.codex/auth.json` 备份为 `.`,默认 `.pre-sub2api`。 - 重写默认 `~/.codex` 消费端,固定指向 `https://sub2api.74-48-78-17.nip.io/`,provider 名称和 wire API 来自 `localCodex`。 - 按 `localCodex.modelContextWindow` / `localCodex.modelAutoCompactTokenLimit` 写入 `model_context_window` / `model_auto_compact_token_limit`,用于统一控制 Codex auto compact 触发窗口,避免 GPT-5.5 消费端生成过大的 `/responses/compact` 长请求。 diff --git a/.agents/skills/unidesk-sub2api/references/operations.md b/.agents/skills/unidesk-sub2api/references/operations.md index 89103a43..0e703d22 100644 --- a/.agents/skills/unidesk-sub2api/references/operations.md +++ b/.agents/skills/unidesk-sub2api/references/operations.md @@ -63,7 +63,7 @@ bun scripts/cli.ts platform-infra sub2api status --target PK01 bun scripts/cli.ts platform-infra sub2api validate --target PK01 ``` -PK01 没有 k3s control plane。当前 `codex-pool sync`、`codex-pool validate`、`sentinel-report` 和 `trace` 的部分实现仍依赖 k8s/kubectl 远端脚本;在 PK01 host-Docker target 上看到 `kubectl` 缺失时,应归类为 CLI host-Docker adapter 缺口,不要误判为 Sub2API app、Caddy、上游或账号池故障。正式修复应补 host-Docker 版 codex-pool sync/validate/report/trace;临时排障只能做只读 admin API、DB join 表和最小公网 `/v1/responses` smoke,并且不得打印 admin password、API key 或账号凭据。 +PK01 没有 k3s control plane。`codex-pool sync --target PK01 --confirm` 和 `codex-pool validate --target PK01` 走 host-Docker adapter:通过本机 Sub2API admin API 和 YAML `hostDocker.envPath` 对齐账号池,不使用 k8s Secret/CronJob,也不重启容器。`sentinel-report`、`sentinel-probe`、`sentinel-image` 和部分 `trace` 能力仍可能依赖 k8s/kubectl;在这些命令上看到 `kubectl` 缺失时,应归类为 CLI host-Docker adapter 缺口,不要误判为 Sub2API app、Caddy、上游或账号池故障。临时排障只能做只读 admin API、DB join 表和最小公网 `/v1/responses` smoke,并且不得打印 admin password、API key 或账号凭据。 PK01 host-Docker apply 仍必须由 `platform-infra sub2api apply --target PK01 --confirm` 受控执行。若 dry-run 或 apply 输出显示 `docker compose is absent; apply will use raw docker run fallback`,这表示 CLI 选择了 host-Docker fallback,不是裸手工 Docker 操作;只要 YAML image、env、ports、Caddy managed block 和 `status/validate` 最终对齐,可作为受控滚动升级证据。不要改用手工 `docker run`、手工 compose 文件或直接编辑 PK01 Caddyfile。 diff --git a/docs/reference/platform-infra.md b/docs/reference/platform-infra.md index dd894f51..787249e7 100644 --- a/docs/reference/platform-infra.md +++ b/docs/reference/platform-infra.md @@ -91,7 +91,7 @@ `config/platform-infra/sub2api-codex-pool.yaml` controls the Codex-facing OpenAI-compatible pool: - `pool.groupName` names the Sub2API group that represents the pool. -- `pool.apiKeySecretName` and `pool.apiKeySecretKey` name the k3s Secret that stores the single consumer API key. +- `pool.apiKeySecretName` and `pool.apiKeySecretKey` name the single consumer API key. k3s targets store it in a k3s Secret; PK01 host-Docker stores the same key in the YAML-declared `hostDocker.envPath`. - `pool.minOwnerConcurrency` is optional; when omitted, the CLI automatically uses the sum of all resolved account capacities as the minimum concurrency for the Sub2API user that owns the unified consumer API key. A YAML value is only an explicit override and must still be at least that capacity sum, so the shared key does not fail requests or WS sessions at the user-concurrency layer. "Resolved" means each account's explicit `profiles.entries[].capacity` or, when omitted, `pool.defaultAccountCapacity`. Do not compensate for owner-concurrency 1013 errors by pinning capacity to one provider. - `pool.defaultTempUnschedulable` is the Sub2API built-in request-path temporary-unschedulable switch plus its YAML rule list. When enabled, `codex-pool sync --confirm` renders `temp_unschedulable_enabled` and `temp_unschedulable_rules` into every managed account unless an account-level override says otherwise. This is the generic same-request recovery path for selected-account upstream failures: a matching upstream error briefly cools the selected account so Sub2API's existing failover loop can select another account in the same group. - The built-in temporary-unschedulable configuration and external `sentinel.*` configuration are separate control surfaces. `pool.defaultTempUnschedulable` handles near-real-time request-path cooling and failover; `sentinel.*` handles account-level marker health, quarantine, restore, and probe cadence. Changing one surface must not silently rewrite the other surface's cadence, marker semantics, quarantine state, or rule list. @@ -99,7 +99,7 @@ - Codex accounts selected by YAML do not declare `schedulable` as durable configuration. `codex-pool sync --confirm` must not restore existing account schedulability merely because YAML selects the account or sentinel state lacks an active quarantine. Existing `schedulable=false` is runtime state: the sentinel first reads Sub2API's actual account state, schedules a recovery probe for unschedulable managed accounts, and restores `schedulable=true` only after the marker probe matches. - `codex-pool sync --confirm` preserves UniDesk-managed accounts that are absent from YAML by default; explicit upstream retirement requires `codex-pool sync --confirm --prune-removed`. This keeps account deletion out of the normal availability-recovery path and prevents temporary YAML edits from becoming destructive runtime changes. - `profiles.entries` selects local Codex profile files from `~/.codex/` and maps them to Sub2API account names. -- The unsuffixed master `~/.codex/config.toml` and `~/.codex/auth.json` are reserved for the unified Sub2API consumer. `config.toml` must keep the YAML-selected consumer base URL written by `codex-pool configure-local --target --confirm`, and `auth.json` must contain the unified pool API key from `pool.apiKeySecretName` / `pool.apiKeySecretKey` on that active target. Do not replace these two files with direct upstream account credentials. +- The unsuffixed master `~/.codex/config.toml` and `~/.codex/auth.json` are reserved for the unified Sub2API consumer. `config.toml` must keep the YAML-selected consumer base URL written by `codex-pool configure-local --target --confirm`, and `auth.json` must contain the unified pool API key from the active target's `pool.apiKeySecretName` / `pool.apiKeySecretKey` location. Do not replace these two files with direct upstream account credentials. - Additional upstream accounts must use suffixed local profile files such as `config.toml.` and `auth.json.`, then be declared through `profiles.entries` in `config/platform-infra/sub2api-codex-pool.yaml`. - `profiles.entries[].capacity` optionally overrides `pool.defaultAccountCapacity` for one account. Capacity is a YAML-controlled routing input; concrete current values belong only in `config/platform-infra/sub2api-codex-pool.yaml` and runtime validation output, not in long-term reference prose. Code constants, Secrets, ad-hoc runtime patches, or stale tests must not override YAML source of truth. - `profiles.entries[].loadFactor` optionally overrides `pool.defaultAccountLoadFactor` for one account and is rendered to Sub2API `load_factor`. Treat it as routing policy: values belong in YAML and `codex-pool validate` output, not code constants, Secrets, or ad-hoc runtime patches. diff --git a/scripts/src/platform-infra-sub2api-codex/actions.ts b/scripts/src/platform-infra-sub2api-codex/actions.ts index f6e2d3b2..4410683d 100644 --- a/scripts/src/platform-infra-sub2api-codex/actions.ts +++ b/scripts/src/platform-infra-sub2api-codex/actions.ts @@ -59,7 +59,9 @@ export function codexPoolPlan(options?: DisclosureOptions): Record 0 && profiles.every((profile) => profile.ok); - if (runtimeTarget.runtimeMode === "host-docker" && options.confirm) { - return { - ok: false, - action: "platform-infra-sub2api-codex-pool-sync", - mode: "blocked-host-docker-sync-unsupported", - target: poolTarget(pool, runtimeTarget), - reason: "PK01 host-Docker target does not run the k3s codex-pool sync path; Sub2API runtime is controlled by platform-infra sub2api apply/validate.", - local: { - profileCount: profiles.length, - invalidProfiles: profiles.filter((profile) => !profile.ok).map(compactProfile), - valuesPrinted: false, - }, - next: { - expose: `bun scripts/cli.ts platform-infra sub2api codex-pool expose${targetFlag(runtimeTarget)} --confirm`, - validate: `bun scripts/cli.ts platform-infra sub2api validate${targetFlag(runtimeTarget)}`, - }, - }; - } if (!options.confirm || !planOk) { const plan = { ...codexPoolPlan(options), diff --git a/scripts/src/platform-infra-sub2api-codex/local-codex.ts b/scripts/src/platform-infra-sub2api-codex/local-codex.ts index f1c6b24d..c068d11e 100644 --- a/scripts/src/platform-infra-sub2api-codex/local-codex.ts +++ b/scripts/src/platform-infra-sub2api-codex/local-codex.ts @@ -28,6 +28,59 @@ import { codexPoolRuntimeTarget } from "./runtime-target"; import { sub2apiConfigPath } from "./types"; export async function fetchPoolApiKey(config: UniDeskConfig, pool: CodexPoolConfig, target = codexPoolRuntimeTarget()): Promise<{ apiKey: string | null; error: string | null }> { + if (target.runtimeMode === "host-docker") { + const envPath = target.hostDockerEnvPath; + if (envPath === null) return { apiKey: null, error: "host-docker envPath missing" }; + const result = await capture(config, target.route, ["sh"], ` +set -u +python3 - <<'PY' +import base64 +import json +import subprocess +path = ${JSON.stringify(envPath)} +key = ${JSON.stringify(pool.apiKeySecretKey)} +values = {} +try: + with open(path, "r", encoding="utf-8") as handle: + lines = handle.read().splitlines() +except FileNotFoundError: + print(json.dumps({"ok": False, "error": "env-source-missing", "path": path, "valuesPrinted": False})) + raise SystemExit(1) +except PermissionError: + proc = subprocess.run(["sudo", "-n", "cat", path], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + if proc.returncode != 0: + print(json.dumps({"ok": False, "error": "env-source-unreadable", "path": path, "stderrTail": proc.stderr.decode("utf-8", errors="replace")[-500:], "valuesPrinted": False})) + raise SystemExit(1) + lines = proc.stdout.decode("utf-8", errors="replace").splitlines() +for line in lines: + stripped = line.strip() + if not stripped or stripped.startswith("#") or "=" not in stripped: + continue + current_key, value = stripped.split("=", 1) + current_key = current_key.strip() + value = value.strip() + if len(value) >= 2 and value[0] == value[-1] and value[0] in ("'", '"'): + value = value[1:-1] + values[current_key] = value +value = values.get(key) +if not value: + print(json.dumps({"ok": False, "error": "api-key-missing", "path": path, "key": key, "valuesPrinted": False})) + raise SystemExit(1) +print(json.dumps({"ok": True, "apiKeyB64": base64.b64encode(value.encode()).decode(), "path": path, "key": key, "valuesPrinted": False})) +PY +`); + if (result.exitCode !== 0) return { apiKey: null, error: `read host pool API key source failed: ${result.stderr.slice(-1000) || result.stdout.slice(-1000)}` }; + const parsed = parseJsonOutput(result.stdout); + if (!isRecord(parsed) || parsed.ok !== true || typeof parsed.apiKeyB64 !== "string") { + return { apiKey: null, error: `${envPath}.${pool.apiKeySecretKey} missing` }; + } + try { + const apiKey = Buffer.from(parsed.apiKeyB64, "base64").toString("utf8"); + return apiKey.length > 0 ? { apiKey, error: null } : { apiKey: null, error: "decoded API key is empty" }; + } catch (error) { + return { apiKey: null, error: error instanceof Error ? error.message : String(error) }; + } + } const result = await capture(config, target.route, ["sh"], ` set -u kubectl -n ${target.namespace} get secret ${pool.apiKeySecretName} -o json diff --git a/scripts/src/platform-infra-sub2api-codex/options.ts b/scripts/src/platform-infra-sub2api-codex/options.ts index 151cc28b..d2120e16 100644 --- a/scripts/src/platform-infra-sub2api-codex/options.ts +++ b/scripts/src/platform-infra-sub2api-codex/options.ts @@ -329,6 +329,7 @@ export interface Sub2ApiRuntimeConfig { defaultTargetId: string; appSecretName: string; secretsRoot: string; + appSourceRef: string; sentinelEnabledOnTargets: string[]; targets: Record[]; } diff --git a/scripts/src/platform-infra-sub2api-codex/public-exposure.ts b/scripts/src/platform-infra-sub2api-codex/public-exposure.ts index 54c44b2b..377021a1 100644 --- a/scripts/src/platform-infra-sub2api-codex/public-exposure.ts +++ b/scripts/src/platform-infra-sub2api-codex/public-exposure.ts @@ -41,7 +41,9 @@ export function poolTarget(pool = readCodexPoolConfig(), target = codexPoolRunti configPath: codexPoolConfigPath, groupName: pool.groupName, apiKeyName: pool.apiKeyName, - apiKeySecret: `${target.namespace}/${pool.apiKeySecretName}.${pool.apiKeySecretKey}`, + apiKeySecret: target.runtimeMode === "host-docker" + ? `${target.hostDockerEnvPath}.${pool.apiKeySecretKey}` + : `${target.namespace}/${pool.apiKeySecretName}.${pool.apiKeySecretKey}`, publicExposure: targetPublicExposureSummary(target), sentinelImageBuild: { source: `${sub2apiConfigPath}.targets[${target.id}].codexPool.sentinelImageBuild`, diff --git a/scripts/src/platform-infra-sub2api-codex/remote-python-sync.ts b/scripts/src/platform-infra-sub2api-codex/remote-python-sync.ts index 0a2db745..8dc21ada 100644 --- a/scripts/src/platform-infra-sub2api-codex/remote-python-sync.ts +++ b/scripts/src/platform-infra-sub2api-codex/remote-python-sync.ts @@ -27,12 +27,14 @@ import { resolvedManualAccountProtections } from "./public-exposure"; import { fieldManager } from "./types"; export function remotePythonScript(mode: "sync" | "validate" | "trace" | "cleanup-probes" | "sentinel-probe", encodedPayload: string, pool: CodexPoolConfig, target: CodexPoolRuntimeTarget): string { + const hostDockerEnvPath = target.runtimeMode === "host-docker" ? target.hostDockerEnvPath : null; return ` set -u python3 - <<'PY' import base64 import hashlib import json +import os import re import secrets import string @@ -43,9 +45,13 @@ from datetime import datetime, timezone, timedelta from urllib.parse import quote TARGET_ID = ${JSON.stringify(target.id)} +RUNTIME_MODE = ${JSON.stringify(target.runtimeMode)} NAMESPACE = ${JSON.stringify(target.namespace)} SERVICE_NAME = ${JSON.stringify(target.serviceName)} SERVICE_DNS = ${JSON.stringify(target.serviceDns)} +HOST_DOCKER_APP_PORT = ${JSON.stringify(target.hostDockerAppPort)} +HOST_DOCKER_ENV_PATH = ${JSON.stringify(hostDockerEnvPath)} +HOST_DOCKER_APP_CONTAINER = "sub2api-app" FIELD_MANAGER = "${fieldManager}" APP_SECRET_NAME = ${JSON.stringify(target.appSecretName)} POOL_GROUP_NAME = "${pool.groupName}" @@ -80,6 +86,107 @@ def text(data, limit=4000): data = data.decode("utf-8", errors="replace") return data[-limit:] +def read_host_env(): + if RUNTIME_MODE != "host-docker": + return {} + if not isinstance(HOST_DOCKER_ENV_PATH, str) or not HOST_DOCKER_ENV_PATH: + raise RuntimeError("host-docker env source path missing") + values = {} + lines = read_host_env_lines() + for line in lines: + stripped = line.strip() + if not stripped or stripped.startswith("#") or "=" not in stripped: + continue + key, value = stripped.split("=", 1) + key = key.strip() + value = value.strip() + if len(value) >= 2 and value[0] == value[-1] and value[0] in ("'", '"'): + value = value[1:-1] + if key: + values[key] = value + return values + +def read_host_env_lines(): + try: + with open(HOST_DOCKER_ENV_PATH, "r", encoding="utf-8") as handle: + return handle.read().splitlines() + except FileNotFoundError: + raise RuntimeError(f"host-docker env source missing: {HOST_DOCKER_ENV_PATH}") + except PermissionError: + proc = run(["sudo", "-n", "cat", HOST_DOCKER_ENV_PATH]) + if proc.returncode != 0: + raise RuntimeError("read host-docker env source failed: " + text(proc.stderr, 1000)) + return proc.stdout.decode("utf-8", errors="replace").splitlines() + +def write_host_env_value(key, value): + if RUNTIME_MODE != "host-docker": + raise RuntimeError("write_host_env_value is only valid for host-docker") + if not isinstance(HOST_DOCKER_ENV_PATH, str) or not HOST_DOCKER_ENV_PATH: + raise RuntimeError("host-docker env source path missing") + if not re.match(r"^[A-Za-z_][A-Za-z0-9_]*$", key): + raise RuntimeError(f"unsupported env key: {key}") + os.makedirs(os.path.dirname(HOST_DOCKER_ENV_PATH), exist_ok=True) + try: + lines = read_host_env_lines() + except RuntimeError as exc: + if "missing" not in str(exc): + raise + lines = [] + next_lines = [] + replaced = False + for line in lines: + stripped = line.strip() + if stripped.startswith("#") or "=" not in stripped: + next_lines.append(line) + continue + current_key = stripped.split("=", 1)[0].strip() + if current_key == key: + next_lines.append(f"{key}={value}") + replaced = True + else: + next_lines.append(line) + if not replaced: + next_lines.append(f"{key}={value}") + content = "\\n".join(next_lines).rstrip() + "\\n" + tmp_path = HOST_DOCKER_ENV_PATH + ".tmp" + try: + with open(tmp_path, "w", encoding="utf-8") as handle: + handle.write(content) + os.chmod(tmp_path, 0o600) + os.replace(tmp_path, HOST_DOCKER_ENV_PATH) + except PermissionError: + try: + os.unlink(tmp_path) + except Exception: + pass + script = r''' +set -eu +path="$1" +dir="$(dirname "$path")" +mkdir -p "$dir" +tmp="$path.tmp.$$" +umask 077 +cat > "$tmp" +mv "$tmp" "$path" +chmod 600 "$path" +''' + proc = run(["sudo", "-n", "sh", "-c", script, "sh", HOST_DOCKER_ENV_PATH], content.encode("utf-8")) + if proc.returncode != 0: + raise RuntimeError("write host-docker env source failed: " + text(proc.stderr, 1000)) + return "updated" if replaced else "created" + +def docker(args): + proc = run(["docker", *args]) + if proc.returncode == 0: + return proc + sudo_proc = run(["sudo", "-n", "docker", *args]) + return sudo_proc if sudo_proc.returncode == 0 else proc + +def runtime_logs(since, tail): + if RUNTIME_MODE == "host-docker": + return docker(["logs", f"--since={since}", f"--tail={tail}", HOST_DOCKER_APP_CONTAINER]) + return kubectl(["-n", NAMESPACE, "logs", "deployment/sub2api", f"--since={since}", f"--tail={tail}"]) + def kubectl(args, input_obj=None): if isinstance(input_obj, str): input_bytes = input_obj.encode("utf-8") @@ -98,17 +205,23 @@ def kube_json(args, label): return json.loads(raw.decode("utf-8")) def decode_secret_value(name, key): + if RUNTIME_MODE == "host-docker": + return read_host_env().get(key) data = kube_json(["-n", NAMESPACE, "get", "secret", name], f"secret/{name}").get("data") or {} if key not in data: return None return base64.b64decode(data[key]).decode("utf-8") def get_config_value(name, key): + if RUNTIME_MODE == "host-docker": + return read_host_env().get(key) data = kube_json(["-n", NAMESPACE, "get", "configmap", name], f"configmap/{name}").get("data") or {} value = data.get(key) return value if isinstance(value, str) and value else None def select_app_pod(): + if RUNTIME_MODE == "host-docker": + return HOST_DOCKER_APP_CONTAINER pods = kube_json(["-n", NAMESPACE, "get", "pods", "-l", "app.kubernetes.io/name=sub2api"], "sub2api pods").get("items") or [] for pod in pods: status = pod.get("status") or {} @@ -241,10 +354,15 @@ else fi fi ''' - proc = run([ - "kubectl", "-n", NAMESPACE, "exec", "-i", APP_POD, - "--", "sh", "-c", script, "sh", method, f"http://127.0.0.1:8080{path}", bearer or "", - ], body) + if RUNTIME_MODE == "host-docker": + if not isinstance(HOST_DOCKER_APP_PORT, int): + raise RuntimeError("host-docker app port missing") + proc = run(["sh", "-c", script, "sh", method, f"http://127.0.0.1:{HOST_DOCKER_APP_PORT}{path}", bearer or ""], body) + else: + proc = run([ + "kubectl", "-n", NAMESPACE, "exec", "-i", APP_POD, + "--", "sh", "-c", script, "sh", method, f"http://127.0.0.1:8080{path}", bearer or "", + ], body) return parse_curl_output(proc) def envelope_data(parsed): @@ -996,6 +1114,9 @@ def ensure_api_key_secret(group_id, token): secret_action = "reused-existing-sub2api-key" else: secret_action = "created" + if RUNTIME_MODE == "host-docker": + env_action = "kept-existing" if existing else write_host_env_value(POOL_API_KEY_SECRET_KEY, api_key) + return api_key, secret_action, f"host-docker-env:{env_action};source={HOST_DOCKER_ENV_PATH};key={POOL_API_KEY_SECRET_KEY};valuesPrinted=false" manifest = { "apiVersion": "v1", "kind": "Secret", @@ -1022,6 +1143,11 @@ def ensure_api_key_secret(group_id, token): raise RuntimeError(f"apply API key secret failed: {text(proc.stderr, 1000)}") return api_key, secret_action, text(proc.stdout, 1000) +def pool_api_key_secret_location(): + if RUNTIME_MODE == "host-docker": + return f"{HOST_DOCKER_ENV_PATH}.{POOL_API_KEY_SECRET_KEY}" + return f"{NAMESPACE}/{POOL_API_KEY_SECRET_NAME}.{POOL_API_KEY_SECRET_KEY}" + def apply_sentinel_manifest(manifest): if not TARGET_SENTINEL_ENABLED: return { @@ -1190,6 +1316,8 @@ def parse_epoch_z(value): return None def sentinel_state_object(): + if not TARGET_SENTINEL_ENABLED: + return None, None state_name = SENTINEL_CONFIG.get("stateConfigMapName") if not state_name: return None, None @@ -1205,6 +1333,8 @@ def sentinel_state_object(): return obj, None def active_sentinel_quarantine_names(): + if not TARGET_SENTINEL_ENABLED: + return set() _, state = sentinel_state_object() if not isinstance(state, dict): return set() @@ -1668,7 +1798,7 @@ def response_output_preview(parsed): return "\\n".join(parts)[:240] def request_log_evidence(request_id): - proc = kubectl(["-n", NAMESPACE, "logs", "deployment/sub2api", "--since=5m", "--tail=800"]) + proc = runtime_logs("5m", 800) stdout = proc.stdout.decode("utf-8", errors="replace") lines = [line for line in stdout.splitlines() if request_id in line] failovers = [] @@ -1705,7 +1835,7 @@ def request_log_evidence(request_id): } def recent_compact_gateway_evidence(): - proc = kubectl(["-n", NAMESPACE, "logs", "deployment/sub2api", "--since=6h", "--tail=2500"]) + proc = runtime_logs("6h", 2500) stdout = proc.stdout.decode("utf-8", errors="replace") failures = [] successes = [] @@ -1830,7 +1960,7 @@ def failover_budget_exhausted_evidence(failovers, final_errors): return exhausted def recent_responses_gateway_evidence(): - proc = kubectl(["-n", NAMESPACE, "logs", "deployment/sub2api", "--since=6h", "--tail=2500"]) + proc = runtime_logs("6h", 2500) stdout = proc.stdout.decode("utf-8", errors="replace") failovers = [] forward_failures = [] @@ -1936,6 +2066,7 @@ def validate_gateway_responses(api_key): set -eu token="$1" request_id="$2" +url="$3" tmp="$(mktemp)" trap 'rm -f "$tmp"' EXIT cat > "$tmp" @@ -1945,13 +2076,18 @@ curl -sS -w '\\n__HTTP_CODE__:%{http_code}' -X POST \ -H "X-Request-ID: $request_id" \ -H "OpenAI-Client-Request-ID: $request_id" \ --data-binary @"$tmp" \ - http://127.0.0.1:8080/v1/responses + "$url" ''' started = time.time() - proc = run([ - "kubectl", "-n", NAMESPACE, "exec", "-i", APP_POD, - "--", "sh", "-c", script, "sh", api_key, request_id, - ], body) + if RUNTIME_MODE == "host-docker": + if not isinstance(HOST_DOCKER_APP_PORT, int): + raise RuntimeError("host-docker app port missing") + proc = run(["sh", "-c", script, "sh", api_key, request_id, f"http://127.0.0.1:{HOST_DOCKER_APP_PORT}/v1/responses"], body) + else: + proc = run([ + "kubectl", "-n", NAMESPACE, "exec", "-i", APP_POD, + "--", "sh", "-c", script, "sh", api_key, request_id, "http://127.0.0.1:8080/v1/responses", + ], body) resp = parse_curl_output(proc) evidence = request_log_evidence(request_id) parsed = resp.get("json") @@ -2123,6 +2259,31 @@ def validate_runtime_capabilities(token): } def app_pod_runtime_image(): + if RUNTIME_MODE == "host-docker": + proc = docker(["inspect", HOST_DOCKER_APP_CONTAINER]) + if proc.returncode != 0: + return { + "container": HOST_DOCKER_APP_CONTAINER, + "error": text(proc.stderr, 1000) or text(proc.stdout, 1000), + } + try: + data = json.loads(proc.stdout.decode("utf-8")) + item = data[0] if isinstance(data, list) and data else {} + except Exception as exc: + return {"container": HOST_DOCKER_APP_CONTAINER, "error": str(exc)} + state = item.get("State") if isinstance(item, dict) and isinstance(item.get("State"), dict) else {} + health = state.get("Health") if isinstance(state.get("Health"), dict) else {} + config = item.get("Config") if isinstance(item, dict) and isinstance(item.get("Config"), dict) else {} + return { + "container": HOST_DOCKER_APP_CONTAINER, + "id": (item.get("Id") or "")[:12] if isinstance(item.get("Id"), str) else None, + "image": config.get("Image"), + "imageID": item.get("Image"), + "ready": state.get("Running") is True and (not health or health.get("Status") in (None, "healthy")), + "restartCount": item.get("RestartCount"), + "startedAt": state.get("StartedAt"), + "health": health.get("Status"), + } try: pod = kube_json(["-n", NAMESPACE, "get", "pod", APP_POD], f"pod/{APP_POD}") spec_containers = ((pod.get("spec") or {}).get("containers") or []) if isinstance(pod, dict) else [] @@ -2474,7 +2635,7 @@ def run_sync(): "tempUnschedulable": temp_unschedulable_status, "apiKey": { "name": POOL_API_KEY_NAME, - "secret": f"{NAMESPACE}/{POOL_API_KEY_SECRET_NAME}.{POOL_API_KEY_SECRET_KEY}", + "secret": pool_api_key_secret_location(), "secretAction": secret_action, "secretApply": secret_apply_stdout, "sub2apiAction": api_key_result["action"], @@ -2523,7 +2684,7 @@ def run_validate(): "appPod": APP_POD, "admin": {"email": admin_email, "tokenPrinted": False, "compliance": admin_compliance}, "apiKey": { - "secret": f"{NAMESPACE}/{POOL_API_KEY_SECRET_NAME}.{POOL_API_KEY_SECRET_KEY}", + "secret": pool_api_key_secret_location(), "sub2apiId": key_item.get("id") if isinstance(key_item, dict) else None, "userId": key_item.get("user_id") if isinstance(key_item, dict) else None, "groupId": key_item.get("group_id") if isinstance(key_item, dict) else None, diff --git a/scripts/src/platform-infra-sub2api-codex/runtime-target.ts b/scripts/src/platform-infra-sub2api-codex/runtime-target.ts index 405046e6..ad0f5540 100644 --- a/scripts/src/platform-infra-sub2api-codex/runtime-target.ts +++ b/scripts/src/platform-infra-sub2api-codex/runtime-target.ts @@ -41,6 +41,8 @@ export function readSub2ApiRuntimeConfig(): Sub2ApiRuntimeConfig { const secrets = runtime !== null && isRecord(runtime.secrets) ? runtime.secrets : null; const secretsRoot = secrets === null ? null : stringValue(secrets.root); if (secretsRoot === null || !secretsRoot.startsWith("/")) throw new Error(`${sub2apiConfigPath}.runtime.secrets.root must be an absolute path`); + const appSourceRef = secrets === null ? null : stringValue(secrets.appSourceRef); + if (appSourceRef === null || !/^[A-Za-z0-9_./-]+$/u.test(appSourceRef)) throw new Error(`${sub2apiConfigPath}.runtime.secrets.appSourceRef has an unsupported format`); const sentinel = runtime !== null && isRecord(runtime.sentinel) ? runtime.sentinel : null; const enabledOnTargets = Array.isArray(sentinel?.enabledOnTargets) ? sentinel.enabledOnTargets.map((entry) => stringValue(entry)).filter((entry): entry is string => entry !== null && entry.length > 0) @@ -50,6 +52,7 @@ export function readSub2ApiRuntimeConfig(): Sub2ApiRuntimeConfig { defaultTargetId, appSecretName, secretsRoot, + appSourceRef, sentinelEnabledOnTargets: enabledOnTargets, targets: parsed.targets, }; @@ -99,9 +102,13 @@ export function codexPoolRuntimeTarget(targetId?: string): CodexPoolRuntimeTarge if (publicExposure !== null && publicExposure.enabled) publicBaseUrl = publicExposure.publicBaseUrl; const hostDocker = runtimeMode === "host-docker" && isRecord(raw.hostDocker) ? raw.hostDocker : null; const hostDockerAppPort = hostDocker === null ? null : numberValue(hostDocker.appPort); + const hostDockerEnvPath = hostDocker === null ? null : stringValue(hostDocker.envPath); if (runtimeMode === "host-docker" && (hostDockerAppPort === null || !Number.isInteger(hostDockerAppPort) || hostDockerAppPort < 1 || hostDockerAppPort > 65535)) { throw new Error(`${sub2apiConfigPath}.targets[${id}].hostDocker.appPort must be an integer TCP port when runtimeMode=host-docker`); } + if (runtimeMode === "host-docker" && (hostDockerEnvPath === null || !hostDockerEnvPath.startsWith("/"))) { + throw new Error(`${sub2apiConfigPath}.targets[${id}].hostDocker.envPath must be an absolute path when runtimeMode=host-docker`); + } return { id, @@ -114,6 +121,9 @@ export function codexPoolRuntimeTarget(targetId?: string): CodexPoolRuntimeTarge publicExposure, appSecretName: runtimeConfig.appSecretName, secretsRoot: runtimeConfig.secretsRoot, + appSourceRef: runtimeConfig.appSourceRef, + hostDockerAppPort, + hostDockerEnvPath, sentinelEnabled, sentinelImageBuild, egressProxy, diff --git a/scripts/src/platform-infra-sub2api-codex/types.ts b/scripts/src/platform-infra-sub2api-codex/types.ts index f5e1ec2f..f62e868e 100644 --- a/scripts/src/platform-infra-sub2api-codex/types.ts +++ b/scripts/src/platform-infra-sub2api-codex/types.ts @@ -88,6 +88,9 @@ export interface CodexPoolRuntimeTarget { publicExposure: CodexPoolRuntimePublicExposure | null; appSecretName: string; secretsRoot: string; + appSourceRef: string; + hostDockerAppPort: number | null; + hostDockerEnvPath: string | null; sentinelEnabled: boolean; sentinelImageBuild: { baseImageCachePolicy: "pull" | "local-if-present";