Merge pull request #279 from pikasTech/fix/sub2api-trace-report

feat: add Sub2API request trace report
2026-06-12 08:01:48 +08:00
parent 3a67bc3432 15f4c5870a
commit 54ce2e8bf0
4 changed files with 597 additions and 3 deletions
@@ -19,6 +19,14 @@ bun scripts/cli.ts platform-infra sub2api codex-pool sentinel-report

 需要机器处理或完整字段时再加 `--raw`；需要更多最近运行记录时加 `--events N`。

+追溯某个 Codex/Sub2API request id 的中断、上游账号、切号、临时不可调度、账号选择失败和同窗口账号池信号时，优先使用低噪声 trace 报表，不要先手写 `kubectl logs | grep`：
+
+```bash
+bun scripts/cli.ts platform-infra sub2api codex-pool trace --request-id <requestId>
+```
+
+默认输出类似 k8s/ps 的短表；机器处理用 `--raw` 读取 `.data.trace.*`；需要审计原始匹配日志时加 `--show-lines`；需要扩大搜索范围时使用 `--since 24h --tail 50000`。该命令只读：读取 Sub2API 日志、账号快照和 admin API 元数据，不改 `schedulable`、不清 runtime backoff、不中断请求。
+
 ## 先读边界

 - 仓库长期开发边界见 `docs/reference/platform-infra.md`，本 skill 承担日常操作手册。
@@ -61,6 +69,7 @@ bun scripts/cli.ts platform-infra sub2api validate
 bun scripts/cli.ts platform-infra sub2api codex-pool plan
 bun scripts/cli.ts platform-infra sub2api codex-pool sync --confirm
 bun scripts/cli.ts platform-infra sub2api codex-pool validate
+bun scripts/cli.ts platform-infra sub2api codex-pool trace --request-id <requestId>
 bun scripts/cli.ts platform-infra sub2api codex-pool sentinel-image status
 bun scripts/cli.ts platform-infra sub2api codex-pool sentinel-image build --confirm
 bun scripts/cli.ts platform-infra sub2api codex-pool sentinel-probe --account unidesk-codex-hy --confirm
@@ -102,6 +111,8 @@ bun scripts/cli.ts platform-infra sub2api codex-pool cleanup-probes --confirm

 `sync --confirm` 同时会按 YAML 渲染账号级哨兵资源，并在 monitor 开启时先确保可复用哨兵镜像存在。当前目标是 `sentinel.monitor.enabled=true` + `sentinel.actions.enabled=true` 的 marker-only 自动冻结/恢复；不要手工 patch CronJob、Secret 或 Sub2API account。若 YAML 新增账号或修改 profile/base URL/API key fingerprint/upstream User-Agent/Responses WebSocket mode，sync 会从变更前 runtime state 写入 pending probe 记录并立即安排 sentinel probe，但默认仍保持该 account 可调度；只有实际 marker probe 非命中或已有 active quarantine 才会冻结账号。sentinel 冻结/恢复只改 `schedulable=false|true`，不得顺手调用 Sub2API `recover-state` 清除请求路径临时不可调度或其他 runtime backoff。无关账号的既有成功/失败退避不能被重置。若 YAML 下调失败冻结最大窗口，sync 会把仍 active 的旧冻结状态迁移到当前最大窗口内并立即安排 recovery probe，但不会直接解冻。若怀疑某个账号被误判，先用 `codex-pool sentinel-probe --account <accountName> --confirm` 立即触发该账号测量；该命令从现有 CronJob 模板派生一次性 Job，复用同一份 Secret、ConfigMap、OpenAI SDK probe、token/cost 账本和冻结/恢复状态机。

+`trace --request-id <requestId>` 是只读 request 追溯报表，不触发 probe、不修改账号。默认输出请求开始/最终状态、failover、`account_select_failed`、窗口内 `account_temp_unschedulable`、admin schedulable 写入计数和当前账号快照；`reason=failover-attempted-no-candidate` 表示 Sub2API 已进入自动切号，但排除当前失败账号后没有可用候选。需要机器处理时使用 `--raw`，需要原始匹配行时加 `--show-lines`。
+
 `sentinel-report` 是只读低噪声报表，不触发 probe、不修改账号。默认输出类似 `ps` 的文本表，展示每个账号的探测次数、最近 marker/HTTP/动作、冻结 TTL、成功退避、下一次 probe 和最近 run 事件；需要机器处理时使用 `sentinel-report --raw`。

 `sync --confirm` 和 `validate` 可能超过单次 SSH/runtime 短连接窗口。必须继续使用 `bun scripts/cli.ts platform-infra sub2api codex-pool ...`，由 CLI 在 G14 远端提交作业并短轮询状态；不要改用裸 `trans G14:k3s script` 等一个长连接等待完整结果。若看到 `UNIDESK_SSH_RUNTIME_TIMEOUT`，先按 `docs/reference/platform-infra.md` 的规则处理为控制面可见性问题，修 CLI/job/poll 或重跑受控命令，不要手工 patch Sub2API credentials 或源码。
@@ -182,6 +193,7 @@ bun scripts/cli.ts platform-infra sub2api codex-pool configure-local --confirm
 ## 排障

 - Codex pool 哨兵、账号冻结/恢复、marker-only 判断或 probe 周期看不清：第一步跑 `bun scripts/cli.ts platform-infra sub2api codex-pool sentinel-report`。这个报表是主观察面；只有报表缺字段或需要底层证据时，才继续看 `--raw`、CronJob log、state ConfigMap 或 Sub2API 管理 UI。若看到“临时不可调度状态”且包含规则序号/匹配关键词，检查 Sub2API `account_temp_unschedulable` 日志和账号 `temp_unschedulable_*` 字段；sentinel 只解释 `schedulable=false` 的 active quarantine，不解释这类内置临时冷却。
+- 单个 request id 报 502/503/中断/没有自动切号：第一步跑 `bun scripts/cli.ts platform-infra sub2api codex-pool trace --request-id <requestId>`。先看 `outcome`、`reason`、`FAILOVER`、`SELECT-FAILED`、`ACCOUNT SIGNALS` 和 `WINDOW STATS`；只有 trace 报表缺字段或需要审计原始日志时，才加 `--show-lines` 或 `--raw`。若 `reason=failover-attempted-no-candidate`，说明切号动作已发生，但 scheduler 在排除失败账号后没有可用候选；继续用 `sentinel-report` 和 `validate --full` 区分 sentinel quarantine、request-path temp-unschedulable、账号 status 或容量耗尽。
 - profile invalid：先修 `~/.codex/config.toml.<profile>` 的 `base_url`、`wire_api`、`model` 或 `auth.json.<profile>` 的 API key；不要在 YAML 中写密钥。
 - Sub2API 卡在 `wait-postgres` / `wait-redis` 或服务内大量 `context deadline exceeded`：先跑 `sub2api status` 看 `networkPolicy.ok`，再跑 `sub2api validate` 看 `postgresCrossPodPgIsReady` / `redisCrossPodPing`；缺失或异常时用 `sub2api apply --confirm` 恢复受控 `NetworkPolicy/allow-all`，不要保留手工 iptables bypass 作为长期修复。
 - pool key 401：跑 `codex-pool sync --confirm` 重建 Sub2API key 与 k3s Secret 绑定，再跑 `codex-pool validate`。
@@ -606,6 +606,7 @@ function platformInfraHelpSummary(): unknown {
      "bun scripts/cli.ts platform-infra sub2api plan",
      "bun scripts/cli.ts platform-infra sub2api status [--full|--raw]",
      "bun scripts/cli.ts platform-infra sub2api codex-pool validate",
+      "bun scripts/cli.ts platform-infra sub2api codex-pool trace --request-id <requestId>",
      "bun scripts/cli.ts platform-infra sub2api codex-pool sentinel-image status",
      "bun scripts/cli.ts platform-infra sub2api codex-pool sentinel-probe --account unidesk-codex-hy --confirm",
      "bun scripts/cli.ts platform-infra sub2api codex-pool sentinel-report",
@@ -56,6 +56,14 @@ interface SentinelReportOptions extends DisclosureOptions {
  events: number;
 }

+interface TraceOptions extends DisclosureOptions {
+  requestId: string | null;
+  since: string;
+  tail: number;
+  contextSeconds: number;
+  showLines: boolean;
+}
+
 interface SentinelImageOptions extends DisclosureOptions {
  action: "status" | "build";
  confirm: boolean;
@@ -192,12 +200,13 @@ interface CodexLocalConsumerTomlOptions {
 export function codexPoolHelp(): unknown {
  const pool = readCodexPoolConfig();
  return {
-    command: "platform-infra sub2api codex-pool plan|sync|validate|sentinel-image|sentinel-probe|sentinel-report|cleanup-probes|expose|configure-local",
-    output: "json, except sentinel-report defaults to a ps-like text table",
+    command: "platform-infra sub2api codex-pool plan|sync|validate|trace|sentinel-image|sentinel-probe|sentinel-report|cleanup-probes|expose|configure-local",
+    output: "json, except trace and sentinel-report default to low-noise text tables",
    usage: [
      "bun scripts/cli.ts platform-infra sub2api codex-pool plan",
      "bun scripts/cli.ts platform-infra sub2api codex-pool sync --confirm [--prune-removed]",
      "bun scripts/cli.ts platform-infra sub2api codex-pool validate [--full|--raw]",
+      "bun scripts/cli.ts platform-infra sub2api codex-pool trace --request-id <requestId> [--since 24h|--tail 20000|--context-seconds 300|--show-lines|--raw]",
      "bun scripts/cli.ts platform-infra sub2api codex-pool sentinel-image status",
      "bun scripts/cli.ts platform-infra sub2api codex-pool sentinel-image build --confirm",
      "bun scripts/cli.ts platform-infra sub2api codex-pool sentinel-probe --account unidesk-codex-hy --confirm",
@@ -229,6 +238,7 @@ export async function runCodexPoolCommand(config: UniDeskConfig, args: string[])
  if (action === "plan") return codexPoolPlan(parseDisclosureOptions(args.slice(1)));
  if (action === "sync") return await codexPoolSync(config, parseSyncOptions(args.slice(1)));
  if (action === "validate") return await codexPoolValidate(config, parseDisclosureOptions(args.slice(1)));
+  if (action === "trace") return await codexPoolTrace(config, parseTraceOptions(args.slice(1)));
  if (action === "sentinel-image") return await codexPoolSentinelImage(config, parseSentinelImageOptions(args.slice(1)));
  if (action === "sentinel-probe") return await codexPoolSentinelProbe(config, parseSentinelProbeOptions(args.slice(1)));
  if (action === "sentinel-report") return await codexPoolSentinelReport(config, parseSentinelReportOptions(args.slice(1)));
@@ -350,6 +360,91 @@ function parseSentinelReportOptions(args: string[]): SentinelReportOptions {
  return { ...disclosure, events };
 }

+function parseTraceOptions(args: string[]): TraceOptions {
+  let requestId: string | null = null;
+  let since = "24h";
+  let tail = 20_000;
+  let contextSeconds = 300;
+  let showLines = false;
+  const disclosureArgs: string[] = [];
+  for (let index = 0; index < args.length; index += 1) {
+    const arg = args[index]!;
+    if (arg === "--full" || arg === "--raw") {
+      disclosureArgs.push(arg);
+      continue;
+    }
+    if (arg === "--show-lines") {
+      showLines = true;
+      continue;
+    }
+    if (arg === "--request-id" || arg === "--id") {
+      const value = args[index + 1];
+      if (value === undefined || value.startsWith("--")) throw new Error(`${arg} requires a request id`);
+      requestId = value.trim();
+      index += 1;
+      continue;
+    }
+    if (arg.startsWith("--request-id=")) {
+      requestId = arg.slice("--request-id=".length).trim();
+      continue;
+    }
+    if (arg.startsWith("--id=")) {
+      requestId = arg.slice("--id=".length).trim();
+      continue;
+    }
+    if (arg === "--since") {
+      const value = args[index + 1];
+      if (value === undefined || value.startsWith("--")) throw new Error("--since requires a kubectl duration such as 24h or 90m");
+      since = parseKubectlDuration(value);
+      index += 1;
+      continue;
+    }
+    if (arg.startsWith("--since=")) {
+      since = parseKubectlDuration(arg.slice("--since=".length));
+      continue;
+    }
+    if (arg === "--tail") {
+      const value = args[index + 1];
+      if (value === undefined || value.startsWith("--")) throw new Error("--tail requires an integer");
+      tail = parseTraceLimit(value, "--tail", 100, 200_000);
+      index += 1;
+      continue;
+    }
+    if (arg.startsWith("--tail=")) {
+      tail = parseTraceLimit(arg.slice("--tail=".length), "--tail", 100, 200_000);
+      continue;
+    }
+    if (arg === "--context-seconds") {
+      const value = args[index + 1];
+      if (value === undefined || value.startsWith("--")) throw new Error("--context-seconds requires an integer");
+      contextSeconds = parseTraceLimit(value, "--context-seconds", 0, 3600);
+      index += 1;
+      continue;
+    }
+    if (arg.startsWith("--context-seconds=")) {
+      contextSeconds = parseTraceLimit(arg.slice("--context-seconds=".length), "--context-seconds", 0, 3600);
+      continue;
+    }
+    throw new Error(`unsupported option: ${arg}`);
+  }
+  if (requestId === null || requestId.length === 0) throw new Error("trace requires --request-id <requestId>");
+  if (!/^[A-Za-z0-9_.:-]{8,128}$/u.test(requestId)) throw new Error("--request-id has an unsupported format");
+  const disclosure = parseDisclosureOptions(disclosureArgs);
+  return { ...disclosure, requestId, since, tail, contextSeconds, showLines };
+}
+
+function parseKubectlDuration(raw: string): string {
+  const value = raw.trim();
+  if (!/^[1-9][0-9]*(?:s|m|h)$/u.test(value)) throw new Error("--since must be a kubectl duration such as 24h, 90m, or 300s");
+  return value;
+}
+
+function parseTraceLimit(raw: string, option: string, min: number, max: number): number {
+  const value = Number(raw);
+  if (!Number.isInteger(value) || value < min || value > max) throw new Error(`${option} must be an integer from ${min} to ${max}`);
+  return value;
+}
+
 function readReportEventLimit(raw: string, option: string): number {
  const value = Number(raw);
  if (!Number.isInteger(value) || value < 1 || value > 200) throw new Error(`${option} must be an integer from 1 to 200`);
@@ -588,6 +683,28 @@ async function codexPoolValidate(config: UniDeskConfig, options: DisclosureOptio
  };
 }

+async function codexPoolTrace(config: UniDeskConfig, options: TraceOptions): Promise<Record<string, unknown> | RenderedCliResult> {
+  const pool = readCodexPoolConfig();
+  const result = await capture(config, g14K3sRoute, ["script"], traceScript(pool, options));
+  const parsed = parseJsonOutput(result.stdout);
+  const ok = result.exitCode === 0 && boolField(parsed, "ok", false);
+  if (options.raw) {
+    return {
+      ok,
+      action: "platform-infra-sub2api-codex-pool-trace",
+      remote: compactCapture(result, { full: true }),
+      trace: parsed,
+      valuesPrinted: false,
+    };
+  }
+  const text = renderTraceReport(parsed, {
+    requestId: options.requestId ?? "",
+    showLines: options.showLines,
+    remote: compactCapture(result, { full: result.exitCode !== 0 || parsed === null }),
+  });
+  return renderedCliResult(ok, "platform-infra sub2api codex-pool trace", text);
+}
+
 async function codexPoolSentinelReport(config: UniDeskConfig, options: SentinelReportOptions): Promise<Record<string, unknown> | RenderedCliResult> {
  const pool = readCodexPoolConfig();
  const result = await capture(config, g14K3sRoute, ["script"], sentinelReportScript(pool, options.events));
@@ -2077,6 +2194,159 @@ function renderSentinelReport(
  return lines.join("\n");
 }

+function renderTraceReport(
+  parsed: Record<string, unknown> | null,
+  context: { requestId: string; showLines: boolean; remote: Record<string, unknown> },
+): string {
+  if (parsed === null) {
+    return [
+      `SUB2API TRACE ${context.requestId} unavailable`,
+      `remote_exit=${context.remote.exitCode ?? "?"} stdout_bytes=${context.remote.stdoutBytes ?? "?"} stderr_bytes=${context.remote.stderrBytes ?? "?"}`,
+      stringValue(context.remote.stderrTail) ?? stringValue(context.remote.stdoutTail) ?? "",
+    ].filter(Boolean).join("\n");
+  }
+  const summary = isRecord(parsed.summary) ? parsed.summary : {};
+  const request = isRecord(parsed.request) ? parsed.request : {};
+  const final = isRecord(parsed.final) ? parsed.final : {};
+  const window = isRecord(parsed.window) ? parsed.window : {};
+  const events = recordArray(parsed.events);
+  const failovers = recordArray(parsed.failovers);
+  const selectFailures = recordArray(parsed.selectFailures);
+  const upstreamErrors = recordArray(parsed.upstreamErrors);
+  const tempUnschedulable = recordArray(parsed.tempUnschedulable);
+  const windowStats = isRecord(parsed.windowStats) ? parsed.windowStats : {};
+  const accountSnapshot = recordArray(parsed.accountSnapshot);
+  const lines: string[] = [];
+  lines.push([
+    "SUB2API TRACE",
+    stringValue(parsed.requestId) ?? context.requestId,
+    `ok=${parsed.ok === true ? "true" : "false"}`,
+    `outcome=${stringValue(summary.outcome) ?? "-"}`,
+    `reason=${stringValue(summary.reason) ?? "-"}`,
+  ].join(" "));
+  lines.push([
+    "REQUEST",
+    `path=${request.path ?? final.path ?? "-"}`,
+    `model=${request.model ?? final.model ?? "-"}`,
+    `stream=${textValue(request.stream)}`,
+    `body=${textValue(request.bodyBytes)}`,
+    `first=${shortIso(summary.firstAt)}`,
+    `last=${shortIso(summary.lastAt)}`,
+  ].join(" "));
+  lines.push([
+    "FINAL",
+    `status=${textValue(final.statusCode)}`,
+    `account=${formatAccountRef(final)}`,
+    `latency_ms=${textValue(final.latencyMs)}`,
+    `events=${textValue(summary.eventCount)}`,
+    `window=${window.beforeSeconds ?? "?"}s/${window.afterSeconds ?? "?"}s`,
+  ].join(" "));
+  if (failovers.length > 0) {
+    lines.push("");
+    lines.push("FAILOVER");
+    lines.push(renderTable([
+      ["AT", "ACCOUNT", "UPSTREAM", "SWITCH", "MAX"],
+      ...failovers.map((item) => [
+        shortIso(item.at),
+        formatAccountRef(item),
+        textValue(item.upstreamStatus),
+        textValue(item.switchCount),
+        textValue(item.maxSwitches),
+      ]),
+    ]));
+  }
+  if (selectFailures.length > 0) {
+    lines.push("");
+    lines.push("SELECT-FAILED");
+    lines.push(renderTable([
+      ["AT", "ERROR", "EXCLUDED"],
+      ...selectFailures.map((item) => [
+        shortIso(item.at),
+        shorten(stringValue(item.error) ?? "-", 56),
+        textValue(item.excludedAccountCount),
+      ]),
+    ]));
+  }
+  if (upstreamErrors.length > 0 || tempUnschedulable.length > 0) {
+    lines.push("");
+    lines.push("ACCOUNT SIGNALS");
+    lines.push(renderTable([
+      ["TYPE", "PHASE", "AT", "ACCOUNT", "STATUS", "RULE", "UNTIL", "DETAIL"],
+      ...upstreamErrors.map((item) => [
+        "upstream-error",
+        textValue(item.phase),
+        shortIso(item.at),
+        formatAccountRef(item),
+        textValue(item.statusCode),
+        "-",
+        "-",
+        shorten(stringValue(item.error) ?? "-", 36),
+      ]),
+      ...tempUnschedulable.map((item) => [
+        "temp-unsched",
+        textValue(item.phase),
+        shortIso(item.at),
+        formatAccountRef(item),
+        textValue(item.statusCode),
+        textValue(item.ruleIndex),
+        shortIso(item.until),
+        shorten(stringValue(item.reason) ?? stringValue(item.matchedKeyword) ?? "-", 36),
+      ]),
+    ]));
+  }
+  lines.push("");
+  lines.push("WINDOW STATS");
+  lines.push(renderTable([
+    ["MATCH", "EVENTS", "FINAL_4XX_5XX", "FAILOVER", "SELECT_FAIL", "TEMP_UNSCHED", "ADMIN_SCHED"],
+    [
+      textValue(windowStats.matchedLines),
+      textValue(windowStats.eventCount),
+      textValue(windowStats.finalErrorCount),
+      textValue(windowStats.failoverCount),
+      textValue(windowStats.selectFailedCount),
+      textValue(windowStats.tempUnschedulableCount),
+      textValue(windowStats.adminSchedulableCount),
+    ],
+  ]));
+  if (accountSnapshot.length > 0) {
+    lines.push("");
+    lines.push("CURRENT ACCOUNTS");
+    lines.push(renderTable([
+      ["ID", "ACCOUNT", "SCHED", "STATUS", "CONC", "TEMP_UNTIL"],
+      ...accountSnapshot.slice(0, 20).map((item) => [
+        textValue(item.accountId),
+        shorten(stringValue(item.accountName) ?? "-", 32),
+        textValue(item.schedulable),
+        textValue(item.status),
+        textValue(item.concurrency),
+        shortIso(item.tempUnschedulableUntil),
+      ]),
+    ]));
+  }
+  if (context.showLines || parsed.showLines === true) {
+    const rawLines = recordArray(parsed.rawLines);
+    if (rawLines.length > 0) {
+      lines.push("");
+      lines.push("RAW LINES");
+      for (const item of rawLines) {
+        lines.push(shorten(stringValue(item.line) ?? "", 1000));
+      }
+    }
+  }
+  lines.push("");
+  lines.push("Raw: bun scripts/cli.ts platform-infra sub2api codex-pool trace --request-id <id> --raw");
+  lines.push("Lines: add --show-lines for bounded matched log lines.");
+  return lines.join("\n");
+}
+
+function formatAccountRef(item: Record<string, unknown>): string {
+  const name = stringValue(item.accountName);
+  const id = textValue(item.accountId);
+  if (name !== null && id !== "-") return `${name}#${id}`;
+  if (name !== null) return name;
+  return id;
+}
+
 function renderTable(rows: string[][]): string {
  if (rows.length === 0) return "";
  const widths: number[] = [];
@@ -2947,6 +3217,17 @@ function validateScript(pool: CodexPoolConfig): string {
  return remotePythonScript("validate", "", pool);
 }

+function traceScript(pool: CodexPoolConfig, options: TraceOptions): string {
+  const encoded = Buffer.from(JSON.stringify({
+    requestId: options.requestId,
+    since: options.since,
+    tail: options.tail,
+    contextSeconds: options.contextSeconds,
+    showLines: options.showLines,
+  }), "utf8").toString("base64");
+  return remotePythonScript("trace", encoded, pool);
+}
+
 function sentinelProbeScript(payload: unknown, pool: CodexPoolConfig): string {
  const encoded = Buffer.from(JSON.stringify(payload), "utf8").toString("base64");
  return remotePythonScript("sentinel-probe", encoded, pool);
@@ -3339,7 +3620,7 @@ function desiredAccountTempUnschedulableMap(pool: CodexPoolConfig): Record<strin
  return policies;
 }

-function remotePythonScript(mode: "sync" | "validate" | "cleanup-probes" | "sentinel-probe", encodedPayload: string, pool: CodexPoolConfig): string {
+function remotePythonScript(mode: "sync" | "validate" | "trace" | "cleanup-probes" | "sentinel-probe", encodedPayload: string, pool: CodexPoolConfig): string {
  return `
 set -u
 python3 - <<'PY'
@@ -5347,6 +5628,302 @@ def run_validate():
        "validation": {"gatewayModels": gateway, "gatewayResponses": responses_smoke, "gatewayResponsesRecent": responses_evidence, "gatewayCompactRecent": compact_evidence},
    }

+def parse_log_line(line):
+    json_start = line.find("{")
+    if json_start < 0:
+        return None
+    prefix = line[:json_start].rstrip()
+    try:
+        item = json.loads(line[json_start:])
+    except Exception:
+        return None
+    if not isinstance(item, dict):
+        return None
+    at = None
+    parts = prefix.split()
+    if parts:
+        at = parts[0]
+    message = ""
+    if len(parts) >= 4:
+        message = " ".join(parts[3:])
+    elif len(parts) >= 3:
+        message = parts[2]
+    elif len(parts) >= 1:
+        message = parts[-1]
+    item["_at"] = at
+    item["_message"] = message
+    item["_line"] = line
+    return item
+
+def log_time_epoch(item):
+    at = item.get("_at") if isinstance(item, dict) else None
+    if not isinstance(at, str) or not at:
+        return None
+    try:
+        return datetime.strptime(at, "%Y-%m-%dT%H:%M:%S.%f%z").timestamp()
+    except Exception:
+        try:
+            return datetime.fromisoformat(at.replace("Z", "+00:00")).timestamp()
+        except Exception:
+            return None
+
+def event_base(item, account_names_by_id):
+    account_id = item.get("account_id")
+    if isinstance(account_id, str) and account_id.isdigit():
+        account_id = int(account_id)
+    account_name = account_names_by_id.get(account_id)
+    return {
+        "at": item.get("_at"),
+        "message": item.get("_message"),
+        "requestId": item.get("request_id"),
+        "clientRequestId": item.get("client_request_id"),
+        "path": item.get("path"),
+        "method": item.get("method"),
+        "model": item.get("model"),
+        "accountId": account_id,
+        "accountName": account_name,
+        "statusCode": item.get("status_code"),
+        "upstreamStatus": item.get("upstream_status"),
+        "latencyMs": item.get("latency_ms"),
+    }
+
+def classify_trace_event(item, account_names_by_id):
+    message = str(item.get("_message") or "")
+    event = event_base(item, account_names_by_id)
+    if "content_moderation.gateway_check_start" in message:
+        event.update({
+            "type": "request-start",
+            "stream": item.get("stream"),
+            "bodyBytes": item.get("body_bytes"),
+            "groupId": item.get("group_id"),
+            "groupName": item.get("group_name"),
+            "apiKeyName": item.get("api_key_name"),
+        })
+    elif "content_moderation.gateway_check_done" in message:
+        event.update({
+            "type": "gateway-check",
+            "allowed": item.get("allowed"),
+            "blocked": item.get("blocked"),
+            "action": item.get("action"),
+        })
+    elif "openai.upstream_failover_switching" in message:
+        event.update({
+            "type": "failover",
+            "switchCount": item.get("switch_count"),
+            "maxSwitches": item.get("max_switches"),
+        })
+    elif "openai.account_select_failed" in message:
+        event.update({
+            "type": "select-failed",
+            "error": item.get("error"),
+            "excludedAccountCount": item.get("excluded_account_count"),
+        })
+    elif "account_upstream_error" in message:
+        event.update({
+            "type": "upstream-error",
+            "error": item.get("error"),
+        })
+    elif "account_temp_unschedulable" in message:
+        event.update({
+            "type": "temp-unschedulable",
+            "until": item.get("until") or item.get("temp_unschedulable_until"),
+            "ruleIndex": item.get("rule_index"),
+            "matchedKeyword": item.get("matched_keyword"),
+            "reason": item.get("reason") or item.get("error"),
+        })
+    elif "http request completed" in message:
+        event.update({
+            "type": "final",
+            "clientIp": item.get("client_ip"),
+            "protocol": item.get("protocol"),
+            "platform": item.get("platform"),
+            "completedAt": item.get("completed_at"),
+        })
+    elif "admin account schedulable updated" in message or "account schedulable updated" in message or "/schedulable" in str(item.get("path") or ""):
+        event.update({
+            "type": "admin-schedulable",
+            "schedulable": item.get("schedulable"),
+        })
+    else:
+        event.update({"type": "other"})
+    return event
+
+def with_trace_phase(event, first_epoch, last_epoch):
+    epoch = None
+    at = event.get("at") if isinstance(event, dict) else None
+    if isinstance(at, str) and at:
+        epoch = log_time_epoch({"_at": at})
+    if epoch is None or first_epoch is None:
+        phase = "unknown"
+    elif epoch < first_epoch:
+        phase = "before-request"
+    elif last_epoch is not None and epoch > last_epoch:
+        phase = "after-request"
+    else:
+        phase = "during-request"
+    event["phase"] = phase
+    return event
+
+def account_snapshot_from_runtime(token):
+    try:
+        accounts = list_accounts(token)
+    except Exception as exc:
+        return [], {"error": str(exc)}
+    rows = []
+    for item in accounts:
+        if not isinstance(item, dict):
+            continue
+        rows.append({
+            "accountId": item.get("id"),
+            "accountName": item.get("name"),
+            "schedulable": item.get("schedulable"),
+            "status": item.get("status"),
+            "concurrency": item.get("concurrency"),
+            "priority": item.get("priority"),
+            "tempUnschedulableUntil": item.get("temp_unschedulable_until") or item.get("tempUnschedulableUntil"),
+            "tempUnschedulableSet": (item.get("temp_unschedulable_until") or item.get("tempUnschedulableUntil")) is not None or bool(item.get("temp_unschedulable_reason") or item.get("tempUnschedulableReason")),
+        })
+    rows.sort(key=lambda row: (str(row.get("accountName") or ""), int(row.get("accountId") or 0)))
+    return rows, None
+
+def trace_reason(events, final_event):
+    failovers = [item for item in events if item.get("type") == "failover"]
+    select_failures = [item for item in events if item.get("type") == "select-failed"]
+    upstream_errors = [item for item in events if item.get("type") == "upstream-error"]
+    if failovers and select_failures:
+        return "failover-attempted-no-candidate"
+    if failovers:
+        return "failover-attempted"
+    if select_failures:
+        return "account-select-failed"
+    if upstream_errors:
+        return "upstream-error"
+    if isinstance(final_event, dict) and isinstance(final_event.get("statusCode"), int) and final_event.get("statusCode") >= 400:
+        return "final-http-error"
+    if isinstance(final_event, dict) and isinstance(final_event.get("statusCode"), int):
+        return "completed"
+    return "unknown"
+
+def run_trace():
+    payload = json.loads(base64.b64decode(PAYLOAD_B64).decode("utf-8")) if PAYLOAD_B64 else {}
+    request_id = payload.get("requestId")
+    since = payload.get("since") or "24h"
+    tail = int(payload.get("tail") or 20000)
+    context_seconds = int(payload.get("contextSeconds") or 300)
+    show_lines = bool(payload.get("showLines"))
+    if not isinstance(request_id, str) or not request_id:
+        raise RuntimeError("trace payload missing requestId")
+    admin_email, token, admin_compliance = login()
+    account_snapshot, account_snapshot_error = account_snapshot_from_runtime(token)
+    account_names_by_id = {}
+    for row in account_snapshot:
+        account_id = row.get("accountId")
+        if isinstance(account_id, str) and account_id.isdigit():
+            account_id = int(account_id)
+        if isinstance(account_id, int) and isinstance(row.get("accountName"), str):
+            account_names_by_id[account_id] = row.get("accountName")
+    proc = kubectl(["-n", NAMESPACE, "logs", "deployment/sub2api", f"--since={since}", f"--tail={tail}"])
+    stdout = proc.stdout.decode("utf-8", errors="replace")
+    parsed_lines = []
+    matched = []
+    for line in stdout.splitlines():
+        parsed = parse_log_line(line)
+        if parsed is None:
+            continue
+        parsed_lines.append(parsed)
+        if request_id in line:
+            matched.append(parsed)
+    first_epoch = None
+    last_epoch = None
+    for item in matched:
+        epoch = log_time_epoch(item)
+        if epoch is None:
+            continue
+        first_epoch = epoch if first_epoch is None else min(first_epoch, epoch)
+        last_epoch = epoch if last_epoch is None else max(last_epoch, epoch)
+    window_lines = []
+    if first_epoch is not None:
+        start_epoch = first_epoch - context_seconds
+        end_epoch = (last_epoch if last_epoch is not None else first_epoch) + context_seconds
+        for item in parsed_lines:
+            epoch = log_time_epoch(item)
+            if epoch is not None and start_epoch <= epoch <= end_epoch:
+                window_lines.append(item)
+    else:
+        window_lines = matched
+    events = [classify_trace_event(item, account_names_by_id) for item in matched]
+    request_start = next((item for item in events if item.get("type") == "request-start"), None)
+    final_event = next((item for item in reversed(events) if item.get("type") == "final"), None)
+    failovers = [item for item in events if item.get("type") == "failover"]
+    select_failures = [item for item in events if item.get("type") == "select-failed"]
+    upstream_errors = [item for item in events if item.get("type") == "upstream-error"]
+    temp_unsched = [with_trace_phase(classify_trace_event(item, account_names_by_id), first_epoch, last_epoch) for item in window_lines if "account_temp_unschedulable" in str(item.get("_message") or "")]
+    admin_sched = [with_trace_phase(classify_trace_event(item, account_names_by_id), first_epoch, last_epoch) for item in window_lines if ("schedulable" in str(item.get("_message") or "") or "/schedulable" in str(item.get("path") or ""))]
+    window_events = [classify_trace_event(item, account_names_by_id) for item in window_lines]
+    final_errors = [item for item in window_events if item.get("type") == "final" and isinstance(item.get("statusCode"), int) and item.get("statusCode") >= 400]
+    window_failovers = [item for item in window_events if item.get("type") == "failover"]
+    window_select_failures = [item for item in window_events if item.get("type") == "select-failed"]
+    reason = trace_reason(events, final_event)
+    if not matched:
+        outcome = "not-found"
+    elif isinstance(final_event, dict) and isinstance(final_event.get("statusCode"), int) and final_event.get("statusCode") < 400:
+        outcome = "succeeded"
+    elif isinstance(final_event, dict):
+        outcome = "failed"
+    else:
+        outcome = "incomplete"
+    return {
+        "ok": proc.returncode == 0 and len(matched) > 0,
+        "mode": "trace",
+        "namespace": NAMESPACE,
+        "serviceDns": SERVICE_DNS,
+        "appPod": APP_POD,
+        "admin": {"email": admin_email, "tokenPrinted": False, "compliance": admin_compliance},
+        "requestId": request_id,
+        "summary": {
+            "outcome": outcome,
+            "reason": reason,
+            "eventCount": len(events),
+            "matchedLineCount": len(matched),
+            "firstAt": events[0].get("at") if events else None,
+            "lastAt": events[-1].get("at") if events else None,
+        },
+        "window": {
+            "since": since,
+            "tail": tail,
+            "beforeSeconds": context_seconds,
+            "afterSeconds": context_seconds,
+            "lineCount": len(window_lines),
+        },
+        "request": request_start or {},
+        "final": final_event or {},
+        "events": events,
+        "failovers": failovers,
+        "selectFailures": select_failures,
+        "upstreamErrors": upstream_errors,
+        "tempUnschedulable": temp_unsched,
+        "adminSchedulable": admin_sched[-20:],
+        "windowStats": {
+            "matchedLines": len(matched),
+            "eventCount": len(window_events),
+            "finalErrorCount": len(final_errors),
+            "failoverCount": len(window_failovers),
+            "selectFailedCount": len(window_select_failures),
+            "tempUnschedulableCount": len(temp_unsched),
+            "adminSchedulableCount": len(admin_sched),
+        },
+        "accountSnapshot": account_snapshot,
+        "accountSnapshotError": account_snapshot_error,
+        "rawLines": [{"line": item.get("_line")} for item in matched[-30:]] if show_lines else [],
+        "showLines": show_lines,
+        "logs": {
+            "exitCode": proc.returncode,
+            "stderrTail": text(proc.stderr, 1000),
+            "stdoutLineCount": len(stdout.splitlines()),
+        },
+        "valuesPrinted": False,
+    }
+
 def parse_embedded_json(stdout):
    if not isinstance(stdout, str) or not stdout.strip():
        return None
@@ -5491,6 +6068,8 @@ def run_cleanup_probes():
 try:
    if MODE == "sync":
        result = run_sync()
+    elif MODE == "trace":
+        result = run_trace()
    elif MODE == "cleanup-probes":
        result = run_cleanup_probes()
    elif MODE == "sentinel-probe":
@@ -44,6 +44,7 @@ export function platformInfraHelp(): unknown {
      "bun scripts/cli.ts platform-infra sub2api codex-pool plan",
      "bun scripts/cli.ts platform-infra sub2api codex-pool sync --confirm",
      "bun scripts/cli.ts platform-infra sub2api codex-pool validate",
+      "bun scripts/cli.ts platform-infra sub2api codex-pool trace --request-id <requestId>",
      "bun scripts/cli.ts platform-infra sub2api codex-pool sentinel-image status",
      "bun scripts/cli.ts platform-infra sub2api codex-pool sentinel-probe --account unidesk-codex-hy --confirm",
    ],
@@ -62,6 +63,7 @@ export function platformInfraHelp(): unknown {
        "bun scripts/cli.ts platform-infra sub2api codex-pool plan",
        "bun scripts/cli.ts platform-infra sub2api codex-pool sync --confirm",
        "bun scripts/cli.ts platform-infra sub2api codex-pool validate",
+        "bun scripts/cli.ts platform-infra sub2api codex-pool trace --request-id <requestId>",
        "bun scripts/cli.ts platform-infra sub2api codex-pool sentinel-image status",
      ],
      module: "scripts/src/platform-infra-sub2api-codex.ts",