From 15f4c5870aa65d98d627bec08c2c2f9622496053 Mon Sep 17 00:00:00 2001 From: Codex Date: Thu, 11 Jun 2026 23:59:22 +0000 Subject: [PATCH] feat: add sub2api request trace report --- .agents/skills/unidesk-sub2api/SKILL.md | 12 + scripts/src/help.ts | 1 + scripts/src/platform-infra-sub2api-codex.ts | 585 +++++++++++++++++++- scripts/src/platform-infra.ts | 2 + 4 files changed, 597 insertions(+), 3 deletions(-) diff --git a/.agents/skills/unidesk-sub2api/SKILL.md b/.agents/skills/unidesk-sub2api/SKILL.md index b4adb2fd..e2291039 100644 --- a/.agents/skills/unidesk-sub2api/SKILL.md +++ b/.agents/skills/unidesk-sub2api/SKILL.md @@ -19,6 +19,14 @@ bun scripts/cli.ts platform-infra sub2api codex-pool sentinel-report 需要机器处理或完整字段时再加 `--raw`;需要更多最近运行记录时加 `--events N`。 +追溯某个 Codex/Sub2API request id 的中断、上游账号、切号、临时不可调度、账号选择失败和同窗口账号池信号时,优先使用低噪声 trace 报表,不要先手写 `kubectl logs | grep`: + +```bash +bun scripts/cli.ts platform-infra sub2api codex-pool trace --request-id +``` + +默认输出类似 k8s/ps 的短表;机器处理用 `--raw` 读取 `.data.trace.*`;需要审计原始匹配日志时加 `--show-lines`;需要扩大搜索范围时使用 `--since 24h --tail 50000`。该命令只读:读取 Sub2API 日志、账号快照和 admin API 元数据,不改 `schedulable`、不清 runtime backoff、不中断请求。 + ## 先读边界 - 仓库长期开发边界见 `docs/reference/platform-infra.md`,本 skill 承担日常操作手册。 @@ -61,6 +69,7 @@ bun scripts/cli.ts platform-infra sub2api validate bun scripts/cli.ts platform-infra sub2api codex-pool plan bun scripts/cli.ts platform-infra sub2api codex-pool sync --confirm bun scripts/cli.ts platform-infra sub2api codex-pool validate +bun scripts/cli.ts platform-infra sub2api codex-pool trace --request-id bun scripts/cli.ts platform-infra sub2api codex-pool sentinel-image status bun scripts/cli.ts platform-infra sub2api codex-pool sentinel-image build --confirm bun scripts/cli.ts platform-infra sub2api codex-pool sentinel-probe --account unidesk-codex-hy --confirm @@ -102,6 +111,8 @@ bun scripts/cli.ts platform-infra sub2api codex-pool cleanup-probes --confirm `sync --confirm` 同时会按 YAML 渲染账号级哨兵资源,并在 monitor 开启时先确保可复用哨兵镜像存在。当前目标是 `sentinel.monitor.enabled=true` + `sentinel.actions.enabled=true` 的 marker-only 自动冻结/恢复;不要手工 patch CronJob、Secret 或 Sub2API account。若 YAML 新增账号或修改 profile/base URL/API key fingerprint/upstream User-Agent/Responses WebSocket mode,sync 会从变更前 runtime state 写入 pending probe 记录并立即安排 sentinel probe,但默认仍保持该 account 可调度;只有实际 marker probe 非命中或已有 active quarantine 才会冻结账号。sentinel 冻结/恢复只改 `schedulable=false|true`,不得顺手调用 Sub2API `recover-state` 清除请求路径临时不可调度或其他 runtime backoff。无关账号的既有成功/失败退避不能被重置。若 YAML 下调失败冻结最大窗口,sync 会把仍 active 的旧冻结状态迁移到当前最大窗口内并立即安排 recovery probe,但不会直接解冻。若怀疑某个账号被误判,先用 `codex-pool sentinel-probe --account --confirm` 立即触发该账号测量;该命令从现有 CronJob 模板派生一次性 Job,复用同一份 Secret、ConfigMap、OpenAI SDK probe、token/cost 账本和冻结/恢复状态机。 +`trace --request-id ` 是只读 request 追溯报表,不触发 probe、不修改账号。默认输出请求开始/最终状态、failover、`account_select_failed`、窗口内 `account_temp_unschedulable`、admin schedulable 写入计数和当前账号快照;`reason=failover-attempted-no-candidate` 表示 Sub2API 已进入自动切号,但排除当前失败账号后没有可用候选。需要机器处理时使用 `--raw`,需要原始匹配行时加 `--show-lines`。 + `sentinel-report` 是只读低噪声报表,不触发 probe、不修改账号。默认输出类似 `ps` 的文本表,展示每个账号的探测次数、最近 marker/HTTP/动作、冻结 TTL、成功退避、下一次 probe 和最近 run 事件;需要机器处理时使用 `sentinel-report --raw`。 `sync --confirm` 和 `validate` 可能超过单次 SSH/runtime 短连接窗口。必须继续使用 `bun scripts/cli.ts platform-infra sub2api codex-pool ...`,由 CLI 在 G14 远端提交作业并短轮询状态;不要改用裸 `trans G14:k3s script` 等一个长连接等待完整结果。若看到 `UNIDESK_SSH_RUNTIME_TIMEOUT`,先按 `docs/reference/platform-infra.md` 的规则处理为控制面可见性问题,修 CLI/job/poll 或重跑受控命令,不要手工 patch Sub2API credentials 或源码。 @@ -182,6 +193,7 @@ bun scripts/cli.ts platform-infra sub2api codex-pool configure-local --confirm ## 排障 - Codex pool 哨兵、账号冻结/恢复、marker-only 判断或 probe 周期看不清:第一步跑 `bun scripts/cli.ts platform-infra sub2api codex-pool sentinel-report`。这个报表是主观察面;只有报表缺字段或需要底层证据时,才继续看 `--raw`、CronJob log、state ConfigMap 或 Sub2API 管理 UI。若看到“临时不可调度状态”且包含规则序号/匹配关键词,检查 Sub2API `account_temp_unschedulable` 日志和账号 `temp_unschedulable_*` 字段;sentinel 只解释 `schedulable=false` 的 active quarantine,不解释这类内置临时冷却。 +- 单个 request id 报 502/503/中断/没有自动切号:第一步跑 `bun scripts/cli.ts platform-infra sub2api codex-pool trace --request-id `。先看 `outcome`、`reason`、`FAILOVER`、`SELECT-FAILED`、`ACCOUNT SIGNALS` 和 `WINDOW STATS`;只有 trace 报表缺字段或需要审计原始日志时,才加 `--show-lines` 或 `--raw`。若 `reason=failover-attempted-no-candidate`,说明切号动作已发生,但 scheduler 在排除失败账号后没有可用候选;继续用 `sentinel-report` 和 `validate --full` 区分 sentinel quarantine、request-path temp-unschedulable、账号 status 或容量耗尽。 - profile invalid:先修 `~/.codex/config.toml.` 的 `base_url`、`wire_api`、`model` 或 `auth.json.` 的 API key;不要在 YAML 中写密钥。 - Sub2API 卡在 `wait-postgres` / `wait-redis` 或服务内大量 `context deadline exceeded`:先跑 `sub2api status` 看 `networkPolicy.ok`,再跑 `sub2api validate` 看 `postgresCrossPodPgIsReady` / `redisCrossPodPing`;缺失或异常时用 `sub2api apply --confirm` 恢复受控 `NetworkPolicy/allow-all`,不要保留手工 iptables bypass 作为长期修复。 - pool key 401:跑 `codex-pool sync --confirm` 重建 Sub2API key 与 k3s Secret 绑定,再跑 `codex-pool validate`。 diff --git a/scripts/src/help.ts b/scripts/src/help.ts index 3908c016..66019730 100644 --- a/scripts/src/help.ts +++ b/scripts/src/help.ts @@ -606,6 +606,7 @@ function platformInfraHelpSummary(): unknown { "bun scripts/cli.ts platform-infra sub2api plan", "bun scripts/cli.ts platform-infra sub2api status [--full|--raw]", "bun scripts/cli.ts platform-infra sub2api codex-pool validate", + "bun scripts/cli.ts platform-infra sub2api codex-pool trace --request-id ", "bun scripts/cli.ts platform-infra sub2api codex-pool sentinel-image status", "bun scripts/cli.ts platform-infra sub2api codex-pool sentinel-probe --account unidesk-codex-hy --confirm", "bun scripts/cli.ts platform-infra sub2api codex-pool sentinel-report", diff --git a/scripts/src/platform-infra-sub2api-codex.ts b/scripts/src/platform-infra-sub2api-codex.ts index 24c57499..22d163bb 100644 --- a/scripts/src/platform-infra-sub2api-codex.ts +++ b/scripts/src/platform-infra-sub2api-codex.ts @@ -56,6 +56,14 @@ interface SentinelReportOptions extends DisclosureOptions { events: number; } +interface TraceOptions extends DisclosureOptions { + requestId: string | null; + since: string; + tail: number; + contextSeconds: number; + showLines: boolean; +} + interface SentinelImageOptions extends DisclosureOptions { action: "status" | "build"; confirm: boolean; @@ -192,12 +200,13 @@ interface CodexLocalConsumerTomlOptions { export function codexPoolHelp(): unknown { const pool = readCodexPoolConfig(); return { - command: "platform-infra sub2api codex-pool plan|sync|validate|sentinel-image|sentinel-probe|sentinel-report|cleanup-probes|expose|configure-local", - output: "json, except sentinel-report defaults to a ps-like text table", + command: "platform-infra sub2api codex-pool plan|sync|validate|trace|sentinel-image|sentinel-probe|sentinel-report|cleanup-probes|expose|configure-local", + output: "json, except trace and sentinel-report default to low-noise text tables", usage: [ "bun scripts/cli.ts platform-infra sub2api codex-pool plan", "bun scripts/cli.ts platform-infra sub2api codex-pool sync --confirm [--prune-removed]", "bun scripts/cli.ts platform-infra sub2api codex-pool validate [--full|--raw]", + "bun scripts/cli.ts platform-infra sub2api codex-pool trace --request-id [--since 24h|--tail 20000|--context-seconds 300|--show-lines|--raw]", "bun scripts/cli.ts platform-infra sub2api codex-pool sentinel-image status", "bun scripts/cli.ts platform-infra sub2api codex-pool sentinel-image build --confirm", "bun scripts/cli.ts platform-infra sub2api codex-pool sentinel-probe --account unidesk-codex-hy --confirm", @@ -229,6 +238,7 @@ export async function runCodexPoolCommand(config: UniDeskConfig, args: string[]) if (action === "plan") return codexPoolPlan(parseDisclosureOptions(args.slice(1))); if (action === "sync") return await codexPoolSync(config, parseSyncOptions(args.slice(1))); if (action === "validate") return await codexPoolValidate(config, parseDisclosureOptions(args.slice(1))); + if (action === "trace") return await codexPoolTrace(config, parseTraceOptions(args.slice(1))); if (action === "sentinel-image") return await codexPoolSentinelImage(config, parseSentinelImageOptions(args.slice(1))); if (action === "sentinel-probe") return await codexPoolSentinelProbe(config, parseSentinelProbeOptions(args.slice(1))); if (action === "sentinel-report") return await codexPoolSentinelReport(config, parseSentinelReportOptions(args.slice(1))); @@ -350,6 +360,91 @@ function parseSentinelReportOptions(args: string[]): SentinelReportOptions { return { ...disclosure, events }; } +function parseTraceOptions(args: string[]): TraceOptions { + let requestId: string | null = null; + let since = "24h"; + let tail = 20_000; + let contextSeconds = 300; + let showLines = false; + const disclosureArgs: string[] = []; + for (let index = 0; index < args.length; index += 1) { + const arg = args[index]!; + if (arg === "--full" || arg === "--raw") { + disclosureArgs.push(arg); + continue; + } + if (arg === "--show-lines") { + showLines = true; + continue; + } + if (arg === "--request-id" || arg === "--id") { + const value = args[index + 1]; + if (value === undefined || value.startsWith("--")) throw new Error(`${arg} requires a request id`); + requestId = value.trim(); + index += 1; + continue; + } + if (arg.startsWith("--request-id=")) { + requestId = arg.slice("--request-id=".length).trim(); + continue; + } + if (arg.startsWith("--id=")) { + requestId = arg.slice("--id=".length).trim(); + continue; + } + if (arg === "--since") { + const value = args[index + 1]; + if (value === undefined || value.startsWith("--")) throw new Error("--since requires a kubectl duration such as 24h or 90m"); + since = parseKubectlDuration(value); + index += 1; + continue; + } + if (arg.startsWith("--since=")) { + since = parseKubectlDuration(arg.slice("--since=".length)); + continue; + } + if (arg === "--tail") { + const value = args[index + 1]; + if (value === undefined || value.startsWith("--")) throw new Error("--tail requires an integer"); + tail = parseTraceLimit(value, "--tail", 100, 200_000); + index += 1; + continue; + } + if (arg.startsWith("--tail=")) { + tail = parseTraceLimit(arg.slice("--tail=".length), "--tail", 100, 200_000); + continue; + } + if (arg === "--context-seconds") { + const value = args[index + 1]; + if (value === undefined || value.startsWith("--")) throw new Error("--context-seconds requires an integer"); + contextSeconds = parseTraceLimit(value, "--context-seconds", 0, 3600); + index += 1; + continue; + } + if (arg.startsWith("--context-seconds=")) { + contextSeconds = parseTraceLimit(arg.slice("--context-seconds=".length), "--context-seconds", 0, 3600); + continue; + } + throw new Error(`unsupported option: ${arg}`); + } + if (requestId === null || requestId.length === 0) throw new Error("trace requires --request-id "); + if (!/^[A-Za-z0-9_.:-]{8,128}$/u.test(requestId)) throw new Error("--request-id has an unsupported format"); + const disclosure = parseDisclosureOptions(disclosureArgs); + return { ...disclosure, requestId, since, tail, contextSeconds, showLines }; +} + +function parseKubectlDuration(raw: string): string { + const value = raw.trim(); + if (!/^[1-9][0-9]*(?:s|m|h)$/u.test(value)) throw new Error("--since must be a kubectl duration such as 24h, 90m, or 300s"); + return value; +} + +function parseTraceLimit(raw: string, option: string, min: number, max: number): number { + const value = Number(raw); + if (!Number.isInteger(value) || value < min || value > max) throw new Error(`${option} must be an integer from ${min} to ${max}`); + return value; +} + function readReportEventLimit(raw: string, option: string): number { const value = Number(raw); if (!Number.isInteger(value) || value < 1 || value > 200) throw new Error(`${option} must be an integer from 1 to 200`); @@ -588,6 +683,28 @@ async function codexPoolValidate(config: UniDeskConfig, options: DisclosureOptio }; } +async function codexPoolTrace(config: UniDeskConfig, options: TraceOptions): Promise | RenderedCliResult> { + const pool = readCodexPoolConfig(); + const result = await capture(config, g14K3sRoute, ["script"], traceScript(pool, options)); + const parsed = parseJsonOutput(result.stdout); + const ok = result.exitCode === 0 && boolField(parsed, "ok", false); + if (options.raw) { + return { + ok, + action: "platform-infra-sub2api-codex-pool-trace", + remote: compactCapture(result, { full: true }), + trace: parsed, + valuesPrinted: false, + }; + } + const text = renderTraceReport(parsed, { + requestId: options.requestId ?? "", + showLines: options.showLines, + remote: compactCapture(result, { full: result.exitCode !== 0 || parsed === null }), + }); + return renderedCliResult(ok, "platform-infra sub2api codex-pool trace", text); +} + async function codexPoolSentinelReport(config: UniDeskConfig, options: SentinelReportOptions): Promise | RenderedCliResult> { const pool = readCodexPoolConfig(); const result = await capture(config, g14K3sRoute, ["script"], sentinelReportScript(pool, options.events)); @@ -2077,6 +2194,159 @@ function renderSentinelReport( return lines.join("\n"); } +function renderTraceReport( + parsed: Record | null, + context: { requestId: string; showLines: boolean; remote: Record }, +): string { + if (parsed === null) { + return [ + `SUB2API TRACE ${context.requestId} unavailable`, + `remote_exit=${context.remote.exitCode ?? "?"} stdout_bytes=${context.remote.stdoutBytes ?? "?"} stderr_bytes=${context.remote.stderrBytes ?? "?"}`, + stringValue(context.remote.stderrTail) ?? stringValue(context.remote.stdoutTail) ?? "", + ].filter(Boolean).join("\n"); + } + const summary = isRecord(parsed.summary) ? parsed.summary : {}; + const request = isRecord(parsed.request) ? parsed.request : {}; + const final = isRecord(parsed.final) ? parsed.final : {}; + const window = isRecord(parsed.window) ? parsed.window : {}; + const events = recordArray(parsed.events); + const failovers = recordArray(parsed.failovers); + const selectFailures = recordArray(parsed.selectFailures); + const upstreamErrors = recordArray(parsed.upstreamErrors); + const tempUnschedulable = recordArray(parsed.tempUnschedulable); + const windowStats = isRecord(parsed.windowStats) ? parsed.windowStats : {}; + const accountSnapshot = recordArray(parsed.accountSnapshot); + const lines: string[] = []; + lines.push([ + "SUB2API TRACE", + stringValue(parsed.requestId) ?? context.requestId, + `ok=${parsed.ok === true ? "true" : "false"}`, + `outcome=${stringValue(summary.outcome) ?? "-"}`, + `reason=${stringValue(summary.reason) ?? "-"}`, + ].join(" ")); + lines.push([ + "REQUEST", + `path=${request.path ?? final.path ?? "-"}`, + `model=${request.model ?? final.model ?? "-"}`, + `stream=${textValue(request.stream)}`, + `body=${textValue(request.bodyBytes)}`, + `first=${shortIso(summary.firstAt)}`, + `last=${shortIso(summary.lastAt)}`, + ].join(" ")); + lines.push([ + "FINAL", + `status=${textValue(final.statusCode)}`, + `account=${formatAccountRef(final)}`, + `latency_ms=${textValue(final.latencyMs)}`, + `events=${textValue(summary.eventCount)}`, + `window=${window.beforeSeconds ?? "?"}s/${window.afterSeconds ?? "?"}s`, + ].join(" ")); + if (failovers.length > 0) { + lines.push(""); + lines.push("FAILOVER"); + lines.push(renderTable([ + ["AT", "ACCOUNT", "UPSTREAM", "SWITCH", "MAX"], + ...failovers.map((item) => [ + shortIso(item.at), + formatAccountRef(item), + textValue(item.upstreamStatus), + textValue(item.switchCount), + textValue(item.maxSwitches), + ]), + ])); + } + if (selectFailures.length > 0) { + lines.push(""); + lines.push("SELECT-FAILED"); + lines.push(renderTable([ + ["AT", "ERROR", "EXCLUDED"], + ...selectFailures.map((item) => [ + shortIso(item.at), + shorten(stringValue(item.error) ?? "-", 56), + textValue(item.excludedAccountCount), + ]), + ])); + } + if (upstreamErrors.length > 0 || tempUnschedulable.length > 0) { + lines.push(""); + lines.push("ACCOUNT SIGNALS"); + lines.push(renderTable([ + ["TYPE", "PHASE", "AT", "ACCOUNT", "STATUS", "RULE", "UNTIL", "DETAIL"], + ...upstreamErrors.map((item) => [ + "upstream-error", + textValue(item.phase), + shortIso(item.at), + formatAccountRef(item), + textValue(item.statusCode), + "-", + "-", + shorten(stringValue(item.error) ?? "-", 36), + ]), + ...tempUnschedulable.map((item) => [ + "temp-unsched", + textValue(item.phase), + shortIso(item.at), + formatAccountRef(item), + textValue(item.statusCode), + textValue(item.ruleIndex), + shortIso(item.until), + shorten(stringValue(item.reason) ?? stringValue(item.matchedKeyword) ?? "-", 36), + ]), + ])); + } + lines.push(""); + lines.push("WINDOW STATS"); + lines.push(renderTable([ + ["MATCH", "EVENTS", "FINAL_4XX_5XX", "FAILOVER", "SELECT_FAIL", "TEMP_UNSCHED", "ADMIN_SCHED"], + [ + textValue(windowStats.matchedLines), + textValue(windowStats.eventCount), + textValue(windowStats.finalErrorCount), + textValue(windowStats.failoverCount), + textValue(windowStats.selectFailedCount), + textValue(windowStats.tempUnschedulableCount), + textValue(windowStats.adminSchedulableCount), + ], + ])); + if (accountSnapshot.length > 0) { + lines.push(""); + lines.push("CURRENT ACCOUNTS"); + lines.push(renderTable([ + ["ID", "ACCOUNT", "SCHED", "STATUS", "CONC", "TEMP_UNTIL"], + ...accountSnapshot.slice(0, 20).map((item) => [ + textValue(item.accountId), + shorten(stringValue(item.accountName) ?? "-", 32), + textValue(item.schedulable), + textValue(item.status), + textValue(item.concurrency), + shortIso(item.tempUnschedulableUntil), + ]), + ])); + } + if (context.showLines || parsed.showLines === true) { + const rawLines = recordArray(parsed.rawLines); + if (rawLines.length > 0) { + lines.push(""); + lines.push("RAW LINES"); + for (const item of rawLines) { + lines.push(shorten(stringValue(item.line) ?? "", 1000)); + } + } + } + lines.push(""); + lines.push("Raw: bun scripts/cli.ts platform-infra sub2api codex-pool trace --request-id --raw"); + lines.push("Lines: add --show-lines for bounded matched log lines."); + return lines.join("\n"); +} + +function formatAccountRef(item: Record): string { + const name = stringValue(item.accountName); + const id = textValue(item.accountId); + if (name !== null && id !== "-") return `${name}#${id}`; + if (name !== null) return name; + return id; +} + function renderTable(rows: string[][]): string { if (rows.length === 0) return ""; const widths: number[] = []; @@ -2947,6 +3217,17 @@ function validateScript(pool: CodexPoolConfig): string { return remotePythonScript("validate", "", pool); } +function traceScript(pool: CodexPoolConfig, options: TraceOptions): string { + const encoded = Buffer.from(JSON.stringify({ + requestId: options.requestId, + since: options.since, + tail: options.tail, + contextSeconds: options.contextSeconds, + showLines: options.showLines, + }), "utf8").toString("base64"); + return remotePythonScript("trace", encoded, pool); +} + function sentinelProbeScript(payload: unknown, pool: CodexPoolConfig): string { const encoded = Buffer.from(JSON.stringify(payload), "utf8").toString("base64"); return remotePythonScript("sentinel-probe", encoded, pool); @@ -3339,7 +3620,7 @@ function desiredAccountTempUnschedulableMap(pool: CodexPoolConfig): Record= 4: + message = " ".join(parts[3:]) + elif len(parts) >= 3: + message = parts[2] + elif len(parts) >= 1: + message = parts[-1] + item["_at"] = at + item["_message"] = message + item["_line"] = line + return item + +def log_time_epoch(item): + at = item.get("_at") if isinstance(item, dict) else None + if not isinstance(at, str) or not at: + return None + try: + return datetime.strptime(at, "%Y-%m-%dT%H:%M:%S.%f%z").timestamp() + except Exception: + try: + return datetime.fromisoformat(at.replace("Z", "+00:00")).timestamp() + except Exception: + return None + +def event_base(item, account_names_by_id): + account_id = item.get("account_id") + if isinstance(account_id, str) and account_id.isdigit(): + account_id = int(account_id) + account_name = account_names_by_id.get(account_id) + return { + "at": item.get("_at"), + "message": item.get("_message"), + "requestId": item.get("request_id"), + "clientRequestId": item.get("client_request_id"), + "path": item.get("path"), + "method": item.get("method"), + "model": item.get("model"), + "accountId": account_id, + "accountName": account_name, + "statusCode": item.get("status_code"), + "upstreamStatus": item.get("upstream_status"), + "latencyMs": item.get("latency_ms"), + } + +def classify_trace_event(item, account_names_by_id): + message = str(item.get("_message") or "") + event = event_base(item, account_names_by_id) + if "content_moderation.gateway_check_start" in message: + event.update({ + "type": "request-start", + "stream": item.get("stream"), + "bodyBytes": item.get("body_bytes"), + "groupId": item.get("group_id"), + "groupName": item.get("group_name"), + "apiKeyName": item.get("api_key_name"), + }) + elif "content_moderation.gateway_check_done" in message: + event.update({ + "type": "gateway-check", + "allowed": item.get("allowed"), + "blocked": item.get("blocked"), + "action": item.get("action"), + }) + elif "openai.upstream_failover_switching" in message: + event.update({ + "type": "failover", + "switchCount": item.get("switch_count"), + "maxSwitches": item.get("max_switches"), + }) + elif "openai.account_select_failed" in message: + event.update({ + "type": "select-failed", + "error": item.get("error"), + "excludedAccountCount": item.get("excluded_account_count"), + }) + elif "account_upstream_error" in message: + event.update({ + "type": "upstream-error", + "error": item.get("error"), + }) + elif "account_temp_unschedulable" in message: + event.update({ + "type": "temp-unschedulable", + "until": item.get("until") or item.get("temp_unschedulable_until"), + "ruleIndex": item.get("rule_index"), + "matchedKeyword": item.get("matched_keyword"), + "reason": item.get("reason") or item.get("error"), + }) + elif "http request completed" in message: + event.update({ + "type": "final", + "clientIp": item.get("client_ip"), + "protocol": item.get("protocol"), + "platform": item.get("platform"), + "completedAt": item.get("completed_at"), + }) + elif "admin account schedulable updated" in message or "account schedulable updated" in message or "/schedulable" in str(item.get("path") or ""): + event.update({ + "type": "admin-schedulable", + "schedulable": item.get("schedulable"), + }) + else: + event.update({"type": "other"}) + return event + +def with_trace_phase(event, first_epoch, last_epoch): + epoch = None + at = event.get("at") if isinstance(event, dict) else None + if isinstance(at, str) and at: + epoch = log_time_epoch({"_at": at}) + if epoch is None or first_epoch is None: + phase = "unknown" + elif epoch < first_epoch: + phase = "before-request" + elif last_epoch is not None and epoch > last_epoch: + phase = "after-request" + else: + phase = "during-request" + event["phase"] = phase + return event + +def account_snapshot_from_runtime(token): + try: + accounts = list_accounts(token) + except Exception as exc: + return [], {"error": str(exc)} + rows = [] + for item in accounts: + if not isinstance(item, dict): + continue + rows.append({ + "accountId": item.get("id"), + "accountName": item.get("name"), + "schedulable": item.get("schedulable"), + "status": item.get("status"), + "concurrency": item.get("concurrency"), + "priority": item.get("priority"), + "tempUnschedulableUntil": item.get("temp_unschedulable_until") or item.get("tempUnschedulableUntil"), + "tempUnschedulableSet": (item.get("temp_unschedulable_until") or item.get("tempUnschedulableUntil")) is not None or bool(item.get("temp_unschedulable_reason") or item.get("tempUnschedulableReason")), + }) + rows.sort(key=lambda row: (str(row.get("accountName") or ""), int(row.get("accountId") or 0))) + return rows, None + +def trace_reason(events, final_event): + failovers = [item for item in events if item.get("type") == "failover"] + select_failures = [item for item in events if item.get("type") == "select-failed"] + upstream_errors = [item for item in events if item.get("type") == "upstream-error"] + if failovers and select_failures: + return "failover-attempted-no-candidate" + if failovers: + return "failover-attempted" + if select_failures: + return "account-select-failed" + if upstream_errors: + return "upstream-error" + if isinstance(final_event, dict) and isinstance(final_event.get("statusCode"), int) and final_event.get("statusCode") >= 400: + return "final-http-error" + if isinstance(final_event, dict) and isinstance(final_event.get("statusCode"), int): + return "completed" + return "unknown" + +def run_trace(): + payload = json.loads(base64.b64decode(PAYLOAD_B64).decode("utf-8")) if PAYLOAD_B64 else {} + request_id = payload.get("requestId") + since = payload.get("since") or "24h" + tail = int(payload.get("tail") or 20000) + context_seconds = int(payload.get("contextSeconds") or 300) + show_lines = bool(payload.get("showLines")) + if not isinstance(request_id, str) or not request_id: + raise RuntimeError("trace payload missing requestId") + admin_email, token, admin_compliance = login() + account_snapshot, account_snapshot_error = account_snapshot_from_runtime(token) + account_names_by_id = {} + for row in account_snapshot: + account_id = row.get("accountId") + if isinstance(account_id, str) and account_id.isdigit(): + account_id = int(account_id) + if isinstance(account_id, int) and isinstance(row.get("accountName"), str): + account_names_by_id[account_id] = row.get("accountName") + proc = kubectl(["-n", NAMESPACE, "logs", "deployment/sub2api", f"--since={since}", f"--tail={tail}"]) + stdout = proc.stdout.decode("utf-8", errors="replace") + parsed_lines = [] + matched = [] + for line in stdout.splitlines(): + parsed = parse_log_line(line) + if parsed is None: + continue + parsed_lines.append(parsed) + if request_id in line: + matched.append(parsed) + first_epoch = None + last_epoch = None + for item in matched: + epoch = log_time_epoch(item) + if epoch is None: + continue + first_epoch = epoch if first_epoch is None else min(first_epoch, epoch) + last_epoch = epoch if last_epoch is None else max(last_epoch, epoch) + window_lines = [] + if first_epoch is not None: + start_epoch = first_epoch - context_seconds + end_epoch = (last_epoch if last_epoch is not None else first_epoch) + context_seconds + for item in parsed_lines: + epoch = log_time_epoch(item) + if epoch is not None and start_epoch <= epoch <= end_epoch: + window_lines.append(item) + else: + window_lines = matched + events = [classify_trace_event(item, account_names_by_id) for item in matched] + request_start = next((item for item in events if item.get("type") == "request-start"), None) + final_event = next((item for item in reversed(events) if item.get("type") == "final"), None) + failovers = [item for item in events if item.get("type") == "failover"] + select_failures = [item for item in events if item.get("type") == "select-failed"] + upstream_errors = [item for item in events if item.get("type") == "upstream-error"] + temp_unsched = [with_trace_phase(classify_trace_event(item, account_names_by_id), first_epoch, last_epoch) for item in window_lines if "account_temp_unschedulable" in str(item.get("_message") or "")] + admin_sched = [with_trace_phase(classify_trace_event(item, account_names_by_id), first_epoch, last_epoch) for item in window_lines if ("schedulable" in str(item.get("_message") or "") or "/schedulable" in str(item.get("path") or ""))] + window_events = [classify_trace_event(item, account_names_by_id) for item in window_lines] + final_errors = [item for item in window_events if item.get("type") == "final" and isinstance(item.get("statusCode"), int) and item.get("statusCode") >= 400] + window_failovers = [item for item in window_events if item.get("type") == "failover"] + window_select_failures = [item for item in window_events if item.get("type") == "select-failed"] + reason = trace_reason(events, final_event) + if not matched: + outcome = "not-found" + elif isinstance(final_event, dict) and isinstance(final_event.get("statusCode"), int) and final_event.get("statusCode") < 400: + outcome = "succeeded" + elif isinstance(final_event, dict): + outcome = "failed" + else: + outcome = "incomplete" + return { + "ok": proc.returncode == 0 and len(matched) > 0, + "mode": "trace", + "namespace": NAMESPACE, + "serviceDns": SERVICE_DNS, + "appPod": APP_POD, + "admin": {"email": admin_email, "tokenPrinted": False, "compliance": admin_compliance}, + "requestId": request_id, + "summary": { + "outcome": outcome, + "reason": reason, + "eventCount": len(events), + "matchedLineCount": len(matched), + "firstAt": events[0].get("at") if events else None, + "lastAt": events[-1].get("at") if events else None, + }, + "window": { + "since": since, + "tail": tail, + "beforeSeconds": context_seconds, + "afterSeconds": context_seconds, + "lineCount": len(window_lines), + }, + "request": request_start or {}, + "final": final_event or {}, + "events": events, + "failovers": failovers, + "selectFailures": select_failures, + "upstreamErrors": upstream_errors, + "tempUnschedulable": temp_unsched, + "adminSchedulable": admin_sched[-20:], + "windowStats": { + "matchedLines": len(matched), + "eventCount": len(window_events), + "finalErrorCount": len(final_errors), + "failoverCount": len(window_failovers), + "selectFailedCount": len(window_select_failures), + "tempUnschedulableCount": len(temp_unsched), + "adminSchedulableCount": len(admin_sched), + }, + "accountSnapshot": account_snapshot, + "accountSnapshotError": account_snapshot_error, + "rawLines": [{"line": item.get("_line")} for item in matched[-30:]] if show_lines else [], + "showLines": show_lines, + "logs": { + "exitCode": proc.returncode, + "stderrTail": text(proc.stderr, 1000), + "stdoutLineCount": len(stdout.splitlines()), + }, + "valuesPrinted": False, + } + def parse_embedded_json(stdout): if not isinstance(stdout, str) or not stdout.strip(): return None @@ -5491,6 +6068,8 @@ def run_cleanup_probes(): try: if MODE == "sync": result = run_sync() + elif MODE == "trace": + result = run_trace() elif MODE == "cleanup-probes": result = run_cleanup_probes() elif MODE == "sentinel-probe": diff --git a/scripts/src/platform-infra.ts b/scripts/src/platform-infra.ts index b39d3df5..4f34e0a3 100644 --- a/scripts/src/platform-infra.ts +++ b/scripts/src/platform-infra.ts @@ -44,6 +44,7 @@ export function platformInfraHelp(): unknown { "bun scripts/cli.ts platform-infra sub2api codex-pool plan", "bun scripts/cli.ts platform-infra sub2api codex-pool sync --confirm", "bun scripts/cli.ts platform-infra sub2api codex-pool validate", + "bun scripts/cli.ts platform-infra sub2api codex-pool trace --request-id ", "bun scripts/cli.ts platform-infra sub2api codex-pool sentinel-image status", "bun scripts/cli.ts platform-infra sub2api codex-pool sentinel-probe --account unidesk-codex-hy --confirm", ], @@ -62,6 +63,7 @@ export function platformInfraHelp(): unknown { "bun scripts/cli.ts platform-infra sub2api codex-pool plan", "bun scripts/cli.ts platform-infra sub2api codex-pool sync --confirm", "bun scripts/cli.ts platform-infra sub2api codex-pool validate", + "bun scripts/cli.ts platform-infra sub2api codex-pool trace --request-id ", "bun scripts/cli.ts platform-infra sub2api codex-pool sentinel-image status", ], module: "scripts/src/platform-infra-sub2api-codex.ts",