|
|
|
@@ -56,6 +56,14 @@ interface SentinelReportOptions extends DisclosureOptions {
|
|
|
|
|
events: number;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
interface TraceOptions extends DisclosureOptions {
|
|
|
|
|
requestId: string | null;
|
|
|
|
|
since: string;
|
|
|
|
|
tail: number;
|
|
|
|
|
contextSeconds: number;
|
|
|
|
|
showLines: boolean;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
interface SentinelImageOptions extends DisclosureOptions {
|
|
|
|
|
action: "status" | "build";
|
|
|
|
|
confirm: boolean;
|
|
|
|
@@ -192,12 +200,13 @@ interface CodexLocalConsumerTomlOptions {
|
|
|
|
|
export function codexPoolHelp(): unknown {
|
|
|
|
|
const pool = readCodexPoolConfig();
|
|
|
|
|
return {
|
|
|
|
|
command: "platform-infra sub2api codex-pool plan|sync|validate|sentinel-image|sentinel-probe|sentinel-report|cleanup-probes|expose|configure-local",
|
|
|
|
|
output: "json, except sentinel-report defaults to a ps-like text table",
|
|
|
|
|
command: "platform-infra sub2api codex-pool plan|sync|validate|trace|sentinel-image|sentinel-probe|sentinel-report|cleanup-probes|expose|configure-local",
|
|
|
|
|
output: "json, except trace and sentinel-report default to low-noise text tables",
|
|
|
|
|
usage: [
|
|
|
|
|
"bun scripts/cli.ts platform-infra sub2api codex-pool plan",
|
|
|
|
|
"bun scripts/cli.ts platform-infra sub2api codex-pool sync --confirm [--prune-removed]",
|
|
|
|
|
"bun scripts/cli.ts platform-infra sub2api codex-pool validate [--full|--raw]",
|
|
|
|
|
"bun scripts/cli.ts platform-infra sub2api codex-pool trace --request-id <requestId> [--since 24h|--tail 20000|--context-seconds 300|--show-lines|--raw]",
|
|
|
|
|
"bun scripts/cli.ts platform-infra sub2api codex-pool sentinel-image status",
|
|
|
|
|
"bun scripts/cli.ts platform-infra sub2api codex-pool sentinel-image build --confirm",
|
|
|
|
|
"bun scripts/cli.ts platform-infra sub2api codex-pool sentinel-probe --account unidesk-codex-hy --confirm",
|
|
|
|
@@ -229,6 +238,7 @@ export async function runCodexPoolCommand(config: UniDeskConfig, args: string[])
|
|
|
|
|
if (action === "plan") return codexPoolPlan(parseDisclosureOptions(args.slice(1)));
|
|
|
|
|
if (action === "sync") return await codexPoolSync(config, parseSyncOptions(args.slice(1)));
|
|
|
|
|
if (action === "validate") return await codexPoolValidate(config, parseDisclosureOptions(args.slice(1)));
|
|
|
|
|
if (action === "trace") return await codexPoolTrace(config, parseTraceOptions(args.slice(1)));
|
|
|
|
|
if (action === "sentinel-image") return await codexPoolSentinelImage(config, parseSentinelImageOptions(args.slice(1)));
|
|
|
|
|
if (action === "sentinel-probe") return await codexPoolSentinelProbe(config, parseSentinelProbeOptions(args.slice(1)));
|
|
|
|
|
if (action === "sentinel-report") return await codexPoolSentinelReport(config, parseSentinelReportOptions(args.slice(1)));
|
|
|
|
@@ -350,6 +360,91 @@ function parseSentinelReportOptions(args: string[]): SentinelReportOptions {
|
|
|
|
|
return { ...disclosure, events };
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function parseTraceOptions(args: string[]): TraceOptions {
|
|
|
|
|
let requestId: string | null = null;
|
|
|
|
|
let since = "24h";
|
|
|
|
|
let tail = 20_000;
|
|
|
|
|
let contextSeconds = 300;
|
|
|
|
|
let showLines = false;
|
|
|
|
|
const disclosureArgs: string[] = [];
|
|
|
|
|
for (let index = 0; index < args.length; index += 1) {
|
|
|
|
|
const arg = args[index]!;
|
|
|
|
|
if (arg === "--full" || arg === "--raw") {
|
|
|
|
|
disclosureArgs.push(arg);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if (arg === "--show-lines") {
|
|
|
|
|
showLines = true;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if (arg === "--request-id" || arg === "--id") {
|
|
|
|
|
const value = args[index + 1];
|
|
|
|
|
if (value === undefined || value.startsWith("--")) throw new Error(`${arg} requires a request id`);
|
|
|
|
|
requestId = value.trim();
|
|
|
|
|
index += 1;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if (arg.startsWith("--request-id=")) {
|
|
|
|
|
requestId = arg.slice("--request-id=".length).trim();
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if (arg.startsWith("--id=")) {
|
|
|
|
|
requestId = arg.slice("--id=".length).trim();
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if (arg === "--since") {
|
|
|
|
|
const value = args[index + 1];
|
|
|
|
|
if (value === undefined || value.startsWith("--")) throw new Error("--since requires a kubectl duration such as 24h or 90m");
|
|
|
|
|
since = parseKubectlDuration(value);
|
|
|
|
|
index += 1;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if (arg.startsWith("--since=")) {
|
|
|
|
|
since = parseKubectlDuration(arg.slice("--since=".length));
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if (arg === "--tail") {
|
|
|
|
|
const value = args[index + 1];
|
|
|
|
|
if (value === undefined || value.startsWith("--")) throw new Error("--tail requires an integer");
|
|
|
|
|
tail = parseTraceLimit(value, "--tail", 100, 200_000);
|
|
|
|
|
index += 1;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if (arg.startsWith("--tail=")) {
|
|
|
|
|
tail = parseTraceLimit(arg.slice("--tail=".length), "--tail", 100, 200_000);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if (arg === "--context-seconds") {
|
|
|
|
|
const value = args[index + 1];
|
|
|
|
|
if (value === undefined || value.startsWith("--")) throw new Error("--context-seconds requires an integer");
|
|
|
|
|
contextSeconds = parseTraceLimit(value, "--context-seconds", 0, 3600);
|
|
|
|
|
index += 1;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if (arg.startsWith("--context-seconds=")) {
|
|
|
|
|
contextSeconds = parseTraceLimit(arg.slice("--context-seconds=".length), "--context-seconds", 0, 3600);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
throw new Error(`unsupported option: ${arg}`);
|
|
|
|
|
}
|
|
|
|
|
if (requestId === null || requestId.length === 0) throw new Error("trace requires --request-id <requestId>");
|
|
|
|
|
if (!/^[A-Za-z0-9_.:-]{8,128}$/u.test(requestId)) throw new Error("--request-id has an unsupported format");
|
|
|
|
|
const disclosure = parseDisclosureOptions(disclosureArgs);
|
|
|
|
|
return { ...disclosure, requestId, since, tail, contextSeconds, showLines };
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function parseKubectlDuration(raw: string): string {
|
|
|
|
|
const value = raw.trim();
|
|
|
|
|
if (!/^[1-9][0-9]*(?:s|m|h)$/u.test(value)) throw new Error("--since must be a kubectl duration such as 24h, 90m, or 300s");
|
|
|
|
|
return value;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function parseTraceLimit(raw: string, option: string, min: number, max: number): number {
|
|
|
|
|
const value = Number(raw);
|
|
|
|
|
if (!Number.isInteger(value) || value < min || value > max) throw new Error(`${option} must be an integer from ${min} to ${max}`);
|
|
|
|
|
return value;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function readReportEventLimit(raw: string, option: string): number {
|
|
|
|
|
const value = Number(raw);
|
|
|
|
|
if (!Number.isInteger(value) || value < 1 || value > 200) throw new Error(`${option} must be an integer from 1 to 200`);
|
|
|
|
@@ -588,6 +683,28 @@ async function codexPoolValidate(config: UniDeskConfig, options: DisclosureOptio
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async function codexPoolTrace(config: UniDeskConfig, options: TraceOptions): Promise<Record<string, unknown> | RenderedCliResult> {
|
|
|
|
|
const pool = readCodexPoolConfig();
|
|
|
|
|
const result = await capture(config, g14K3sRoute, ["script"], traceScript(pool, options));
|
|
|
|
|
const parsed = parseJsonOutput(result.stdout);
|
|
|
|
|
const ok = result.exitCode === 0 && boolField(parsed, "ok", false);
|
|
|
|
|
if (options.raw) {
|
|
|
|
|
return {
|
|
|
|
|
ok,
|
|
|
|
|
action: "platform-infra-sub2api-codex-pool-trace",
|
|
|
|
|
remote: compactCapture(result, { full: true }),
|
|
|
|
|
trace: parsed,
|
|
|
|
|
valuesPrinted: false,
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
const text = renderTraceReport(parsed, {
|
|
|
|
|
requestId: options.requestId ?? "",
|
|
|
|
|
showLines: options.showLines,
|
|
|
|
|
remote: compactCapture(result, { full: result.exitCode !== 0 || parsed === null }),
|
|
|
|
|
});
|
|
|
|
|
return renderedCliResult(ok, "platform-infra sub2api codex-pool trace", text);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async function codexPoolSentinelReport(config: UniDeskConfig, options: SentinelReportOptions): Promise<Record<string, unknown> | RenderedCliResult> {
|
|
|
|
|
const pool = readCodexPoolConfig();
|
|
|
|
|
const result = await capture(config, g14K3sRoute, ["script"], sentinelReportScript(pool, options.events));
|
|
|
|
@@ -2077,6 +2194,159 @@ function renderSentinelReport(
|
|
|
|
|
return lines.join("\n");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function renderTraceReport(
|
|
|
|
|
parsed: Record<string, unknown> | null,
|
|
|
|
|
context: { requestId: string; showLines: boolean; remote: Record<string, unknown> },
|
|
|
|
|
): string {
|
|
|
|
|
if (parsed === null) {
|
|
|
|
|
return [
|
|
|
|
|
`SUB2API TRACE ${context.requestId} unavailable`,
|
|
|
|
|
`remote_exit=${context.remote.exitCode ?? "?"} stdout_bytes=${context.remote.stdoutBytes ?? "?"} stderr_bytes=${context.remote.stderrBytes ?? "?"}`,
|
|
|
|
|
stringValue(context.remote.stderrTail) ?? stringValue(context.remote.stdoutTail) ?? "",
|
|
|
|
|
].filter(Boolean).join("\n");
|
|
|
|
|
}
|
|
|
|
|
const summary = isRecord(parsed.summary) ? parsed.summary : {};
|
|
|
|
|
const request = isRecord(parsed.request) ? parsed.request : {};
|
|
|
|
|
const final = isRecord(parsed.final) ? parsed.final : {};
|
|
|
|
|
const window = isRecord(parsed.window) ? parsed.window : {};
|
|
|
|
|
const events = recordArray(parsed.events);
|
|
|
|
|
const failovers = recordArray(parsed.failovers);
|
|
|
|
|
const selectFailures = recordArray(parsed.selectFailures);
|
|
|
|
|
const upstreamErrors = recordArray(parsed.upstreamErrors);
|
|
|
|
|
const tempUnschedulable = recordArray(parsed.tempUnschedulable);
|
|
|
|
|
const windowStats = isRecord(parsed.windowStats) ? parsed.windowStats : {};
|
|
|
|
|
const accountSnapshot = recordArray(parsed.accountSnapshot);
|
|
|
|
|
const lines: string[] = [];
|
|
|
|
|
lines.push([
|
|
|
|
|
"SUB2API TRACE",
|
|
|
|
|
stringValue(parsed.requestId) ?? context.requestId,
|
|
|
|
|
`ok=${parsed.ok === true ? "true" : "false"}`,
|
|
|
|
|
`outcome=${stringValue(summary.outcome) ?? "-"}`,
|
|
|
|
|
`reason=${stringValue(summary.reason) ?? "-"}`,
|
|
|
|
|
].join(" "));
|
|
|
|
|
lines.push([
|
|
|
|
|
"REQUEST",
|
|
|
|
|
`path=${request.path ?? final.path ?? "-"}`,
|
|
|
|
|
`model=${request.model ?? final.model ?? "-"}`,
|
|
|
|
|
`stream=${textValue(request.stream)}`,
|
|
|
|
|
`body=${textValue(request.bodyBytes)}`,
|
|
|
|
|
`first=${shortIso(summary.firstAt)}`,
|
|
|
|
|
`last=${shortIso(summary.lastAt)}`,
|
|
|
|
|
].join(" "));
|
|
|
|
|
lines.push([
|
|
|
|
|
"FINAL",
|
|
|
|
|
`status=${textValue(final.statusCode)}`,
|
|
|
|
|
`account=${formatAccountRef(final)}`,
|
|
|
|
|
`latency_ms=${textValue(final.latencyMs)}`,
|
|
|
|
|
`events=${textValue(summary.eventCount)}`,
|
|
|
|
|
`window=${window.beforeSeconds ?? "?"}s/${window.afterSeconds ?? "?"}s`,
|
|
|
|
|
].join(" "));
|
|
|
|
|
if (failovers.length > 0) {
|
|
|
|
|
lines.push("");
|
|
|
|
|
lines.push("FAILOVER");
|
|
|
|
|
lines.push(renderTable([
|
|
|
|
|
["AT", "ACCOUNT", "UPSTREAM", "SWITCH", "MAX"],
|
|
|
|
|
...failovers.map((item) => [
|
|
|
|
|
shortIso(item.at),
|
|
|
|
|
formatAccountRef(item),
|
|
|
|
|
textValue(item.upstreamStatus),
|
|
|
|
|
textValue(item.switchCount),
|
|
|
|
|
textValue(item.maxSwitches),
|
|
|
|
|
]),
|
|
|
|
|
]));
|
|
|
|
|
}
|
|
|
|
|
if (selectFailures.length > 0) {
|
|
|
|
|
lines.push("");
|
|
|
|
|
lines.push("SELECT-FAILED");
|
|
|
|
|
lines.push(renderTable([
|
|
|
|
|
["AT", "ERROR", "EXCLUDED"],
|
|
|
|
|
...selectFailures.map((item) => [
|
|
|
|
|
shortIso(item.at),
|
|
|
|
|
shorten(stringValue(item.error) ?? "-", 56),
|
|
|
|
|
textValue(item.excludedAccountCount),
|
|
|
|
|
]),
|
|
|
|
|
]));
|
|
|
|
|
}
|
|
|
|
|
if (upstreamErrors.length > 0 || tempUnschedulable.length > 0) {
|
|
|
|
|
lines.push("");
|
|
|
|
|
lines.push("ACCOUNT SIGNALS");
|
|
|
|
|
lines.push(renderTable([
|
|
|
|
|
["TYPE", "PHASE", "AT", "ACCOUNT", "STATUS", "RULE", "UNTIL", "DETAIL"],
|
|
|
|
|
...upstreamErrors.map((item) => [
|
|
|
|
|
"upstream-error",
|
|
|
|
|
textValue(item.phase),
|
|
|
|
|
shortIso(item.at),
|
|
|
|
|
formatAccountRef(item),
|
|
|
|
|
textValue(item.statusCode),
|
|
|
|
|
"-",
|
|
|
|
|
"-",
|
|
|
|
|
shorten(stringValue(item.error) ?? "-", 36),
|
|
|
|
|
]),
|
|
|
|
|
...tempUnschedulable.map((item) => [
|
|
|
|
|
"temp-unsched",
|
|
|
|
|
textValue(item.phase),
|
|
|
|
|
shortIso(item.at),
|
|
|
|
|
formatAccountRef(item),
|
|
|
|
|
textValue(item.statusCode),
|
|
|
|
|
textValue(item.ruleIndex),
|
|
|
|
|
shortIso(item.until),
|
|
|
|
|
shorten(stringValue(item.reason) ?? stringValue(item.matchedKeyword) ?? "-", 36),
|
|
|
|
|
]),
|
|
|
|
|
]));
|
|
|
|
|
}
|
|
|
|
|
lines.push("");
|
|
|
|
|
lines.push("WINDOW STATS");
|
|
|
|
|
lines.push(renderTable([
|
|
|
|
|
["MATCH", "EVENTS", "FINAL_4XX_5XX", "FAILOVER", "SELECT_FAIL", "TEMP_UNSCHED", "ADMIN_SCHED"],
|
|
|
|
|
[
|
|
|
|
|
textValue(windowStats.matchedLines),
|
|
|
|
|
textValue(windowStats.eventCount),
|
|
|
|
|
textValue(windowStats.finalErrorCount),
|
|
|
|
|
textValue(windowStats.failoverCount),
|
|
|
|
|
textValue(windowStats.selectFailedCount),
|
|
|
|
|
textValue(windowStats.tempUnschedulableCount),
|
|
|
|
|
textValue(windowStats.adminSchedulableCount),
|
|
|
|
|
],
|
|
|
|
|
]));
|
|
|
|
|
if (accountSnapshot.length > 0) {
|
|
|
|
|
lines.push("");
|
|
|
|
|
lines.push("CURRENT ACCOUNTS");
|
|
|
|
|
lines.push(renderTable([
|
|
|
|
|
["ID", "ACCOUNT", "SCHED", "STATUS", "CONC", "TEMP_UNTIL"],
|
|
|
|
|
...accountSnapshot.slice(0, 20).map((item) => [
|
|
|
|
|
textValue(item.accountId),
|
|
|
|
|
shorten(stringValue(item.accountName) ?? "-", 32),
|
|
|
|
|
textValue(item.schedulable),
|
|
|
|
|
textValue(item.status),
|
|
|
|
|
textValue(item.concurrency),
|
|
|
|
|
shortIso(item.tempUnschedulableUntil),
|
|
|
|
|
]),
|
|
|
|
|
]));
|
|
|
|
|
}
|
|
|
|
|
if (context.showLines || parsed.showLines === true) {
|
|
|
|
|
const rawLines = recordArray(parsed.rawLines);
|
|
|
|
|
if (rawLines.length > 0) {
|
|
|
|
|
lines.push("");
|
|
|
|
|
lines.push("RAW LINES");
|
|
|
|
|
for (const item of rawLines) {
|
|
|
|
|
lines.push(shorten(stringValue(item.line) ?? "", 1000));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
lines.push("");
|
|
|
|
|
lines.push("Raw: bun scripts/cli.ts platform-infra sub2api codex-pool trace --request-id <id> --raw");
|
|
|
|
|
lines.push("Lines: add --show-lines for bounded matched log lines.");
|
|
|
|
|
return lines.join("\n");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function formatAccountRef(item: Record<string, unknown>): string {
|
|
|
|
|
const name = stringValue(item.accountName);
|
|
|
|
|
const id = textValue(item.accountId);
|
|
|
|
|
if (name !== null && id !== "-") return `${name}#${id}`;
|
|
|
|
|
if (name !== null) return name;
|
|
|
|
|
return id;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function renderTable(rows: string[][]): string {
|
|
|
|
|
if (rows.length === 0) return "";
|
|
|
|
|
const widths: number[] = [];
|
|
|
|
@@ -2947,6 +3217,17 @@ function validateScript(pool: CodexPoolConfig): string {
|
|
|
|
|
return remotePythonScript("validate", "", pool);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function traceScript(pool: CodexPoolConfig, options: TraceOptions): string {
|
|
|
|
|
const encoded = Buffer.from(JSON.stringify({
|
|
|
|
|
requestId: options.requestId,
|
|
|
|
|
since: options.since,
|
|
|
|
|
tail: options.tail,
|
|
|
|
|
contextSeconds: options.contextSeconds,
|
|
|
|
|
showLines: options.showLines,
|
|
|
|
|
}), "utf8").toString("base64");
|
|
|
|
|
return remotePythonScript("trace", encoded, pool);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function sentinelProbeScript(payload: unknown, pool: CodexPoolConfig): string {
|
|
|
|
|
const encoded = Buffer.from(JSON.stringify(payload), "utf8").toString("base64");
|
|
|
|
|
return remotePythonScript("sentinel-probe", encoded, pool);
|
|
|
|
@@ -3339,7 +3620,7 @@ function desiredAccountTempUnschedulableMap(pool: CodexPoolConfig): Record<strin
|
|
|
|
|
return policies;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function remotePythonScript(mode: "sync" | "validate" | "cleanup-probes" | "sentinel-probe", encodedPayload: string, pool: CodexPoolConfig): string {
|
|
|
|
|
function remotePythonScript(mode: "sync" | "validate" | "trace" | "cleanup-probes" | "sentinel-probe", encodedPayload: string, pool: CodexPoolConfig): string {
|
|
|
|
|
return `
|
|
|
|
|
set -u
|
|
|
|
|
python3 - <<'PY'
|
|
|
|
@@ -5347,6 +5628,302 @@ def run_validate():
|
|
|
|
|
"validation": {"gatewayModels": gateway, "gatewayResponses": responses_smoke, "gatewayResponsesRecent": responses_evidence, "gatewayCompactRecent": compact_evidence},
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
def parse_log_line(line):
|
|
|
|
|
json_start = line.find("{")
|
|
|
|
|
if json_start < 0:
|
|
|
|
|
return None
|
|
|
|
|
prefix = line[:json_start].rstrip()
|
|
|
|
|
try:
|
|
|
|
|
item = json.loads(line[json_start:])
|
|
|
|
|
except Exception:
|
|
|
|
|
return None
|
|
|
|
|
if not isinstance(item, dict):
|
|
|
|
|
return None
|
|
|
|
|
at = None
|
|
|
|
|
parts = prefix.split()
|
|
|
|
|
if parts:
|
|
|
|
|
at = parts[0]
|
|
|
|
|
message = ""
|
|
|
|
|
if len(parts) >= 4:
|
|
|
|
|
message = " ".join(parts[3:])
|
|
|
|
|
elif len(parts) >= 3:
|
|
|
|
|
message = parts[2]
|
|
|
|
|
elif len(parts) >= 1:
|
|
|
|
|
message = parts[-1]
|
|
|
|
|
item["_at"] = at
|
|
|
|
|
item["_message"] = message
|
|
|
|
|
item["_line"] = line
|
|
|
|
|
return item
|
|
|
|
|
|
|
|
|
|
def log_time_epoch(item):
|
|
|
|
|
at = item.get("_at") if isinstance(item, dict) else None
|
|
|
|
|
if not isinstance(at, str) or not at:
|
|
|
|
|
return None
|
|
|
|
|
try:
|
|
|
|
|
return datetime.strptime(at, "%Y-%m-%dT%H:%M:%S.%f%z").timestamp()
|
|
|
|
|
except Exception:
|
|
|
|
|
try:
|
|
|
|
|
return datetime.fromisoformat(at.replace("Z", "+00:00")).timestamp()
|
|
|
|
|
except Exception:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
def event_base(item, account_names_by_id):
|
|
|
|
|
account_id = item.get("account_id")
|
|
|
|
|
if isinstance(account_id, str) and account_id.isdigit():
|
|
|
|
|
account_id = int(account_id)
|
|
|
|
|
account_name = account_names_by_id.get(account_id)
|
|
|
|
|
return {
|
|
|
|
|
"at": item.get("_at"),
|
|
|
|
|
"message": item.get("_message"),
|
|
|
|
|
"requestId": item.get("request_id"),
|
|
|
|
|
"clientRequestId": item.get("client_request_id"),
|
|
|
|
|
"path": item.get("path"),
|
|
|
|
|
"method": item.get("method"),
|
|
|
|
|
"model": item.get("model"),
|
|
|
|
|
"accountId": account_id,
|
|
|
|
|
"accountName": account_name,
|
|
|
|
|
"statusCode": item.get("status_code"),
|
|
|
|
|
"upstreamStatus": item.get("upstream_status"),
|
|
|
|
|
"latencyMs": item.get("latency_ms"),
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
def classify_trace_event(item, account_names_by_id):
|
|
|
|
|
message = str(item.get("_message") or "")
|
|
|
|
|
event = event_base(item, account_names_by_id)
|
|
|
|
|
if "content_moderation.gateway_check_start" in message:
|
|
|
|
|
event.update({
|
|
|
|
|
"type": "request-start",
|
|
|
|
|
"stream": item.get("stream"),
|
|
|
|
|
"bodyBytes": item.get("body_bytes"),
|
|
|
|
|
"groupId": item.get("group_id"),
|
|
|
|
|
"groupName": item.get("group_name"),
|
|
|
|
|
"apiKeyName": item.get("api_key_name"),
|
|
|
|
|
})
|
|
|
|
|
elif "content_moderation.gateway_check_done" in message:
|
|
|
|
|
event.update({
|
|
|
|
|
"type": "gateway-check",
|
|
|
|
|
"allowed": item.get("allowed"),
|
|
|
|
|
"blocked": item.get("blocked"),
|
|
|
|
|
"action": item.get("action"),
|
|
|
|
|
})
|
|
|
|
|
elif "openai.upstream_failover_switching" in message:
|
|
|
|
|
event.update({
|
|
|
|
|
"type": "failover",
|
|
|
|
|
"switchCount": item.get("switch_count"),
|
|
|
|
|
"maxSwitches": item.get("max_switches"),
|
|
|
|
|
})
|
|
|
|
|
elif "openai.account_select_failed" in message:
|
|
|
|
|
event.update({
|
|
|
|
|
"type": "select-failed",
|
|
|
|
|
"error": item.get("error"),
|
|
|
|
|
"excludedAccountCount": item.get("excluded_account_count"),
|
|
|
|
|
})
|
|
|
|
|
elif "account_upstream_error" in message:
|
|
|
|
|
event.update({
|
|
|
|
|
"type": "upstream-error",
|
|
|
|
|
"error": item.get("error"),
|
|
|
|
|
})
|
|
|
|
|
elif "account_temp_unschedulable" in message:
|
|
|
|
|
event.update({
|
|
|
|
|
"type": "temp-unschedulable",
|
|
|
|
|
"until": item.get("until") or item.get("temp_unschedulable_until"),
|
|
|
|
|
"ruleIndex": item.get("rule_index"),
|
|
|
|
|
"matchedKeyword": item.get("matched_keyword"),
|
|
|
|
|
"reason": item.get("reason") or item.get("error"),
|
|
|
|
|
})
|
|
|
|
|
elif "http request completed" in message:
|
|
|
|
|
event.update({
|
|
|
|
|
"type": "final",
|
|
|
|
|
"clientIp": item.get("client_ip"),
|
|
|
|
|
"protocol": item.get("protocol"),
|
|
|
|
|
"platform": item.get("platform"),
|
|
|
|
|
"completedAt": item.get("completed_at"),
|
|
|
|
|
})
|
|
|
|
|
elif "admin account schedulable updated" in message or "account schedulable updated" in message or "/schedulable" in str(item.get("path") or ""):
|
|
|
|
|
event.update({
|
|
|
|
|
"type": "admin-schedulable",
|
|
|
|
|
"schedulable": item.get("schedulable"),
|
|
|
|
|
})
|
|
|
|
|
else:
|
|
|
|
|
event.update({"type": "other"})
|
|
|
|
|
return event
|
|
|
|
|
|
|
|
|
|
def with_trace_phase(event, first_epoch, last_epoch):
|
|
|
|
|
epoch = None
|
|
|
|
|
at = event.get("at") if isinstance(event, dict) else None
|
|
|
|
|
if isinstance(at, str) and at:
|
|
|
|
|
epoch = log_time_epoch({"_at": at})
|
|
|
|
|
if epoch is None or first_epoch is None:
|
|
|
|
|
phase = "unknown"
|
|
|
|
|
elif epoch < first_epoch:
|
|
|
|
|
phase = "before-request"
|
|
|
|
|
elif last_epoch is not None and epoch > last_epoch:
|
|
|
|
|
phase = "after-request"
|
|
|
|
|
else:
|
|
|
|
|
phase = "during-request"
|
|
|
|
|
event["phase"] = phase
|
|
|
|
|
return event
|
|
|
|
|
|
|
|
|
|
def account_snapshot_from_runtime(token):
|
|
|
|
|
try:
|
|
|
|
|
accounts = list_accounts(token)
|
|
|
|
|
except Exception as exc:
|
|
|
|
|
return [], {"error": str(exc)}
|
|
|
|
|
rows = []
|
|
|
|
|
for item in accounts:
|
|
|
|
|
if not isinstance(item, dict):
|
|
|
|
|
continue
|
|
|
|
|
rows.append({
|
|
|
|
|
"accountId": item.get("id"),
|
|
|
|
|
"accountName": item.get("name"),
|
|
|
|
|
"schedulable": item.get("schedulable"),
|
|
|
|
|
"status": item.get("status"),
|
|
|
|
|
"concurrency": item.get("concurrency"),
|
|
|
|
|
"priority": item.get("priority"),
|
|
|
|
|
"tempUnschedulableUntil": item.get("temp_unschedulable_until") or item.get("tempUnschedulableUntil"),
|
|
|
|
|
"tempUnschedulableSet": (item.get("temp_unschedulable_until") or item.get("tempUnschedulableUntil")) is not None or bool(item.get("temp_unschedulable_reason") or item.get("tempUnschedulableReason")),
|
|
|
|
|
})
|
|
|
|
|
rows.sort(key=lambda row: (str(row.get("accountName") or ""), int(row.get("accountId") or 0)))
|
|
|
|
|
return rows, None
|
|
|
|
|
|
|
|
|
|
def trace_reason(events, final_event):
|
|
|
|
|
failovers = [item for item in events if item.get("type") == "failover"]
|
|
|
|
|
select_failures = [item for item in events if item.get("type") == "select-failed"]
|
|
|
|
|
upstream_errors = [item for item in events if item.get("type") == "upstream-error"]
|
|
|
|
|
if failovers and select_failures:
|
|
|
|
|
return "failover-attempted-no-candidate"
|
|
|
|
|
if failovers:
|
|
|
|
|
return "failover-attempted"
|
|
|
|
|
if select_failures:
|
|
|
|
|
return "account-select-failed"
|
|
|
|
|
if upstream_errors:
|
|
|
|
|
return "upstream-error"
|
|
|
|
|
if isinstance(final_event, dict) and isinstance(final_event.get("statusCode"), int) and final_event.get("statusCode") >= 400:
|
|
|
|
|
return "final-http-error"
|
|
|
|
|
if isinstance(final_event, dict) and isinstance(final_event.get("statusCode"), int):
|
|
|
|
|
return "completed"
|
|
|
|
|
return "unknown"
|
|
|
|
|
|
|
|
|
|
def run_trace():
|
|
|
|
|
payload = json.loads(base64.b64decode(PAYLOAD_B64).decode("utf-8")) if PAYLOAD_B64 else {}
|
|
|
|
|
request_id = payload.get("requestId")
|
|
|
|
|
since = payload.get("since") or "24h"
|
|
|
|
|
tail = int(payload.get("tail") or 20000)
|
|
|
|
|
context_seconds = int(payload.get("contextSeconds") or 300)
|
|
|
|
|
show_lines = bool(payload.get("showLines"))
|
|
|
|
|
if not isinstance(request_id, str) or not request_id:
|
|
|
|
|
raise RuntimeError("trace payload missing requestId")
|
|
|
|
|
admin_email, token, admin_compliance = login()
|
|
|
|
|
account_snapshot, account_snapshot_error = account_snapshot_from_runtime(token)
|
|
|
|
|
account_names_by_id = {}
|
|
|
|
|
for row in account_snapshot:
|
|
|
|
|
account_id = row.get("accountId")
|
|
|
|
|
if isinstance(account_id, str) and account_id.isdigit():
|
|
|
|
|
account_id = int(account_id)
|
|
|
|
|
if isinstance(account_id, int) and isinstance(row.get("accountName"), str):
|
|
|
|
|
account_names_by_id[account_id] = row.get("accountName")
|
|
|
|
|
proc = kubectl(["-n", NAMESPACE, "logs", "deployment/sub2api", f"--since={since}", f"--tail={tail}"])
|
|
|
|
|
stdout = proc.stdout.decode("utf-8", errors="replace")
|
|
|
|
|
parsed_lines = []
|
|
|
|
|
matched = []
|
|
|
|
|
for line in stdout.splitlines():
|
|
|
|
|
parsed = parse_log_line(line)
|
|
|
|
|
if parsed is None:
|
|
|
|
|
continue
|
|
|
|
|
parsed_lines.append(parsed)
|
|
|
|
|
if request_id in line:
|
|
|
|
|
matched.append(parsed)
|
|
|
|
|
first_epoch = None
|
|
|
|
|
last_epoch = None
|
|
|
|
|
for item in matched:
|
|
|
|
|
epoch = log_time_epoch(item)
|
|
|
|
|
if epoch is None:
|
|
|
|
|
continue
|
|
|
|
|
first_epoch = epoch if first_epoch is None else min(first_epoch, epoch)
|
|
|
|
|
last_epoch = epoch if last_epoch is None else max(last_epoch, epoch)
|
|
|
|
|
window_lines = []
|
|
|
|
|
if first_epoch is not None:
|
|
|
|
|
start_epoch = first_epoch - context_seconds
|
|
|
|
|
end_epoch = (last_epoch if last_epoch is not None else first_epoch) + context_seconds
|
|
|
|
|
for item in parsed_lines:
|
|
|
|
|
epoch = log_time_epoch(item)
|
|
|
|
|
if epoch is not None and start_epoch <= epoch <= end_epoch:
|
|
|
|
|
window_lines.append(item)
|
|
|
|
|
else:
|
|
|
|
|
window_lines = matched
|
|
|
|
|
events = [classify_trace_event(item, account_names_by_id) for item in matched]
|
|
|
|
|
request_start = next((item for item in events if item.get("type") == "request-start"), None)
|
|
|
|
|
final_event = next((item for item in reversed(events) if item.get("type") == "final"), None)
|
|
|
|
|
failovers = [item for item in events if item.get("type") == "failover"]
|
|
|
|
|
select_failures = [item for item in events if item.get("type") == "select-failed"]
|
|
|
|
|
upstream_errors = [item for item in events if item.get("type") == "upstream-error"]
|
|
|
|
|
temp_unsched = [with_trace_phase(classify_trace_event(item, account_names_by_id), first_epoch, last_epoch) for item in window_lines if "account_temp_unschedulable" in str(item.get("_message") or "")]
|
|
|
|
|
admin_sched = [with_trace_phase(classify_trace_event(item, account_names_by_id), first_epoch, last_epoch) for item in window_lines if ("schedulable" in str(item.get("_message") or "") or "/schedulable" in str(item.get("path") or ""))]
|
|
|
|
|
window_events = [classify_trace_event(item, account_names_by_id) for item in window_lines]
|
|
|
|
|
final_errors = [item for item in window_events if item.get("type") == "final" and isinstance(item.get("statusCode"), int) and item.get("statusCode") >= 400]
|
|
|
|
|
window_failovers = [item for item in window_events if item.get("type") == "failover"]
|
|
|
|
|
window_select_failures = [item for item in window_events if item.get("type") == "select-failed"]
|
|
|
|
|
reason = trace_reason(events, final_event)
|
|
|
|
|
if not matched:
|
|
|
|
|
outcome = "not-found"
|
|
|
|
|
elif isinstance(final_event, dict) and isinstance(final_event.get("statusCode"), int) and final_event.get("statusCode") < 400:
|
|
|
|
|
outcome = "succeeded"
|
|
|
|
|
elif isinstance(final_event, dict):
|
|
|
|
|
outcome = "failed"
|
|
|
|
|
else:
|
|
|
|
|
outcome = "incomplete"
|
|
|
|
|
return {
|
|
|
|
|
"ok": proc.returncode == 0 and len(matched) > 0,
|
|
|
|
|
"mode": "trace",
|
|
|
|
|
"namespace": NAMESPACE,
|
|
|
|
|
"serviceDns": SERVICE_DNS,
|
|
|
|
|
"appPod": APP_POD,
|
|
|
|
|
"admin": {"email": admin_email, "tokenPrinted": False, "compliance": admin_compliance},
|
|
|
|
|
"requestId": request_id,
|
|
|
|
|
"summary": {
|
|
|
|
|
"outcome": outcome,
|
|
|
|
|
"reason": reason,
|
|
|
|
|
"eventCount": len(events),
|
|
|
|
|
"matchedLineCount": len(matched),
|
|
|
|
|
"firstAt": events[0].get("at") if events else None,
|
|
|
|
|
"lastAt": events[-1].get("at") if events else None,
|
|
|
|
|
},
|
|
|
|
|
"window": {
|
|
|
|
|
"since": since,
|
|
|
|
|
"tail": tail,
|
|
|
|
|
"beforeSeconds": context_seconds,
|
|
|
|
|
"afterSeconds": context_seconds,
|
|
|
|
|
"lineCount": len(window_lines),
|
|
|
|
|
},
|
|
|
|
|
"request": request_start or {},
|
|
|
|
|
"final": final_event or {},
|
|
|
|
|
"events": events,
|
|
|
|
|
"failovers": failovers,
|
|
|
|
|
"selectFailures": select_failures,
|
|
|
|
|
"upstreamErrors": upstream_errors,
|
|
|
|
|
"tempUnschedulable": temp_unsched,
|
|
|
|
|
"adminSchedulable": admin_sched[-20:],
|
|
|
|
|
"windowStats": {
|
|
|
|
|
"matchedLines": len(matched),
|
|
|
|
|
"eventCount": len(window_events),
|
|
|
|
|
"finalErrorCount": len(final_errors),
|
|
|
|
|
"failoverCount": len(window_failovers),
|
|
|
|
|
"selectFailedCount": len(window_select_failures),
|
|
|
|
|
"tempUnschedulableCount": len(temp_unsched),
|
|
|
|
|
"adminSchedulableCount": len(admin_sched),
|
|
|
|
|
},
|
|
|
|
|
"accountSnapshot": account_snapshot,
|
|
|
|
|
"accountSnapshotError": account_snapshot_error,
|
|
|
|
|
"rawLines": [{"line": item.get("_line")} for item in matched[-30:]] if show_lines else [],
|
|
|
|
|
"showLines": show_lines,
|
|
|
|
|
"logs": {
|
|
|
|
|
"exitCode": proc.returncode,
|
|
|
|
|
"stderrTail": text(proc.stderr, 1000),
|
|
|
|
|
"stdoutLineCount": len(stdout.splitlines()),
|
|
|
|
|
},
|
|
|
|
|
"valuesPrinted": False,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
def parse_embedded_json(stdout):
|
|
|
|
|
if not isinstance(stdout, str) or not stdout.strip():
|
|
|
|
|
return None
|
|
|
|
@@ -5491,6 +6068,8 @@ def run_cleanup_probes():
|
|
|
|
|
try:
|
|
|
|
|
if MODE == "sync":
|
|
|
|
|
result = run_sync()
|
|
|
|
|
elif MODE == "trace":
|
|
|
|
|
result = run_trace()
|
|
|
|
|
elif MODE == "cleanup-probes":
|
|
|
|
|
result = run_cleanup_probes()
|
|
|
|
|
elif MODE == "sentinel-probe":
|
|
|
|
|