fix: improve observability code agent diagnosis

This commit is contained in:
Codex
2026-06-30 04:06:39 +00:00
parent 8d1b1bf71f
commit 1eb9c2a89c
7 changed files with 108 additions and 9 deletions
@@ -25,6 +25,7 @@ import { readObservabilityConfig } from "./config";
import { applyScript, statusScript } from "./search-script";
import { compactStatus, configSummary, manifestObjectSummary, policyChecks, statusSummary, targetSummary } from "./summary";
import { renderManifest } from "./trace-script";
import { formatTable, shortenEnd, textValue } from "./manifest";
import { apiPathField, configLabel, kubernetesNameField, stringField } from "./types";
export function parseStatusEndpoint(record: Record<string, unknown>, index: number): StatusEndpoint {
@@ -109,12 +110,20 @@ export async function apply(config: UniDeskConfig, options: ApplyOptions): Promi
};
}
export async function status(config: UniDeskConfig, options: CommonOptions): Promise<Record<string, unknown>> {
export async function status(config: UniDeskConfig, options: CommonOptions): Promise<Record<string, unknown> | RenderedCliResult> {
const observability = readObservabilityConfig();
const target = resolveTarget(observability, options.targetId);
const result = await capture(config, target.route, ["sh"], statusScript(observability, target, options.full));
const parsed = parseJsonOutput(result.stdout);
const summary = parsed === null ? null : statusSummary(parsed);
if (!options.full && !options.raw) {
return renderStatusTable({
ok: result.exitCode === 0 && summary?.ready === true,
target,
summary,
remote: parsed === null ? compactCapture(result, { full: false }) : null,
});
}
return {
ok: result.exitCode === 0 && summary?.ready === true,
action: "platform-infra-observability-status",
@@ -131,6 +140,66 @@ export async function status(config: UniDeskConfig, options: CommonOptions): Pro
};
}
function renderStatusTable(input: {
ok: boolean;
target: ObservabilityTarget;
summary: Record<string, unknown> | null;
remote: Record<string, unknown> | null;
}): RenderedCliResult {
const deployments = recordList(input.summary?.deployments).map((item) => [
shortenEnd(textValue(item.name), 34),
textValue(item.ready),
`${textValue(item.availableReplicas)}/${textValue(item.replicas)}`,
]);
const pods = recordList(input.summary?.pods).slice(0, 12).map((item) => [
shortenEnd(textValue(item.name), 44),
textValue(item.phase),
textValue(item.ready),
shortenEnd(textValue(item.reason), 28),
]);
const probes = recordList(input.summary?.probes).map((item) => [
shortenEnd(textValue(item.name), 24),
textValue(item.ok),
shortenEnd(textValue(item.service), 26),
shortenEnd(textValue(item.path), 38),
]);
const lines = [
`platform-infra observability status (${input.ok ? "ok" : "not-ok"})`,
"",
`target=${input.target.id} namespace=${input.target.namespace} ready=${textValue(input.summary?.ready)} route=${input.target.route}`,
"",
"Deployments:",
formatTable(["NAME", "READY", "AVAILABLE"], deployments.length > 0 ? deployments : [["-", "-", "-"]]),
"",
"Pods:",
formatTable(["NAME", "PHASE", "READY", "REASON"], pods.length > 0 ? pods : [["-", "-", "-", "-"]]),
"",
"Probes:",
formatTable(["NAME", "OK", "SERVICE", "PATH"], probes.length > 0 ? probes : [["-", "-", "-", "-"]]),
];
if (input.remote !== null) {
lines.push("", "Remote:");
lines.push(` ${shortenEnd(JSON.stringify(input.remote), 240)}`);
}
lines.push("", "Next:");
lines.push(` bun scripts/cli.ts platform-infra observability validate --target ${input.target.id}`);
lines.push(` bun scripts/cli.ts platform-infra observability status --target ${input.target.id} --full`);
lines.push("", "Disclosure:");
lines.push(" default view is a bounded table; use --full for compact JSON or --raw for backend/parser debugging.");
return {
ok: input.ok,
command: "platform-infra observability status",
contentType: "text/plain",
renderedText: lines.join("\n"),
};
}
function recordList(value: unknown): Record<string, unknown>[] {
return Array.isArray(value)
? value.filter((item): item is Record<string, unknown> => typeof item === "object" && item !== null && !Array.isArray(item))
: [];
}
export async function validate(config: UniDeskConfig, options: CommonOptions): Promise<Record<string, unknown>> {
const observability = readObservabilityConfig();
const target = resolveTarget(observability, options.targetId);
@@ -63,6 +63,7 @@ export function compactDiagnoseCodeAgentResult(value: unknown): Record<string, u
businessTraceId: mapping.businessTraceId ?? null,
runId: mapping.runId ?? null,
commandId: mapping.commandId ?? null,
sessionId: mapping.sessionId ?? null,
runnerJobId: mapping.runnerJobId ?? null,
otelTraceId: mapping.otelTraceId ?? null,
searchQuery: mapping.searchQuery ?? null,
@@ -767,12 +767,13 @@ function diagnoseSearchFilters(options: DiagnoseCodeAgentOptions): string[] {
if (options.businessTraceId !== null) filters.push(`.traceId = ${JSON.stringify(options.businessTraceId)}`);
if (options.runId !== null) filters.push(`.runId = ${JSON.stringify(options.runId)}`);
if (options.commandId !== null) filters.push(`.commandId = ${JSON.stringify(options.commandId)}`);
if (options.sessionId !== null) filters.push(`.sessionId = ${JSON.stringify(options.sessionId)}`);
if (options.runnerJobId !== null) filters.push(`.runnerJobId = ${JSON.stringify(options.runnerJobId)}`);
return filters;
}
function diagnoseSearchMode(options: DiagnoseCodeAgentOptions): string {
if (options.businessTraceId !== null && options.runId === null && options.commandId === null && options.runnerJobId === null) return "business-trace-id";
if (options.businessTraceId !== null && options.runId === null && options.commandId === null && options.sessionId === null && options.runnerJobId === null) return "business-trace-id";
if (options.traceId !== null) return "trace-id";
return "trace-attribute-query";
}
@@ -783,6 +784,7 @@ function buildDiagnoseCodeAgentCommand(target: ObservabilityTarget, options: Dia
if (options.traceId !== null) parts.push("--trace-id", options.traceId);
if (options.runId !== null) parts.push("--run-id", options.runId);
if (options.commandId !== null) parts.push("--command-id", options.commandId);
if (options.sessionId !== null) parts.push("--session-id", options.sessionId);
if (options.runnerJobId !== null) parts.push("--runner-job-id", options.runnerJobId);
parts.push("--lookback-minutes", String(options.lookbackMinutes), "--candidate-limit", String(options.candidateLimit), "--limit", String(options.limit));
if (full) parts.push("--full");
@@ -817,6 +819,7 @@ export function diagnoseCodeAgentScript(observability: ObservabilityConfig, targ
const businessTraceIdLiteral = options.businessTraceId === null ? "None" : JSON.stringify(options.businessTraceId);
const runIdLiteral = options.runId === null ? "None" : JSON.stringify(options.runId);
const commandIdLiteral = options.commandId === null ? "None" : JSON.stringify(options.commandId);
const sessionIdLiteral = options.sessionId === null ? "None" : JSON.stringify(options.sessionId);
const runnerJobIdLiteral = options.runnerJobId === null ? "None" : JSON.stringify(options.runnerJobId);
const searchPathLiteral = searchPath === null ? "None" : JSON.stringify(searchPath);
const searchProxyPathLiteral = searchProxyPath === null ? "None" : JSON.stringify(searchProxyPath);
@@ -836,6 +839,7 @@ BUSINESS_TRACE_ID = ${businessTraceIdLiteral}
TRACE_ID = ${traceIdLiteral}
RUN_ID = ${runIdLiteral}
COMMAND_ID = ${commandIdLiteral}
SESSION_ID = ${sessionIdLiteral}
RUNNER_JOB_ID = ${runnerJobIdLiteral}
TARGET_ID = ${JSON.stringify(target.id)}
SEARCH_MODE = ${JSON.stringify(searchMode)}
@@ -1781,7 +1785,7 @@ def candidate_score(trace_id, meta, trace_body, trace_rc, trace_err):
span_points = min(len(spans), 80)
if span_points:
add(span_points, "span count %s" % len(spans))
for key, points in (("runId", 50), ("commandId", 50), ("runnerJobId", 35), ("runnerId", 30), ("attemptId", 20), ("backendProfile", 15)):
for key, points in (("runId", 50), ("commandId", 50), ("sessionId", 35), ("runnerJobId", 35), ("runnerId", 30), ("attemptId", 20), ("backendProfile", 15)):
if identity.get(key) not in (None, ""):
add(points, "identity %s" % key)
if any("codex_stdio" in name for name in lowered_names):
@@ -1821,7 +1825,7 @@ def candidate_score(trace_id, meta, trace_body, trace_rc, trace_err):
"spanCount": len(spans),
"services": sorted(services),
"servicePath": service_path,
"identity": {key: identity.get(key) for key in ("runId", "commandId", "runnerJobId", "runnerId", "backendProfile") if identity.get(key) not in (None, "")},
"identity": {key: identity.get(key) for key in ("runId", "commandId", "sessionId", "runnerJobId", "runnerId", "backendProfile") if identity.get(key) not in (None, "")},
"terminalStatus": agentrun.get("terminalStatus"),
"errorSpanCount": len(error_spans),
"candidateQuality": candidate_quality,
@@ -1839,6 +1843,7 @@ def resolve_trace():
"businessTraceId": BUSINESS_TRACE_ID,
"runId": RUN_ID,
"commandId": COMMAND_ID,
"sessionId": SESSION_ID,
"runnerJobId": RUNNER_JOB_ID,
"otelTraceId": TRACE_ID,
"searchQuery": SEARCH_QUERY,
@@ -1898,6 +1903,7 @@ def resolve_trace():
"businessTraceId": BUSINESS_TRACE_ID,
"runId": RUN_ID,
"commandId": COMMAND_ID,
"sessionId": SESSION_ID,
"runnerJobId": RUNNER_JOB_ID,
"otelTraceId": selected_trace_id,
"searchQuery": SEARCH_QUERY,
@@ -209,6 +209,7 @@ export function buildDiagnoseCommand(target: ObservabilityTarget, options: Diagn
if (options.traceId !== null) parts.push("--trace-id", options.traceId);
if (options.runId !== null) parts.push("--run-id", options.runId);
if (options.commandId !== null) parts.push("--command-id", options.commandId);
if (options.sessionId !== null) parts.push("--session-id", options.sessionId);
if (options.runnerJobId !== null) parts.push("--runner-job-id", options.runnerJobId);
parts.push("--lookback-minutes", String(options.lookbackMinutes), "--candidate-limit", String(options.candidateLimit), "--limit", String(options.limit));
if (full) parts.push("--full");
@@ -41,7 +41,7 @@ export function observabilityHelp(): Record<string, unknown> {
"bun scripts/cli.ts platform-infra observability search --target D518 --grep 'no rollout found' [--lookback-minutes 360] [--candidate-limit 80] [--limit 20] [--full|--raw]",
"bun scripts/cli.ts platform-infra observability search --target D518 --path /v1/workbench/sessions --status 502 [--lookback-minutes 120] [--full|--raw]",
"bun scripts/cli.ts platform-infra observability diagnose-code-agent --target D518 --business-trace-id <trc_...> [--full|--raw]",
"bun scripts/cli.ts platform-infra observability diagnose-code-agent --target D518 --run-id <run_...> [--command-id <cmd_...>] [--runner-job-id <rjob_...>] [--full|--raw]",
"bun scripts/cli.ts platform-infra observability diagnose-code-agent --target D518 --run-id <run_...> [--command-id <cmd_...>] [--session-id <ses_...>] [--runner-job-id <rjob_...>] [--full|--raw]",
],
boundary: "Prometheus remains the metrics source; this command owns only platform-infra OTel Collector, trace backend readiness, and trace lookup.",
};
@@ -222,6 +222,7 @@ export function parseDiagnoseCodeAgentOptions(args: string[]): DiagnoseCodeAgent
let traceId: string | null = null;
let runId: string | null = null;
let commandId: string | null = null;
let sessionId: string | null = null;
let runnerJobId: string | null = null;
let limit = 40;
let candidateLimit = 300;
@@ -252,6 +253,12 @@ export function parseDiagnoseCodeAgentOptions(args: string[]): DiagnoseCodeAgent
if (!/^cmd_[A-Za-z0-9_-]+$/u.test(value)) throw new Error(`${arg} must look like cmd_<id>`);
commandId = value;
index += 1;
} else if (arg === "--session-id" || arg === "--session") {
const value = args[index + 1];
if (value === undefined || value.startsWith("--")) throw new Error(`${arg} requires a value`);
if (!/^ses_[A-Za-z0-9_-]+$/u.test(value)) throw new Error(`${arg} must look like ses_<id>`);
sessionId = value;
index += 1;
} else if (arg === "--runner-job-id" || arg === "--runner-job" || arg === "--runnerjob") {
const value = args[index + 1];
if (value === undefined || value.startsWith("--")) throw new Error(`${arg} requires a value`);
@@ -290,5 +297,5 @@ export function parseDiagnoseCodeAgentOptions(args: string[]): DiagnoseCodeAgent
if (businessTraceId === null && traceId === null && runId === null && commandId === null && runnerJobId === null) {
throw new Error("observability diagnose-code-agent requires --business-trace-id <trc_...>, --trace-id <otelTraceId>, --run-id <run_...>, --command-id <cmd_...>, or --runner-job-id <rjob_...>");
}
return { ...parseCommonOptions(commonArgs), businessTraceId, traceId, runId, commandId, runnerJobId, limit, candidateLimit, lookbackMinutes };
return { ...parseCommonOptions(commonArgs), businessTraceId, traceId, runId, commandId, sessionId, runnerJobId, limit, candidateLimit, lookbackMinutes };
}
@@ -209,12 +209,13 @@ function diagnoseTraceSearchFilters(options: DiagnoseCodeAgentOptions): string[]
if (options.businessTraceId !== null) filters.push(`.traceId = ${JSON.stringify(options.businessTraceId)}`);
if (options.runId !== null) filters.push(`.runId = ${JSON.stringify(options.runId)}`);
if (options.commandId !== null) filters.push(`.commandId = ${JSON.stringify(options.commandId)}`);
if (options.sessionId !== null) filters.push(`.sessionId = ${JSON.stringify(options.sessionId)}`);
if (options.runnerJobId !== null) filters.push(`.runnerJobId = ${JSON.stringify(options.runnerJobId)}`);
return filters;
}
function diagnoseTraceSearchMode(options: DiagnoseCodeAgentOptions): string {
if (options.businessTraceId !== null && options.runId === null && options.commandId === null && options.runnerJobId === null) return "business-trace-id";
if (options.businessTraceId !== null && options.runId === null && options.commandId === null && options.sessionId === null && options.runnerJobId === null) return "business-trace-id";
return "trace-attribute-query";
}
@@ -245,9 +246,11 @@ export async function diagnoseCodeAgent(config: UniDeskConfig, options: Diagnose
traceId: options.traceId,
runId: options.runId,
commandId: options.commandId,
sessionId: options.sessionId,
runnerJobId: options.runnerJobId,
mode: diagnoseTraceSearchMode(options),
tempoQuery,
queryClauses: searchFilters,
path: searchPath,
lookbackMinutes: options.lookbackMinutes,
candidateLimit: options.candidateLimit,
@@ -630,6 +633,15 @@ export function renderDiagnoseCodeAgentTable(input: {
shortenEnd(textValue(candidate.summary ?? candidate.label), 80),
]);
const httpRows = httpTableRows(http);
const queryClauses = asArray(input.query.queryClauses).map((item) => textValue(item)).filter((item) => item !== "-");
const requestedRunId = textValue(input.query.runId);
const requestedCommandId = textValue(input.query.commandId);
const requestedSessionId = textValue(input.query.sessionId);
const requestedRunnerJobId = textValue(input.query.runnerJobId);
const observedRunId = textValue(identity?.runId);
const observedCommandId = textValue(identity?.commandId);
const observedSessionId = textValue(identity?.sessionId);
const observedRunnerJobId = textValue(identity?.runnerJobId);
const lines = [
`platform-infra observability diagnose-code-agent (${input.ok ? "ok" : "not-ok"})`,
"",
@@ -637,8 +649,10 @@ export function renderDiagnoseCodeAgentTable(input: {
"",
"Identity:",
` businessTraceId=${textValue(mapping?.businessTraceId ?? input.query.businessTraceId)} otelTraceId=${traceId}`,
` queryMode=${textValue(mapping?.mode ?? input.query.mode)} tempoQuery=${shortenMiddle(textValue(mapping?.searchQuery ?? input.query.tempoQuery), 80)}`,
` runId=${textValue(identity?.runId)} commandId=${textValue(identity?.commandId)} runnerJobId=${textValue(identity?.runnerJobId)} runnerId=${textValue(identity?.runnerId)}`,
` queryMode=${textValue(mapping?.mode ?? input.query.mode)} tempoQuery=${shortenEnd(textValue(mapping?.searchQuery ?? input.query.tempoQuery), 180)}`,
` queryClauses=${queryClauses.length > 0 ? queryClauses.join(" ; ") : "-"}`,
` requested runId=${requestedRunId} commandId=${requestedCommandId} sessionId=${requestedSessionId} runnerJobId=${requestedRunnerJobId}`,
` observed runId=${observedRunId} commandId=${observedCommandId} sessionId=${observedSessionId} runnerJobId=${observedRunnerJobId} runnerId=${textValue(identity?.runnerId)}`,
` backendProfile=${textValue(identity?.backendProfile)} sourceCommit=${shortenMiddle(textValue(identity?.sourceCommit), 20)}`,
"",
"Root causes:",
@@ -159,6 +159,7 @@ export interface DiagnoseCodeAgentOptions extends CommonOptions {
traceId: string | null;
runId: string | null;
commandId: string | null;
sessionId: string | null;
runnerJobId: string | null;
limit: number;
candidateLimit: number;