fix: improve observability code agent diagnosis
This commit is contained in:
@@ -25,6 +25,7 @@ import { readObservabilityConfig } from "./config";
|
||||
import { applyScript, statusScript } from "./search-script";
|
||||
import { compactStatus, configSummary, manifestObjectSummary, policyChecks, statusSummary, targetSummary } from "./summary";
|
||||
import { renderManifest } from "./trace-script";
|
||||
import { formatTable, shortenEnd, textValue } from "./manifest";
|
||||
import { apiPathField, configLabel, kubernetesNameField, stringField } from "./types";
|
||||
|
||||
export function parseStatusEndpoint(record: Record<string, unknown>, index: number): StatusEndpoint {
|
||||
@@ -109,12 +110,20 @@ export async function apply(config: UniDeskConfig, options: ApplyOptions): Promi
|
||||
};
|
||||
}
|
||||
|
||||
export async function status(config: UniDeskConfig, options: CommonOptions): Promise<Record<string, unknown>> {
|
||||
export async function status(config: UniDeskConfig, options: CommonOptions): Promise<Record<string, unknown> | RenderedCliResult> {
|
||||
const observability = readObservabilityConfig();
|
||||
const target = resolveTarget(observability, options.targetId);
|
||||
const result = await capture(config, target.route, ["sh"], statusScript(observability, target, options.full));
|
||||
const parsed = parseJsonOutput(result.stdout);
|
||||
const summary = parsed === null ? null : statusSummary(parsed);
|
||||
if (!options.full && !options.raw) {
|
||||
return renderStatusTable({
|
||||
ok: result.exitCode === 0 && summary?.ready === true,
|
||||
target,
|
||||
summary,
|
||||
remote: parsed === null ? compactCapture(result, { full: false }) : null,
|
||||
});
|
||||
}
|
||||
return {
|
||||
ok: result.exitCode === 0 && summary?.ready === true,
|
||||
action: "platform-infra-observability-status",
|
||||
@@ -131,6 +140,66 @@ export async function status(config: UniDeskConfig, options: CommonOptions): Pro
|
||||
};
|
||||
}
|
||||
|
||||
function renderStatusTable(input: {
|
||||
ok: boolean;
|
||||
target: ObservabilityTarget;
|
||||
summary: Record<string, unknown> | null;
|
||||
remote: Record<string, unknown> | null;
|
||||
}): RenderedCliResult {
|
||||
const deployments = recordList(input.summary?.deployments).map((item) => [
|
||||
shortenEnd(textValue(item.name), 34),
|
||||
textValue(item.ready),
|
||||
`${textValue(item.availableReplicas)}/${textValue(item.replicas)}`,
|
||||
]);
|
||||
const pods = recordList(input.summary?.pods).slice(0, 12).map((item) => [
|
||||
shortenEnd(textValue(item.name), 44),
|
||||
textValue(item.phase),
|
||||
textValue(item.ready),
|
||||
shortenEnd(textValue(item.reason), 28),
|
||||
]);
|
||||
const probes = recordList(input.summary?.probes).map((item) => [
|
||||
shortenEnd(textValue(item.name), 24),
|
||||
textValue(item.ok),
|
||||
shortenEnd(textValue(item.service), 26),
|
||||
shortenEnd(textValue(item.path), 38),
|
||||
]);
|
||||
const lines = [
|
||||
`platform-infra observability status (${input.ok ? "ok" : "not-ok"})`,
|
||||
"",
|
||||
`target=${input.target.id} namespace=${input.target.namespace} ready=${textValue(input.summary?.ready)} route=${input.target.route}`,
|
||||
"",
|
||||
"Deployments:",
|
||||
formatTable(["NAME", "READY", "AVAILABLE"], deployments.length > 0 ? deployments : [["-", "-", "-"]]),
|
||||
"",
|
||||
"Pods:",
|
||||
formatTable(["NAME", "PHASE", "READY", "REASON"], pods.length > 0 ? pods : [["-", "-", "-", "-"]]),
|
||||
"",
|
||||
"Probes:",
|
||||
formatTable(["NAME", "OK", "SERVICE", "PATH"], probes.length > 0 ? probes : [["-", "-", "-", "-"]]),
|
||||
];
|
||||
if (input.remote !== null) {
|
||||
lines.push("", "Remote:");
|
||||
lines.push(` ${shortenEnd(JSON.stringify(input.remote), 240)}`);
|
||||
}
|
||||
lines.push("", "Next:");
|
||||
lines.push(` bun scripts/cli.ts platform-infra observability validate --target ${input.target.id}`);
|
||||
lines.push(` bun scripts/cli.ts platform-infra observability status --target ${input.target.id} --full`);
|
||||
lines.push("", "Disclosure:");
|
||||
lines.push(" default view is a bounded table; use --full for compact JSON or --raw for backend/parser debugging.");
|
||||
return {
|
||||
ok: input.ok,
|
||||
command: "platform-infra observability status",
|
||||
contentType: "text/plain",
|
||||
renderedText: lines.join("\n"),
|
||||
};
|
||||
}
|
||||
|
||||
function recordList(value: unknown): Record<string, unknown>[] {
|
||||
return Array.isArray(value)
|
||||
? value.filter((item): item is Record<string, unknown> => typeof item === "object" && item !== null && !Array.isArray(item))
|
||||
: [];
|
||||
}
|
||||
|
||||
export async function validate(config: UniDeskConfig, options: CommonOptions): Promise<Record<string, unknown>> {
|
||||
const observability = readObservabilityConfig();
|
||||
const target = resolveTarget(observability, options.targetId);
|
||||
|
||||
@@ -63,6 +63,7 @@ export function compactDiagnoseCodeAgentResult(value: unknown): Record<string, u
|
||||
businessTraceId: mapping.businessTraceId ?? null,
|
||||
runId: mapping.runId ?? null,
|
||||
commandId: mapping.commandId ?? null,
|
||||
sessionId: mapping.sessionId ?? null,
|
||||
runnerJobId: mapping.runnerJobId ?? null,
|
||||
otelTraceId: mapping.otelTraceId ?? null,
|
||||
searchQuery: mapping.searchQuery ?? null,
|
||||
|
||||
@@ -767,12 +767,13 @@ function diagnoseSearchFilters(options: DiagnoseCodeAgentOptions): string[] {
|
||||
if (options.businessTraceId !== null) filters.push(`.traceId = ${JSON.stringify(options.businessTraceId)}`);
|
||||
if (options.runId !== null) filters.push(`.runId = ${JSON.stringify(options.runId)}`);
|
||||
if (options.commandId !== null) filters.push(`.commandId = ${JSON.stringify(options.commandId)}`);
|
||||
if (options.sessionId !== null) filters.push(`.sessionId = ${JSON.stringify(options.sessionId)}`);
|
||||
if (options.runnerJobId !== null) filters.push(`.runnerJobId = ${JSON.stringify(options.runnerJobId)}`);
|
||||
return filters;
|
||||
}
|
||||
|
||||
function diagnoseSearchMode(options: DiagnoseCodeAgentOptions): string {
|
||||
if (options.businessTraceId !== null && options.runId === null && options.commandId === null && options.runnerJobId === null) return "business-trace-id";
|
||||
if (options.businessTraceId !== null && options.runId === null && options.commandId === null && options.sessionId === null && options.runnerJobId === null) return "business-trace-id";
|
||||
if (options.traceId !== null) return "trace-id";
|
||||
return "trace-attribute-query";
|
||||
}
|
||||
@@ -783,6 +784,7 @@ function buildDiagnoseCodeAgentCommand(target: ObservabilityTarget, options: Dia
|
||||
if (options.traceId !== null) parts.push("--trace-id", options.traceId);
|
||||
if (options.runId !== null) parts.push("--run-id", options.runId);
|
||||
if (options.commandId !== null) parts.push("--command-id", options.commandId);
|
||||
if (options.sessionId !== null) parts.push("--session-id", options.sessionId);
|
||||
if (options.runnerJobId !== null) parts.push("--runner-job-id", options.runnerJobId);
|
||||
parts.push("--lookback-minutes", String(options.lookbackMinutes), "--candidate-limit", String(options.candidateLimit), "--limit", String(options.limit));
|
||||
if (full) parts.push("--full");
|
||||
@@ -817,6 +819,7 @@ export function diagnoseCodeAgentScript(observability: ObservabilityConfig, targ
|
||||
const businessTraceIdLiteral = options.businessTraceId === null ? "None" : JSON.stringify(options.businessTraceId);
|
||||
const runIdLiteral = options.runId === null ? "None" : JSON.stringify(options.runId);
|
||||
const commandIdLiteral = options.commandId === null ? "None" : JSON.stringify(options.commandId);
|
||||
const sessionIdLiteral = options.sessionId === null ? "None" : JSON.stringify(options.sessionId);
|
||||
const runnerJobIdLiteral = options.runnerJobId === null ? "None" : JSON.stringify(options.runnerJobId);
|
||||
const searchPathLiteral = searchPath === null ? "None" : JSON.stringify(searchPath);
|
||||
const searchProxyPathLiteral = searchProxyPath === null ? "None" : JSON.stringify(searchProxyPath);
|
||||
@@ -836,6 +839,7 @@ BUSINESS_TRACE_ID = ${businessTraceIdLiteral}
|
||||
TRACE_ID = ${traceIdLiteral}
|
||||
RUN_ID = ${runIdLiteral}
|
||||
COMMAND_ID = ${commandIdLiteral}
|
||||
SESSION_ID = ${sessionIdLiteral}
|
||||
RUNNER_JOB_ID = ${runnerJobIdLiteral}
|
||||
TARGET_ID = ${JSON.stringify(target.id)}
|
||||
SEARCH_MODE = ${JSON.stringify(searchMode)}
|
||||
@@ -1781,7 +1785,7 @@ def candidate_score(trace_id, meta, trace_body, trace_rc, trace_err):
|
||||
span_points = min(len(spans), 80)
|
||||
if span_points:
|
||||
add(span_points, "span count %s" % len(spans))
|
||||
for key, points in (("runId", 50), ("commandId", 50), ("runnerJobId", 35), ("runnerId", 30), ("attemptId", 20), ("backendProfile", 15)):
|
||||
for key, points in (("runId", 50), ("commandId", 50), ("sessionId", 35), ("runnerJobId", 35), ("runnerId", 30), ("attemptId", 20), ("backendProfile", 15)):
|
||||
if identity.get(key) not in (None, ""):
|
||||
add(points, "identity %s" % key)
|
||||
if any("codex_stdio" in name for name in lowered_names):
|
||||
@@ -1821,7 +1825,7 @@ def candidate_score(trace_id, meta, trace_body, trace_rc, trace_err):
|
||||
"spanCount": len(spans),
|
||||
"services": sorted(services),
|
||||
"servicePath": service_path,
|
||||
"identity": {key: identity.get(key) for key in ("runId", "commandId", "runnerJobId", "runnerId", "backendProfile") if identity.get(key) not in (None, "")},
|
||||
"identity": {key: identity.get(key) for key in ("runId", "commandId", "sessionId", "runnerJobId", "runnerId", "backendProfile") if identity.get(key) not in (None, "")},
|
||||
"terminalStatus": agentrun.get("terminalStatus"),
|
||||
"errorSpanCount": len(error_spans),
|
||||
"candidateQuality": candidate_quality,
|
||||
@@ -1839,6 +1843,7 @@ def resolve_trace():
|
||||
"businessTraceId": BUSINESS_TRACE_ID,
|
||||
"runId": RUN_ID,
|
||||
"commandId": COMMAND_ID,
|
||||
"sessionId": SESSION_ID,
|
||||
"runnerJobId": RUNNER_JOB_ID,
|
||||
"otelTraceId": TRACE_ID,
|
||||
"searchQuery": SEARCH_QUERY,
|
||||
@@ -1898,6 +1903,7 @@ def resolve_trace():
|
||||
"businessTraceId": BUSINESS_TRACE_ID,
|
||||
"runId": RUN_ID,
|
||||
"commandId": COMMAND_ID,
|
||||
"sessionId": SESSION_ID,
|
||||
"runnerJobId": RUNNER_JOB_ID,
|
||||
"otelTraceId": selected_trace_id,
|
||||
"searchQuery": SEARCH_QUERY,
|
||||
|
||||
@@ -209,6 +209,7 @@ export function buildDiagnoseCommand(target: ObservabilityTarget, options: Diagn
|
||||
if (options.traceId !== null) parts.push("--trace-id", options.traceId);
|
||||
if (options.runId !== null) parts.push("--run-id", options.runId);
|
||||
if (options.commandId !== null) parts.push("--command-id", options.commandId);
|
||||
if (options.sessionId !== null) parts.push("--session-id", options.sessionId);
|
||||
if (options.runnerJobId !== null) parts.push("--runner-job-id", options.runnerJobId);
|
||||
parts.push("--lookback-minutes", String(options.lookbackMinutes), "--candidate-limit", String(options.candidateLimit), "--limit", String(options.limit));
|
||||
if (full) parts.push("--full");
|
||||
|
||||
@@ -41,7 +41,7 @@ export function observabilityHelp(): Record<string, unknown> {
|
||||
"bun scripts/cli.ts platform-infra observability search --target D518 --grep 'no rollout found' [--lookback-minutes 360] [--candidate-limit 80] [--limit 20] [--full|--raw]",
|
||||
"bun scripts/cli.ts platform-infra observability search --target D518 --path /v1/workbench/sessions --status 502 [--lookback-minutes 120] [--full|--raw]",
|
||||
"bun scripts/cli.ts platform-infra observability diagnose-code-agent --target D518 --business-trace-id <trc_...> [--full|--raw]",
|
||||
"bun scripts/cli.ts platform-infra observability diagnose-code-agent --target D518 --run-id <run_...> [--command-id <cmd_...>] [--runner-job-id <rjob_...>] [--full|--raw]",
|
||||
"bun scripts/cli.ts platform-infra observability diagnose-code-agent --target D518 --run-id <run_...> [--command-id <cmd_...>] [--session-id <ses_...>] [--runner-job-id <rjob_...>] [--full|--raw]",
|
||||
],
|
||||
boundary: "Prometheus remains the metrics source; this command owns only platform-infra OTel Collector, trace backend readiness, and trace lookup.",
|
||||
};
|
||||
@@ -222,6 +222,7 @@ export function parseDiagnoseCodeAgentOptions(args: string[]): DiagnoseCodeAgent
|
||||
let traceId: string | null = null;
|
||||
let runId: string | null = null;
|
||||
let commandId: string | null = null;
|
||||
let sessionId: string | null = null;
|
||||
let runnerJobId: string | null = null;
|
||||
let limit = 40;
|
||||
let candidateLimit = 300;
|
||||
@@ -252,6 +253,12 @@ export function parseDiagnoseCodeAgentOptions(args: string[]): DiagnoseCodeAgent
|
||||
if (!/^cmd_[A-Za-z0-9_-]+$/u.test(value)) throw new Error(`${arg} must look like cmd_<id>`);
|
||||
commandId = value;
|
||||
index += 1;
|
||||
} else if (arg === "--session-id" || arg === "--session") {
|
||||
const value = args[index + 1];
|
||||
if (value === undefined || value.startsWith("--")) throw new Error(`${arg} requires a value`);
|
||||
if (!/^ses_[A-Za-z0-9_-]+$/u.test(value)) throw new Error(`${arg} must look like ses_<id>`);
|
||||
sessionId = value;
|
||||
index += 1;
|
||||
} else if (arg === "--runner-job-id" || arg === "--runner-job" || arg === "--runnerjob") {
|
||||
const value = args[index + 1];
|
||||
if (value === undefined || value.startsWith("--")) throw new Error(`${arg} requires a value`);
|
||||
@@ -290,5 +297,5 @@ export function parseDiagnoseCodeAgentOptions(args: string[]): DiagnoseCodeAgent
|
||||
if (businessTraceId === null && traceId === null && runId === null && commandId === null && runnerJobId === null) {
|
||||
throw new Error("observability diagnose-code-agent requires --business-trace-id <trc_...>, --trace-id <otelTraceId>, --run-id <run_...>, --command-id <cmd_...>, or --runner-job-id <rjob_...>");
|
||||
}
|
||||
return { ...parseCommonOptions(commonArgs), businessTraceId, traceId, runId, commandId, runnerJobId, limit, candidateLimit, lookbackMinutes };
|
||||
return { ...parseCommonOptions(commonArgs), businessTraceId, traceId, runId, commandId, sessionId, runnerJobId, limit, candidateLimit, lookbackMinutes };
|
||||
}
|
||||
|
||||
@@ -209,12 +209,13 @@ function diagnoseTraceSearchFilters(options: DiagnoseCodeAgentOptions): string[]
|
||||
if (options.businessTraceId !== null) filters.push(`.traceId = ${JSON.stringify(options.businessTraceId)}`);
|
||||
if (options.runId !== null) filters.push(`.runId = ${JSON.stringify(options.runId)}`);
|
||||
if (options.commandId !== null) filters.push(`.commandId = ${JSON.stringify(options.commandId)}`);
|
||||
if (options.sessionId !== null) filters.push(`.sessionId = ${JSON.stringify(options.sessionId)}`);
|
||||
if (options.runnerJobId !== null) filters.push(`.runnerJobId = ${JSON.stringify(options.runnerJobId)}`);
|
||||
return filters;
|
||||
}
|
||||
|
||||
function diagnoseTraceSearchMode(options: DiagnoseCodeAgentOptions): string {
|
||||
if (options.businessTraceId !== null && options.runId === null && options.commandId === null && options.runnerJobId === null) return "business-trace-id";
|
||||
if (options.businessTraceId !== null && options.runId === null && options.commandId === null && options.sessionId === null && options.runnerJobId === null) return "business-trace-id";
|
||||
return "trace-attribute-query";
|
||||
}
|
||||
|
||||
@@ -245,9 +246,11 @@ export async function diagnoseCodeAgent(config: UniDeskConfig, options: Diagnose
|
||||
traceId: options.traceId,
|
||||
runId: options.runId,
|
||||
commandId: options.commandId,
|
||||
sessionId: options.sessionId,
|
||||
runnerJobId: options.runnerJobId,
|
||||
mode: diagnoseTraceSearchMode(options),
|
||||
tempoQuery,
|
||||
queryClauses: searchFilters,
|
||||
path: searchPath,
|
||||
lookbackMinutes: options.lookbackMinutes,
|
||||
candidateLimit: options.candidateLimit,
|
||||
@@ -630,6 +633,15 @@ export function renderDiagnoseCodeAgentTable(input: {
|
||||
shortenEnd(textValue(candidate.summary ?? candidate.label), 80),
|
||||
]);
|
||||
const httpRows = httpTableRows(http);
|
||||
const queryClauses = asArray(input.query.queryClauses).map((item) => textValue(item)).filter((item) => item !== "-");
|
||||
const requestedRunId = textValue(input.query.runId);
|
||||
const requestedCommandId = textValue(input.query.commandId);
|
||||
const requestedSessionId = textValue(input.query.sessionId);
|
||||
const requestedRunnerJobId = textValue(input.query.runnerJobId);
|
||||
const observedRunId = textValue(identity?.runId);
|
||||
const observedCommandId = textValue(identity?.commandId);
|
||||
const observedSessionId = textValue(identity?.sessionId);
|
||||
const observedRunnerJobId = textValue(identity?.runnerJobId);
|
||||
const lines = [
|
||||
`platform-infra observability diagnose-code-agent (${input.ok ? "ok" : "not-ok"})`,
|
||||
"",
|
||||
@@ -637,8 +649,10 @@ export function renderDiagnoseCodeAgentTable(input: {
|
||||
"",
|
||||
"Identity:",
|
||||
` businessTraceId=${textValue(mapping?.businessTraceId ?? input.query.businessTraceId)} otelTraceId=${traceId}`,
|
||||
` queryMode=${textValue(mapping?.mode ?? input.query.mode)} tempoQuery=${shortenMiddle(textValue(mapping?.searchQuery ?? input.query.tempoQuery), 80)}`,
|
||||
` runId=${textValue(identity?.runId)} commandId=${textValue(identity?.commandId)} runnerJobId=${textValue(identity?.runnerJobId)} runnerId=${textValue(identity?.runnerId)}`,
|
||||
` queryMode=${textValue(mapping?.mode ?? input.query.mode)} tempoQuery=${shortenEnd(textValue(mapping?.searchQuery ?? input.query.tempoQuery), 180)}`,
|
||||
` queryClauses=${queryClauses.length > 0 ? queryClauses.join(" ; ") : "-"}`,
|
||||
` requested runId=${requestedRunId} commandId=${requestedCommandId} sessionId=${requestedSessionId} runnerJobId=${requestedRunnerJobId}`,
|
||||
` observed runId=${observedRunId} commandId=${observedCommandId} sessionId=${observedSessionId} runnerJobId=${observedRunnerJobId} runnerId=${textValue(identity?.runnerId)}`,
|
||||
` backendProfile=${textValue(identity?.backendProfile)} sourceCommit=${shortenMiddle(textValue(identity?.sourceCommit), 20)}`,
|
||||
"",
|
||||
"Root causes:",
|
||||
|
||||
@@ -159,6 +159,7 @@ export interface DiagnoseCodeAgentOptions extends CommonOptions {
|
||||
traceId: string | null;
|
||||
runId: string | null;
|
||||
commandId: string | null;
|
||||
sessionId: string | null;
|
||||
runnerJobId: string | null;
|
||||
limit: number;
|
||||
candidateLimit: number;
|
||||
|
||||
Reference in New Issue
Block a user