344 lines
18 KiB
TypeScript
344 lines
18 KiB
TypeScript
import type { CommandRecord, JsonRecord, JsonValue, RunEvent, RunRecord, RunnerJobRecord } from "../common/types.js";
|
|
import { boundedTextSummary } from "../common/output.js";
|
|
|
|
export interface RunDiagnosisInput {
|
|
run: RunRecord;
|
|
command: CommandRecord | null;
|
|
latestJob: RunnerJobRecord | null;
|
|
events: RunEvent[];
|
|
terminalClassification: JsonRecord | null;
|
|
liveness: JsonRecord | null;
|
|
terminalStatus: string | null;
|
|
failureKind: string | null;
|
|
failureMessage: string | null;
|
|
}
|
|
|
|
export function runDiagnosis(input: RunDiagnosisInput): JsonRecord {
|
|
const nowMs = Date.now();
|
|
const lease = recordAt(input.liveness, "lease") ?? leaseFromRun(input.run, nowMs);
|
|
const timeoutBudget = recordAt(input.liveness, "timeoutBudget");
|
|
const staleClaimed = input.run.status === "claimed" && booleanValue(lease.leaseExpired) === true;
|
|
const terminalCommandOpenRun = input.run.status === "claimed" && input.terminalStatus !== null;
|
|
const runnerJob = input.latestJob ? runnerJobReference(input.latestJob, input.events) : null;
|
|
const runnerLost = staleClaimed && (runnerJob === null || runnerJobPhaseIndicatesLost(runnerJob.phase));
|
|
const session = sessionReference(input.run);
|
|
const providerEvidence = stringValue(input.terminalClassification?.providerEvidence) ?? "not-applicable";
|
|
const providerInterruption = stringValue(input.terminalClassification?.providerInterruption) ?? "not-established";
|
|
const category = diagnosisCategory({ staleClaimed, runnerLost, terminalCommandOpenRun, providerEvidence, terminalCategory: stringValue(input.terminalClassification?.category) });
|
|
const recoveryActions = recoveryActionsForDiagnosis({ run: input.run, command: input.command, latestJob: input.latestJob, session, runnerLost, staleClaimed, terminalCommandOpenRun, failureKind: input.failureKind, lastSeq: numberValue(input.liveness?.lastSeq) ?? 0 });
|
|
|
|
return {
|
|
category,
|
|
staleClaimed,
|
|
runnerLost,
|
|
terminalCommandOpenRun,
|
|
evidenceLevel: evidenceLevel(category, providerEvidence, runnerLost, staleClaimed, terminalCommandOpenRun),
|
|
providerEvidence,
|
|
providerInterruption,
|
|
providerInterruptionKnown: input.terminalClassification?.providerInterruptionKnown === true,
|
|
terminalCategory: stringValue(input.terminalClassification?.category),
|
|
terminalStatus: input.terminalStatus,
|
|
failureKind: input.failureKind,
|
|
failureMessage: input.failureMessage ? boundedTextSummary(input.failureMessage, { limitChars: 240 }).text as string : null,
|
|
run: {
|
|
runId: input.run.id,
|
|
status: input.run.status,
|
|
claimedBy: input.run.claimedBy,
|
|
leaseExpiresAt: input.run.leaseExpiresAt,
|
|
leaseExpired: booleanOrNull(lease.leaseExpired),
|
|
leaseRemainingMs: numberValue(lease.leaseRemainingMs),
|
|
valuesPrinted: false,
|
|
},
|
|
command: input.command ? {
|
|
commandId: input.command.id,
|
|
state: input.command.state,
|
|
terminalStatus: input.terminalStatus,
|
|
acknowledgedAt: input.command.acknowledgedAt ?? null,
|
|
updatedAt: input.command.updatedAt,
|
|
valuesPrinted: false,
|
|
} : null,
|
|
runnerJob,
|
|
session,
|
|
timeoutBudget: timeoutBudget ? compactRecord(timeoutBudget, ["state", "timeoutKind", "timeoutMs", "elapsedMs", "idleElapsedMs", "remainingMs", "startedAt", "idleStartedAt", "lastActivityAt", "lastActivitySeq", "commandElapsedMs", "runElapsedMs", "source"]) : null,
|
|
recoveryActions,
|
|
valuesPrinted: false,
|
|
};
|
|
}
|
|
|
|
export function runnerJobDiagnosis(job: RunnerJobRecord, events: RunEvent[] = []): JsonRecord {
|
|
const observation = runnerJobObservation(job, events);
|
|
const phase = stringValue(observation.phase) ?? "unknown";
|
|
const notStarted = phase === "created" || phase === "recorded";
|
|
const runnerLostSuspected = notStarted || runnerJobPhaseIndicatesLost(phase);
|
|
return {
|
|
category: stringValue(observation.category) ?? (notStarted ? "runner-job-created" : phase.startsWith("terminal:") ? "runner-job-terminal" : "runner-job-observed"),
|
|
runnerLostSuspected,
|
|
phase,
|
|
evidenceLevel: stringValue(observation.evidenceLevel) ?? (notStarted ? "medium" : "high"),
|
|
lastObservedSeq: numberValue(observation.lastObservedSeq),
|
|
lastObservedAt: stringValue(observation.lastObservedAt),
|
|
lastObservedKind: stringValue(observation.lastObservedKind),
|
|
terminalReportState: stringValue(observation.terminalReportState),
|
|
runReportState: stringValue(observation.runReportState),
|
|
runId: job.runId,
|
|
commandId: job.commandId,
|
|
runnerJobId: job.id,
|
|
attemptId: job.attemptId,
|
|
runnerId: job.runnerId,
|
|
jobName: job.jobName,
|
|
namespace: job.namespace,
|
|
logPath: stringValue(recordAt(job.result, "runner")?.logPath),
|
|
nextActions: [
|
|
recoveryDescriptor({ action: "inspect-run", operation: "describe", resourceKind: "run", resourceName: job.runId, runId: job.runId }),
|
|
recoveryDescriptor({ action: "inspect-command", operation: "describe", resourceKind: "command", resourceName: job.commandId, runId: job.runId, commandId: job.commandId }),
|
|
recoveryDescriptor({ action: "poll-events", operation: "events", resourceKind: "run", resourceName: job.runId, runId: job.runId, commandId: job.commandId, afterSeq: 0, limit: 100 }),
|
|
],
|
|
valuesPrinted: false,
|
|
};
|
|
}
|
|
|
|
export function runnerJobObservation(job: RunnerJobRecord, events: RunEvent[] = []): JsonRecord {
|
|
const terminalEvent = latestTerminalEvent(events, job.commandId);
|
|
const terminalStatus = stringValue(terminalEvent?.payload.terminalStatus) ?? stringValue(latestTerminalStatusFromResult(job.result));
|
|
if (terminalStatus) {
|
|
return {
|
|
phase: `terminal:${terminalStatus}`,
|
|
category: "runner-job-terminal",
|
|
terminalStatus,
|
|
failureKind: stringValue(terminalEvent?.payload.failureKind),
|
|
startedAt: stringValue(firstObservedRunnerEvent(job, events)?.createdAt),
|
|
finishedAt: stringValue(terminalEvent?.createdAt),
|
|
lastObservedSeq: numberValue(terminalEvent?.seq),
|
|
lastObservedAt: stringValue(terminalEvent?.createdAt),
|
|
lastObservedKind: eventKind(terminalEvent),
|
|
evidenceLevel: "high",
|
|
valuesPrinted: false,
|
|
};
|
|
}
|
|
|
|
const observed = relevantRunnerJobEvents(job, events);
|
|
const lastObserved = observed.at(-1) ?? null;
|
|
if (lastObserved) {
|
|
return {
|
|
phase: "running",
|
|
category: "runner-job-running",
|
|
terminalStatus: null,
|
|
failureKind: null,
|
|
startedAt: stringValue(observed[0]?.createdAt),
|
|
finishedAt: null,
|
|
lastObservedSeq: numberValue(lastObserved.seq),
|
|
lastObservedAt: stringValue(lastObserved.createdAt),
|
|
lastObservedKind: eventKind(lastObserved),
|
|
evidenceLevel: "high",
|
|
valuesPrinted: false,
|
|
};
|
|
}
|
|
|
|
const reconcilerObservation = recordAt(job.result, "observation");
|
|
const reconcilerPhase = stringValue(reconcilerObservation?.observedRunnerPhase) ?? stringValue(reconcilerObservation?.phase);
|
|
if (reconcilerObservation && reconcilerPhase) {
|
|
return {
|
|
phase: reconcilerPhase,
|
|
category: stringValue(reconcilerObservation.category) ?? "runner-job-observed",
|
|
terminalStatus: stringValue(reconcilerObservation.terminalStatus),
|
|
failureKind: stringValue(reconcilerObservation.failureKind),
|
|
startedAt: stringValue(recordAt(reconcilerObservation, "k8s")?.startTime),
|
|
finishedAt: stringValue(recordAt(reconcilerObservation, "k8s")?.completionTime),
|
|
lastObservedSeq: null,
|
|
lastObservedAt: stringValue(reconcilerObservation.lastK8sObservedAt) ?? stringValue(reconcilerObservation.lastObservedAt),
|
|
lastObservedKind: stringValue(reconcilerObservation.lastObservedKind) ?? `manager-reconciler:${reconcilerPhase}`,
|
|
terminalReportState: stringValue(reconcilerObservation.terminalReportState),
|
|
runReportState: stringValue(reconcilerObservation.runReportState),
|
|
evidenceLevel: stringValue(reconcilerObservation.evidenceLevel) ?? "high",
|
|
valuesPrinted: false,
|
|
};
|
|
}
|
|
|
|
const created = recordAt(job.result, "kubernetes")?.created === true;
|
|
return {
|
|
phase: created ? "created" : "recorded",
|
|
category: created ? "runner-job-created" : "runner-job-recorded",
|
|
terminalStatus: null,
|
|
failureKind: null,
|
|
startedAt: null,
|
|
finishedAt: null,
|
|
lastObservedSeq: null,
|
|
lastObservedAt: null,
|
|
lastObservedKind: null,
|
|
evidenceLevel: created ? "medium" : "low",
|
|
valuesPrinted: false,
|
|
};
|
|
}
|
|
|
|
function diagnosisCategory(input: { staleClaimed: boolean; runnerLost: boolean; terminalCommandOpenRun: boolean; providerEvidence: string; terminalCategory: string | null }): string {
|
|
if (input.runnerLost) return "runner-lost";
|
|
if (input.staleClaimed) return "stale-claimed";
|
|
if (input.terminalCommandOpenRun) return "terminal-command-open-run";
|
|
if (input.providerEvidence === "failure-kind") return "provider-interruption-known";
|
|
if (input.providerEvidence === "observed-transport-disconnect") return "provider-interruption-unknown";
|
|
if (input.terminalCategory) return input.terminalCategory;
|
|
return "unknown";
|
|
}
|
|
|
|
function evidenceLevel(category: string, providerEvidence: string, runnerLost: boolean, staleClaimed: boolean, terminalCommandOpenRun: boolean): string {
|
|
if (runnerLost || staleClaimed || terminalCommandOpenRun) return "high";
|
|
if (providerEvidence === "failure-kind") return "high";
|
|
if (providerEvidence === "observed-transport-disconnect") return "medium";
|
|
if (category === "completed" || category === "cancelled") return "high";
|
|
return "low";
|
|
}
|
|
|
|
function recoveryActionsForDiagnosis(input: { run: RunRecord; command: CommandRecord | null; latestJob: RunnerJobRecord | null; session: JsonRecord; runnerLost: boolean; staleClaimed: boolean; terminalCommandOpenRun: boolean; failureKind: string | null; lastSeq: number }): JsonRecord[] {
|
|
const actions: JsonRecord[] = [];
|
|
if (input.latestJob) actions.push(recoveryDescriptor({ action: "inspect-runner-job", operation: "describe", resourceKind: "runnerjob", resourceName: input.latestJob.id, runId: input.run.id, commandId: input.command?.id ?? input.latestJob.commandId, runnerJobId: input.latestJob.id }));
|
|
if (input.command) actions.push(recoveryDescriptor({ action: "inspect-command", operation: "result", resourceKind: "command", resourceName: input.command.id, runId: input.run.id, commandId: input.command.id }));
|
|
actions.push(recoveryDescriptor({ action: "poll-events", operation: "events", resourceKind: "run", resourceName: input.run.id, runId: input.run.id, commandId: input.command?.id ?? null, afterSeq: input.lastSeq, limit: 100 }));
|
|
const sessionId = stringValue(input.session.sessionId);
|
|
if (sessionId) actions.push(recoveryDescriptor({ action: "continue-session", operation: "send", resourceKind: "session", resourceName: sessionId, runId: input.run.id, commandId: input.command?.id ?? null, sessionId, inputKind: "prompt" }));
|
|
else actions.push({ action: "session-unavailable", operation: "operator-decision", resourceKind: "run", resourceName: input.run.id, runId: input.run.id, commandId: input.command?.id ?? null, reason: "sessionRef=null", reasonHint: "当前 run 没有 sessionRef,管理者只能从 run/events/command/runner-job 读取 trace 后重新提交;这表示该任务不可同 session 续跑。", valuesPrinted: false });
|
|
if (input.runnerLost || input.staleClaimed || input.terminalCommandOpenRun) actions.push(recoveryDescriptor({ action: "refresh-queue-or-resubmit", operation: "operator-decision", resourceKind: sessionId ? "session" : "run", resourceName: sessionId ?? input.run.id, runId: input.run.id, commandId: input.command?.id ?? null, sessionId, reason: input.failureKind ?? "stale-runner-state", reasonHint: "先用 queue refresh/show 对齐 attempt;有 sessionId 时继续同一 session,没有 sessionId 才重新派发。" }));
|
|
return actions.slice(0, 6);
|
|
}
|
|
|
|
function recoveryDescriptor(input: { action: string; operation: string; resourceKind: string; resourceName: string; runId?: string | null; commandId?: string | null; runnerJobId?: string | null; sessionId?: string | null; afterSeq?: number | null; limit?: number | null; reason?: string | null; reasonHint?: string | null; inputKind?: string | null }): JsonRecord {
|
|
return {
|
|
action: input.action,
|
|
operation: input.operation,
|
|
resourceKind: input.resourceKind,
|
|
resourceName: input.resourceName,
|
|
runId: input.runId ?? null,
|
|
commandId: input.commandId ?? null,
|
|
runnerJobId: input.runnerJobId ?? null,
|
|
sessionId: input.sessionId ?? null,
|
|
...(input.afterSeq !== undefined ? { afterSeq: input.afterSeq } : {}),
|
|
...(input.limit !== undefined ? { limit: input.limit } : {}),
|
|
...(input.reason ? { reason: input.reason } : {}),
|
|
...(input.reasonHint ? { reasonHint: input.reasonHint } : {}),
|
|
...(input.inputKind ? { inputKind: input.inputKind } : {}),
|
|
valuesPrinted: false,
|
|
};
|
|
}
|
|
|
|
function runnerJobReference(job: RunnerJobRecord, events: RunEvent[]): JsonRecord {
|
|
const observation = runnerJobObservation(job, events);
|
|
const terminalStatus = stringValue(observation.terminalStatus);
|
|
return {
|
|
runnerJobId: job.id,
|
|
attemptId: job.attemptId,
|
|
runnerId: job.runnerId,
|
|
namespace: job.namespace,
|
|
jobName: job.jobName,
|
|
phase: stringValue(observation.phase) ?? "unknown",
|
|
terminalStatus,
|
|
startedAt: stringValue(observation.startedAt),
|
|
finishedAt: stringValue(observation.finishedAt),
|
|
lastObservedSeq: numberValue(observation.lastObservedSeq),
|
|
lastObservedAt: stringValue(observation.lastObservedAt),
|
|
lastObservedKind: stringValue(observation.lastObservedKind),
|
|
terminalReportState: stringValue(observation.terminalReportState),
|
|
runReportState: stringValue(observation.runReportState),
|
|
logPath: stringValue(recordAt(job.result, "runner")?.logPath),
|
|
valuesPrinted: false,
|
|
};
|
|
}
|
|
|
|
function runnerJobPhaseIndicatesLost(value: JsonValue | undefined): boolean {
|
|
const phase = stringValue(value);
|
|
return phase === "created"
|
|
|| phase === "recorded"
|
|
|| phase === "k8s:failed"
|
|
|| phase === "k8s:missing"
|
|
|| phase === "k8s:succeeded"
|
|
|| phase === "k8s:observe-failed";
|
|
}
|
|
|
|
function sessionReference(run: RunRecord): JsonRecord {
|
|
if (!run.sessionRef) return { sessionId: null, sessionRefNull: true, sessionPath: null, valuesPrinted: false };
|
|
return { sessionId: run.sessionRef.sessionId, sessionRefNull: false, sessionPath: `/api/v1/sessions/${run.sessionRef.sessionId}`, valuesPrinted: false };
|
|
}
|
|
|
|
function leaseFromRun(run: RunRecord, nowMs: number): JsonRecord {
|
|
const leaseExpiresMs = run.leaseExpiresAt ? Date.parse(run.leaseExpiresAt) : NaN;
|
|
const hasLease = Boolean(run.claimedBy && run.leaseExpiresAt && Number.isFinite(leaseExpiresMs));
|
|
return {
|
|
leaseExpired: run.claimedBy ? (hasLease ? leaseExpiresMs <= nowMs : true) : null,
|
|
leaseRemainingMs: hasLease ? Math.max(0, leaseExpiresMs - nowMs) : null,
|
|
valuesPrinted: false,
|
|
};
|
|
}
|
|
|
|
function latestTerminalStatusFromResult(result: JsonRecord): JsonValue | null {
|
|
const value = result.terminalStatus;
|
|
return typeof value === "string" ? value : null;
|
|
}
|
|
|
|
function latestTerminalEvent(events: RunEvent[], commandId: string): RunEvent | null {
|
|
for (const event of [...events].reverse()) {
|
|
const payload = event.payload;
|
|
if (payload?.commandId && payload.commandId !== commandId) continue;
|
|
if (event.type === "terminal_status") return event;
|
|
if (event.type === "backend_status" && payload?.phase === "command-terminal" && payload.commandId === commandId) return event;
|
|
}
|
|
return null;
|
|
}
|
|
|
|
function relevantRunnerJobEvents(job: RunnerJobRecord, events: RunEvent[]): RunEvent[] {
|
|
return events.filter((event) => isRunnerJobActivity(job, event));
|
|
}
|
|
|
|
function firstObservedRunnerEvent(job: RunnerJobRecord, events: RunEvent[]): RunEvent | null {
|
|
return relevantRunnerJobEvents(job, events)[0] ?? null;
|
|
}
|
|
|
|
function isRunnerJobActivity(job: RunnerJobRecord, event: RunEvent): boolean {
|
|
const payload = event.payload;
|
|
if (payload?.phase === "runner-job-created") return false;
|
|
if (payload?.runnerId === job.runnerId || payload?.attemptId === job.attemptId) return true;
|
|
if (payload?.commandId === job.commandId) {
|
|
if (event.type === "tool_call" || event.type === "assistant_message" || event.type === "command_output" || event.type === "error") return true;
|
|
if (event.type === "backend_status" && typeof payload.phase === "string" && payload.phase !== "command-created") return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
function eventKind(event: RunEvent | null): string | null {
|
|
if (!event) return null;
|
|
const phase = stringValue(event.payload?.phase);
|
|
if (phase) return `${event.type}:${phase}`;
|
|
const toolName = stringValue(event.payload?.toolName);
|
|
if (toolName) return `${event.type}:${toolName}`;
|
|
return event.type;
|
|
}
|
|
|
|
function compactRecord(record: JsonRecord, keys: string[]): JsonRecord {
|
|
const result: JsonRecord = {};
|
|
for (const key of keys) {
|
|
const value = record[key];
|
|
if (value === undefined) continue;
|
|
if (typeof value === "string" || typeof value === "number" || typeof value === "boolean" || value === null) result[key] = value;
|
|
}
|
|
result.valuesPrinted = false;
|
|
return result;
|
|
}
|
|
|
|
function recordAt(record: JsonRecord | null | undefined, key: string): JsonRecord | null {
|
|
const value = record?.[key];
|
|
return typeof value === "object" && value !== null && !Array.isArray(value) ? value as JsonRecord : null;
|
|
}
|
|
|
|
function stringValue(value: JsonValue | undefined): string | null {
|
|
return typeof value === "string" && value.length > 0 ? value : null;
|
|
}
|
|
|
|
function numberValue(value: JsonValue | undefined): number | null {
|
|
return typeof value === "number" && Number.isFinite(value) ? value : null;
|
|
}
|
|
|
|
function booleanValue(value: JsonValue | undefined): boolean | null {
|
|
return typeof value === "boolean" ? value : null;
|
|
}
|
|
|
|
function booleanOrNull(value: JsonValue | undefined): boolean | null {
|
|
return typeof value === "boolean" ? value : null;
|
|
}
|