fix: show active retry interruptions in liveness
This commit is contained in:
+26
-5
@@ -29,10 +29,12 @@ interface AssistantReplySummary {
|
||||
interface TerminalClassificationInput {
|
||||
terminal: TerminalStatus | null;
|
||||
terminalSource: string;
|
||||
active: boolean;
|
||||
failureKind: FailureKind | null;
|
||||
failureMessage: string | null;
|
||||
timeoutBudget: JsonRecord;
|
||||
transportDisconnect: RunEvent | null;
|
||||
retryInterruption: RunEvent | null;
|
||||
lastActivity: JsonRecord | null;
|
||||
command: CommandRecord | null;
|
||||
}
|
||||
@@ -117,9 +119,10 @@ function livenessSnapshot(run: RunRecord, command: CommandRecord | null, events:
|
||||
const lastCommandActivity = lastBusinessActivity ?? latestLivenessActivity(scopedEvents);
|
||||
const lease = leaseSummary(run, nowMs);
|
||||
const transportDisconnect = latestTransportDisconnect(scopedEvents);
|
||||
const retryInterruption = latestRetryInterruption(scopedEvents);
|
||||
const lastActivity = livenessActivitySummary(lastCommandActivity, nowMs);
|
||||
const timeoutBudget = timeoutBudgetSummary(run, command, terminal, failureKind, nowMs, lastActivity);
|
||||
const terminalClassification = terminalClassificationFromEvidence({ terminal, terminalSource: "liveness", failureKind, failureMessage, timeoutBudget, transportDisconnect, lastActivity, command });
|
||||
const terminalClassification = terminalClassificationFromEvidence({ terminal, terminalSource: "liveness", active, failureKind, failureMessage, timeoutBudget, transportDisconnect, retryInterruption, lastActivity, command });
|
||||
const phase = livenessPhase({ active, command, lastVisibleActivity, leaseExpired: lease.leaseExpired, transportDisconnect, timeoutBudget, lastActivity });
|
||||
const afterSeq = lastEvent?.seq ?? 0;
|
||||
return {
|
||||
@@ -139,6 +142,7 @@ function livenessSnapshot(run: RunRecord, command: CommandRecord | null, events:
|
||||
terminalClassification,
|
||||
lease,
|
||||
transportDisconnect: transportDisconnect ? livenessActivitySummary(transportDisconnect, nowMs) : null,
|
||||
retryInterruption: retryInterruption ? livenessActivitySummary(retryInterruption, nowMs) : null,
|
||||
recoveryActions: recoveryActions({ run, command, afterSeq, active, terminal, failureKind, failureMessage }),
|
||||
valuesPrinted: false,
|
||||
};
|
||||
@@ -183,6 +187,7 @@ function terminalClassificationFromEvidence(input: TerminalClassificationInput):
|
||||
const idleTimeout = timeoutFailure && timeoutKind === "idle";
|
||||
const hardTimeout = timeoutFailure && timeoutKind === "hard";
|
||||
const providerKind = providerFailureCategory(input.failureKind);
|
||||
const retryInterruptionKind = failureKindValue(input.retryInterruption?.payload.failureKind);
|
||||
const cancelled = input.terminal === "cancelled" || input.failureKind === "cancelled";
|
||||
const taskFailure = input.terminal === "failed" && input.failureKind !== null && !timeoutFailure && !providerKind && !infrastructureFailureKind(input.failureKind);
|
||||
let category = "unknown";
|
||||
@@ -190,7 +195,12 @@ function terminalClassificationFromEvidence(input: TerminalClassificationInput):
|
||||
let providerEvidence = "not-applicable";
|
||||
let reason = "terminal state is not yet available";
|
||||
|
||||
if (input.terminal === "completed") {
|
||||
if (input.active && input.retryInterruption) {
|
||||
category = "active-retry-interruption";
|
||||
confidence = "high";
|
||||
providerEvidence = "retry-event";
|
||||
reason = `willRetry=true provider interruption${retryInterruptionKind ? ` (${retryInterruptionKind})` : ""} was observed, but the command remains active and this is not a terminal failure`;
|
||||
} else if (input.terminal === "completed") {
|
||||
category = "completed";
|
||||
confidence = "high";
|
||||
reason = "command completed successfully";
|
||||
@@ -232,13 +242,16 @@ function terminalClassificationFromEvidence(input: TerminalClassificationInput):
|
||||
return {
|
||||
category,
|
||||
confidence,
|
||||
active: input.active,
|
||||
providerEvidence,
|
||||
providerInterruption: providerEvidence === "failure-kind" || providerEvidence === "observed-transport-disconnect" ? providerEvidence : "not-established",
|
||||
providerInterruption: providerEvidence === "failure-kind" || providerEvidence === "observed-transport-disconnect" || providerEvidence === "retry-event" ? providerEvidence : "not-established",
|
||||
providerInterruptionKnown: providerEvidence === "failure-kind",
|
||||
providerInterruptionReason: providerEvidence === "failure-kind"
|
||||
? "provider-specific failureKind is authoritative"
|
||||
: providerEvidence === "observed-transport-disconnect"
|
||||
? "transport disconnect was observed, but current events cannot distinguish provider outage from runner/backend shutdown during timeout"
|
||||
: providerEvidence === "retry-event"
|
||||
? "provider interruption was observed with willRetry=true; current command liveness decides whether work is still active"
|
||||
: providerEvidence === "insufficient"
|
||||
? "no provider-specific error or disconnect evidence was recorded"
|
||||
: null,
|
||||
@@ -249,6 +262,9 @@ function terminalClassificationFromEvidence(input: TerminalClassificationInput):
|
||||
timeoutState,
|
||||
transportDisconnectObserved: Boolean(input.transportDisconnect),
|
||||
transportDisconnectSeq: input.transportDisconnect?.seq ?? null,
|
||||
retryInterruptionObserved: Boolean(input.retryInterruption),
|
||||
retryInterruptionSeq: input.retryInterruption?.seq ?? null,
|
||||
retryInterruptionKind,
|
||||
lastActivityKind: stringJsonValue(input.lastActivity?.activityKind),
|
||||
lastActivitySeq: numberJsonValue(input.lastActivity?.sourceSeq),
|
||||
commandId: input.command?.id ?? null,
|
||||
@@ -353,6 +369,10 @@ function latestTransportDisconnect(events: RunEvent[]): RunEvent | null {
|
||||
}) ?? null;
|
||||
}
|
||||
|
||||
function latestRetryInterruption(events: RunEvent[]): RunEvent | null {
|
||||
return [...events].reverse().find((event) => event.type === "error" && event.payload.willRetry === true && isFailureKind(event.payload.failureKind)) ?? null;
|
||||
}
|
||||
|
||||
function livenessActivitySummary(event: RunEvent | null, nowMs: number): JsonRecord | null {
|
||||
if (!event) return null;
|
||||
return {
|
||||
@@ -558,9 +578,10 @@ function eventsForCommand(events: RunEvent[], commandId: string): RunEvent[] {
|
||||
return scoped.length > 0 ? scoped : events;
|
||||
}
|
||||
|
||||
function failureKindFromEvents(events: RunEvent[]): FailureKind | null {
|
||||
function failureKindFromEvents(events: RunEvent[], options: { includeRetry: boolean }): FailureKind | null {
|
||||
for (const event of [...events].reverse()) {
|
||||
const value = event.payload.failureKind;
|
||||
if (event.type === "error" && event.payload.willRetry === true && !options.includeRetry) continue;
|
||||
if (isFailureKind(value)) return value;
|
||||
}
|
||||
return null;
|
||||
@@ -568,7 +589,7 @@ function failureKindFromEvents(events: RunEvent[]): FailureKind | null {
|
||||
|
||||
function resultFailureKind(run: RunRecord, command: CommandRecord | null, events: RunEvent[], jobs: RunnerJobRecord[], terminal: TerminalStatus | null): FailureKind | null {
|
||||
if (terminal === "completed") return null;
|
||||
return failureKindValue(run.failureKind) ?? failureKindFromEvents(events) ?? failureKindFromRunnerJobs(jobs);
|
||||
return failureKindValue(run.failureKind) ?? failureKindFromEvents(events, { includeRetry: terminal !== null }) ?? failureKindFromRunnerJobs(jobs);
|
||||
}
|
||||
|
||||
function failureKindFromRunnerJobs(jobs: RunnerJobRecord[]): FailureKind | null {
|
||||
|
||||
@@ -213,6 +213,9 @@ function compactTerminalClassification(record: JsonRecord): JsonRecord {
|
||||
providerInterruption: stringJsonValue(record.providerInterruption),
|
||||
providerInterruptionKnown: record.providerInterruptionKnown === true,
|
||||
providerInterruptionReason: boundedJsonString(record.providerInterruptionReason, 240),
|
||||
retryInterruptionObserved: record.retryInterruptionObserved === true,
|
||||
retryInterruptionSeq: numberJsonValue(record.retryInterruptionSeq),
|
||||
retryInterruptionKind: stringJsonValue(record.retryInterruptionKind),
|
||||
hardTimeout: record.hardTimeout === true,
|
||||
idleTimeout: record.idleTimeout === true,
|
||||
timeoutKind: stringJsonValue(record.timeoutKind),
|
||||
|
||||
Reference in New Issue
Block a user