diff --git a/src/mgr/result.ts b/src/mgr/result.ts index 7d8ac61..7320ef8 100644 --- a/src/mgr/result.ts +++ b/src/mgr/result.ts @@ -29,10 +29,12 @@ interface AssistantReplySummary { interface TerminalClassificationInput { terminal: TerminalStatus | null; terminalSource: string; + active: boolean; failureKind: FailureKind | null; failureMessage: string | null; timeoutBudget: JsonRecord; transportDisconnect: RunEvent | null; + retryInterruption: RunEvent | null; lastActivity: JsonRecord | null; command: CommandRecord | null; } @@ -117,9 +119,10 @@ function livenessSnapshot(run: RunRecord, command: CommandRecord | null, events: const lastCommandActivity = lastBusinessActivity ?? latestLivenessActivity(scopedEvents); const lease = leaseSummary(run, nowMs); const transportDisconnect = latestTransportDisconnect(scopedEvents); + const retryInterruption = latestRetryInterruption(scopedEvents); const lastActivity = livenessActivitySummary(lastCommandActivity, nowMs); const timeoutBudget = timeoutBudgetSummary(run, command, terminal, failureKind, nowMs, lastActivity); - const terminalClassification = terminalClassificationFromEvidence({ terminal, terminalSource: "liveness", failureKind, failureMessage, timeoutBudget, transportDisconnect, lastActivity, command }); + const terminalClassification = terminalClassificationFromEvidence({ terminal, terminalSource: "liveness", active, failureKind, failureMessage, timeoutBudget, transportDisconnect, retryInterruption, lastActivity, command }); const phase = livenessPhase({ active, command, lastVisibleActivity, leaseExpired: lease.leaseExpired, transportDisconnect, timeoutBudget, lastActivity }); const afterSeq = lastEvent?.seq ?? 0; return { @@ -139,6 +142,7 @@ function livenessSnapshot(run: RunRecord, command: CommandRecord | null, events: terminalClassification, lease, transportDisconnect: transportDisconnect ? livenessActivitySummary(transportDisconnect, nowMs) : null, + retryInterruption: retryInterruption ? livenessActivitySummary(retryInterruption, nowMs) : null, recoveryActions: recoveryActions({ run, command, afterSeq, active, terminal, failureKind, failureMessage }), valuesPrinted: false, }; @@ -183,6 +187,7 @@ function terminalClassificationFromEvidence(input: TerminalClassificationInput): const idleTimeout = timeoutFailure && timeoutKind === "idle"; const hardTimeout = timeoutFailure && timeoutKind === "hard"; const providerKind = providerFailureCategory(input.failureKind); + const retryInterruptionKind = failureKindValue(input.retryInterruption?.payload.failureKind); const cancelled = input.terminal === "cancelled" || input.failureKind === "cancelled"; const taskFailure = input.terminal === "failed" && input.failureKind !== null && !timeoutFailure && !providerKind && !infrastructureFailureKind(input.failureKind); let category = "unknown"; @@ -190,7 +195,12 @@ function terminalClassificationFromEvidence(input: TerminalClassificationInput): let providerEvidence = "not-applicable"; let reason = "terminal state is not yet available"; - if (input.terminal === "completed") { + if (input.active && input.retryInterruption) { + category = "active-retry-interruption"; + confidence = "high"; + providerEvidence = "retry-event"; + reason = `willRetry=true provider interruption${retryInterruptionKind ? ` (${retryInterruptionKind})` : ""} was observed, but the command remains active and this is not a terminal failure`; + } else if (input.terminal === "completed") { category = "completed"; confidence = "high"; reason = "command completed successfully"; @@ -232,13 +242,16 @@ function terminalClassificationFromEvidence(input: TerminalClassificationInput): return { category, confidence, + active: input.active, providerEvidence, - providerInterruption: providerEvidence === "failure-kind" || providerEvidence === "observed-transport-disconnect" ? providerEvidence : "not-established", + providerInterruption: providerEvidence === "failure-kind" || providerEvidence === "observed-transport-disconnect" || providerEvidence === "retry-event" ? providerEvidence : "not-established", providerInterruptionKnown: providerEvidence === "failure-kind", providerInterruptionReason: providerEvidence === "failure-kind" ? "provider-specific failureKind is authoritative" : providerEvidence === "observed-transport-disconnect" ? "transport disconnect was observed, but current events cannot distinguish provider outage from runner/backend shutdown during timeout" + : providerEvidence === "retry-event" + ? "provider interruption was observed with willRetry=true; current command liveness decides whether work is still active" : providerEvidence === "insufficient" ? "no provider-specific error or disconnect evidence was recorded" : null, @@ -249,6 +262,9 @@ function terminalClassificationFromEvidence(input: TerminalClassificationInput): timeoutState, transportDisconnectObserved: Boolean(input.transportDisconnect), transportDisconnectSeq: input.transportDisconnect?.seq ?? null, + retryInterruptionObserved: Boolean(input.retryInterruption), + retryInterruptionSeq: input.retryInterruption?.seq ?? null, + retryInterruptionKind, lastActivityKind: stringJsonValue(input.lastActivity?.activityKind), lastActivitySeq: numberJsonValue(input.lastActivity?.sourceSeq), commandId: input.command?.id ?? null, @@ -353,6 +369,10 @@ function latestTransportDisconnect(events: RunEvent[]): RunEvent | null { }) ?? null; } +function latestRetryInterruption(events: RunEvent[]): RunEvent | null { + return [...events].reverse().find((event) => event.type === "error" && event.payload.willRetry === true && isFailureKind(event.payload.failureKind)) ?? null; +} + function livenessActivitySummary(event: RunEvent | null, nowMs: number): JsonRecord | null { if (!event) return null; return { @@ -558,9 +578,10 @@ function eventsForCommand(events: RunEvent[], commandId: string): RunEvent[] { return scoped.length > 0 ? scoped : events; } -function failureKindFromEvents(events: RunEvent[]): FailureKind | null { +function failureKindFromEvents(events: RunEvent[], options: { includeRetry: boolean }): FailureKind | null { for (const event of [...events].reverse()) { const value = event.payload.failureKind; + if (event.type === "error" && event.payload.willRetry === true && !options.includeRetry) continue; if (isFailureKind(value)) return value; } return null; @@ -568,7 +589,7 @@ function failureKindFromEvents(events: RunEvent[]): FailureKind | null { function resultFailureKind(run: RunRecord, command: CommandRecord | null, events: RunEvent[], jobs: RunnerJobRecord[], terminal: TerminalStatus | null): FailureKind | null { if (terminal === "completed") return null; - return failureKindValue(run.failureKind) ?? failureKindFromEvents(events) ?? failureKindFromRunnerJobs(jobs); + return failureKindValue(run.failureKind) ?? failureKindFromEvents(events, { includeRetry: terminal !== null }) ?? failureKindFromRunnerJobs(jobs); } function failureKindFromRunnerJobs(jobs: RunnerJobRecord[]): FailureKind | null { diff --git a/src/mgr/server.ts b/src/mgr/server.ts index 5b00818..ad48694 100644 --- a/src/mgr/server.ts +++ b/src/mgr/server.ts @@ -213,6 +213,9 @@ function compactTerminalClassification(record: JsonRecord): JsonRecord { providerInterruption: stringJsonValue(record.providerInterruption), providerInterruptionKnown: record.providerInterruptionKnown === true, providerInterruptionReason: boundedJsonString(record.providerInterruptionReason, 240), + retryInterruptionObserved: record.retryInterruptionObserved === true, + retryInterruptionSeq: numberJsonValue(record.retryInterruptionSeq), + retryInterruptionKind: stringJsonValue(record.retryInterruptionKind), hardTimeout: record.hardTimeout === true, idleTimeout: record.idleTimeout === true, timeoutKind: stringJsonValue(record.timeoutKind),