From ec62cc70dd66b0f654ab9d507899f3b167174288 Mon Sep 17 00:00:00 2001 From: AgentRun Codex Date: Thu, 11 Jun 2026 09:19:43 +0800 Subject: [PATCH] fix: expose stale runner diagnostics --- docs/reference/spec-v01-cli.md | 1 + scripts/src/cli.ts | 31 +++ src/mgr/diagnosis.ts | 203 ++++++++++++++++++ src/mgr/result.ts | 3 + src/mgr/runner-job-status.ts | 2 + src/mgr/server.ts | 1 + src/selftest/cases/55-timeout-liveness.ts | 40 +++- .../cases/60-hwlab-baseline-contract.ts | 3 + 8 files changed, 279 insertions(+), 5 deletions(-) create mode 100644 src/mgr/diagnosis.ts diff --git a/docs/reference/spec-v01-cli.md b/docs/reference/spec-v01-cli.md index 0e18a42..7175be1 100644 --- a/docs/reference/spec-v01-cli.md +++ b/docs/reference/spec-v01-cli.md @@ -93,6 +93,7 @@ CLI 官方 TypeScript 入口固定为 `scripts/agentrun-cli.ts`。在 G14 非交 - 创建类命令返回 `runId`、`commandId`、status 和下一步 poll command。 - `runner start` 返回 attemptId、job/process identity、logPath 和后续 status/events 命令。 - `runner jobs` / `runner job-status` 返回 manager 持久化的 runner Job 最小状态摘要,包括 attemptId、runnerId、namespace、jobName、phase、terminalStatus、logPath、retention 和 redacted Kubernetes identity;业务方不需要直连 Kubernetes 才能定位当前 attempt。 +- `runs show/result`、`commands show/result`、`queue show/commander` 和 `runner jobs/job-status` 的低噪声视图必须暴露 compact `diagnosis`,把 stale claimed、runner lost、terminal command/open run、`sessionRef=null`、provider interruption known/unknown、runner job phase 和下一步 drill-down 命令放到同一层级,避免监督者人工拼接 run、command、runner job 和 events 才能判断恢复入口。 - `aipod-specs render` 与 `queue submit --aipod` 必须调用同一 manager `/api/v1/aipod-specs/:name/render` 路径,把 YAML 展开为标准 Queue task;CLI 不得在本地复制一套 render 逻辑。 - `queue submit --aipod ` 只接受本次任务输入(prompt、idempotencyKey、tenant/project/queue/lane/provider 覆盖等),模型、provider credential、tool credential、gitbundle 和 requiredSkills 由 [spec-v01-aipod-spec.md](spec-v01-aipod-spec.md) 定义。 - 查询类命令返回当前 state、terminal_status、failureKind、event cursor 或 logPath。 diff --git a/scripts/src/cli.ts b/scripts/src/cli.ts index 5ff09c3..ffce01a 100644 --- a/scripts/src/cli.ts +++ b/scripts/src/cli.ts @@ -183,6 +183,7 @@ function summarizeRunShowResult(run: JsonValue, result: JsonValue, runId: string action: "runs-show-summary", run: summarizeRunRecord(jsonRecordValue(run)), result: summarizeResultEnvelope(resultRecord), + diagnosis: summarizeDiagnosisRecord(jsonRecordValue(resultRecord?.diagnosis)), terminalClassification: summarizeTerminalClassificationIfPresent(resultRecord), liveness: summarizeLivenessRecord(jsonRecordValue(resultRecord?.liveness)), finalResponse: summarizeFinalResponseRecord(jsonRecordValue(resultRecord?.finalResponse)), @@ -200,6 +201,7 @@ function summarizeCommandShowResult(command: JsonValue, result: JsonValue, runId action: "commands-show-summary", command: summarizeCommandRecord(jsonRecordValue(command)), result: summarizeResultEnvelope(resultRecord), + diagnosis: summarizeDiagnosisRecord(jsonRecordValue(resultRecord?.diagnosis)), terminalClassification: summarizeTerminalClassificationIfPresent(resultRecord), liveness: summarizeLivenessRecord(jsonRecordValue(resultRecord?.liveness)), finalResponse: summarizeFinalResponseRecord(jsonRecordValue(resultRecord?.finalResponse)), @@ -238,6 +240,23 @@ function summarizeLivenessRecord(record: JsonRecord | null): JsonRecord | null { }; } +function summarizeDiagnosisRecord(record: JsonRecord | null): JsonRecord | null { + if (!record) return null; + const run = jsonRecordValue(record.run); + const command = jsonRecordValue(record.command); + const runnerJob = jsonRecordValue(record.runnerJob); + const session = jsonRecordValue(record.session); + return { + ...withoutFullRecordBytes(compactRecord(record, { keys: ["category", "staleClaimed", "runnerLost", "terminalCommandOpenRun", "evidenceLevel", "providerEvidence", "providerInterruption", "providerInterruptionKnown", "terminalCategory", "terminalStatus", "failureKind", "failureMessage"] })), + run: run ? withoutFullRecordBytes(compactRecord(run, { keys: ["runId", "status", "claimedBy", "leaseExpiresAt", "leaseExpired", "leaseRemainingMs"] })) : null, + command: command ? withoutFullRecordBytes(compactRecord(command, { keys: ["commandId", "state", "terminalStatus", "acknowledgedAt", "updatedAt"] })) : null, + runnerJob: runnerJob ? withoutFullRecordBytes(compactRecord(runnerJob, { keys: ["runnerJobId", "attemptId", "runnerId", "namespace", "jobName", "phase", "terminalStatus", "startedAt", "finishedAt", "logPath"] })) : null, + session: session ? withoutFullRecordBytes(compactRecord(session, { keys: ["sessionId", "sessionRefNull", "sessionPath"] })) : null, + recoveryActions: summarizeRecoveryActions(record.recoveryActions), + valuesPrinted: false, + }; +} + function summarizeFinalResponseRecord(record: JsonRecord | null): JsonRecord | null { if (!record) return null; return withoutFullRecordBytes(compactRecord(record, { keys: ["seq", "source", "final", "replyAuthority", "textTruncated", "outputTruncated", "text"] })); @@ -900,8 +919,10 @@ function summarizeSupervisorRecord(record: JsonRecord | null): JsonRecord | null const lastActivity = jsonRecordValue(record.lastActivity); const timeoutBudget = jsonRecordValue(record.timeoutBudget); const terminalClassification = jsonRecordValue(record.terminalClassification); + const diagnosis = jsonRecordValue(record.diagnosis); return { ...withoutFullRecordBytes(compactRecord(record, { keys: ["phase", "active", "status", "terminalStatus", "failureKind", "runId", "commandId", "lastSeq"] })), + diagnosis: diagnosis ? summarizeDiagnosisRecord(diagnosis) : null, terminalClassification: terminalClassification ? summarizeTerminalClassification(terminalClassification) : null, lastActivity: lastActivity ? withoutFullRecordBytes(compactRecord(lastActivity, { keys: ["sourceSeq", "eventId", "activityKind", "type", "status", "toolName", "itemId", "ageMs", "summary"] })) : null, timeoutBudget: timeoutBudget ? withoutFullRecordBytes(compactRecord(timeoutBudget, { keys: ["state", "timeoutMs", "elapsedMs", "remainingMs", "startedAt", "source"] })) : null, @@ -943,6 +964,7 @@ function summarizeRunnerJobRecord(record: JsonRecord | null): JsonRecord | null ...compactRecord(record, { keys: ["action", "mutation", "runId", "commandId", "attemptId", "runnerId", "namespace", "jobName"] }), logPath: stringValue(runner?.logPath), backendProfile: stringValue(runner?.backendProfile), + diagnosis: summarizeRunnerJobDiagnosis(jsonRecordValue(record.diagnosis)), jobUid: stringValue(jobIdentity?.uid), created: kubernetes?.created === true, warnings: Array.isArray(record.warnings) ? record.warnings.map((item) => boundedSummaryString(typeof item === "string" ? item : JSON.stringify(item), 240)).filter((item): item is string => Boolean(item)) : [], @@ -951,6 +973,15 @@ function summarizeRunnerJobRecord(record: JsonRecord | null): JsonRecord | null }; } +function summarizeRunnerJobDiagnosis(record: JsonRecord | null): JsonRecord | null { + if (!record) return null; + return withoutFullRecordBytes({ + ...compactRecord(record, { keys: ["category", "runnerLostSuspected", "phase", "evidenceLevel", "runId", "commandId", "runnerJobId", "attemptId", "runnerId", "jobName", "namespace", "logPath"] }), + nextActions: summarizeRecoveryActions(record.nextActions), + valuesPrinted: false, + }); +} + function summarizeGenericRecord(record: JsonRecord | null): JsonRecord | null { if (!record) return null; return compactRecord(record, { diff --git a/src/mgr/diagnosis.ts b/src/mgr/diagnosis.ts new file mode 100644 index 0000000..355a22c --- /dev/null +++ b/src/mgr/diagnosis.ts @@ -0,0 +1,203 @@ +import type { CommandRecord, JsonRecord, JsonValue, RunEvent, RunRecord, RunnerJobRecord } from "../common/types.js"; +import { boundedTextSummary } from "../common/output.js"; + +export interface RunDiagnosisInput { + run: RunRecord; + command: CommandRecord | null; + latestJob: RunnerJobRecord | null; + events: RunEvent[]; + terminalClassification: JsonRecord | null; + liveness: JsonRecord | null; + terminalStatus: string | null; + failureKind: string | null; + failureMessage: string | null; +} + +export function runDiagnosis(input: RunDiagnosisInput): JsonRecord { + const nowMs = Date.now(); + const lease = recordAt(input.liveness, "lease") ?? leaseFromRun(input.run, nowMs); + const timeoutBudget = recordAt(input.liveness, "timeoutBudget"); + const staleClaimed = input.run.status === "claimed" && booleanValue(lease.leaseExpired) === true; + const terminalCommandOpenRun = input.run.status === "claimed" && input.terminalStatus !== null; + const runnerJob = input.latestJob ? runnerJobReference(input.latestJob, input.events) : null; + const runnerLost = staleClaimed && (runnerJob === null || runnerJob.phase === "created" || runnerJob.phase === "recorded"); + const session = sessionReference(input.run); + const providerEvidence = stringValue(input.terminalClassification?.providerEvidence) ?? "not-applicable"; + const providerInterruption = stringValue(input.terminalClassification?.providerInterruption) ?? "not-established"; + const category = diagnosisCategory({ staleClaimed, runnerLost, terminalCommandOpenRun, providerEvidence, terminalCategory: stringValue(input.terminalClassification?.category) }); + const recoveryActions = recoveryActionsForDiagnosis({ run: input.run, command: input.command, latestJob: input.latestJob, session, runnerLost, staleClaimed, terminalCommandOpenRun, failureKind: input.failureKind, lastSeq: numberValue(input.liveness?.lastSeq) ?? 0 }); + + return { + category, + staleClaimed, + runnerLost, + terminalCommandOpenRun, + evidenceLevel: evidenceLevel(category, providerEvidence, runnerLost, staleClaimed, terminalCommandOpenRun), + providerEvidence, + providerInterruption, + providerInterruptionKnown: input.terminalClassification?.providerInterruptionKnown === true, + terminalCategory: stringValue(input.terminalClassification?.category), + terminalStatus: input.terminalStatus, + failureKind: input.failureKind, + failureMessage: input.failureMessage ? boundedTextSummary(input.failureMessage, { limitChars: 240 }).text as string : null, + run: { + runId: input.run.id, + status: input.run.status, + claimedBy: input.run.claimedBy, + leaseExpiresAt: input.run.leaseExpiresAt, + leaseExpired: booleanOrNull(lease.leaseExpired), + leaseRemainingMs: numberValue(lease.leaseRemainingMs), + valuesPrinted: false, + }, + command: input.command ? { + commandId: input.command.id, + state: input.command.state, + terminalStatus: input.terminalStatus, + acknowledgedAt: input.command.acknowledgedAt ?? null, + updatedAt: input.command.updatedAt, + valuesPrinted: false, + } : null, + runnerJob, + session, + timeoutBudget: timeoutBudget ? compactRecord(timeoutBudget, ["state", "timeoutMs", "elapsedMs", "remainingMs", "startedAt", "source"]) : null, + recoveryActions, + valuesPrinted: false, + }; +} + +export function runnerJobDiagnosis(job: RunnerJobRecord, events: RunEvent[] = []): JsonRecord { + const terminalEvent = latestTerminalEvent(events, job.commandId); + const phase = terminalEvent ? `terminal:${String(terminalEvent.payload?.terminalStatus ?? "unknown")}` : recordAt(job.result, "kubernetes")?.created === true ? "created" : "recorded"; + const notStarted = phase === "created" || phase === "recorded"; + return { + category: notStarted ? "runner-job-not-started" : phase.startsWith("terminal:") ? "runner-job-terminal" : "runner-job-observed", + runnerLostSuspected: notStarted, + phase, + evidenceLevel: notStarted ? "medium" : "high", + runId: job.runId, + commandId: job.commandId, + runnerJobId: job.id, + attemptId: job.attemptId, + runnerId: job.runnerId, + jobName: job.jobName, + namespace: job.namespace, + logPath: stringValue(recordAt(job.result, "runner")?.logPath), + nextActions: [ + { action: "inspect-run", command: `./scripts/agentrun runs show ${job.runId}`, valuesPrinted: false }, + { action: "inspect-command", command: `./scripts/agentrun commands show ${job.commandId} --run-id ${job.runId}`, valuesPrinted: false }, + { action: "poll-events", command: `./scripts/agentrun runs events ${job.runId} --after-seq 0 --limit 100 --tail-summary`, valuesPrinted: false }, + ], + valuesPrinted: false, + }; +} + +function diagnosisCategory(input: { staleClaimed: boolean; runnerLost: boolean; terminalCommandOpenRun: boolean; providerEvidence: string; terminalCategory: string | null }): string { + if (input.runnerLost) return "runner-lost"; + if (input.staleClaimed) return "stale-claimed"; + if (input.terminalCommandOpenRun) return "terminal-command-open-run"; + if (input.providerEvidence === "failure-kind") return "provider-interruption-known"; + if (input.providerEvidence === "observed-transport-disconnect") return "provider-interruption-unknown"; + if (input.terminalCategory) return input.terminalCategory; + return "unknown"; +} + +function evidenceLevel(category: string, providerEvidence: string, runnerLost: boolean, staleClaimed: boolean, terminalCommandOpenRun: boolean): string { + if (runnerLost || staleClaimed || terminalCommandOpenRun) return "high"; + if (providerEvidence === "failure-kind") return "high"; + if (providerEvidence === "observed-transport-disconnect") return "medium"; + if (category === "completed" || category === "cancelled") return "high"; + return "low"; +} + +function recoveryActionsForDiagnosis(input: { run: RunRecord; command: CommandRecord | null; latestJob: RunnerJobRecord | null; session: JsonRecord; runnerLost: boolean; staleClaimed: boolean; terminalCommandOpenRun: boolean; failureKind: string | null; lastSeq: number }): JsonRecord[] { + const actions: JsonRecord[] = []; + if (input.latestJob) actions.push({ action: "inspect-runner-job", runnerJobId: input.latestJob.id, command: `./scripts/agentrun runner job-status ${input.latestJob.id} --run-id ${input.run.id}`, valuesPrinted: false }); + if (input.command) actions.push({ action: "inspect-command", commandId: input.command.id, command: `./scripts/agentrun commands result ${input.command.id} --run-id ${input.run.id}`, valuesPrinted: false }); + actions.push({ action: "poll-events", runId: input.run.id, afterSeq: input.lastSeq, command: `./scripts/agentrun runs events ${input.run.id} --after-seq ${input.lastSeq} --limit 100 --tail-summary`, valuesPrinted: false }); + const sessionId = stringValue(input.session.sessionId); + if (sessionId) actions.push({ action: "resume-session", sessionId, command: `./scripts/agentrun sessions turn ${sessionId} --prompt-stdin`, valuesPrinted: false }); + else actions.push({ action: "session-unavailable", reason: "sessionRef=null", hint: "当前 run 没有 sessionRef,只能从 run/events/command/runner-job 继续 drill-down 或重新提交任务。", valuesPrinted: false }); + if (input.runnerLost || input.staleClaimed || input.terminalCommandOpenRun) actions.push({ action: "refresh-queue-or-resubmit", reason: input.failureKind ?? "stale-runner-state", hint: "先用 queue refresh/show 对齐 attempt,再按任务边界决定重新派发或拆分续跑。", valuesPrinted: false }); + return actions.slice(0, 6); +} + +function runnerJobReference(job: RunnerJobRecord, events: RunEvent[]): JsonRecord { + const kubernetes = recordAt(job.result, "kubernetes"); + const terminalStatus = stringValue(latestTerminalEvent(events, job.commandId)?.payload.terminalStatus) ?? stringValue(latestTerminalStatusFromResult(job.result)); + const phase = terminalStatus ? `terminal:${terminalStatus}` : kubernetes?.created === true ? "created" : "recorded"; + return { + runnerJobId: job.id, + attemptId: job.attemptId, + runnerId: job.runnerId, + namespace: job.namespace, + jobName: job.jobName, + phase, + terminalStatus, + startedAt: null, + finishedAt: null, + logPath: stringValue(recordAt(job.result, "runner")?.logPath), + valuesPrinted: false, + }; +} + +function sessionReference(run: RunRecord): JsonRecord { + if (!run.sessionRef) return { sessionId: null, sessionRefNull: true, sessionPath: null, valuesPrinted: false }; + return { sessionId: run.sessionRef.sessionId, sessionRefNull: false, sessionPath: `/api/v1/sessions/${run.sessionRef.sessionId}`, valuesPrinted: false }; +} + +function leaseFromRun(run: RunRecord, nowMs: number): JsonRecord { + const leaseExpiresMs = run.leaseExpiresAt ? Date.parse(run.leaseExpiresAt) : NaN; + const hasLease = Boolean(run.claimedBy && run.leaseExpiresAt && Number.isFinite(leaseExpiresMs)); + return { + leaseExpired: run.claimedBy ? (hasLease ? leaseExpiresMs <= nowMs : true) : null, + leaseRemainingMs: hasLease ? Math.max(0, leaseExpiresMs - nowMs) : null, + valuesPrinted: false, + }; +} + +function latestTerminalStatusFromResult(result: JsonRecord): JsonValue | null { + const value = result.terminalStatus; + return typeof value === "string" ? value : null; +} + +function latestTerminalEvent(events: RunEvent[], commandId: string): RunEvent | null { + for (const event of [...events].reverse()) { + const payload = event.payload; + if (payload?.commandId && payload.commandId !== commandId) continue; + if (event.type === "terminal_status") return event; + if (event.type === "backend_status" && payload?.phase === "command-terminal" && payload.commandId === commandId) return event; + } + return null; +} + +function compactRecord(record: JsonRecord, keys: string[]): JsonRecord { + const result: JsonRecord = {}; + for (const key of keys) { + const value = record[key]; + if (value === undefined) continue; + if (typeof value === "string" || typeof value === "number" || typeof value === "boolean" || value === null) result[key] = value; + } + result.valuesPrinted = false; + return result; +} + +function recordAt(record: JsonRecord | null | undefined, key: string): JsonRecord | null { + const value = record?.[key]; + return typeof value === "object" && value !== null && !Array.isArray(value) ? value as JsonRecord : null; +} + +function stringValue(value: JsonValue | undefined): string | null { + return typeof value === "string" && value.length > 0 ? value : null; +} + +function numberValue(value: JsonValue | undefined): number | null { + return typeof value === "number" && Number.isFinite(value) ? value : null; +} + +function booleanValue(value: JsonValue | undefined): boolean | null { + return typeof value === "boolean" ? value : null; +} + +function booleanOrNull(value: JsonValue | undefined): boolean | null { + return typeof value === "boolean" ? value : null; +} diff --git a/src/mgr/result.ts b/src/mgr/result.ts index 41b5543..1555e7c 100644 --- a/src/mgr/result.ts +++ b/src/mgr/result.ts @@ -1,6 +1,7 @@ import type { AgentRunStore } from "./store.js"; import type { CommandRecord, FailureKind, JsonRecord, JsonValue, RunEvent, RunRecord, RunnerJobRecord, TerminalStatus } from "../common/types.js"; import { boundedTextSummary, outputBytesFromPayload, outputTruncatedFromPayload } from "../common/output.js"; +import { runDiagnosis } from "./diagnosis.js"; const maxToolCallSummaryItems = 40; const toolCallCommandLimitChars = 600; @@ -56,6 +57,7 @@ export async function buildRunResult(store: AgentRunStore, runId: string, comman const blocker = terminal === "blocked" || terminal === "failed" ? { failureKind, message: failureMessage, details: failureDetails } : null; const liveness = livenessSnapshot(run, command, events, scopedEvents, terminal, failureKind, failureMessage); const terminalClassification = terminalClassificationSummary({ terminal, terminalSource, failureKind, failureMessage, liveness }); + const diagnosis = runDiagnosis({ run, command, latestJob, events, terminalClassification, liveness, terminalStatus: terminal, failureKind, failureMessage }); const steerDelivery = command?.type === "steer" ? steerDeliverySummary(events, command.id) : null; return { runId: run.id, @@ -88,6 +90,7 @@ export async function buildRunResult(store: AgentRunStore, runId: string, comman failureMessage, failureDetails, terminalClassification, + diagnosis, blocker, liveness, ...(steerDelivery ? { steerDelivery } : {}), diff --git a/src/mgr/runner-job-status.ts b/src/mgr/runner-job-status.ts index e828140..1755c62 100644 --- a/src/mgr/runner-job-status.ts +++ b/src/mgr/runner-job-status.ts @@ -1,4 +1,5 @@ import type { JsonRecord, RunEvent, RunnerJobRecord, TerminalStatus } from "../common/types.js"; +import { runnerJobDiagnosis } from "./diagnosis.js"; export function runnerJobStatusSummary(job: RunnerJobRecord, events: RunEvent[] = []): JsonRecord { const terminalEvent = latestTerminalEvent(events, job.commandId); @@ -32,6 +33,7 @@ export function runnerJobStatusSummary(job: RunnerJobRecord, events: RunEvent[] logPath: typeof runner.logPath === "string" ? runner.logPath : null, retention, kubernetes, + diagnosis: runnerJobDiagnosis(job, events), createdAt: job.createdAt, updatedAt: job.updatedAt, valuesPrinted: false, diff --git a/src/mgr/server.ts b/src/mgr/server.ts index 99b1c88..de8588e 100644 --- a/src/mgr/server.ts +++ b/src/mgr/server.ts @@ -151,6 +151,7 @@ async function queueTaskSupervisor(store: AgentRunStore, task: JsonRecord): Prom status: stringJsonValue(result.status), terminalStatus: stringJsonValue(result.terminalStatus), failureKind: stringJsonValue(result.failureKind), + diagnosis: asJsonRecord(result.diagnosis), terminalClassification: terminalClassification ? compactTerminalClassification(terminalClassification) : null, phase: stringJsonValue(liveness?.phase), active: liveness?.active === true, diff --git a/src/selftest/cases/55-timeout-liveness.ts b/src/selftest/cases/55-timeout-liveness.ts index 16eff73..9608b13 100644 --- a/src/selftest/cases/55-timeout-liveness.ts +++ b/src/selftest/cases/55-timeout-liveness.ts @@ -54,19 +54,48 @@ const selfTest: SelfTestCase = async (context: SelfTestContext) => { await client.post(`/api/v1/runs/${noSession.runId}/events`, { type: "backend_status", payload: { commandId: noSession.commandId, phase: "codex-app-server-closed", message: "stdio closed before terminal result" } }); await client.post(`/api/v1/runs/${noSession.runId}/events`, { type: "terminal_status", payload: { commandId: noSession.commandId, terminalStatus: "failed", failureKind: "backend-timeout", message: "codex stdio turn hard timed out after 50ms" } }); await client.patch(`/api/v1/commands/${noSession.commandId}/status`, { terminalStatus: "failed", failureKind: "backend-timeout", failureMessage: "codex stdio turn hard timed out after 50ms" }); - await client.patch(`/api/v1/runs/${noSession.runId}/status`, { terminalStatus: "failed", failureKind: "backend-timeout", failureMessage: "codex stdio turn hard timed out after 50ms" }); const noSessionResult = await commandResult(client, noSession); const noSessionLive = noSessionResult.liveness as JsonRecord; const noSessionClassification = noSessionResult.terminalClassification as JsonRecord; + const noSessionDiagnosis = noSessionResult.diagnosis as JsonRecord; assert.equal(noSessionClassification.category, "execution-hard-timeout"); assert.equal(noSessionClassification.providerEvidence, "observed-transport-disconnect"); assert.equal(noSessionClassification.providerInterruptionKnown, false); + assert.equal(noSessionDiagnosis.category, "terminal-command-open-run"); + assert.equal(noSessionDiagnosis.providerEvidence, "observed-transport-disconnect"); + assert.equal(((noSessionDiagnosis.session as JsonRecord).sessionRefNull), true); + assert.ok((noSessionDiagnosis.recoveryActions as JsonRecord[]).some((action) => action.action === "session-unavailable")); assert.match(String(noSessionClassification.providerInterruptionReason), /cannot distinguish provider outage/u); assert.equal((noSessionLive.transportDisconnect as JsonRecord).sourceSeq, 4); assert.equal((noSessionLive.recoveryActions as JsonRecord[]).some((action) => action.action === "resume-session"), false, "sessionId=null must not suggest session-only resume"); assert.equal((noSessionLive.recoveryActions as JsonRecord[]).some((action) => action.action === "poll-output"), false, "sessionId=null must not suggest session output path"); assert.ok((noSessionLive.recoveryActions as JsonRecord[]).some((action) => action.action === "poll-trace" && String(action.command).includes("runs events"))); + const stale = await createActiveRun(client, context, "timeout-liveness-stale-claimed", 120_000, { session: false, leaseMs: 1 }); + await store.saveRunnerJob({ + runId: stale.runId, + commandId: stale.commandId, + idempotencyKey: "stale-runner-lost", + payloadHash: "hash-stale-runner-lost", + attemptId: "attempt_stale_runner_lost", + runnerId: "runner_stale_claimed", + namespace: "agentrun-v01", + jobName: "agentrun-v01-runner-stale", + managerUrl: server.baseUrl, + image: "127.0.0.1:5000/agentrun/selftest:stale", + sourceCommit: "self-test", + serviceAccountName: "agentrun-v01-runner", + result: { runner: { logPath: "kubectl -n agentrun-v01 logs job/agentrun-v01-runner-stale" }, kubernetes: { created: true, valuesPrinted: false }, valuesPrinted: false }, + }); + await sleep(5); + const staleResult = await commandResult(client, stale); + const staleDiagnosis = staleResult.diagnosis as JsonRecord; + assert.equal(staleDiagnosis.category, "runner-lost"); + assert.equal(staleDiagnosis.staleClaimed, true); + assert.equal(staleDiagnosis.runnerLost, true); + assert.equal(((staleDiagnosis.runnerJob as JsonRecord).phase), "created"); + assert.equal(((staleDiagnosis.session as JsonRecord).sessionRefNull), true); + assert.ok(terminal.sessionId, "terminal fixture must have a session id"); const terminalSessionId = terminal.sessionId; const session = await client.get(`/api/v1/sessions/${terminalSessionId}?readerId=timeout-liveness`) as JsonRecord; @@ -82,6 +111,7 @@ const selfTest: SelfTestCase = async (context: SelfTestContext) => { const commander = await client.get("/api/v1/queue/commander?queue=timeout-liveness&readerId=timeout-liveness") as JsonRecord; const commanderItem = ((commander.items as JsonRecord[]) ?? []).find((item) => item.id === task.id) as JsonRecord; assert.equal(((commanderItem.supervisor as JsonRecord).phase), "terminal"); + assert.equal((((commanderItem.supervisor as JsonRecord).diagnosis as JsonRecord).category), "execution-hard-timeout"); assert.equal((((commanderItem.supervisor as JsonRecord).timeoutBudget as JsonRecord).state), "timed-out"); const commanderSummary = summarizeQueueCommanderSnapshot(commander, { limit: 5 }); const summaryItem = ((commanderSummary.items as JsonRecord[]) ?? []).find((item) => item.id === task.id) as JsonRecord; @@ -90,19 +120,19 @@ const selfTest: SelfTestCase = async (context: SelfTestContext) => { assert.equal((((summaryItem.supervisor as JsonRecord).terminalClassification as JsonRecord).providerEvidence), "insufficient"); assert.equal(JSON.stringify(commanderSummary).includes("hwpod workspace apply-patch"), false, "commander summary must stay compact and avoid dumping command bodies"); assert.equal(JSON.stringify(summaryItem).includes("fullRecordBytes"), false, "commander item must not add bookkeeping noise"); - assertNoSecretLeak({ toolResult, assistantLive, inactiveLive, terminalResult, noSessionResult, session, commanderSummary }); + assertNoSecretLeak({ toolResult, assistantLive, inactiveLive, terminalResult, noSessionResult, staleResult, session, commanderSummary }); - return { name: "timeout-liveness", tests: ["tool-in-flight-liveness", "assistant-progress-liveness", "stdio-inactive-timeout-budget", "terminal-timeout-recovery", "no-session-drilldown", "terminal-classification", "queue-commander-supervisor"] }; + return { name: "timeout-liveness", tests: ["tool-in-flight-liveness", "assistant-progress-liveness", "stdio-inactive-timeout-budget", "terminal-timeout-recovery", "no-session-drilldown", "terminal-classification", "queue-commander-supervisor", "diagnosis-visibility", "stale-claimed-runner-lost"] }; } finally { await new Promise((resolve) => server.server.close(() => resolve())); } }; -async function createActiveRun(client: ManagerClient, context: SelfTestContext, sessionSuffix: string, timeoutMs: number, options: { session?: boolean } = {}): Promise<{ runId: string; commandId: string; sessionId: string | null }> { +async function createActiveRun(client: ManagerClient, context: SelfTestContext, sessionSuffix: string, timeoutMs: number, options: { session?: boolean; leaseMs?: number } = {}): Promise<{ runId: string; commandId: string; sessionId: string | null }> { const sessionId = `selftest-${sessionSuffix}`; const run = await client.post("/api/v1/runs", runBody(context, options.session === false ? null : sessionId, timeoutMs)) as JsonRecord; const command = await client.post(`/api/v1/runs/${run.id}/commands`, { type: "turn", payload: { prompt: sessionSuffix }, idempotencyKey: sessionSuffix }) as JsonRecord; - await client.post(`/api/v1/runs/${run.id}/claim`, { runnerId: `runner_${sessionSuffix}`, leaseMs: 60_000 }); + await client.post(`/api/v1/runs/${run.id}/claim`, { runnerId: `runner_${sessionSuffix}`, leaseMs: options.leaseMs ?? 60_000 }); await client.post(`/api/v1/commands/${command.id}/ack`, {}); return { runId: String(run.id), commandId: String(command.id), sessionId: options.session === false ? null : sessionId }; } diff --git a/src/selftest/cases/60-hwlab-baseline-contract.ts b/src/selftest/cases/60-hwlab-baseline-contract.ts index c18275c..67b94bb 100644 --- a/src/selftest/cases/60-hwlab-baseline-contract.ts +++ b/src/selftest/cases/60-hwlab-baseline-contract.ts @@ -122,10 +122,13 @@ async function assertRunnerJobStatus(client: ManagerClient, context: SelfTestCon assert.equal(status.runId, item.runId); assert.equal(status.commandId, item.commandId); assert.equal(status.phase, "created"); + assert.equal(((status.diagnosis as JsonRecord).category), "runner-job-not-started"); + assert.equal(((status.diagnosis as JsonRecord).runnerLostSuspected), true); assert.equal(status.valuesPrinted, false); assert.equal(typeof status.logPath, "string"); const single = await client.get(`/api/v1/runs/${item.runId}/runner-jobs/${String(status.id)}`) as JsonRecord; assert.equal(single.jobName, status.jobName); + assert.equal(((single.diagnosis as JsonRecord).phase), "created"); assertNoSecretLeak({ list, single }); }