fix: return recovery action descriptors (#174)

Co-authored-by: AgentRun Codex <agentrun-codex@users.noreply.github.com>
2026-06-12 01:20:02 +08:00
parent 1a9b6debbb
commit 216209ca95
11 changed files with 298 additions and 74 deletions
@@ -465,7 +465,7 @@ export interface QueueDispatchResult extends JsonRecord {
  envImage: JsonRecord | null;
  workReady: JsonRecord | null;
  latestAttempt: QueueAttemptRef;
-  pollCommands: JsonRecord;
+  pollActions: JsonRecord[];
 }

 export interface BackendEvent {
@@ -86,9 +86,9 @@ export function runnerJobDiagnosis(job: RunnerJobRecord, events: RunEvent[] = []
    namespace: job.namespace,
    logPath: stringValue(recordAt(job.result, "runner")?.logPath),
    nextActions: [
-      { action: "inspect-run", command: `./scripts/agentrun runs show ${job.runId}`, valuesPrinted: false },
-      { action: "inspect-command", command: `./scripts/agentrun commands show ${job.commandId} --run-id ${job.runId}`, valuesPrinted: false },
-      { action: "poll-events", command: `./scripts/agentrun runs events ${job.runId} --after-seq 0 --limit 100 --tail-summary`, valuesPrinted: false },
+      recoveryDescriptor({ action: "inspect-run", operation: "describe", resourceKind: "run", resourceName: job.runId, runId: job.runId }),
+      recoveryDescriptor({ action: "inspect-command", operation: "describe", resourceKind: "command", resourceName: job.commandId, runId: job.runId, commandId: job.commandId }),
+      recoveryDescriptor({ action: "poll-events", operation: "events", resourceKind: "run", resourceName: job.runId, runId: job.runId, commandId: job.commandId, afterSeq: 0, limit: 100 }),
    ],
    valuesPrinted: false,
  };
@@ -167,16 +167,35 @@ function evidenceLevel(category: string, providerEvidence: string, runnerLost: b

 function recoveryActionsForDiagnosis(input: { run: RunRecord; command: CommandRecord | null; latestJob: RunnerJobRecord | null; session: JsonRecord; runnerLost: boolean; staleClaimed: boolean; terminalCommandOpenRun: boolean; failureKind: string | null; lastSeq: number }): JsonRecord[] {
  const actions: JsonRecord[] = [];
-  if (input.latestJob) actions.push({ action: "inspect-runner-job", runnerJobId: input.latestJob.id, command: `./scripts/agentrun runner job-status ${input.latestJob.id} --run-id ${input.run.id}`, valuesPrinted: false });
-  if (input.command) actions.push({ action: "inspect-command", commandId: input.command.id, command: `./scripts/agentrun commands result ${input.command.id} --run-id ${input.run.id}`, valuesPrinted: false });
-  actions.push({ action: "poll-events", runId: input.run.id, afterSeq: input.lastSeq, command: `./scripts/agentrun runs events ${input.run.id} --after-seq ${input.lastSeq} --limit 100 --tail-summary`, valuesPrinted: false });
+  if (input.latestJob) actions.push(recoveryDescriptor({ action: "inspect-runner-job", operation: "describe", resourceKind: "runnerjob", resourceName: input.latestJob.id, runId: input.run.id, commandId: input.command?.id ?? input.latestJob.commandId, runnerJobId: input.latestJob.id }));
+  if (input.command) actions.push(recoveryDescriptor({ action: "inspect-command", operation: "result", resourceKind: "command", resourceName: input.command.id, runId: input.run.id, commandId: input.command.id }));
+  actions.push(recoveryDescriptor({ action: "poll-events", operation: "events", resourceKind: "run", resourceName: input.run.id, runId: input.run.id, commandId: input.command?.id ?? null, afterSeq: input.lastSeq, limit: 100 }));
  const sessionId = stringValue(input.session.sessionId);
-  if (sessionId) actions.push({ action: "continue-session", sessionId, command: `./scripts/agentrun sessions send ${sessionId} --prompt-stdin`, valuesPrinted: false });
-  else actions.push({ action: "session-unavailable", reason: "sessionRef=null", hint: "当前 run 没有 sessionRef，管理者只能从 run/events/command/runner-job 读取 trace 后重新提交；这表示该任务不可同 session 续跑。", valuesPrinted: false });
-  if (input.runnerLost || input.staleClaimed || input.terminalCommandOpenRun) actions.push({ action: "refresh-queue-or-resubmit", reason: input.failureKind ?? "stale-runner-state", hint: "先用 queue refresh/show 对齐 attempt；有 sessionId 时继续同一 session，没有 sessionId 才重新派发。", valuesPrinted: false });
+  if (sessionId) actions.push(recoveryDescriptor({ action: "continue-session", operation: "send", resourceKind: "session", resourceName: sessionId, runId: input.run.id, commandId: input.command?.id ?? null, sessionId, inputKind: "prompt" }));
+  else actions.push({ action: "session-unavailable", operation: "operator-decision", resourceKind: "run", resourceName: input.run.id, runId: input.run.id, commandId: input.command?.id ?? null, reason: "sessionRef=null", reasonHint: "当前 run 没有 sessionRef，管理者只能从 run/events/command/runner-job 读取 trace 后重新提交；这表示该任务不可同 session 续跑。", valuesPrinted: false });
+  if (input.runnerLost || input.staleClaimed || input.terminalCommandOpenRun) actions.push(recoveryDescriptor({ action: "refresh-queue-or-resubmit", operation: "operator-decision", resourceKind: sessionId ? "session" : "run", resourceName: sessionId ?? input.run.id, runId: input.run.id, commandId: input.command?.id ?? null, sessionId, reason: input.failureKind ?? "stale-runner-state", reasonHint: "先用 queue refresh/show 对齐 attempt；有 sessionId 时继续同一 session，没有 sessionId 才重新派发。" }));
  return actions.slice(0, 6);
 }

+function recoveryDescriptor(input: { action: string; operation: string; resourceKind: string; resourceName: string; runId?: string | null; commandId?: string | null; runnerJobId?: string | null; sessionId?: string | null; afterSeq?: number | null; limit?: number | null; reason?: string | null; reasonHint?: string | null; inputKind?: string | null }): JsonRecord {
+  return {
+    action: input.action,
+    operation: input.operation,
+    resourceKind: input.resourceKind,
+    resourceName: input.resourceName,
+    runId: input.runId ?? null,
+    commandId: input.commandId ?? null,
+    runnerJobId: input.runnerJobId ?? null,
+    sessionId: input.sessionId ?? null,
+    ...(input.afterSeq !== undefined ? { afterSeq: input.afterSeq } : {}),
+    ...(input.limit !== undefined ? { limit: input.limit } : {}),
+    ...(input.reason ? { reason: input.reason } : {}),
+    ...(input.reasonHint ? { reasonHint: input.reasonHint } : {}),
+    ...(input.inputKind ? { inputKind: input.inputKind } : {}),
+    valuesPrinted: false,
+  };
+}
+
 function runnerJobReference(job: RunnerJobRecord, events: RunEvent[]): JsonRecord {
  const observation = runnerJobObservation(job, events);
  const terminalStatus = stringValue(observation.terminalStatus);
@@ -213,11 +213,11 @@ export async function createKubernetesRunnerJob(options: { store: AgentRunStore;
    retention: {
      ttlSecondsAfterFinished: render.ttlSecondsAfterFinished,
    },
-    pollCommands: {
-      run: `./scripts/agentrun runs show ${run.id} --manager-url ${managerUrl}`,
-      command: `./scripts/agentrun commands show ${commandId} --run-id ${run.id} --manager-url ${managerUrl}`,
-      events: `./scripts/agentrun runs events ${run.id} --manager-url ${managerUrl} --after-seq 0 --limit 100`,
-    },
+    pollActions: [
+      runnerJobActionDescriptor({ action: "inspect-run", operation: "describe", resourceKind: "run", resourceName: run.id, runId: run.id }),
+      runnerJobActionDescriptor({ action: "inspect-command", operation: "describe", resourceKind: "command", resourceName: commandId, runId: run.id, commandId }),
+      runnerJobActionDescriptor({ action: "poll-events", operation: "events", resourceKind: "run", resourceName: run.id, runId: run.id, commandId, afterSeq: 0, limit: 100 }),
+    ],
    warnings: render.warnings,
    kubernetes: {
      created: true,
@@ -459,6 +459,20 @@ function stringField(record: JsonRecord, key: string): string {
  return value.trim();
 }

+function runnerJobActionDescriptor(input: { action: string; operation: string; resourceKind: string; resourceName: string; runId?: string | null; commandId?: string | null; afterSeq?: number | null; limit?: number | null }): JsonRecord {
+  return {
+    action: input.action,
+    operation: input.operation,
+    resourceKind: input.resourceKind,
+    resourceName: input.resourceName,
+    runId: input.runId ?? null,
+    commandId: input.commandId ?? null,
+    ...(input.afterSeq !== undefined ? { afterSeq: input.afterSeq } : {}),
+    ...(input.limit !== undefined ? { limit: input.limit } : {}),
+    valuesPrinted: false,
+  };
+}
+
 function optionalString(value: unknown): string | undefined {
  return typeof value === "string" && value.trim().length > 0 ? value.trim() : undefined;
 }
@@ -126,12 +126,12 @@ export async function removeProviderProfile(profileValue: string, options: Provi
    configHashSuffix: hashDataKey(data, "config.toml") ?? stringPath(annotations, [`${credentialAnnotationPrefix}-config-hash-suffix`]),
    updatedAt: new Date().toISOString(),
    valuesPrinted: false,
-    pollCommands: {
-      list: "./scripts/agentrun provider-profiles list",
-      show: `./scripts/agentrun provider-profiles show ${profile}`,
-      setKey: `./scripts/agentrun provider-profiles set-key ${profile} --key-stdin`,
-      setConfig: `./scripts/agentrun provider-profiles set-config ${profile} --config-stdin`,
-    },
+    pollActions: [
+      providerActionDescriptor({ action: "list-provider-profiles", operation: "list", resourceKind: "provider-profile", resourceName: "*" }),
+      providerActionDescriptor({ action: "inspect-provider-profile", operation: "describe", resourceKind: "provider-profile", resourceName: profile, profile }),
+      providerActionDescriptor({ action: "set-provider-key", operation: "set-key", resourceKind: "provider-profile", resourceName: profile, profile, inputKind: "credential" }),
+      providerActionDescriptor({ action: "set-provider-config", operation: "set-config", resourceKind: "provider-profile", resourceName: profile, profile, inputKind: "config" }),
+    ],
  };
 }

@@ -184,12 +184,12 @@ export async function setProviderProfileConfig(profileValue: string, body: unkno
    configTomlPrinted: false,
    credentialValuesPrinted: false,
    valuesPrinted: false,
-    pollCommands: {
-      config: `./scripts/agentrun provider-profiles config ${profile}`,
-      show: `./scripts/agentrun provider-profiles show ${profile}`,
-      setKey: `./scripts/agentrun provider-profiles set-key ${profile} --key-stdin`,
-      validate: `./scripts/agentrun provider-profiles validate ${profile} --wait --timeout-ms 120000`,
-    },
+    pollActions: [
+      providerActionDescriptor({ action: "inspect-provider-config", operation: "config", resourceKind: "provider-profile", resourceName: profile, profile }),
+      providerActionDescriptor({ action: "inspect-provider-profile", operation: "describe", resourceKind: "provider-profile", resourceName: profile, profile }),
+      providerActionDescriptor({ action: "set-provider-key", operation: "set-key", resourceKind: "provider-profile", resourceName: profile, profile, inputKind: "credential" }),
+      providerActionDescriptor({ action: "validate-provider-profile", operation: "validate", resourceKind: "provider-profile", resourceName: profile, profile, wait: true, timeoutMs: 120_000 }),
+    ],
  };
 }

@@ -241,10 +241,24 @@ export async function setProviderProfileCredential(profileValue: string, body: u
    delegatedBy,
    requiresExternalBridgeUpdate: profileUsesMoonBridge(profile, renderedConfig.configToml),
    valuesPrinted: false,
-    pollCommands: {
-      show: `./scripts/agentrun provider-profiles show ${profile}`,
-      validate: `./scripts/agentrun provider-profiles validate ${profile} --wait --timeout-ms 120000`,
-    },
+    pollActions: [
+      providerActionDescriptor({ action: "inspect-provider-profile", operation: "describe", resourceKind: "provider-profile", resourceName: profile, profile }),
+      providerActionDescriptor({ action: "validate-provider-profile", operation: "validate", resourceKind: "provider-profile", resourceName: profile, profile, wait: true, timeoutMs: 120_000 }),
+    ],
+  };
+}
+
+function providerActionDescriptor(input: { action: string; operation: string; resourceKind: string; resourceName: string; profile?: string | null; inputKind?: string | null; wait?: boolean; timeoutMs?: number | null }): JsonRecord {
+  return {
+    action: input.action,
+    operation: input.operation,
+    resourceKind: input.resourceKind,
+    resourceName: input.resourceName,
+    profile: input.profile ?? null,
+    ...(input.inputKind ? { inputKind: input.inputKind } : {}),
+    ...(input.wait === true ? { wait: true } : {}),
+    ...(input.timeoutMs !== undefined ? { timeoutMs: input.timeoutMs } : {}),
+    valuesPrinted: false,
  };
 }

@@ -52,12 +52,26 @@ export async function dispatchQueueTask(options: DispatchQueueTaskOptions): Prom
    envImage: jsonRecordOrNull(runnerJob.envImage),
    workReady: jsonRecordOrNull(runnerJob.workReady),
    latestAttempt,
-    pollCommands: {
-      queue: `./scripts/agentrun queue show ${task.id}`,
-      run: `./scripts/agentrun runs show ${run.id}`,
-      command: `./scripts/agentrun commands show ${command.id} --run-id ${run.id}`,
-      events: `./scripts/agentrun runs events ${run.id} --after-seq 0 --limit 100`,
-    },
+    pollActions: [
+      dispatchActionDescriptor({ action: "inspect-task", operation: "describe", resourceKind: "task", resourceName: task.id }),
+      dispatchActionDescriptor({ action: "inspect-run", operation: "describe", resourceKind: "run", resourceName: run.id, runId: run.id }),
+      dispatchActionDescriptor({ action: "inspect-command", operation: "describe", resourceKind: "command", resourceName: command.id, runId: run.id, commandId: command.id }),
+      dispatchActionDescriptor({ action: "poll-events", operation: "events", resourceKind: "run", resourceName: run.id, runId: run.id, commandId: command.id, afterSeq: 0, limit: 100 }),
+    ],
+  };
+}
+
+function dispatchActionDescriptor(input: { action: string; operation: string; resourceKind: string; resourceName: string; runId?: string | null; commandId?: string | null; afterSeq?: number | null; limit?: number | null }): JsonRecord {
+  return {
+    action: input.action,
+    operation: input.operation,
+    resourceKind: input.resourceKind,
+    resourceName: input.resourceName,
+    runId: input.runId ?? null,
+    commandId: input.commandId ?? null,
+    ...(input.afterSeq !== undefined ? { afterSeq: input.afterSeq } : {}),
+    ...(input.limit !== undefined ? { limit: input.limit } : {}),
+    valuesPrinted: false,
  };
 }

@@ -442,39 +442,57 @@ function ageMs(value: string, nowMs: number): number | null {
 function recoveryActions(input: { run: RunRecord; command: CommandRecord | null; afterSeq: number; active: boolean; terminal: TerminalStatus | null; failureKind: FailureKind | null; failureMessage: string | null; needsContinuation: boolean; finalResponseAuthority: string }): JsonRecord[] {
  const { run, command, afterSeq, active, terminal, failureKind, failureMessage, needsContinuation, finalResponseAuthority } = input;
  const sessionId = run.sessionRef?.sessionId ?? null;
-  const traceCommand = sessionId ? `./scripts/agentrun sessions trace ${sessionId} --after-seq ${afterSeq} --limit 100 --run-id ${run.id}` : `./scripts/agentrun runs events ${run.id} --after-seq ${afterSeq} --limit 100 --summary`;
-  const outputCommand = sessionId ? `./scripts/agentrun sessions output ${sessionId} --after-seq ${afterSeq} --limit 100 --run-id ${run.id}` : null;
  const actions: JsonRecord[] = [
-    { action: "poll-trace", runId: run.id, commandId: command?.id ?? null, afterSeq, command: traceCommand, valuesPrinted: false },
+    recoveryDescriptor({ action: "poll-trace", operation: "events", resourceKind: "run", resourceName: run.id, runId: run.id, commandId: command?.id ?? null, sessionId, afterSeq, limit: 100 }),
  ];
-  if (outputCommand) actions.push({ action: "poll-output", runId: run.id, commandId: command?.id ?? null, afterSeq, command: outputCommand, valuesPrinted: false });
+  if (sessionId) actions.push(recoveryDescriptor({ action: "poll-output", operation: "logs", resourceKind: "session", resourceName: sessionId, runId: run.id, commandId: command?.id ?? null, sessionId, afterSeq, limit: 100 }));
  if (active) {
-    if (sessionId) actions.push({ action: "send-session", sessionId, runId: run.id, commandId: command?.id ?? null, command: `./scripts/agentrun sessions send ${sessionId} --prompt-stdin`, hint: "manager 会按当前 session 状态自动决定内部 steer 或新 turn", valuesPrinted: false });
-    if (command) actions.push({ action: "cancel-command", runId: run.id, commandId: command.id, command: `./scripts/agentrun commands cancel ${command.id} --reason <reason>`, valuesPrinted: false });
-    else actions.push({ action: "cancel-run", runId: run.id, command: `./scripts/agentrun runs cancel ${run.id} --reason <reason>`, valuesPrinted: false });
+    if (sessionId) actions.push(recoveryDescriptor({ action: "send-session", operation: "send", resourceKind: "session", resourceName: sessionId, runId: run.id, commandId: command?.id ?? null, sessionId, inputKind: "prompt", reasonHint: "manager 会按当前 session 状态自动决定内部 steer 或新 turn" }));
+    if (command) actions.push(recoveryDescriptor({ action: "cancel-command", operation: "cancel", resourceKind: "command", resourceName: command.id, runId: run.id, commandId: command.id, sessionId, reasonRequired: true, reasonHint: "operator supplied cancel reason" }));
+    else actions.push(recoveryDescriptor({ action: "cancel-run", operation: "cancel", resourceKind: "run", resourceName: run.id, runId: run.id, sessionId, reasonRequired: true, reasonHint: "operator supplied cancel reason" }));
    return actions;
  }
  if (needsContinuation && sessionId) {
-    if (command) actions.push({ action: "inspect-result", runId: run.id, commandId: command.id, command: `./scripts/agentrun commands result ${command.id} --run-id ${run.id}`, valuesPrinted: false });
-    actions.push({ action: "continue-session", reason: `final-response-${finalResponseAuthority}`, sessionId, command: `./scripts/agentrun sessions send ${sessionId} --prompt-stdin`, hint: "命令已 terminal completed，但没有 authoritative final response；管理者应先读 trace/output，再用同一 session 发送后续 prompt。", valuesPrinted: false });
+    if (command) actions.push(recoveryDescriptor({ action: "inspect-result", operation: "result", resourceKind: "command", resourceName: command.id, runId: run.id, commandId: command.id, sessionId }));
+    actions.push(recoveryDescriptor({ action: "continue-session", operation: "send", resourceKind: "session", resourceName: sessionId, runId: run.id, commandId: command?.id ?? null, sessionId, reason: `final-response-${finalResponseAuthority}`, inputKind: "prompt", reasonHint: "命令已 terminal completed，但没有 authoritative final response；管理者应先读 trace/output，再用同一 session 发送后续 prompt。" }));
    return actions;
  }
  if (terminal === "failed" || terminal === "blocked" || terminal === "cancelled") {
-    if (command) actions.push({ action: "inspect-result", runId: run.id, commandId: command.id, command: `./scripts/agentrun commands result ${command.id} --run-id ${run.id}`, valuesPrinted: false });
-    if (sessionId) actions.push({ action: "continue-session", sessionId, command: `./scripts/agentrun sessions send ${sessionId} --prompt-stdin`, valuesPrinted: false });
-    if (failureKind === "backend-timeout") actions.push({ action: "split-task", reason: "backend-timeout", hint: "先由管理者读取 trace/result，总结下一步，再把后续 prompt 发到同一 session；必要时把大 patch / 长工具链拆成更短 turn。", failureMessage: failureMessage ? boundedTextSummary(failureMessage, { limitChars: 200 }).text as string : null, valuesPrinted: false });
-    else actions.push({ action: "retry-or-split", reason: failureKind ?? "terminal", hint: "先读 trace/output 的 detail id，再决定继续同 session、重跑或拆分", valuesPrinted: false });
+    if (command) actions.push(recoveryDescriptor({ action: "inspect-result", operation: "result", resourceKind: "command", resourceName: command.id, runId: run.id, commandId: command.id, sessionId }));
+    if (sessionId) actions.push(recoveryDescriptor({ action: "continue-session", operation: "send", resourceKind: "session", resourceName: sessionId, runId: run.id, commandId: command?.id ?? null, sessionId, inputKind: "prompt" }));
+    if (failureKind === "backend-timeout") actions.push(recoveryDescriptor({ action: "split-task", operation: "operator-decision", resourceKind: sessionId ? "session" : "run", resourceName: sessionId ?? run.id, runId: run.id, commandId: command?.id ?? null, sessionId, reason: "backend-timeout", reasonHint: "先由管理者读取 trace/result，总结下一步，再把后续 prompt 发到同一 session；必要时把大 patch / 长工具链拆成更短 turn。", failureMessage: failureMessage ? boundedTextSummary(failureMessage, { limitChars: 200 }).text as string : null }));
+    else actions.push(recoveryDescriptor({ action: "retry-or-split", operation: "operator-decision", resourceKind: sessionId ? "session" : "run", resourceName: sessionId ?? run.id, runId: run.id, commandId: command?.id ?? null, sessionId, reason: failureKind ?? "terminal", reasonHint: "先读 trace/output 的 detail id，再决定继续同 session、重跑或拆分" }));
  }
  return actions;
 }

+function recoveryDescriptor(input: { action: string; operation: string; resourceKind: string; resourceName: string; runId?: string | null; commandId?: string | null; sessionId?: string | null; afterSeq?: number | null; limit?: number | null; reason?: string | null; reasonHint?: string | null; reasonRequired?: boolean; inputKind?: string | null; failureMessage?: string | null }): JsonRecord {
+  return {
+    action: input.action,
+    operation: input.operation,
+    resourceKind: input.resourceKind,
+    resourceName: input.resourceName,
+    runId: input.runId ?? null,
+    commandId: input.commandId ?? null,
+    sessionId: input.sessionId ?? null,
+    ...(input.afterSeq !== undefined ? { afterSeq: input.afterSeq } : {}),
+    ...(input.limit !== undefined ? { limit: input.limit } : {}),
+    ...(input.reason ? { reason: input.reason } : {}),
+    ...(input.reasonHint ? { reasonHint: input.reasonHint } : {}),
+    ...(input.reasonRequired === true ? { reasonRequired: true } : {}),
+    ...(input.inputKind ? { inputKind: input.inputKind } : {}),
+    ...(input.failureMessage ? { failureMessage: input.failureMessage } : {}),
+    valuesPrinted: false,
+  };
+}
+
 function finalResponseAuthority(reply: AssistantReplySummary): "authoritative" | "fallback" | "missing" {
  if (reply.replyAuthority || reply.final) return "authoritative";
  return reply.text.length > 0 ? "fallback" : "missing";
 }

 function completionEvidenceSummary(input: { terminal: TerminalStatus | null; terminalSource: string; reply: AssistantReplySummary; responseAuthority: string; needsContinuation: boolean; sessionId: string | null }): JsonRecord {
-  const recommendedAction = input.needsContinuation && input.sessionId ? `./scripts/agentrun sessions send ${input.sessionId} --prompt-stdin` : null;
+  const recommendedAction = input.needsContinuation && input.sessionId ? recoveryDescriptor({ action: "continue-session", operation: "send", resourceKind: "session", resourceName: input.sessionId, sessionId: input.sessionId, inputKind: "prompt", reason: `final-response-${input.responseAuthority}` }) : null;
  return {
    terminalStatus: input.terminal,
    terminalSource: input.terminalSource,
@@ -165,6 +165,7 @@ async function queueTaskSupervisor(store: AgentRunStore, task: JsonRecord): Prom
    const lastActivity = asJsonRecord(liveness?.lastActivity ?? liveness?.lastCommandActivity);
    const timeoutBudget = asJsonRecord(liveness?.timeoutBudget);
    const terminalClassification = asJsonRecord(result.terminalClassification ?? liveness?.terminalClassification);
+    const lease = asJsonRecord(liveness?.lease);
    return {
      runId: stringJsonValue(result.runId),
      commandId: stringJsonValue(result.commandId),
@@ -176,8 +177,13 @@ async function queueTaskSupervisor(store: AgentRunStore, task: JsonRecord): Prom
      phase: stringJsonValue(liveness?.phase),
      active: liveness?.active === true,
      lastSeq: numberJsonValue(liveness?.lastSeq ?? result.lastSeq),
+      lastEventAt: stringJsonValue(liveness?.lastEventAt),
+      lastEventAgeMs: numberJsonValue(liveness?.lastEventAgeMs),
      lastActivity: lastActivity ? compactActivity(lastActivity) : null,
      timeoutBudget: timeoutBudget ? compactTimeoutBudget(timeoutBudget) : null,
+      lease: lease ? compactLease(lease) : null,
+      leaseRemainingMs: numberJsonValue(lease?.leaseRemainingMs),
+      leaseExpired: lease?.leaseExpired === true,
      recoveryActions: compactRecoveryActions(liveness?.recoveryActions),
      valuesPrinted: false,
    };
@@ -220,6 +226,16 @@ function compactTimeoutBudget(budget: JsonRecord): JsonRecord {
  };
 }

+function compactLease(lease: JsonRecord): JsonRecord {
+  return {
+    claimedBy: stringJsonValue(lease.claimedBy),
+    leaseExpiresAt: stringJsonValue(lease.leaseExpiresAt),
+    leaseExpired: lease.leaseExpired === true,
+    leaseRemainingMs: numberJsonValue(lease.leaseRemainingMs),
+    valuesPrinted: false,
+  };
+}
+
 function compactTerminalClassification(record: JsonRecord): JsonRecord {
  return {
    category: stringJsonValue(record.category),
@@ -248,13 +264,20 @@ function compactRecoveryActions(value: JsonValue | undefined): JsonValue[] {
    if (!action) return { action: "unknown", valuesPrinted: false };
    return {
      action: stringJsonValue(action.action),
+      operation: stringJsonValue(action.operation),
+      resourceKind: stringJsonValue(action.resourceKind),
+      resourceName: stringJsonValue(action.resourceName),
      reason: stringJsonValue(action.reason),
+      reasonHint: boundedJsonString(action.reasonHint, 220),
+      reasonRequired: action.reasonRequired === true,
+      inputKind: stringJsonValue(action.inputKind),
      runId: stringJsonValue(action.runId),
      commandId: stringJsonValue(action.commandId),
+      runnerJobId: stringJsonValue(action.runnerJobId),
      sessionId: stringJsonValue(action.sessionId),
      afterSeq: numberJsonValue(action.afterSeq),
-      command: boundedJsonString(action.command, 220),
-      hint: boundedJsonString(action.hint, 220),
+      limit: numberJsonValue(action.limit),
+      failureMessage: boundedJsonString(action.failureMessage, 220),
      valuesPrinted: false,
    };
  });
@@ -710,7 +733,7 @@ function sessionSendPlan(sessionId: string, decision: "steer" | "turn", active:
    activeBefore: active ? activeBeforeSummary(active) : null,
    request,
    ...(runBody ? { run: { bodyBytes: jsonByteLength(runBody), sessionRef: summarizeSendSessionRef(runBody), valuesPrinted: false } } : {}),
-    next: { confirm: `./scripts/agentrun sessions send ${sessionId} --prompt-stdin`, note: "Remove --dry-run to perform the mutation. Manager will decide internal steer vs turn from durable session state." },
+    next: { confirm: managerActionDescriptor({ action: "send-session", operation: "send", resourceKind: "session", resourceName: sessionId, sessionId, inputKind: "prompt" }), note: "Remove --dry-run to perform the mutation. Manager will decide internal steer vs turn from durable session state." },
    valuesPrinted: false,
  };
 }
@@ -727,13 +750,31 @@ function sessionSendResponse(input: { sessionId: string; decision: "steer" | "tu
    command: input.command as unknown as JsonRecord,
    runnerJob: input.runnerJob,
    activeBefore: input.activeBefore ? activeBeforeSummary(input.activeBefore) : null,
-    pollCommands: {
-      show: `./scripts/agentrun sessions show ${input.sessionId} --reader-id cli`,
-      trace: `./scripts/agentrun sessions trace ${input.sessionId} --after-seq 0 --limit 100`,
-      output: `./scripts/agentrun sessions output ${input.sessionId} --after-seq 0 --limit 100`,
-      read: `./scripts/agentrun sessions read ${input.sessionId} --reader-id cli`,
-      cancel: `./scripts/agentrun sessions cancel ${input.sessionId}`,
-    },
+    pollActions: [
+      managerActionDescriptor({ action: "inspect-session", operation: "describe", resourceKind: "session", resourceName: input.sessionId, sessionId: input.sessionId, readerId: "cli" }),
+      managerActionDescriptor({ action: "poll-trace", operation: "events", resourceKind: "run", resourceName: input.run.id, runId: input.run.id, commandId: input.command.id, sessionId: input.sessionId, afterSeq: 0, limit: 100 }),
+      managerActionDescriptor({ action: "poll-output", operation: "logs", resourceKind: "session", resourceName: input.sessionId, runId: input.run.id, commandId: input.command.id, sessionId: input.sessionId, afterSeq: 0, limit: 100 }),
+      managerActionDescriptor({ action: "read-session", operation: "read", resourceKind: "session", resourceName: input.sessionId, sessionId: input.sessionId, readerId: "cli" }),
+      managerActionDescriptor({ action: "cancel-session", operation: "cancel", resourceKind: "session", resourceName: input.sessionId, sessionId: input.sessionId, reasonRequired: true }),
+    ],
+    valuesPrinted: false,
+  };
+}
+
+function managerActionDescriptor(input: { action: string; operation: string; resourceKind: string; resourceName: string; runId?: string | null; commandId?: string | null; sessionId?: string | null; afterSeq?: number | null; limit?: number | null; readerId?: string | null; reasonRequired?: boolean; inputKind?: string | null }): JsonRecord {
+  return {
+    action: input.action,
+    operation: input.operation,
+    resourceKind: input.resourceKind,
+    resourceName: input.resourceName,
+    runId: input.runId ?? null,
+    commandId: input.commandId ?? null,
+    sessionId: input.sessionId ?? null,
+    ...(input.afterSeq !== undefined ? { afterSeq: input.afterSeq } : {}),
+    ...(input.limit !== undefined ? { limit: input.limit } : {}),
+    ...(input.readerId ? { readerId: input.readerId } : {}),
+    ...(input.reasonRequired === true ? { reasonRequired: true } : {}),
+    ...(input.inputKind ? { inputKind: input.inputKind } : {}),
    valuesPrinted: false,
  };
 }
@@ -92,10 +92,21 @@ export async function setGithubSshToolCredential(body: unknown, options: ToolCre
    updatedAt: stringPath(applied, ["metadata", "annotations", `${annotationPrefix}-updated-at`]) ?? updatedAt,
    credentialValuesPrinted: false,
    valuesPrinted: false,
-    pollCommands: {
-      show: "./scripts/agentrun tool-credentials show github-ssh",
-      list: "./scripts/agentrun tool-credentials list",
-    },
+    pollActions: [
+      toolCredentialActionDescriptor({ action: "inspect-tool-credential", operation: "describe", resourceKind: "tool-credential", resourceName: spec.name, tool: spec.tool }),
+      toolCredentialActionDescriptor({ action: "list-tool-credentials", operation: "list", resourceKind: "tool-credential", resourceName: "*", tool: spec.tool }),
+    ],
+  };
+}
+
+function toolCredentialActionDescriptor(input: { action: string; operation: string; resourceKind: string; resourceName: string; tool?: string | null }): JsonRecord {
+  return {
+    action: input.action,
+    operation: input.operation,
+    resourceKind: input.resourceKind,
+    resourceName: input.resourceName,
+    tool: input.tool ?? null,
+    valuesPrinted: false,
  };
 }

@@ -422,7 +422,39 @@ async function runFailureCase(options: { client: ManagerClient; managerUrl: stri
  }
  const command = await options.client.get(`/api/v1/runs/${item.runId}/commands/${item.commandId}`) as { state?: string };
  assert.equal(command.state, "failed", options.mode);
-  assertNoSecretLeak(events);
+  const envelope = await options.client.get(`/api/v1/runs/${item.runId}/commands/${item.commandId}/result`) as JsonRecord;
+  if (options.mode === "provider-503-terminal") {
+    const classification = envelope.terminalClassification as JsonRecord;
+    const liveness = envelope.liveness as JsonRecord;
+    const timeoutBudget = liveness.timeoutBudget as JsonRecord;
+    assert.equal(classification.category, "provider-failed");
+    assert.equal(classification.providerEvidence, "failure-kind");
+    assert.equal(classification.providerInterruptionKnown, true);
+    assert.equal(classification.failureKind, "provider-http-error");
+    assert.equal(liveness.phase, "terminal");
+    assert.equal(typeof liveness.lastEventAgeMs, "number");
+    assert.equal(timeoutBudget.timeoutKind, "idle");
+    assert.equal(typeof timeoutBudget.idleElapsedMs, "number");
+    assertRecoveryActionDescriptors(liveness.recoveryActions);
+  }
+  assertNoSecretLeak({ events, envelope });
+}
+
+function assertRecoveryActionDescriptors(value: unknown): void {
+  assert.ok(Array.isArray(value), "recoveryActions must be an array");
+  const text = JSON.stringify(value);
+  assert.equal(text.includes("./scripts/agentrun sessions"), false, "server recoveryActions must not expose old sessions CLI paths");
+  assert.equal(text.includes("./scripts/agentrun commands"), false, "server recoveryActions must not expose old commands CLI paths");
+  assert.equal(text.includes("bun scripts/cli.ts agentrun"), false, "server recoveryActions must not hardcode render-only client commands");
+  for (const item of value) {
+    const action = item as JsonRecord;
+    assert.equal(Object.prototype.hasOwnProperty.call(action, "command"), false, "recovery action must be a descriptor, not a rendered command string");
+    assert.equal(typeof action.action, "string");
+    assert.equal(typeof action.operation, "string");
+    assert.equal(typeof action.resourceKind, "string");
+    assert.equal(typeof action.resourceName, "string");
+    assert.equal(action.valuesPrinted, false);
+  }
 }

 function eventPayload(event: { payload: unknown }): JsonRecord {
@@ -29,6 +29,25 @@ const selfTest: SelfTestCase = async (context: SelfTestContext) => {
    assert.equal(assistantLive.phase, "waiting-model-output");
    assert.equal(((assistantLive.lastActivity as JsonRecord).activityKind), "assistant-progress");

+    const retry = await createActiveRun(client, context, "timeout-liveness-provider-retry", 120_000);
+    await client.post(`/api/v1/runs/${retry.runId}/events`, { type: "error", payload: { commandId: retry.commandId, failureKind: "provider-stream-disconnected", willRetry: true, message: "provider stream disconnected; retrying" } });
+    const retryResult = await commandResult(client, retry);
+    const retryLive = retryResult.liveness as JsonRecord;
+    const retryClassification = retryResult.terminalClassification as JsonRecord;
+    assert.equal(retryLive.active, true);
+    assert.equal(retryClassification.category, "active-retry-interruption");
+    assert.equal(retryClassification.providerEvidence, "retry-event");
+    assert.equal(retryClassification.retryInterruptionObserved, true);
+    assert.equal(((retryLive.timeoutBudget as JsonRecord).state), "within-budget");
+    assert.equal(typeof retryLive.lastEventAgeMs, "number");
+    const retryCancelAction = (retryLive.recoveryActions as JsonRecord[]).find((action) => action.action === "cancel-command") as JsonRecord;
+    assert.equal(retryCancelAction.operation, "cancel");
+    assert.equal(retryCancelAction.resourceKind, "command");
+    assert.equal(retryCancelAction.resourceName, retry.commandId);
+    assert.equal(retryCancelAction.runId, retry.runId);
+    assert.equal(retryCancelAction.commandId, retry.commandId);
+    assertRecoveryActionDescriptors(retryLive.recoveryActions);
+
    const inactive = await createActiveRun(client, context, "timeout-liveness-inactive", 40);
    await sleep(36);
    const inactiveLive = (await commandResult(client, inactive)).liveness as JsonRecord;
@@ -72,7 +91,21 @@ const selfTest: SelfTestCase = async (context: SelfTestContext) => {
    assert.equal((noSessionLive.transportDisconnect as JsonRecord).sourceSeq, 4);
    assert.equal((noSessionLive.recoveryActions as JsonRecord[]).some((action) => action.action === "continue-session"), false, "sessionId=null must not suggest session-only continuation");
    assert.equal((noSessionLive.recoveryActions as JsonRecord[]).some((action) => action.action === "poll-output"), false, "sessionId=null must not suggest session output path");
-    assert.ok((noSessionLive.recoveryActions as JsonRecord[]).some((action) => action.action === "poll-trace" && String(action.command).includes("runs events")));
+    assert.ok((noSessionLive.recoveryActions as JsonRecord[]).some((action) => action.action === "poll-trace" && action.operation === "events" && action.resourceKind === "run" && action.resourceName === noSession.runId));
+    assertRecoveryActionDescriptors(noSessionLive.recoveryActions);
+
+    const manualCancel = await createActiveRun(client, context, "timeout-liveness-manual-command-cancel", 120_000);
+    await client.post(`/api/v1/commands/${manualCancel.commandId}/cancel`, { reason: "self-test manual command cancel" });
+    const manualCancelResult = await commandResult(client, manualCancel);
+    const manualCancelLive = manualCancelResult.liveness as JsonRecord;
+    const manualCancelClassification = manualCancelResult.terminalClassification as JsonRecord;
+    assert.equal(manualCancelResult.terminalStatus, "cancelled");
+    assert.equal(manualCancelResult.failureKind, "cancelled");
+    assert.equal(manualCancelLive.phase, "terminal");
+    assert.equal(manualCancelClassification.category, "cancelled");
+    assert.equal(manualCancelClassification.reason, "terminal status or failureKind is cancelled");
+    assert.ok((manualCancelLive.recoveryActions as JsonRecord[]).some((action) => action.action === "inspect-result" && action.operation === "result" && action.resourceKind === "command" && action.resourceName === manualCancel.commandId));
+    assertRecoveryActionDescriptors(manualCancelLive.recoveryActions);

    const stale = await createActiveRun(client, context, "timeout-liveness-stale-claimed", 120_000, { session: false, leaseMs: 1 });
    await store.saveRunnerJob({
@@ -98,12 +131,14 @@ const selfTest: SelfTestCase = async (context: SelfTestContext) => {
    assert.equal(staleDiagnosis.runnerLost, true);
    assert.equal(((staleDiagnosis.runnerJob as JsonRecord).phase), "created");
    assert.equal(((staleDiagnosis.session as JsonRecord).sessionRefNull), true);
+    assertRecoveryActionDescriptors(staleDiagnosis.recoveryActions);

    assert.ok(terminal.sessionId, "terminal fixture must have a session id");
    const terminalSessionId = terminal.sessionId;
    const session = await client.get(`/api/v1/sessions/${terminalSessionId}?readerId=timeout-liveness`) as JsonRecord;
    assert.equal(((session.liveness as JsonRecord).phase), "terminal");
    assert.ok(Array.isArray(((session.supervisor as JsonRecord).recoveryActions)), "session show must keep terminal recovery actions");
+    assertRecoveryActionDescriptors((session.supervisor as JsonRecord).recoveryActions);

    const task = await client.post("/api/v1/queue/tasks", queueTask(context, terminalSessionId, 50)) as JsonRecord;
    store.updateQueueTaskAttempt(String(task.id), {
@@ -117,16 +152,21 @@ const selfTest: SelfTestCase = async (context: SelfTestContext) => {
    assert.equal((((commanderItem.supervisor as JsonRecord).diagnosis as JsonRecord).category), "execution-idle-timeout");
    assert.equal((((commanderItem.supervisor as JsonRecord).timeoutBudget as JsonRecord).state), "timed-out");
    assert.equal((((commanderItem.supervisor as JsonRecord).timeoutBudget as JsonRecord).timeoutKind), "idle");
+    assert.equal(typeof ((commanderItem.supervisor as JsonRecord).lastEventAgeMs), "number");
+    assert.equal(typeof ((commanderItem.supervisor as JsonRecord).leaseRemainingMs), "number");
    const commanderSummary = summarizeQueueCommanderSnapshot(commander, { limit: 5 });
    const summaryItem = ((commanderSummary.items as JsonRecord[]) ?? []).find((item) => item.id === task.id) as JsonRecord;
    assert.equal(((summaryItem.supervisor as JsonRecord).phase), "terminal");
    assert.equal((((summaryItem.supervisor as JsonRecord).terminalClassification as JsonRecord).category), "execution-idle-timeout");
    assert.equal((((summaryItem.supervisor as JsonRecord).terminalClassification as JsonRecord).providerEvidence), "insufficient");
+    assert.equal(typeof ((summaryItem.supervisor as JsonRecord).lastEventAgeMs), "number");
+    assert.equal(typeof (((summaryItem.supervisor as JsonRecord).timeoutBudget as JsonRecord).idleElapsedMs), "number");
+    assertRecoveryActionDescriptors((summaryItem.supervisor as JsonRecord).recoveryActions);
    assert.equal(JSON.stringify(commanderSummary).includes("hwpod workspace apply-patch"), false, "commander summary must stay compact and avoid dumping command bodies");
    assert.equal(JSON.stringify(summaryItem).includes("fullRecordBytes"), false, "commander item must not add bookkeeping noise");
-    assertNoSecretLeak({ toolResult, assistantLive, inactiveLive, terminalResult, noSessionResult, staleResult, session, commanderSummary });
+    assertNoSecretLeak({ toolResult, assistantLive, retryResult, inactiveLive, terminalResult, noSessionResult, manualCancelResult, staleResult, session, commanderSummary });

-    return { name: "timeout-liveness", tests: ["tool-in-flight-liveness", "assistant-progress-liveness", "stdio-inactive-timeout-budget", "terminal-timeout-recovery", "no-session-drilldown", "terminal-classification", "queue-commander-supervisor", "diagnosis-visibility", "stale-claimed-runner-lost"] };
+    return { name: "timeout-liveness", tests: ["tool-in-flight-liveness", "assistant-progress-liveness", "active-provider-retry-summary", "stdio-inactive-timeout-budget", "terminal-timeout-recovery", "no-session-drilldown", "manual-command-cancel-summary", "terminal-classification", "queue-commander-supervisor", "diagnosis-visibility", "stale-claimed-runner-lost"] };
  } finally {
    await new Promise<void>((resolve) => server.server.close(() => resolve()));
  }
@@ -188,4 +228,21 @@ function executionPolicy(timeoutMs: number, codexHome: string): JsonRecord {
  };
 }

+function assertRecoveryActionDescriptors(value: unknown): void {
+  assert.ok(Array.isArray(value), "recoveryActions must be an array");
+  const text = JSON.stringify(value);
+  assert.equal(text.includes("./scripts/agentrun sessions"), false, "server recoveryActions must not expose old sessions CLI paths");
+  assert.equal(text.includes("./scripts/agentrun commands"), false, "server recoveryActions must not expose old commands CLI paths");
+  assert.equal(text.includes("bun scripts/cli.ts agentrun"), false, "server recoveryActions must not hardcode render-only client commands");
+  for (const item of value) {
+    const action = item as JsonRecord;
+    assert.equal(Object.prototype.hasOwnProperty.call(action, "command"), false, "recovery action must be a descriptor, not a rendered command string");
+    assert.equal(typeof action.action, "string");
+    assert.equal(typeof action.operation, "string");
+    assert.equal(typeof action.resourceKind, "string");
+    assert.equal(typeof action.resourceName, "string");
+    assert.equal(action.valuesPrinted, false);
+  }
+}
+
 export default selfTest;