fix: return recovery action descriptors (#174)
Co-authored-by: AgentRun Codex <agentrun-codex@users.noreply.github.com>
This commit is contained in:
+1
-1
@@ -465,7 +465,7 @@ export interface QueueDispatchResult extends JsonRecord {
|
||||
envImage: JsonRecord | null;
|
||||
workReady: JsonRecord | null;
|
||||
latestAttempt: QueueAttemptRef;
|
||||
pollCommands: JsonRecord;
|
||||
pollActions: JsonRecord[];
|
||||
}
|
||||
|
||||
export interface BackendEvent {
|
||||
|
||||
+28
-9
@@ -86,9 +86,9 @@ export function runnerJobDiagnosis(job: RunnerJobRecord, events: RunEvent[] = []
|
||||
namespace: job.namespace,
|
||||
logPath: stringValue(recordAt(job.result, "runner")?.logPath),
|
||||
nextActions: [
|
||||
{ action: "inspect-run", command: `./scripts/agentrun runs show ${job.runId}`, valuesPrinted: false },
|
||||
{ action: "inspect-command", command: `./scripts/agentrun commands show ${job.commandId} --run-id ${job.runId}`, valuesPrinted: false },
|
||||
{ action: "poll-events", command: `./scripts/agentrun runs events ${job.runId} --after-seq 0 --limit 100 --tail-summary`, valuesPrinted: false },
|
||||
recoveryDescriptor({ action: "inspect-run", operation: "describe", resourceKind: "run", resourceName: job.runId, runId: job.runId }),
|
||||
recoveryDescriptor({ action: "inspect-command", operation: "describe", resourceKind: "command", resourceName: job.commandId, runId: job.runId, commandId: job.commandId }),
|
||||
recoveryDescriptor({ action: "poll-events", operation: "events", resourceKind: "run", resourceName: job.runId, runId: job.runId, commandId: job.commandId, afterSeq: 0, limit: 100 }),
|
||||
],
|
||||
valuesPrinted: false,
|
||||
};
|
||||
@@ -167,16 +167,35 @@ function evidenceLevel(category: string, providerEvidence: string, runnerLost: b
|
||||
|
||||
function recoveryActionsForDiagnosis(input: { run: RunRecord; command: CommandRecord | null; latestJob: RunnerJobRecord | null; session: JsonRecord; runnerLost: boolean; staleClaimed: boolean; terminalCommandOpenRun: boolean; failureKind: string | null; lastSeq: number }): JsonRecord[] {
|
||||
const actions: JsonRecord[] = [];
|
||||
if (input.latestJob) actions.push({ action: "inspect-runner-job", runnerJobId: input.latestJob.id, command: `./scripts/agentrun runner job-status ${input.latestJob.id} --run-id ${input.run.id}`, valuesPrinted: false });
|
||||
if (input.command) actions.push({ action: "inspect-command", commandId: input.command.id, command: `./scripts/agentrun commands result ${input.command.id} --run-id ${input.run.id}`, valuesPrinted: false });
|
||||
actions.push({ action: "poll-events", runId: input.run.id, afterSeq: input.lastSeq, command: `./scripts/agentrun runs events ${input.run.id} --after-seq ${input.lastSeq} --limit 100 --tail-summary`, valuesPrinted: false });
|
||||
if (input.latestJob) actions.push(recoveryDescriptor({ action: "inspect-runner-job", operation: "describe", resourceKind: "runnerjob", resourceName: input.latestJob.id, runId: input.run.id, commandId: input.command?.id ?? input.latestJob.commandId, runnerJobId: input.latestJob.id }));
|
||||
if (input.command) actions.push(recoveryDescriptor({ action: "inspect-command", operation: "result", resourceKind: "command", resourceName: input.command.id, runId: input.run.id, commandId: input.command.id }));
|
||||
actions.push(recoveryDescriptor({ action: "poll-events", operation: "events", resourceKind: "run", resourceName: input.run.id, runId: input.run.id, commandId: input.command?.id ?? null, afterSeq: input.lastSeq, limit: 100 }));
|
||||
const sessionId = stringValue(input.session.sessionId);
|
||||
if (sessionId) actions.push({ action: "continue-session", sessionId, command: `./scripts/agentrun sessions send ${sessionId} --prompt-stdin`, valuesPrinted: false });
|
||||
else actions.push({ action: "session-unavailable", reason: "sessionRef=null", hint: "当前 run 没有 sessionRef,管理者只能从 run/events/command/runner-job 读取 trace 后重新提交;这表示该任务不可同 session 续跑。", valuesPrinted: false });
|
||||
if (input.runnerLost || input.staleClaimed || input.terminalCommandOpenRun) actions.push({ action: "refresh-queue-or-resubmit", reason: input.failureKind ?? "stale-runner-state", hint: "先用 queue refresh/show 对齐 attempt;有 sessionId 时继续同一 session,没有 sessionId 才重新派发。", valuesPrinted: false });
|
||||
if (sessionId) actions.push(recoveryDescriptor({ action: "continue-session", operation: "send", resourceKind: "session", resourceName: sessionId, runId: input.run.id, commandId: input.command?.id ?? null, sessionId, inputKind: "prompt" }));
|
||||
else actions.push({ action: "session-unavailable", operation: "operator-decision", resourceKind: "run", resourceName: input.run.id, runId: input.run.id, commandId: input.command?.id ?? null, reason: "sessionRef=null", reasonHint: "当前 run 没有 sessionRef,管理者只能从 run/events/command/runner-job 读取 trace 后重新提交;这表示该任务不可同 session 续跑。", valuesPrinted: false });
|
||||
if (input.runnerLost || input.staleClaimed || input.terminalCommandOpenRun) actions.push(recoveryDescriptor({ action: "refresh-queue-or-resubmit", operation: "operator-decision", resourceKind: sessionId ? "session" : "run", resourceName: sessionId ?? input.run.id, runId: input.run.id, commandId: input.command?.id ?? null, sessionId, reason: input.failureKind ?? "stale-runner-state", reasonHint: "先用 queue refresh/show 对齐 attempt;有 sessionId 时继续同一 session,没有 sessionId 才重新派发。" }));
|
||||
return actions.slice(0, 6);
|
||||
}
|
||||
|
||||
function recoveryDescriptor(input: { action: string; operation: string; resourceKind: string; resourceName: string; runId?: string | null; commandId?: string | null; runnerJobId?: string | null; sessionId?: string | null; afterSeq?: number | null; limit?: number | null; reason?: string | null; reasonHint?: string | null; inputKind?: string | null }): JsonRecord {
|
||||
return {
|
||||
action: input.action,
|
||||
operation: input.operation,
|
||||
resourceKind: input.resourceKind,
|
||||
resourceName: input.resourceName,
|
||||
runId: input.runId ?? null,
|
||||
commandId: input.commandId ?? null,
|
||||
runnerJobId: input.runnerJobId ?? null,
|
||||
sessionId: input.sessionId ?? null,
|
||||
...(input.afterSeq !== undefined ? { afterSeq: input.afterSeq } : {}),
|
||||
...(input.limit !== undefined ? { limit: input.limit } : {}),
|
||||
...(input.reason ? { reason: input.reason } : {}),
|
||||
...(input.reasonHint ? { reasonHint: input.reasonHint } : {}),
|
||||
...(input.inputKind ? { inputKind: input.inputKind } : {}),
|
||||
valuesPrinted: false,
|
||||
};
|
||||
}
|
||||
|
||||
function runnerJobReference(job: RunnerJobRecord, events: RunEvent[]): JsonRecord {
|
||||
const observation = runnerJobObservation(job, events);
|
||||
const terminalStatus = stringValue(observation.terminalStatus);
|
||||
|
||||
@@ -213,11 +213,11 @@ export async function createKubernetesRunnerJob(options: { store: AgentRunStore;
|
||||
retention: {
|
||||
ttlSecondsAfterFinished: render.ttlSecondsAfterFinished,
|
||||
},
|
||||
pollCommands: {
|
||||
run: `./scripts/agentrun runs show ${run.id} --manager-url ${managerUrl}`,
|
||||
command: `./scripts/agentrun commands show ${commandId} --run-id ${run.id} --manager-url ${managerUrl}`,
|
||||
events: `./scripts/agentrun runs events ${run.id} --manager-url ${managerUrl} --after-seq 0 --limit 100`,
|
||||
},
|
||||
pollActions: [
|
||||
runnerJobActionDescriptor({ action: "inspect-run", operation: "describe", resourceKind: "run", resourceName: run.id, runId: run.id }),
|
||||
runnerJobActionDescriptor({ action: "inspect-command", operation: "describe", resourceKind: "command", resourceName: commandId, runId: run.id, commandId }),
|
||||
runnerJobActionDescriptor({ action: "poll-events", operation: "events", resourceKind: "run", resourceName: run.id, runId: run.id, commandId, afterSeq: 0, limit: 100 }),
|
||||
],
|
||||
warnings: render.warnings,
|
||||
kubernetes: {
|
||||
created: true,
|
||||
@@ -459,6 +459,20 @@ function stringField(record: JsonRecord, key: string): string {
|
||||
return value.trim();
|
||||
}
|
||||
|
||||
function runnerJobActionDescriptor(input: { action: string; operation: string; resourceKind: string; resourceName: string; runId?: string | null; commandId?: string | null; afterSeq?: number | null; limit?: number | null }): JsonRecord {
|
||||
return {
|
||||
action: input.action,
|
||||
operation: input.operation,
|
||||
resourceKind: input.resourceKind,
|
||||
resourceName: input.resourceName,
|
||||
runId: input.runId ?? null,
|
||||
commandId: input.commandId ?? null,
|
||||
...(input.afterSeq !== undefined ? { afterSeq: input.afterSeq } : {}),
|
||||
...(input.limit !== undefined ? { limit: input.limit } : {}),
|
||||
valuesPrinted: false,
|
||||
};
|
||||
}
|
||||
|
||||
function optionalString(value: unknown): string | undefined {
|
||||
return typeof value === "string" && value.trim().length > 0 ? value.trim() : undefined;
|
||||
}
|
||||
|
||||
@@ -126,12 +126,12 @@ export async function removeProviderProfile(profileValue: string, options: Provi
|
||||
configHashSuffix: hashDataKey(data, "config.toml") ?? stringPath(annotations, [`${credentialAnnotationPrefix}-config-hash-suffix`]),
|
||||
updatedAt: new Date().toISOString(),
|
||||
valuesPrinted: false,
|
||||
pollCommands: {
|
||||
list: "./scripts/agentrun provider-profiles list",
|
||||
show: `./scripts/agentrun provider-profiles show ${profile}`,
|
||||
setKey: `./scripts/agentrun provider-profiles set-key ${profile} --key-stdin`,
|
||||
setConfig: `./scripts/agentrun provider-profiles set-config ${profile} --config-stdin`,
|
||||
},
|
||||
pollActions: [
|
||||
providerActionDescriptor({ action: "list-provider-profiles", operation: "list", resourceKind: "provider-profile", resourceName: "*" }),
|
||||
providerActionDescriptor({ action: "inspect-provider-profile", operation: "describe", resourceKind: "provider-profile", resourceName: profile, profile }),
|
||||
providerActionDescriptor({ action: "set-provider-key", operation: "set-key", resourceKind: "provider-profile", resourceName: profile, profile, inputKind: "credential" }),
|
||||
providerActionDescriptor({ action: "set-provider-config", operation: "set-config", resourceKind: "provider-profile", resourceName: profile, profile, inputKind: "config" }),
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
@@ -184,12 +184,12 @@ export async function setProviderProfileConfig(profileValue: string, body: unkno
|
||||
configTomlPrinted: false,
|
||||
credentialValuesPrinted: false,
|
||||
valuesPrinted: false,
|
||||
pollCommands: {
|
||||
config: `./scripts/agentrun provider-profiles config ${profile}`,
|
||||
show: `./scripts/agentrun provider-profiles show ${profile}`,
|
||||
setKey: `./scripts/agentrun provider-profiles set-key ${profile} --key-stdin`,
|
||||
validate: `./scripts/agentrun provider-profiles validate ${profile} --wait --timeout-ms 120000`,
|
||||
},
|
||||
pollActions: [
|
||||
providerActionDescriptor({ action: "inspect-provider-config", operation: "config", resourceKind: "provider-profile", resourceName: profile, profile }),
|
||||
providerActionDescriptor({ action: "inspect-provider-profile", operation: "describe", resourceKind: "provider-profile", resourceName: profile, profile }),
|
||||
providerActionDescriptor({ action: "set-provider-key", operation: "set-key", resourceKind: "provider-profile", resourceName: profile, profile, inputKind: "credential" }),
|
||||
providerActionDescriptor({ action: "validate-provider-profile", operation: "validate", resourceKind: "provider-profile", resourceName: profile, profile, wait: true, timeoutMs: 120_000 }),
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
@@ -241,10 +241,24 @@ export async function setProviderProfileCredential(profileValue: string, body: u
|
||||
delegatedBy,
|
||||
requiresExternalBridgeUpdate: profileUsesMoonBridge(profile, renderedConfig.configToml),
|
||||
valuesPrinted: false,
|
||||
pollCommands: {
|
||||
show: `./scripts/agentrun provider-profiles show ${profile}`,
|
||||
validate: `./scripts/agentrun provider-profiles validate ${profile} --wait --timeout-ms 120000`,
|
||||
},
|
||||
pollActions: [
|
||||
providerActionDescriptor({ action: "inspect-provider-profile", operation: "describe", resourceKind: "provider-profile", resourceName: profile, profile }),
|
||||
providerActionDescriptor({ action: "validate-provider-profile", operation: "validate", resourceKind: "provider-profile", resourceName: profile, profile, wait: true, timeoutMs: 120_000 }),
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
function providerActionDescriptor(input: { action: string; operation: string; resourceKind: string; resourceName: string; profile?: string | null; inputKind?: string | null; wait?: boolean; timeoutMs?: number | null }): JsonRecord {
|
||||
return {
|
||||
action: input.action,
|
||||
operation: input.operation,
|
||||
resourceKind: input.resourceKind,
|
||||
resourceName: input.resourceName,
|
||||
profile: input.profile ?? null,
|
||||
...(input.inputKind ? { inputKind: input.inputKind } : {}),
|
||||
...(input.wait === true ? { wait: true } : {}),
|
||||
...(input.timeoutMs !== undefined ? { timeoutMs: input.timeoutMs } : {}),
|
||||
valuesPrinted: false,
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -52,12 +52,26 @@ export async function dispatchQueueTask(options: DispatchQueueTaskOptions): Prom
|
||||
envImage: jsonRecordOrNull(runnerJob.envImage),
|
||||
workReady: jsonRecordOrNull(runnerJob.workReady),
|
||||
latestAttempt,
|
||||
pollCommands: {
|
||||
queue: `./scripts/agentrun queue show ${task.id}`,
|
||||
run: `./scripts/agentrun runs show ${run.id}`,
|
||||
command: `./scripts/agentrun commands show ${command.id} --run-id ${run.id}`,
|
||||
events: `./scripts/agentrun runs events ${run.id} --after-seq 0 --limit 100`,
|
||||
},
|
||||
pollActions: [
|
||||
dispatchActionDescriptor({ action: "inspect-task", operation: "describe", resourceKind: "task", resourceName: task.id }),
|
||||
dispatchActionDescriptor({ action: "inspect-run", operation: "describe", resourceKind: "run", resourceName: run.id, runId: run.id }),
|
||||
dispatchActionDescriptor({ action: "inspect-command", operation: "describe", resourceKind: "command", resourceName: command.id, runId: run.id, commandId: command.id }),
|
||||
dispatchActionDescriptor({ action: "poll-events", operation: "events", resourceKind: "run", resourceName: run.id, runId: run.id, commandId: command.id, afterSeq: 0, limit: 100 }),
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
function dispatchActionDescriptor(input: { action: string; operation: string; resourceKind: string; resourceName: string; runId?: string | null; commandId?: string | null; afterSeq?: number | null; limit?: number | null }): JsonRecord {
|
||||
return {
|
||||
action: input.action,
|
||||
operation: input.operation,
|
||||
resourceKind: input.resourceKind,
|
||||
resourceName: input.resourceName,
|
||||
runId: input.runId ?? null,
|
||||
commandId: input.commandId ?? null,
|
||||
...(input.afterSeq !== undefined ? { afterSeq: input.afterSeq } : {}),
|
||||
...(input.limit !== undefined ? { limit: input.limit } : {}),
|
||||
valuesPrinted: false,
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
+32
-14
@@ -442,39 +442,57 @@ function ageMs(value: string, nowMs: number): number | null {
|
||||
function recoveryActions(input: { run: RunRecord; command: CommandRecord | null; afterSeq: number; active: boolean; terminal: TerminalStatus | null; failureKind: FailureKind | null; failureMessage: string | null; needsContinuation: boolean; finalResponseAuthority: string }): JsonRecord[] {
|
||||
const { run, command, afterSeq, active, terminal, failureKind, failureMessage, needsContinuation, finalResponseAuthority } = input;
|
||||
const sessionId = run.sessionRef?.sessionId ?? null;
|
||||
const traceCommand = sessionId ? `./scripts/agentrun sessions trace ${sessionId} --after-seq ${afterSeq} --limit 100 --run-id ${run.id}` : `./scripts/agentrun runs events ${run.id} --after-seq ${afterSeq} --limit 100 --summary`;
|
||||
const outputCommand = sessionId ? `./scripts/agentrun sessions output ${sessionId} --after-seq ${afterSeq} --limit 100 --run-id ${run.id}` : null;
|
||||
const actions: JsonRecord[] = [
|
||||
{ action: "poll-trace", runId: run.id, commandId: command?.id ?? null, afterSeq, command: traceCommand, valuesPrinted: false },
|
||||
recoveryDescriptor({ action: "poll-trace", operation: "events", resourceKind: "run", resourceName: run.id, runId: run.id, commandId: command?.id ?? null, sessionId, afterSeq, limit: 100 }),
|
||||
];
|
||||
if (outputCommand) actions.push({ action: "poll-output", runId: run.id, commandId: command?.id ?? null, afterSeq, command: outputCommand, valuesPrinted: false });
|
||||
if (sessionId) actions.push(recoveryDescriptor({ action: "poll-output", operation: "logs", resourceKind: "session", resourceName: sessionId, runId: run.id, commandId: command?.id ?? null, sessionId, afterSeq, limit: 100 }));
|
||||
if (active) {
|
||||
if (sessionId) actions.push({ action: "send-session", sessionId, runId: run.id, commandId: command?.id ?? null, command: `./scripts/agentrun sessions send ${sessionId} --prompt-stdin`, hint: "manager 会按当前 session 状态自动决定内部 steer 或新 turn", valuesPrinted: false });
|
||||
if (command) actions.push({ action: "cancel-command", runId: run.id, commandId: command.id, command: `./scripts/agentrun commands cancel ${command.id} --reason <reason>`, valuesPrinted: false });
|
||||
else actions.push({ action: "cancel-run", runId: run.id, command: `./scripts/agentrun runs cancel ${run.id} --reason <reason>`, valuesPrinted: false });
|
||||
if (sessionId) actions.push(recoveryDescriptor({ action: "send-session", operation: "send", resourceKind: "session", resourceName: sessionId, runId: run.id, commandId: command?.id ?? null, sessionId, inputKind: "prompt", reasonHint: "manager 会按当前 session 状态自动决定内部 steer 或新 turn" }));
|
||||
if (command) actions.push(recoveryDescriptor({ action: "cancel-command", operation: "cancel", resourceKind: "command", resourceName: command.id, runId: run.id, commandId: command.id, sessionId, reasonRequired: true, reasonHint: "operator supplied cancel reason" }));
|
||||
else actions.push(recoveryDescriptor({ action: "cancel-run", operation: "cancel", resourceKind: "run", resourceName: run.id, runId: run.id, sessionId, reasonRequired: true, reasonHint: "operator supplied cancel reason" }));
|
||||
return actions;
|
||||
}
|
||||
if (needsContinuation && sessionId) {
|
||||
if (command) actions.push({ action: "inspect-result", runId: run.id, commandId: command.id, command: `./scripts/agentrun commands result ${command.id} --run-id ${run.id}`, valuesPrinted: false });
|
||||
actions.push({ action: "continue-session", reason: `final-response-${finalResponseAuthority}`, sessionId, command: `./scripts/agentrun sessions send ${sessionId} --prompt-stdin`, hint: "命令已 terminal completed,但没有 authoritative final response;管理者应先读 trace/output,再用同一 session 发送后续 prompt。", valuesPrinted: false });
|
||||
if (command) actions.push(recoveryDescriptor({ action: "inspect-result", operation: "result", resourceKind: "command", resourceName: command.id, runId: run.id, commandId: command.id, sessionId }));
|
||||
actions.push(recoveryDescriptor({ action: "continue-session", operation: "send", resourceKind: "session", resourceName: sessionId, runId: run.id, commandId: command?.id ?? null, sessionId, reason: `final-response-${finalResponseAuthority}`, inputKind: "prompt", reasonHint: "命令已 terminal completed,但没有 authoritative final response;管理者应先读 trace/output,再用同一 session 发送后续 prompt。" }));
|
||||
return actions;
|
||||
}
|
||||
if (terminal === "failed" || terminal === "blocked" || terminal === "cancelled") {
|
||||
if (command) actions.push({ action: "inspect-result", runId: run.id, commandId: command.id, command: `./scripts/agentrun commands result ${command.id} --run-id ${run.id}`, valuesPrinted: false });
|
||||
if (sessionId) actions.push({ action: "continue-session", sessionId, command: `./scripts/agentrun sessions send ${sessionId} --prompt-stdin`, valuesPrinted: false });
|
||||
if (failureKind === "backend-timeout") actions.push({ action: "split-task", reason: "backend-timeout", hint: "先由管理者读取 trace/result,总结下一步,再把后续 prompt 发到同一 session;必要时把大 patch / 长工具链拆成更短 turn。", failureMessage: failureMessage ? boundedTextSummary(failureMessage, { limitChars: 200 }).text as string : null, valuesPrinted: false });
|
||||
else actions.push({ action: "retry-or-split", reason: failureKind ?? "terminal", hint: "先读 trace/output 的 detail id,再决定继续同 session、重跑或拆分", valuesPrinted: false });
|
||||
if (command) actions.push(recoveryDescriptor({ action: "inspect-result", operation: "result", resourceKind: "command", resourceName: command.id, runId: run.id, commandId: command.id, sessionId }));
|
||||
if (sessionId) actions.push(recoveryDescriptor({ action: "continue-session", operation: "send", resourceKind: "session", resourceName: sessionId, runId: run.id, commandId: command?.id ?? null, sessionId, inputKind: "prompt" }));
|
||||
if (failureKind === "backend-timeout") actions.push(recoveryDescriptor({ action: "split-task", operation: "operator-decision", resourceKind: sessionId ? "session" : "run", resourceName: sessionId ?? run.id, runId: run.id, commandId: command?.id ?? null, sessionId, reason: "backend-timeout", reasonHint: "先由管理者读取 trace/result,总结下一步,再把后续 prompt 发到同一 session;必要时把大 patch / 长工具链拆成更短 turn。", failureMessage: failureMessage ? boundedTextSummary(failureMessage, { limitChars: 200 }).text as string : null }));
|
||||
else actions.push(recoveryDescriptor({ action: "retry-or-split", operation: "operator-decision", resourceKind: sessionId ? "session" : "run", resourceName: sessionId ?? run.id, runId: run.id, commandId: command?.id ?? null, sessionId, reason: failureKind ?? "terminal", reasonHint: "先读 trace/output 的 detail id,再决定继续同 session、重跑或拆分" }));
|
||||
}
|
||||
return actions;
|
||||
}
|
||||
|
||||
function recoveryDescriptor(input: { action: string; operation: string; resourceKind: string; resourceName: string; runId?: string | null; commandId?: string | null; sessionId?: string | null; afterSeq?: number | null; limit?: number | null; reason?: string | null; reasonHint?: string | null; reasonRequired?: boolean; inputKind?: string | null; failureMessage?: string | null }): JsonRecord {
|
||||
return {
|
||||
action: input.action,
|
||||
operation: input.operation,
|
||||
resourceKind: input.resourceKind,
|
||||
resourceName: input.resourceName,
|
||||
runId: input.runId ?? null,
|
||||
commandId: input.commandId ?? null,
|
||||
sessionId: input.sessionId ?? null,
|
||||
...(input.afterSeq !== undefined ? { afterSeq: input.afterSeq } : {}),
|
||||
...(input.limit !== undefined ? { limit: input.limit } : {}),
|
||||
...(input.reason ? { reason: input.reason } : {}),
|
||||
...(input.reasonHint ? { reasonHint: input.reasonHint } : {}),
|
||||
...(input.reasonRequired === true ? { reasonRequired: true } : {}),
|
||||
...(input.inputKind ? { inputKind: input.inputKind } : {}),
|
||||
...(input.failureMessage ? { failureMessage: input.failureMessage } : {}),
|
||||
valuesPrinted: false,
|
||||
};
|
||||
}
|
||||
|
||||
function finalResponseAuthority(reply: AssistantReplySummary): "authoritative" | "fallback" | "missing" {
|
||||
if (reply.replyAuthority || reply.final) return "authoritative";
|
||||
return reply.text.length > 0 ? "fallback" : "missing";
|
||||
}
|
||||
|
||||
function completionEvidenceSummary(input: { terminal: TerminalStatus | null; terminalSource: string; reply: AssistantReplySummary; responseAuthority: string; needsContinuation: boolean; sessionId: string | null }): JsonRecord {
|
||||
const recommendedAction = input.needsContinuation && input.sessionId ? `./scripts/agentrun sessions send ${input.sessionId} --prompt-stdin` : null;
|
||||
const recommendedAction = input.needsContinuation && input.sessionId ? recoveryDescriptor({ action: "continue-session", operation: "send", resourceKind: "session", resourceName: input.sessionId, sessionId: input.sessionId, inputKind: "prompt", reason: `final-response-${input.responseAuthority}` }) : null;
|
||||
return {
|
||||
terminalStatus: input.terminal,
|
||||
terminalSource: input.terminalSource,
|
||||
|
||||
+51
-10
@@ -165,6 +165,7 @@ async function queueTaskSupervisor(store: AgentRunStore, task: JsonRecord): Prom
|
||||
const lastActivity = asJsonRecord(liveness?.lastActivity ?? liveness?.lastCommandActivity);
|
||||
const timeoutBudget = asJsonRecord(liveness?.timeoutBudget);
|
||||
const terminalClassification = asJsonRecord(result.terminalClassification ?? liveness?.terminalClassification);
|
||||
const lease = asJsonRecord(liveness?.lease);
|
||||
return {
|
||||
runId: stringJsonValue(result.runId),
|
||||
commandId: stringJsonValue(result.commandId),
|
||||
@@ -176,8 +177,13 @@ async function queueTaskSupervisor(store: AgentRunStore, task: JsonRecord): Prom
|
||||
phase: stringJsonValue(liveness?.phase),
|
||||
active: liveness?.active === true,
|
||||
lastSeq: numberJsonValue(liveness?.lastSeq ?? result.lastSeq),
|
||||
lastEventAt: stringJsonValue(liveness?.lastEventAt),
|
||||
lastEventAgeMs: numberJsonValue(liveness?.lastEventAgeMs),
|
||||
lastActivity: lastActivity ? compactActivity(lastActivity) : null,
|
||||
timeoutBudget: timeoutBudget ? compactTimeoutBudget(timeoutBudget) : null,
|
||||
lease: lease ? compactLease(lease) : null,
|
||||
leaseRemainingMs: numberJsonValue(lease?.leaseRemainingMs),
|
||||
leaseExpired: lease?.leaseExpired === true,
|
||||
recoveryActions: compactRecoveryActions(liveness?.recoveryActions),
|
||||
valuesPrinted: false,
|
||||
};
|
||||
@@ -220,6 +226,16 @@ function compactTimeoutBudget(budget: JsonRecord): JsonRecord {
|
||||
};
|
||||
}
|
||||
|
||||
function compactLease(lease: JsonRecord): JsonRecord {
|
||||
return {
|
||||
claimedBy: stringJsonValue(lease.claimedBy),
|
||||
leaseExpiresAt: stringJsonValue(lease.leaseExpiresAt),
|
||||
leaseExpired: lease.leaseExpired === true,
|
||||
leaseRemainingMs: numberJsonValue(lease.leaseRemainingMs),
|
||||
valuesPrinted: false,
|
||||
};
|
||||
}
|
||||
|
||||
function compactTerminalClassification(record: JsonRecord): JsonRecord {
|
||||
return {
|
||||
category: stringJsonValue(record.category),
|
||||
@@ -248,13 +264,20 @@ function compactRecoveryActions(value: JsonValue | undefined): JsonValue[] {
|
||||
if (!action) return { action: "unknown", valuesPrinted: false };
|
||||
return {
|
||||
action: stringJsonValue(action.action),
|
||||
operation: stringJsonValue(action.operation),
|
||||
resourceKind: stringJsonValue(action.resourceKind),
|
||||
resourceName: stringJsonValue(action.resourceName),
|
||||
reason: stringJsonValue(action.reason),
|
||||
reasonHint: boundedJsonString(action.reasonHint, 220),
|
||||
reasonRequired: action.reasonRequired === true,
|
||||
inputKind: stringJsonValue(action.inputKind),
|
||||
runId: stringJsonValue(action.runId),
|
||||
commandId: stringJsonValue(action.commandId),
|
||||
runnerJobId: stringJsonValue(action.runnerJobId),
|
||||
sessionId: stringJsonValue(action.sessionId),
|
||||
afterSeq: numberJsonValue(action.afterSeq),
|
||||
command: boundedJsonString(action.command, 220),
|
||||
hint: boundedJsonString(action.hint, 220),
|
||||
limit: numberJsonValue(action.limit),
|
||||
failureMessage: boundedJsonString(action.failureMessage, 220),
|
||||
valuesPrinted: false,
|
||||
};
|
||||
});
|
||||
@@ -710,7 +733,7 @@ function sessionSendPlan(sessionId: string, decision: "steer" | "turn", active:
|
||||
activeBefore: active ? activeBeforeSummary(active) : null,
|
||||
request,
|
||||
...(runBody ? { run: { bodyBytes: jsonByteLength(runBody), sessionRef: summarizeSendSessionRef(runBody), valuesPrinted: false } } : {}),
|
||||
next: { confirm: `./scripts/agentrun sessions send ${sessionId} --prompt-stdin`, note: "Remove --dry-run to perform the mutation. Manager will decide internal steer vs turn from durable session state." },
|
||||
next: { confirm: managerActionDescriptor({ action: "send-session", operation: "send", resourceKind: "session", resourceName: sessionId, sessionId, inputKind: "prompt" }), note: "Remove --dry-run to perform the mutation. Manager will decide internal steer vs turn from durable session state." },
|
||||
valuesPrinted: false,
|
||||
};
|
||||
}
|
||||
@@ -727,13 +750,31 @@ function sessionSendResponse(input: { sessionId: string; decision: "steer" | "tu
|
||||
command: input.command as unknown as JsonRecord,
|
||||
runnerJob: input.runnerJob,
|
||||
activeBefore: input.activeBefore ? activeBeforeSummary(input.activeBefore) : null,
|
||||
pollCommands: {
|
||||
show: `./scripts/agentrun sessions show ${input.sessionId} --reader-id cli`,
|
||||
trace: `./scripts/agentrun sessions trace ${input.sessionId} --after-seq 0 --limit 100`,
|
||||
output: `./scripts/agentrun sessions output ${input.sessionId} --after-seq 0 --limit 100`,
|
||||
read: `./scripts/agentrun sessions read ${input.sessionId} --reader-id cli`,
|
||||
cancel: `./scripts/agentrun sessions cancel ${input.sessionId}`,
|
||||
},
|
||||
pollActions: [
|
||||
managerActionDescriptor({ action: "inspect-session", operation: "describe", resourceKind: "session", resourceName: input.sessionId, sessionId: input.sessionId, readerId: "cli" }),
|
||||
managerActionDescriptor({ action: "poll-trace", operation: "events", resourceKind: "run", resourceName: input.run.id, runId: input.run.id, commandId: input.command.id, sessionId: input.sessionId, afterSeq: 0, limit: 100 }),
|
||||
managerActionDescriptor({ action: "poll-output", operation: "logs", resourceKind: "session", resourceName: input.sessionId, runId: input.run.id, commandId: input.command.id, sessionId: input.sessionId, afterSeq: 0, limit: 100 }),
|
||||
managerActionDescriptor({ action: "read-session", operation: "read", resourceKind: "session", resourceName: input.sessionId, sessionId: input.sessionId, readerId: "cli" }),
|
||||
managerActionDescriptor({ action: "cancel-session", operation: "cancel", resourceKind: "session", resourceName: input.sessionId, sessionId: input.sessionId, reasonRequired: true }),
|
||||
],
|
||||
valuesPrinted: false,
|
||||
};
|
||||
}
|
||||
|
||||
function managerActionDescriptor(input: { action: string; operation: string; resourceKind: string; resourceName: string; runId?: string | null; commandId?: string | null; sessionId?: string | null; afterSeq?: number | null; limit?: number | null; readerId?: string | null; reasonRequired?: boolean; inputKind?: string | null }): JsonRecord {
|
||||
return {
|
||||
action: input.action,
|
||||
operation: input.operation,
|
||||
resourceKind: input.resourceKind,
|
||||
resourceName: input.resourceName,
|
||||
runId: input.runId ?? null,
|
||||
commandId: input.commandId ?? null,
|
||||
sessionId: input.sessionId ?? null,
|
||||
...(input.afterSeq !== undefined ? { afterSeq: input.afterSeq } : {}),
|
||||
...(input.limit !== undefined ? { limit: input.limit } : {}),
|
||||
...(input.readerId ? { readerId: input.readerId } : {}),
|
||||
...(input.reasonRequired === true ? { reasonRequired: true } : {}),
|
||||
...(input.inputKind ? { inputKind: input.inputKind } : {}),
|
||||
valuesPrinted: false,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -92,10 +92,21 @@ export async function setGithubSshToolCredential(body: unknown, options: ToolCre
|
||||
updatedAt: stringPath(applied, ["metadata", "annotations", `${annotationPrefix}-updated-at`]) ?? updatedAt,
|
||||
credentialValuesPrinted: false,
|
||||
valuesPrinted: false,
|
||||
pollCommands: {
|
||||
show: "./scripts/agentrun tool-credentials show github-ssh",
|
||||
list: "./scripts/agentrun tool-credentials list",
|
||||
},
|
||||
pollActions: [
|
||||
toolCredentialActionDescriptor({ action: "inspect-tool-credential", operation: "describe", resourceKind: "tool-credential", resourceName: spec.name, tool: spec.tool }),
|
||||
toolCredentialActionDescriptor({ action: "list-tool-credentials", operation: "list", resourceKind: "tool-credential", resourceName: "*", tool: spec.tool }),
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
function toolCredentialActionDescriptor(input: { action: string; operation: string; resourceKind: string; resourceName: string; tool?: string | null }): JsonRecord {
|
||||
return {
|
||||
action: input.action,
|
||||
operation: input.operation,
|
||||
resourceKind: input.resourceKind,
|
||||
resourceName: input.resourceName,
|
||||
tool: input.tool ?? null,
|
||||
valuesPrinted: false,
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -422,7 +422,39 @@ async function runFailureCase(options: { client: ManagerClient; managerUrl: stri
|
||||
}
|
||||
const command = await options.client.get(`/api/v1/runs/${item.runId}/commands/${item.commandId}`) as { state?: string };
|
||||
assert.equal(command.state, "failed", options.mode);
|
||||
assertNoSecretLeak(events);
|
||||
const envelope = await options.client.get(`/api/v1/runs/${item.runId}/commands/${item.commandId}/result`) as JsonRecord;
|
||||
if (options.mode === "provider-503-terminal") {
|
||||
const classification = envelope.terminalClassification as JsonRecord;
|
||||
const liveness = envelope.liveness as JsonRecord;
|
||||
const timeoutBudget = liveness.timeoutBudget as JsonRecord;
|
||||
assert.equal(classification.category, "provider-failed");
|
||||
assert.equal(classification.providerEvidence, "failure-kind");
|
||||
assert.equal(classification.providerInterruptionKnown, true);
|
||||
assert.equal(classification.failureKind, "provider-http-error");
|
||||
assert.equal(liveness.phase, "terminal");
|
||||
assert.equal(typeof liveness.lastEventAgeMs, "number");
|
||||
assert.equal(timeoutBudget.timeoutKind, "idle");
|
||||
assert.equal(typeof timeoutBudget.idleElapsedMs, "number");
|
||||
assertRecoveryActionDescriptors(liveness.recoveryActions);
|
||||
}
|
||||
assertNoSecretLeak({ events, envelope });
|
||||
}
|
||||
|
||||
function assertRecoveryActionDescriptors(value: unknown): void {
|
||||
assert.ok(Array.isArray(value), "recoveryActions must be an array");
|
||||
const text = JSON.stringify(value);
|
||||
assert.equal(text.includes("./scripts/agentrun sessions"), false, "server recoveryActions must not expose old sessions CLI paths");
|
||||
assert.equal(text.includes("./scripts/agentrun commands"), false, "server recoveryActions must not expose old commands CLI paths");
|
||||
assert.equal(text.includes("bun scripts/cli.ts agentrun"), false, "server recoveryActions must not hardcode render-only client commands");
|
||||
for (const item of value) {
|
||||
const action = item as JsonRecord;
|
||||
assert.equal(Object.prototype.hasOwnProperty.call(action, "command"), false, "recovery action must be a descriptor, not a rendered command string");
|
||||
assert.equal(typeof action.action, "string");
|
||||
assert.equal(typeof action.operation, "string");
|
||||
assert.equal(typeof action.resourceKind, "string");
|
||||
assert.equal(typeof action.resourceName, "string");
|
||||
assert.equal(action.valuesPrinted, false);
|
||||
}
|
||||
}
|
||||
|
||||
function eventPayload(event: { payload: unknown }): JsonRecord {
|
||||
|
||||
@@ -29,6 +29,25 @@ const selfTest: SelfTestCase = async (context: SelfTestContext) => {
|
||||
assert.equal(assistantLive.phase, "waiting-model-output");
|
||||
assert.equal(((assistantLive.lastActivity as JsonRecord).activityKind), "assistant-progress");
|
||||
|
||||
const retry = await createActiveRun(client, context, "timeout-liveness-provider-retry", 120_000);
|
||||
await client.post(`/api/v1/runs/${retry.runId}/events`, { type: "error", payload: { commandId: retry.commandId, failureKind: "provider-stream-disconnected", willRetry: true, message: "provider stream disconnected; retrying" } });
|
||||
const retryResult = await commandResult(client, retry);
|
||||
const retryLive = retryResult.liveness as JsonRecord;
|
||||
const retryClassification = retryResult.terminalClassification as JsonRecord;
|
||||
assert.equal(retryLive.active, true);
|
||||
assert.equal(retryClassification.category, "active-retry-interruption");
|
||||
assert.equal(retryClassification.providerEvidence, "retry-event");
|
||||
assert.equal(retryClassification.retryInterruptionObserved, true);
|
||||
assert.equal(((retryLive.timeoutBudget as JsonRecord).state), "within-budget");
|
||||
assert.equal(typeof retryLive.lastEventAgeMs, "number");
|
||||
const retryCancelAction = (retryLive.recoveryActions as JsonRecord[]).find((action) => action.action === "cancel-command") as JsonRecord;
|
||||
assert.equal(retryCancelAction.operation, "cancel");
|
||||
assert.equal(retryCancelAction.resourceKind, "command");
|
||||
assert.equal(retryCancelAction.resourceName, retry.commandId);
|
||||
assert.equal(retryCancelAction.runId, retry.runId);
|
||||
assert.equal(retryCancelAction.commandId, retry.commandId);
|
||||
assertRecoveryActionDescriptors(retryLive.recoveryActions);
|
||||
|
||||
const inactive = await createActiveRun(client, context, "timeout-liveness-inactive", 40);
|
||||
await sleep(36);
|
||||
const inactiveLive = (await commandResult(client, inactive)).liveness as JsonRecord;
|
||||
@@ -72,7 +91,21 @@ const selfTest: SelfTestCase = async (context: SelfTestContext) => {
|
||||
assert.equal((noSessionLive.transportDisconnect as JsonRecord).sourceSeq, 4);
|
||||
assert.equal((noSessionLive.recoveryActions as JsonRecord[]).some((action) => action.action === "continue-session"), false, "sessionId=null must not suggest session-only continuation");
|
||||
assert.equal((noSessionLive.recoveryActions as JsonRecord[]).some((action) => action.action === "poll-output"), false, "sessionId=null must not suggest session output path");
|
||||
assert.ok((noSessionLive.recoveryActions as JsonRecord[]).some((action) => action.action === "poll-trace" && String(action.command).includes("runs events")));
|
||||
assert.ok((noSessionLive.recoveryActions as JsonRecord[]).some((action) => action.action === "poll-trace" && action.operation === "events" && action.resourceKind === "run" && action.resourceName === noSession.runId));
|
||||
assertRecoveryActionDescriptors(noSessionLive.recoveryActions);
|
||||
|
||||
const manualCancel = await createActiveRun(client, context, "timeout-liveness-manual-command-cancel", 120_000);
|
||||
await client.post(`/api/v1/commands/${manualCancel.commandId}/cancel`, { reason: "self-test manual command cancel" });
|
||||
const manualCancelResult = await commandResult(client, manualCancel);
|
||||
const manualCancelLive = manualCancelResult.liveness as JsonRecord;
|
||||
const manualCancelClassification = manualCancelResult.terminalClassification as JsonRecord;
|
||||
assert.equal(manualCancelResult.terminalStatus, "cancelled");
|
||||
assert.equal(manualCancelResult.failureKind, "cancelled");
|
||||
assert.equal(manualCancelLive.phase, "terminal");
|
||||
assert.equal(manualCancelClassification.category, "cancelled");
|
||||
assert.equal(manualCancelClassification.reason, "terminal status or failureKind is cancelled");
|
||||
assert.ok((manualCancelLive.recoveryActions as JsonRecord[]).some((action) => action.action === "inspect-result" && action.operation === "result" && action.resourceKind === "command" && action.resourceName === manualCancel.commandId));
|
||||
assertRecoveryActionDescriptors(manualCancelLive.recoveryActions);
|
||||
|
||||
const stale = await createActiveRun(client, context, "timeout-liveness-stale-claimed", 120_000, { session: false, leaseMs: 1 });
|
||||
await store.saveRunnerJob({
|
||||
@@ -98,12 +131,14 @@ const selfTest: SelfTestCase = async (context: SelfTestContext) => {
|
||||
assert.equal(staleDiagnosis.runnerLost, true);
|
||||
assert.equal(((staleDiagnosis.runnerJob as JsonRecord).phase), "created");
|
||||
assert.equal(((staleDiagnosis.session as JsonRecord).sessionRefNull), true);
|
||||
assertRecoveryActionDescriptors(staleDiagnosis.recoveryActions);
|
||||
|
||||
assert.ok(terminal.sessionId, "terminal fixture must have a session id");
|
||||
const terminalSessionId = terminal.sessionId;
|
||||
const session = await client.get(`/api/v1/sessions/${terminalSessionId}?readerId=timeout-liveness`) as JsonRecord;
|
||||
assert.equal(((session.liveness as JsonRecord).phase), "terminal");
|
||||
assert.ok(Array.isArray(((session.supervisor as JsonRecord).recoveryActions)), "session show must keep terminal recovery actions");
|
||||
assertRecoveryActionDescriptors((session.supervisor as JsonRecord).recoveryActions);
|
||||
|
||||
const task = await client.post("/api/v1/queue/tasks", queueTask(context, terminalSessionId, 50)) as JsonRecord;
|
||||
store.updateQueueTaskAttempt(String(task.id), {
|
||||
@@ -117,16 +152,21 @@ const selfTest: SelfTestCase = async (context: SelfTestContext) => {
|
||||
assert.equal((((commanderItem.supervisor as JsonRecord).diagnosis as JsonRecord).category), "execution-idle-timeout");
|
||||
assert.equal((((commanderItem.supervisor as JsonRecord).timeoutBudget as JsonRecord).state), "timed-out");
|
||||
assert.equal((((commanderItem.supervisor as JsonRecord).timeoutBudget as JsonRecord).timeoutKind), "idle");
|
||||
assert.equal(typeof ((commanderItem.supervisor as JsonRecord).lastEventAgeMs), "number");
|
||||
assert.equal(typeof ((commanderItem.supervisor as JsonRecord).leaseRemainingMs), "number");
|
||||
const commanderSummary = summarizeQueueCommanderSnapshot(commander, { limit: 5 });
|
||||
const summaryItem = ((commanderSummary.items as JsonRecord[]) ?? []).find((item) => item.id === task.id) as JsonRecord;
|
||||
assert.equal(((summaryItem.supervisor as JsonRecord).phase), "terminal");
|
||||
assert.equal((((summaryItem.supervisor as JsonRecord).terminalClassification as JsonRecord).category), "execution-idle-timeout");
|
||||
assert.equal((((summaryItem.supervisor as JsonRecord).terminalClassification as JsonRecord).providerEvidence), "insufficient");
|
||||
assert.equal(typeof ((summaryItem.supervisor as JsonRecord).lastEventAgeMs), "number");
|
||||
assert.equal(typeof (((summaryItem.supervisor as JsonRecord).timeoutBudget as JsonRecord).idleElapsedMs), "number");
|
||||
assertRecoveryActionDescriptors((summaryItem.supervisor as JsonRecord).recoveryActions);
|
||||
assert.equal(JSON.stringify(commanderSummary).includes("hwpod workspace apply-patch"), false, "commander summary must stay compact and avoid dumping command bodies");
|
||||
assert.equal(JSON.stringify(summaryItem).includes("fullRecordBytes"), false, "commander item must not add bookkeeping noise");
|
||||
assertNoSecretLeak({ toolResult, assistantLive, inactiveLive, terminalResult, noSessionResult, staleResult, session, commanderSummary });
|
||||
assertNoSecretLeak({ toolResult, assistantLive, retryResult, inactiveLive, terminalResult, noSessionResult, manualCancelResult, staleResult, session, commanderSummary });
|
||||
|
||||
return { name: "timeout-liveness", tests: ["tool-in-flight-liveness", "assistant-progress-liveness", "stdio-inactive-timeout-budget", "terminal-timeout-recovery", "no-session-drilldown", "terminal-classification", "queue-commander-supervisor", "diagnosis-visibility", "stale-claimed-runner-lost"] };
|
||||
return { name: "timeout-liveness", tests: ["tool-in-flight-liveness", "assistant-progress-liveness", "active-provider-retry-summary", "stdio-inactive-timeout-budget", "terminal-timeout-recovery", "no-session-drilldown", "manual-command-cancel-summary", "terminal-classification", "queue-commander-supervisor", "diagnosis-visibility", "stale-claimed-runner-lost"] };
|
||||
} finally {
|
||||
await new Promise<void>((resolve) => server.server.close(() => resolve()));
|
||||
}
|
||||
@@ -188,4 +228,21 @@ function executionPolicy(timeoutMs: number, codexHome: string): JsonRecord {
|
||||
};
|
||||
}
|
||||
|
||||
function assertRecoveryActionDescriptors(value: unknown): void {
|
||||
assert.ok(Array.isArray(value), "recoveryActions must be an array");
|
||||
const text = JSON.stringify(value);
|
||||
assert.equal(text.includes("./scripts/agentrun sessions"), false, "server recoveryActions must not expose old sessions CLI paths");
|
||||
assert.equal(text.includes("./scripts/agentrun commands"), false, "server recoveryActions must not expose old commands CLI paths");
|
||||
assert.equal(text.includes("bun scripts/cli.ts agentrun"), false, "server recoveryActions must not hardcode render-only client commands");
|
||||
for (const item of value) {
|
||||
const action = item as JsonRecord;
|
||||
assert.equal(Object.prototype.hasOwnProperty.call(action, "command"), false, "recovery action must be a descriptor, not a rendered command string");
|
||||
assert.equal(typeof action.action, "string");
|
||||
assert.equal(typeof action.operation, "string");
|
||||
assert.equal(typeof action.resourceKind, "string");
|
||||
assert.equal(typeof action.resourceName, "string");
|
||||
assert.equal(action.valuesPrinted, false);
|
||||
}
|
||||
}
|
||||
|
||||
export default selfTest;
|
||||
|
||||
Reference in New Issue
Block a user