diff --git a/src/mgr/kubernetes-runner-job.ts b/src/mgr/kubernetes-runner-job.ts index b3ac10f..72a7554 100644 --- a/src/mgr/kubernetes-runner-job.ts +++ b/src/mgr/kubernetes-runner-job.ts @@ -44,6 +44,7 @@ export interface RunnerJobDefaults { envIdentity?: string; artifactCatalogFile?: string; serviceAccountName?: string; + runnerIdleTimeoutMs?: number; kubectlCommand?: string; unideskSshEndpointEnv?: JsonRecord; } @@ -57,6 +58,7 @@ export interface CreateRunnerJobInput extends JsonRecord { runnerId?: string; sourceCommit?: string; serviceAccountName?: string; + runnerIdleTimeoutMs?: number; idempotencyKey?: string; imageRef?: JsonRecord; transientEnv?: JsonRecord[]; @@ -85,6 +87,7 @@ export async function createKubernetesRunnerJob(options: { store: AgentRunStore; const transientEnv = assembleToolContextTransientEnv(run.executionPolicy, transientEnvField(options.input.transientEnv), options.defaults); const attemptId = optionalString(options.input.attemptId) ?? `attempt_${Date.now().toString(36)}`; const runnerId = optionalString(options.input.runnerId); + const runnerIdleTimeoutMs = optionalPositiveInteger(options.input.runnerIdleTimeoutMs, "runnerIdleTimeoutMs") ?? options.defaults.runnerIdleTimeoutMs; const transientEnvSecretName = transientEnv.length > 0 ? transientEnvSecretNameForRun(run.id, commandId, attemptId) : null; const renderTransientEnv = transientEnvSecretName ? transientEnvWithSecretRefs(transientEnv, transientEnvSecretName) : transientEnv; const normalizedPayload = { @@ -97,6 +100,7 @@ export async function createKubernetesRunnerJob(options: { store: AgentRunStore; serviceAccountName: serviceAccountName ?? null, attemptId: optionalString(options.input.attemptId) ?? null, runnerId: optionalString(options.input.runnerId) ?? null, + runnerIdleTimeoutMs: runnerIdleTimeoutMs ?? null, transientEnv: transientEnv.map((item) => ({ name: item.name, valueHash: stableHash(item.value), sensitive: true })), }; const payloadHash = stableHash(normalizedPayload); @@ -150,6 +154,7 @@ export async function createKubernetesRunnerJob(options: { store: AgentRunStore; namespace, sourceCommit, transientEnv: renderTransientEnv, + ...(runnerIdleTimeoutMs !== undefined ? { runnerIdleTimeoutMs } : {}), ...(serviceAccountName ? { serviceAccountName } : {}), ...(sessionPvc ? { sessionPvc } : {}), }; @@ -214,6 +219,7 @@ export async function createKubernetesRunnerJob(options: { store: AgentRunStore; workReady: staticWorkReadyCapabilitySummary(), retention: { ttlSecondsAfterFinished: render.ttlSecondsAfterFinished, + runnerIdleTimeoutMs: render.runnerIdleTimeoutMs, }, pollActions: [ runnerJobActionDescriptor({ action: "inspect-run", operation: "describe", resourceKind: "run", resourceName: run.id, runId: run.id }), @@ -479,6 +485,12 @@ function optionalString(value: unknown): string | undefined { return typeof value === "string" && value.trim().length > 0 ? value.trim() : undefined; } +function optionalPositiveInteger(value: unknown, key: string): number | undefined { + if (value === undefined || value === null) return undefined; + if (!Number.isInteger(value) || Number(value) <= 0) throw new AgentRunError("schema-invalid", `${key} must be a positive integer`, { httpStatus: 400 }); + return Number(value); +} + function objectPath(value: unknown, path: string[]): string | null { let current: unknown = value; for (const key of path) { diff --git a/src/mgr/server.ts b/src/mgr/server.ts index 8e4abd9..116a3b8 100644 --- a/src/mgr/server.ts +++ b/src/mgr/server.ts @@ -46,6 +46,7 @@ function runnerJobDefaultsForRequest(defaults: ManagerServerOptions["runnerJobDe ...optionalStringRecord("envIdentity", defaults?.envIdentity ?? process.env.AGENTRUN_ENV_IDENTITY), ...optionalStringRecord("artifactCatalogFile", defaults?.artifactCatalogFile ?? process.env.AGENTRUN_ARTIFACT_CATALOG_FILE), serviceAccountName: defaults?.serviceAccountName ?? process.env.AGENTRUN_RUNNER_SERVICE_ACCOUNT ?? "agentrun-v01-runner", + ...(defaults?.runnerIdleTimeoutMs !== undefined ? { runnerIdleTimeoutMs: defaults.runnerIdleTimeoutMs } : optionalPositiveIntegerRecord("runnerIdleTimeoutMs", process.env.AGENTRUN_RUNNER_IDLE_TIMEOUT_MS)), ...(defaults?.kubectlCommand ? { kubectlCommand: defaults.kubectlCommand } : {}), ...(defaults?.unideskSshEndpointEnv ? { unideskSshEndpointEnv: defaults.unideskSshEndpointEnv } : {}), }; @@ -64,6 +65,7 @@ export interface ManagerServerOptions { envIdentity?: string; artifactCatalogFile?: string; serviceAccountName?: string; + runnerIdleTimeoutMs?: number; kubectlCommand?: string; unideskSshEndpointEnv?: JsonRecord; }; @@ -947,6 +949,13 @@ function optionalStringRecord(key: string, value: unknown): JsonRecord { return typeof value === "string" && value.trim().length > 0 ? { [key]: value.trim() } : {}; } +function optionalPositiveIntegerRecord(key: string, value: unknown): JsonRecord { + if (value === undefined || value === null || value === "") return {}; + const parsed = Number(value); + if (!Number.isInteger(parsed) || parsed <= 0) throw new AgentRunError("schema-invalid", `${key} must be a positive integer`, { httpStatus: 400 }); + return { [key]: parsed }; +} + function normalizeError(error: unknown): AgentRunError { if (error instanceof AgentRunError) return error; return new AgentRunError("infra-failed", error instanceof Error ? error.message : String(error), { httpStatus: 500 }); diff --git a/src/runner/k8s-job.ts b/src/runner/k8s-job.ts index 5d1d8f9..cea28bc 100644 --- a/src/runner/k8s-job.ts +++ b/src/runner/k8s-job.ts @@ -7,6 +7,7 @@ import { gitTransportSummary, runnerGitTransportEnvVars } from "../common/git-tr const defaultBootRepoUrl = "http://git-mirror-http.devops-infra.svc.cluster.local/pikasTech/agentrun.git"; const defaultResourceBinPath = "/usr/local/bin"; const defaultCodexShellSandbox = "danger-full-access"; +const defaultRunnerIdleTimeoutMs = 600_000; const fallbackRunnerEgressProxyUrl = "http://g14-provider-egress-proxy.unidesk.svc.cluster.local:18789"; const defaultRunnerNoProxyItems = [ "localhost", @@ -52,6 +53,7 @@ export interface RunnerJobRenderOptions { imagePullPolicy?: string; backoffLimit?: number; ttlSecondsAfterFinished?: number; + runnerIdleTimeoutMs?: number; transientEnv?: RunnerTransientEnv[]; sessionPvc?: RunnerSessionPvcOptions; dryRun?: boolean; @@ -133,6 +135,7 @@ export function renderRunnerJobDryRun(options: RunnerJobRenderOptions): JsonReco workReady: staticWorkReadyCapabilitySummary(), retention: { ttlSecondsAfterFinished: render.ttlSecondsAfterFinished, + runnerIdleTimeoutMs: render.runnerIdleTimeoutMs, }, pollCommands: { run: `./scripts/agentrun runs show ${options.run.id} --manager-url ${options.managerUrl}`, @@ -143,20 +146,21 @@ export function renderRunnerJobDryRun(options: RunnerJobRenderOptions): JsonReco }; } -export function renderRunnerJobManifest(options: RunnerJobRenderOptions): { manifest: JsonRecord; namespace: string; jobName: string; runnerId: string; attemptId: string; sourceCommit: string; serviceAccountName: string; secretRefs: CredentialProjection[]; toolCredentials: ToolCredentialProjection[]; warnings: string[]; ttlSecondsAfterFinished: number } { +export function renderRunnerJobManifest(options: RunnerJobRenderOptions): { manifest: JsonRecord; namespace: string; jobName: string; runnerId: string; attemptId: string; sourceCommit: string; serviceAccountName: string; secretRefs: CredentialProjection[]; toolCredentials: ToolCredentialProjection[]; warnings: string[]; ttlSecondsAfterFinished: number; runnerIdleTimeoutMs: number } { const namespace = options.namespace ?? "agentrun-v01"; const attemptId = options.attemptId ?? `attempt_${Date.now().toString(36)}`; const runnerId = options.runnerId ?? `runner_${shortHash(`${options.run.id}:${attemptId}:${options.commandId}`)}`; const sourceCommit = options.sourceCommit ?? process.env.AGENTRUN_SOURCE_COMMIT ?? "unknown"; const serviceAccountName = options.serviceAccountName ?? "agentrun-v01-runner"; const ttlSecondsAfterFinished = options.ttlSecondsAfterFinished ?? 86_400; + const runnerIdleTimeoutMs = normalizeRunnerIdleTimeoutMs(options.runnerIdleTimeoutMs); const jobName = `agentrun-v01-runner-${shortDnsHash(options.run.id, attemptId)}`; const secretRefs = credentialProjections(options.run, namespace); const toolCredentials = toolCredentialProjections(options.run, namespace); const sessionPvc = options.sessionPvc; const warnings: string[] = []; if (secretRefs.length === 0) warnings.push("run executionPolicy.secretScope 未声明 provider SecretRef;runner 将按 secret-unavailable 上报,而不会降级直连外部凭据"); - const env = runnerEnv(options, { namespace, jobName, runnerId, attemptId, sourceCommit, secretRefs, toolCredentials, sessionPvc }); + const env = runnerEnv(options, { namespace, jobName, runnerId, attemptId, sourceCommit, secretRefs, toolCredentials, sessionPvc, runnerIdleTimeoutMs }); const manifest: JsonRecord = { apiVersion: "batch/v1", kind: "Job", @@ -219,10 +223,10 @@ export function renderRunnerJobManifest(options: RunnerJobRenderOptions): { mani }, }, }; - return { manifest, namespace, jobName, runnerId, attemptId, sourceCommit, serviceAccountName, secretRefs, toolCredentials, warnings, ttlSecondsAfterFinished }; + return { manifest, namespace, jobName, runnerId, attemptId, sourceCommit, serviceAccountName, secretRefs, toolCredentials, warnings, ttlSecondsAfterFinished, runnerIdleTimeoutMs }; } -function runnerEnv(options: RunnerJobRenderOptions, context: { namespace: string; jobName: string; runnerId: string; attemptId: string; sourceCommit: string; secretRefs: CredentialProjection[]; toolCredentials: ToolCredentialProjection[]; sessionPvc: RunnerSessionPvcOptions | undefined }): JsonRecord[] { +function runnerEnv(options: RunnerJobRenderOptions, context: { namespace: string; jobName: string; runnerId: string; attemptId: string; sourceCommit: string; secretRefs: CredentialProjection[]; toolCredentials: ToolCredentialProjection[]; sessionPvc: RunnerSessionPvcOptions | undefined; runnerIdleTimeoutMs: number }): JsonRecord[] { const selectedSecret = context.secretRefs.find((item) => item.profile === options.run.backendProfile); const codexHome = selectedSecret?.runtimeMountPath ?? defaultRuntimeHome(options.run.backendProfile); const bootRepoUrl = optionalString(options.bootRepoUrl) ?? defaultBootRepoUrl; @@ -250,7 +254,7 @@ function runnerEnv(options: RunnerJobRenderOptions, context: { namespace: string { name: "AGENTRUN_LOG_PATH", value: "/tmp/agentrun-runner.jsonl" }, { name: "AGENTRUN_WORK_READY_VERSION", value: String(staticWorkReadyCapabilitySummary().version) }, { name: "AGENTRUN_PROJECT_DEPENDENCY_POLICY", value: "explicit-cache-or-derived-image-only" }, - { name: "AGENTRUN_RUNNER_IDLE_TIMEOUT_MS", value: "600000" }, + { name: "AGENTRUN_RUNNER_IDLE_TIMEOUT_MS", value: String(context.runnerIdleTimeoutMs) }, { name: "AGENTRUN_RUNNER_POLL_INTERVAL_MS", value: "250" }, { name: "HOME", value: "/home/agentrun" }, { name: "CODEX_HOME", value: codexHome }, @@ -268,6 +272,12 @@ function runnerEnv(options: RunnerJobRenderOptions, context: { namespace: string ]); } +function normalizeRunnerIdleTimeoutMs(value: number | undefined): number { + if (value === undefined) return defaultRunnerIdleTimeoutMs; + if (!Number.isInteger(value) || value <= 0) throw new Error("runnerIdleTimeoutMs must be a positive integer"); + return value; +} + function codexShellSandbox(policy: ExecutionPolicy): string { if (policy.sandbox === "workspace-write") return defaultCodexShellSandbox; return policy.sandbox; diff --git a/src/runner/run-once.ts b/src/runner/run-once.ts index 8922951..57f6e03 100644 --- a/src/runner/run-once.ts +++ b/src/runner/run-once.ts @@ -135,6 +135,7 @@ export async function runOnce(options: RunnerOnceOptions): Promise { ? await reportCommandFailure(api, options.runId, command.id, runner, attemptId, materializationFailure, "runner:resource-bundle", { terminalRun: true }) : await executeCommand(api, withResourceAssembly(options, resourceEnv, initialPrompt), command, runner, attemptId, workspacePath, backendSession ?? (backendSession = createBackendSession(currentRun, withResourceAssembly(options, resourceEnv, initialPrompt)))); commandResults.push(result); + idleSince = Date.now(); if (options.oneShot === true) { const run = await api.reportStatus(options.runId, { terminalStatus: result.terminalStatus, failureKind: result.failureKind, failureMessage: null }); return { runner, commandId: command.id, terminalStatus: result.terminalStatus, failureKind: result.failureKind, run, commandsProcessed: commandResults.length, commandResults, stopped: "one-shot" } as JsonRecord;