Merge pull request #190 from pikasTech/fix/issue-1326-runner-persistence

fix: 持久化 runner 空闲保活策略
This commit is contained in:
Lyon
2026-06-16 13:01:58 +08:00
committed by GitHub
4 changed files with 37 additions and 5 deletions
+12
View File
@@ -44,6 +44,7 @@ export interface RunnerJobDefaults {
envIdentity?: string;
artifactCatalogFile?: string;
serviceAccountName?: string;
runnerIdleTimeoutMs?: number;
kubectlCommand?: string;
unideskSshEndpointEnv?: JsonRecord;
}
@@ -57,6 +58,7 @@ export interface CreateRunnerJobInput extends JsonRecord {
runnerId?: string;
sourceCommit?: string;
serviceAccountName?: string;
runnerIdleTimeoutMs?: number;
idempotencyKey?: string;
imageRef?: JsonRecord;
transientEnv?: JsonRecord[];
@@ -85,6 +87,7 @@ export async function createKubernetesRunnerJob(options: { store: AgentRunStore;
const transientEnv = assembleToolContextTransientEnv(run.executionPolicy, transientEnvField(options.input.transientEnv), options.defaults);
const attemptId = optionalString(options.input.attemptId) ?? `attempt_${Date.now().toString(36)}`;
const runnerId = optionalString(options.input.runnerId);
const runnerIdleTimeoutMs = optionalPositiveInteger(options.input.runnerIdleTimeoutMs, "runnerIdleTimeoutMs") ?? options.defaults.runnerIdleTimeoutMs;
const transientEnvSecretName = transientEnv.length > 0 ? transientEnvSecretNameForRun(run.id, commandId, attemptId) : null;
const renderTransientEnv = transientEnvSecretName ? transientEnvWithSecretRefs(transientEnv, transientEnvSecretName) : transientEnv;
const normalizedPayload = {
@@ -97,6 +100,7 @@ export async function createKubernetesRunnerJob(options: { store: AgentRunStore;
serviceAccountName: serviceAccountName ?? null,
attemptId: optionalString(options.input.attemptId) ?? null,
runnerId: optionalString(options.input.runnerId) ?? null,
runnerIdleTimeoutMs: runnerIdleTimeoutMs ?? null,
transientEnv: transientEnv.map((item) => ({ name: item.name, valueHash: stableHash(item.value), sensitive: true })),
};
const payloadHash = stableHash(normalizedPayload);
@@ -150,6 +154,7 @@ export async function createKubernetesRunnerJob(options: { store: AgentRunStore;
namespace,
sourceCommit,
transientEnv: renderTransientEnv,
...(runnerIdleTimeoutMs !== undefined ? { runnerIdleTimeoutMs } : {}),
...(serviceAccountName ? { serviceAccountName } : {}),
...(sessionPvc ? { sessionPvc } : {}),
};
@@ -214,6 +219,7 @@ export async function createKubernetesRunnerJob(options: { store: AgentRunStore;
workReady: staticWorkReadyCapabilitySummary(),
retention: {
ttlSecondsAfterFinished: render.ttlSecondsAfterFinished,
runnerIdleTimeoutMs: render.runnerIdleTimeoutMs,
},
pollActions: [
runnerJobActionDescriptor({ action: "inspect-run", operation: "describe", resourceKind: "run", resourceName: run.id, runId: run.id }),
@@ -479,6 +485,12 @@ function optionalString(value: unknown): string | undefined {
return typeof value === "string" && value.trim().length > 0 ? value.trim() : undefined;
}
function optionalPositiveInteger(value: unknown, key: string): number | undefined {
if (value === undefined || value === null) return undefined;
if (!Number.isInteger(value) || Number(value) <= 0) throw new AgentRunError("schema-invalid", `${key} must be a positive integer`, { httpStatus: 400 });
return Number(value);
}
function objectPath(value: unknown, path: string[]): string | null {
let current: unknown = value;
for (const key of path) {
+9
View File
@@ -46,6 +46,7 @@ function runnerJobDefaultsForRequest(defaults: ManagerServerOptions["runnerJobDe
...optionalStringRecord("envIdentity", defaults?.envIdentity ?? process.env.AGENTRUN_ENV_IDENTITY),
...optionalStringRecord("artifactCatalogFile", defaults?.artifactCatalogFile ?? process.env.AGENTRUN_ARTIFACT_CATALOG_FILE),
serviceAccountName: defaults?.serviceAccountName ?? process.env.AGENTRUN_RUNNER_SERVICE_ACCOUNT ?? "agentrun-v01-runner",
...(defaults?.runnerIdleTimeoutMs !== undefined ? { runnerIdleTimeoutMs: defaults.runnerIdleTimeoutMs } : optionalPositiveIntegerRecord("runnerIdleTimeoutMs", process.env.AGENTRUN_RUNNER_IDLE_TIMEOUT_MS)),
...(defaults?.kubectlCommand ? { kubectlCommand: defaults.kubectlCommand } : {}),
...(defaults?.unideskSshEndpointEnv ? { unideskSshEndpointEnv: defaults.unideskSshEndpointEnv } : {}),
};
@@ -64,6 +65,7 @@ export interface ManagerServerOptions {
envIdentity?: string;
artifactCatalogFile?: string;
serviceAccountName?: string;
runnerIdleTimeoutMs?: number;
kubectlCommand?: string;
unideskSshEndpointEnv?: JsonRecord;
};
@@ -947,6 +949,13 @@ function optionalStringRecord(key: string, value: unknown): JsonRecord {
return typeof value === "string" && value.trim().length > 0 ? { [key]: value.trim() } : {};
}
function optionalPositiveIntegerRecord(key: string, value: unknown): JsonRecord {
if (value === undefined || value === null || value === "") return {};
const parsed = Number(value);
if (!Number.isInteger(parsed) || parsed <= 0) throw new AgentRunError("schema-invalid", `${key} must be a positive integer`, { httpStatus: 400 });
return { [key]: parsed };
}
function normalizeError(error: unknown): AgentRunError {
if (error instanceof AgentRunError) return error;
return new AgentRunError("infra-failed", error instanceof Error ? error.message : String(error), { httpStatus: 500 });
+15 -5
View File
@@ -7,6 +7,7 @@ import { gitTransportSummary, runnerGitTransportEnvVars } from "../common/git-tr
const defaultBootRepoUrl = "http://git-mirror-http.devops-infra.svc.cluster.local/pikasTech/agentrun.git";
const defaultResourceBinPath = "/usr/local/bin";
const defaultCodexShellSandbox = "danger-full-access";
const defaultRunnerIdleTimeoutMs = 600_000;
const fallbackRunnerEgressProxyUrl = "http://g14-provider-egress-proxy.unidesk.svc.cluster.local:18789";
const defaultRunnerNoProxyItems = [
"localhost",
@@ -52,6 +53,7 @@ export interface RunnerJobRenderOptions {
imagePullPolicy?: string;
backoffLimit?: number;
ttlSecondsAfterFinished?: number;
runnerIdleTimeoutMs?: number;
transientEnv?: RunnerTransientEnv[];
sessionPvc?: RunnerSessionPvcOptions;
dryRun?: boolean;
@@ -133,6 +135,7 @@ export function renderRunnerJobDryRun(options: RunnerJobRenderOptions): JsonReco
workReady: staticWorkReadyCapabilitySummary(),
retention: {
ttlSecondsAfterFinished: render.ttlSecondsAfterFinished,
runnerIdleTimeoutMs: render.runnerIdleTimeoutMs,
},
pollCommands: {
run: `./scripts/agentrun runs show ${options.run.id} --manager-url ${options.managerUrl}`,
@@ -143,20 +146,21 @@ export function renderRunnerJobDryRun(options: RunnerJobRenderOptions): JsonReco
};
}
export function renderRunnerJobManifest(options: RunnerJobRenderOptions): { manifest: JsonRecord; namespace: string; jobName: string; runnerId: string; attemptId: string; sourceCommit: string; serviceAccountName: string; secretRefs: CredentialProjection[]; toolCredentials: ToolCredentialProjection[]; warnings: string[]; ttlSecondsAfterFinished: number } {
export function renderRunnerJobManifest(options: RunnerJobRenderOptions): { manifest: JsonRecord; namespace: string; jobName: string; runnerId: string; attemptId: string; sourceCommit: string; serviceAccountName: string; secretRefs: CredentialProjection[]; toolCredentials: ToolCredentialProjection[]; warnings: string[]; ttlSecondsAfterFinished: number; runnerIdleTimeoutMs: number } {
const namespace = options.namespace ?? "agentrun-v01";
const attemptId = options.attemptId ?? `attempt_${Date.now().toString(36)}`;
const runnerId = options.runnerId ?? `runner_${shortHash(`${options.run.id}:${attemptId}:${options.commandId}`)}`;
const sourceCommit = options.sourceCommit ?? process.env.AGENTRUN_SOURCE_COMMIT ?? "unknown";
const serviceAccountName = options.serviceAccountName ?? "agentrun-v01-runner";
const ttlSecondsAfterFinished = options.ttlSecondsAfterFinished ?? 86_400;
const runnerIdleTimeoutMs = normalizeRunnerIdleTimeoutMs(options.runnerIdleTimeoutMs);
const jobName = `agentrun-v01-runner-${shortDnsHash(options.run.id, attemptId)}`;
const secretRefs = credentialProjections(options.run, namespace);
const toolCredentials = toolCredentialProjections(options.run, namespace);
const sessionPvc = options.sessionPvc;
const warnings: string[] = [];
if (secretRefs.length === 0) warnings.push("run executionPolicy.secretScope 未声明 provider SecretRefrunner 将按 secret-unavailable 上报,而不会降级直连外部凭据");
const env = runnerEnv(options, { namespace, jobName, runnerId, attemptId, sourceCommit, secretRefs, toolCredentials, sessionPvc });
const env = runnerEnv(options, { namespace, jobName, runnerId, attemptId, sourceCommit, secretRefs, toolCredentials, sessionPvc, runnerIdleTimeoutMs });
const manifest: JsonRecord = {
apiVersion: "batch/v1",
kind: "Job",
@@ -219,10 +223,10 @@ export function renderRunnerJobManifest(options: RunnerJobRenderOptions): { mani
},
},
};
return { manifest, namespace, jobName, runnerId, attemptId, sourceCommit, serviceAccountName, secretRefs, toolCredentials, warnings, ttlSecondsAfterFinished };
return { manifest, namespace, jobName, runnerId, attemptId, sourceCommit, serviceAccountName, secretRefs, toolCredentials, warnings, ttlSecondsAfterFinished, runnerIdleTimeoutMs };
}
function runnerEnv(options: RunnerJobRenderOptions, context: { namespace: string; jobName: string; runnerId: string; attemptId: string; sourceCommit: string; secretRefs: CredentialProjection[]; toolCredentials: ToolCredentialProjection[]; sessionPvc: RunnerSessionPvcOptions | undefined }): JsonRecord[] {
function runnerEnv(options: RunnerJobRenderOptions, context: { namespace: string; jobName: string; runnerId: string; attemptId: string; sourceCommit: string; secretRefs: CredentialProjection[]; toolCredentials: ToolCredentialProjection[]; sessionPvc: RunnerSessionPvcOptions | undefined; runnerIdleTimeoutMs: number }): JsonRecord[] {
const selectedSecret = context.secretRefs.find((item) => item.profile === options.run.backendProfile);
const codexHome = selectedSecret?.runtimeMountPath ?? defaultRuntimeHome(options.run.backendProfile);
const bootRepoUrl = optionalString(options.bootRepoUrl) ?? defaultBootRepoUrl;
@@ -250,7 +254,7 @@ function runnerEnv(options: RunnerJobRenderOptions, context: { namespace: string
{ name: "AGENTRUN_LOG_PATH", value: "/tmp/agentrun-runner.jsonl" },
{ name: "AGENTRUN_WORK_READY_VERSION", value: String(staticWorkReadyCapabilitySummary().version) },
{ name: "AGENTRUN_PROJECT_DEPENDENCY_POLICY", value: "explicit-cache-or-derived-image-only" },
{ name: "AGENTRUN_RUNNER_IDLE_TIMEOUT_MS", value: "600000" },
{ name: "AGENTRUN_RUNNER_IDLE_TIMEOUT_MS", value: String(context.runnerIdleTimeoutMs) },
{ name: "AGENTRUN_RUNNER_POLL_INTERVAL_MS", value: "250" },
{ name: "HOME", value: "/home/agentrun" },
{ name: "CODEX_HOME", value: codexHome },
@@ -268,6 +272,12 @@ function runnerEnv(options: RunnerJobRenderOptions, context: { namespace: string
]);
}
function normalizeRunnerIdleTimeoutMs(value: number | undefined): number {
if (value === undefined) return defaultRunnerIdleTimeoutMs;
if (!Number.isInteger(value) || value <= 0) throw new Error("runnerIdleTimeoutMs must be a positive integer");
return value;
}
function codexShellSandbox(policy: ExecutionPolicy): string {
if (policy.sandbox === "workspace-write") return defaultCodexShellSandbox;
return policy.sandbox;
+1
View File
@@ -135,6 +135,7 @@ export async function runOnce(options: RunnerOnceOptions): Promise<JsonRecord> {
? await reportCommandFailure(api, options.runId, command.id, runner, attemptId, materializationFailure, "runner:resource-bundle", { terminalRun: true })
: await executeCommand(api, withResourceAssembly(options, resourceEnv, initialPrompt), command, runner, attemptId, workspacePath, backendSession ?? (backendSession = createBackendSession(currentRun, withResourceAssembly(options, resourceEnv, initialPrompt))));
commandResults.push(result);
idleSince = Date.now();
if (options.oneShot === true) {
const run = await api.reportStatus(options.runId, { terminalStatus: result.terminalStatus, failureKind: result.failureKind, failureMessage: null });
return { runner, commandId: command.id, terminalStatus: result.terminalStatus, failureKind: result.failureKind, run, commandsProcessed: commandResults.length, commandResults, stopped: "one-shot" } as JsonRecord;