fix: 持久化 runner 空闲保活策略
This commit is contained in:
@@ -44,6 +44,7 @@ export interface RunnerJobDefaults {
|
||||
envIdentity?: string;
|
||||
artifactCatalogFile?: string;
|
||||
serviceAccountName?: string;
|
||||
runnerIdleTimeoutMs?: number;
|
||||
kubectlCommand?: string;
|
||||
unideskSshEndpointEnv?: JsonRecord;
|
||||
}
|
||||
@@ -57,6 +58,7 @@ export interface CreateRunnerJobInput extends JsonRecord {
|
||||
runnerId?: string;
|
||||
sourceCommit?: string;
|
||||
serviceAccountName?: string;
|
||||
runnerIdleTimeoutMs?: number;
|
||||
idempotencyKey?: string;
|
||||
imageRef?: JsonRecord;
|
||||
transientEnv?: JsonRecord[];
|
||||
@@ -85,6 +87,7 @@ export async function createKubernetesRunnerJob(options: { store: AgentRunStore;
|
||||
const transientEnv = assembleToolContextTransientEnv(run.executionPolicy, transientEnvField(options.input.transientEnv), options.defaults);
|
||||
const attemptId = optionalString(options.input.attemptId) ?? `attempt_${Date.now().toString(36)}`;
|
||||
const runnerId = optionalString(options.input.runnerId);
|
||||
const runnerIdleTimeoutMs = optionalPositiveInteger(options.input.runnerIdleTimeoutMs, "runnerIdleTimeoutMs") ?? options.defaults.runnerIdleTimeoutMs;
|
||||
const transientEnvSecretName = transientEnv.length > 0 ? transientEnvSecretNameForRun(run.id, commandId, attemptId) : null;
|
||||
const renderTransientEnv = transientEnvSecretName ? transientEnvWithSecretRefs(transientEnv, transientEnvSecretName) : transientEnv;
|
||||
const normalizedPayload = {
|
||||
@@ -97,6 +100,7 @@ export async function createKubernetesRunnerJob(options: { store: AgentRunStore;
|
||||
serviceAccountName: serviceAccountName ?? null,
|
||||
attemptId: optionalString(options.input.attemptId) ?? null,
|
||||
runnerId: optionalString(options.input.runnerId) ?? null,
|
||||
runnerIdleTimeoutMs: runnerIdleTimeoutMs ?? null,
|
||||
transientEnv: transientEnv.map((item) => ({ name: item.name, valueHash: stableHash(item.value), sensitive: true })),
|
||||
};
|
||||
const payloadHash = stableHash(normalizedPayload);
|
||||
@@ -150,6 +154,7 @@ export async function createKubernetesRunnerJob(options: { store: AgentRunStore;
|
||||
namespace,
|
||||
sourceCommit,
|
||||
transientEnv: renderTransientEnv,
|
||||
...(runnerIdleTimeoutMs !== undefined ? { runnerIdleTimeoutMs } : {}),
|
||||
...(serviceAccountName ? { serviceAccountName } : {}),
|
||||
...(sessionPvc ? { sessionPvc } : {}),
|
||||
};
|
||||
@@ -214,6 +219,7 @@ export async function createKubernetesRunnerJob(options: { store: AgentRunStore;
|
||||
workReady: staticWorkReadyCapabilitySummary(),
|
||||
retention: {
|
||||
ttlSecondsAfterFinished: render.ttlSecondsAfterFinished,
|
||||
runnerIdleTimeoutMs: render.runnerIdleTimeoutMs,
|
||||
},
|
||||
pollActions: [
|
||||
runnerJobActionDescriptor({ action: "inspect-run", operation: "describe", resourceKind: "run", resourceName: run.id, runId: run.id }),
|
||||
@@ -479,6 +485,12 @@ function optionalString(value: unknown): string | undefined {
|
||||
return typeof value === "string" && value.trim().length > 0 ? value.trim() : undefined;
|
||||
}
|
||||
|
||||
function optionalPositiveInteger(value: unknown, key: string): number | undefined {
|
||||
if (value === undefined || value === null) return undefined;
|
||||
if (!Number.isInteger(value) || Number(value) <= 0) throw new AgentRunError("schema-invalid", `${key} must be a positive integer`, { httpStatus: 400 });
|
||||
return Number(value);
|
||||
}
|
||||
|
||||
function objectPath(value: unknown, path: string[]): string | null {
|
||||
let current: unknown = value;
|
||||
for (const key of path) {
|
||||
|
||||
@@ -46,6 +46,7 @@ function runnerJobDefaultsForRequest(defaults: ManagerServerOptions["runnerJobDe
|
||||
...optionalStringRecord("envIdentity", defaults?.envIdentity ?? process.env.AGENTRUN_ENV_IDENTITY),
|
||||
...optionalStringRecord("artifactCatalogFile", defaults?.artifactCatalogFile ?? process.env.AGENTRUN_ARTIFACT_CATALOG_FILE),
|
||||
serviceAccountName: defaults?.serviceAccountName ?? process.env.AGENTRUN_RUNNER_SERVICE_ACCOUNT ?? "agentrun-v01-runner",
|
||||
...(defaults?.runnerIdleTimeoutMs !== undefined ? { runnerIdleTimeoutMs: defaults.runnerIdleTimeoutMs } : optionalPositiveIntegerRecord("runnerIdleTimeoutMs", process.env.AGENTRUN_RUNNER_IDLE_TIMEOUT_MS)),
|
||||
...(defaults?.kubectlCommand ? { kubectlCommand: defaults.kubectlCommand } : {}),
|
||||
...(defaults?.unideskSshEndpointEnv ? { unideskSshEndpointEnv: defaults.unideskSshEndpointEnv } : {}),
|
||||
};
|
||||
@@ -64,6 +65,7 @@ export interface ManagerServerOptions {
|
||||
envIdentity?: string;
|
||||
artifactCatalogFile?: string;
|
||||
serviceAccountName?: string;
|
||||
runnerIdleTimeoutMs?: number;
|
||||
kubectlCommand?: string;
|
||||
unideskSshEndpointEnv?: JsonRecord;
|
||||
};
|
||||
@@ -947,6 +949,13 @@ function optionalStringRecord(key: string, value: unknown): JsonRecord {
|
||||
return typeof value === "string" && value.trim().length > 0 ? { [key]: value.trim() } : {};
|
||||
}
|
||||
|
||||
function optionalPositiveIntegerRecord(key: string, value: unknown): JsonRecord {
|
||||
if (value === undefined || value === null || value === "") return {};
|
||||
const parsed = Number(value);
|
||||
if (!Number.isInteger(parsed) || parsed <= 0) throw new AgentRunError("schema-invalid", `${key} must be a positive integer`, { httpStatus: 400 });
|
||||
return { [key]: parsed };
|
||||
}
|
||||
|
||||
function normalizeError(error: unknown): AgentRunError {
|
||||
if (error instanceof AgentRunError) return error;
|
||||
return new AgentRunError("infra-failed", error instanceof Error ? error.message : String(error), { httpStatus: 500 });
|
||||
|
||||
+15
-5
@@ -7,6 +7,7 @@ import { gitTransportSummary, runnerGitTransportEnvVars } from "../common/git-tr
|
||||
const defaultBootRepoUrl = "http://git-mirror-http.devops-infra.svc.cluster.local/pikasTech/agentrun.git";
|
||||
const defaultResourceBinPath = "/usr/local/bin";
|
||||
const defaultCodexShellSandbox = "danger-full-access";
|
||||
const defaultRunnerIdleTimeoutMs = 600_000;
|
||||
const fallbackRunnerEgressProxyUrl = "http://g14-provider-egress-proxy.unidesk.svc.cluster.local:18789";
|
||||
const defaultRunnerNoProxyItems = [
|
||||
"localhost",
|
||||
@@ -52,6 +53,7 @@ export interface RunnerJobRenderOptions {
|
||||
imagePullPolicy?: string;
|
||||
backoffLimit?: number;
|
||||
ttlSecondsAfterFinished?: number;
|
||||
runnerIdleTimeoutMs?: number;
|
||||
transientEnv?: RunnerTransientEnv[];
|
||||
sessionPvc?: RunnerSessionPvcOptions;
|
||||
dryRun?: boolean;
|
||||
@@ -133,6 +135,7 @@ export function renderRunnerJobDryRun(options: RunnerJobRenderOptions): JsonReco
|
||||
workReady: staticWorkReadyCapabilitySummary(),
|
||||
retention: {
|
||||
ttlSecondsAfterFinished: render.ttlSecondsAfterFinished,
|
||||
runnerIdleTimeoutMs: render.runnerIdleTimeoutMs,
|
||||
},
|
||||
pollCommands: {
|
||||
run: `./scripts/agentrun runs show ${options.run.id} --manager-url ${options.managerUrl}`,
|
||||
@@ -143,20 +146,21 @@ export function renderRunnerJobDryRun(options: RunnerJobRenderOptions): JsonReco
|
||||
};
|
||||
}
|
||||
|
||||
export function renderRunnerJobManifest(options: RunnerJobRenderOptions): { manifest: JsonRecord; namespace: string; jobName: string; runnerId: string; attemptId: string; sourceCommit: string; serviceAccountName: string; secretRefs: CredentialProjection[]; toolCredentials: ToolCredentialProjection[]; warnings: string[]; ttlSecondsAfterFinished: number } {
|
||||
export function renderRunnerJobManifest(options: RunnerJobRenderOptions): { manifest: JsonRecord; namespace: string; jobName: string; runnerId: string; attemptId: string; sourceCommit: string; serviceAccountName: string; secretRefs: CredentialProjection[]; toolCredentials: ToolCredentialProjection[]; warnings: string[]; ttlSecondsAfterFinished: number; runnerIdleTimeoutMs: number } {
|
||||
const namespace = options.namespace ?? "agentrun-v01";
|
||||
const attemptId = options.attemptId ?? `attempt_${Date.now().toString(36)}`;
|
||||
const runnerId = options.runnerId ?? `runner_${shortHash(`${options.run.id}:${attemptId}:${options.commandId}`)}`;
|
||||
const sourceCommit = options.sourceCommit ?? process.env.AGENTRUN_SOURCE_COMMIT ?? "unknown";
|
||||
const serviceAccountName = options.serviceAccountName ?? "agentrun-v01-runner";
|
||||
const ttlSecondsAfterFinished = options.ttlSecondsAfterFinished ?? 86_400;
|
||||
const runnerIdleTimeoutMs = normalizeRunnerIdleTimeoutMs(options.runnerIdleTimeoutMs);
|
||||
const jobName = `agentrun-v01-runner-${shortDnsHash(options.run.id, attemptId)}`;
|
||||
const secretRefs = credentialProjections(options.run, namespace);
|
||||
const toolCredentials = toolCredentialProjections(options.run, namespace);
|
||||
const sessionPvc = options.sessionPvc;
|
||||
const warnings: string[] = [];
|
||||
if (secretRefs.length === 0) warnings.push("run executionPolicy.secretScope 未声明 provider SecretRef;runner 将按 secret-unavailable 上报,而不会降级直连外部凭据");
|
||||
const env = runnerEnv(options, { namespace, jobName, runnerId, attemptId, sourceCommit, secretRefs, toolCredentials, sessionPvc });
|
||||
const env = runnerEnv(options, { namespace, jobName, runnerId, attemptId, sourceCommit, secretRefs, toolCredentials, sessionPvc, runnerIdleTimeoutMs });
|
||||
const manifest: JsonRecord = {
|
||||
apiVersion: "batch/v1",
|
||||
kind: "Job",
|
||||
@@ -219,10 +223,10 @@ export function renderRunnerJobManifest(options: RunnerJobRenderOptions): { mani
|
||||
},
|
||||
},
|
||||
};
|
||||
return { manifest, namespace, jobName, runnerId, attemptId, sourceCommit, serviceAccountName, secretRefs, toolCredentials, warnings, ttlSecondsAfterFinished };
|
||||
return { manifest, namespace, jobName, runnerId, attemptId, sourceCommit, serviceAccountName, secretRefs, toolCredentials, warnings, ttlSecondsAfterFinished, runnerIdleTimeoutMs };
|
||||
}
|
||||
|
||||
function runnerEnv(options: RunnerJobRenderOptions, context: { namespace: string; jobName: string; runnerId: string; attemptId: string; sourceCommit: string; secretRefs: CredentialProjection[]; toolCredentials: ToolCredentialProjection[]; sessionPvc: RunnerSessionPvcOptions | undefined }): JsonRecord[] {
|
||||
function runnerEnv(options: RunnerJobRenderOptions, context: { namespace: string; jobName: string; runnerId: string; attemptId: string; sourceCommit: string; secretRefs: CredentialProjection[]; toolCredentials: ToolCredentialProjection[]; sessionPvc: RunnerSessionPvcOptions | undefined; runnerIdleTimeoutMs: number }): JsonRecord[] {
|
||||
const selectedSecret = context.secretRefs.find((item) => item.profile === options.run.backendProfile);
|
||||
const codexHome = selectedSecret?.runtimeMountPath ?? defaultRuntimeHome(options.run.backendProfile);
|
||||
const bootRepoUrl = optionalString(options.bootRepoUrl) ?? defaultBootRepoUrl;
|
||||
@@ -250,7 +254,7 @@ function runnerEnv(options: RunnerJobRenderOptions, context: { namespace: string
|
||||
{ name: "AGENTRUN_LOG_PATH", value: "/tmp/agentrun-runner.jsonl" },
|
||||
{ name: "AGENTRUN_WORK_READY_VERSION", value: String(staticWorkReadyCapabilitySummary().version) },
|
||||
{ name: "AGENTRUN_PROJECT_DEPENDENCY_POLICY", value: "explicit-cache-or-derived-image-only" },
|
||||
{ name: "AGENTRUN_RUNNER_IDLE_TIMEOUT_MS", value: "600000" },
|
||||
{ name: "AGENTRUN_RUNNER_IDLE_TIMEOUT_MS", value: String(context.runnerIdleTimeoutMs) },
|
||||
{ name: "AGENTRUN_RUNNER_POLL_INTERVAL_MS", value: "250" },
|
||||
{ name: "HOME", value: "/home/agentrun" },
|
||||
{ name: "CODEX_HOME", value: codexHome },
|
||||
@@ -268,6 +272,12 @@ function runnerEnv(options: RunnerJobRenderOptions, context: { namespace: string
|
||||
]);
|
||||
}
|
||||
|
||||
function normalizeRunnerIdleTimeoutMs(value: number | undefined): number {
|
||||
if (value === undefined) return defaultRunnerIdleTimeoutMs;
|
||||
if (!Number.isInteger(value) || value <= 0) throw new Error("runnerIdleTimeoutMs must be a positive integer");
|
||||
return value;
|
||||
}
|
||||
|
||||
function codexShellSandbox(policy: ExecutionPolicy): string {
|
||||
if (policy.sandbox === "workspace-write") return defaultCodexShellSandbox;
|
||||
return policy.sandbox;
|
||||
|
||||
@@ -135,6 +135,7 @@ export async function runOnce(options: RunnerOnceOptions): Promise<JsonRecord> {
|
||||
? await reportCommandFailure(api, options.runId, command.id, runner, attemptId, materializationFailure, "runner:resource-bundle", { terminalRun: true })
|
||||
: await executeCommand(api, withResourceAssembly(options, resourceEnv, initialPrompt), command, runner, attemptId, workspacePath, backendSession ?? (backendSession = createBackendSession(currentRun, withResourceAssembly(options, resourceEnv, initialPrompt))));
|
||||
commandResults.push(result);
|
||||
idleSince = Date.now();
|
||||
if (options.oneShot === true) {
|
||||
const run = await api.reportStatus(options.runId, { terminalStatus: result.terminalStatus, failureKind: result.failureKind, failureMessage: null });
|
||||
return { runner, commandId: command.id, terminalStatus: result.terminalStatus, failureKind: result.failureKind, run, commandsProcessed: commandResults.length, commandResults, stopped: "one-shot" } as JsonRecord;
|
||||
|
||||
Reference in New Issue
Block a user