From d709b74bb32a3d136a8633e2cb4b19d1e518a5ee Mon Sep 17 00:00:00 2001 From: Codex Date: Thu, 21 May 2026 12:55:11 +0000 Subject: [PATCH] fix: route artifact registry health via remote control plane --- ...-preflight-classification-contract-test.ts | 187 ++++++++++++-- scripts/src/artifact-registry.ts | 234 +++++++++++++++++- scripts/src/remote.ts | 12 +- 3 files changed, 411 insertions(+), 22 deletions(-) diff --git a/scripts/artifact-registry-preflight-classification-contract-test.ts b/scripts/artifact-registry-preflight-classification-contract-test.ts index b9617c1b..ebd48033 100644 --- a/scripts/artifact-registry-preflight-classification-contract-test.ts +++ b/scripts/artifact-registry-preflight-classification-contract-test.ts @@ -1,7 +1,9 @@ import { + artifactRegistryReadonlyAutoRemotePlan, artifactRegistryReadonlyResultFromCommand, buildArtifactRegistryReadonlyProbe, parseArtifactRegistryOptions, + runArtifactRegistryCommand, } from "./src/artifact-registry"; import type { CommandResult } from "./src/command"; @@ -114,18 +116,173 @@ assertCondition(asStringArray(success.failedScopes, "success.failedScopes").leng assertCondition(success.recommendedAction === "none", "healthy registry recommendedAction should be none", success); assertCondition(success.remoteCommandShape === probe.remoteCommandShape, "healthy registry should echo remote command shape", success); -process.stdout.write(`${JSON.stringify({ - ok: true, - checks: [ - "provider-ssh-command missing is classified distinctly", - "oversized host.ssh command shape is classified distinctly", - "remote host.ssh timeout is classified distinctly", - "successful registry readonly probe has no failed scopes", - ], - classifications: { - missing: missing.failureClassification, - commandShape: commandShape.failureClassification, - timeout: timeout.failureClassification, - success: success.failureClassification, - }, -}, null, 2)}\n`); +const driftStdout = [ + "readonly=true", + "unit_exists=true", + "compose_exists=true", + "config_exists=true", + "storage_exists=true", + "systemctl_available=true", + "unit_active=active", + "unit_enabled=enabled", + "docker_available=true", + "container_running=true", + "container_status=running", + "container_image=registry:2.8.2", + "container_restart_policy=unless-stopped", + "listener_count=1", + "bad_listener_count=0", + "loopback_only=true", + "curl_available=true", + "v2_http_code=200", + "config_hash=old-config", + "compose_hash=old-compose", + "unit_hash=old-unit", + "config_hash_matches=false", + "compose_hash_matches=false", + "unit_hash_matches=false", + "image_matches=false", + "", +].join("\n"); + +const drift = asRecord(artifactRegistryReadonlyResultFromCommand(probe, command({ + stdout: driftStdout, +})), "registry drift result"); +assertCondition(drift.ok === false, "health should fail when rendered config/image drift exists", drift); +assertCondition(drift.failureClassification === "registry-unhealthy", "registry drift should classify as registry-unhealthy", drift); +const driftScopes = asStringArray(drift.failedScopes, "drift.failedScopes"); +assertCondition(driftScopes.includes("rendered-config"), "registry drift should include rendered-config scope", drift); +assertCondition(driftScopes.includes("registry-image"), "registry drift should include registry-image scope", drift); +assertCondition(!driftScopes.includes("control-plane-missing"), "registry drift must not be classified as control-plane missing", drift); + +async function main(): Promise { + const localMissingWithNoRemote = await runArtifactRegistryCommand(["health", "--provider-id", "D601"], { + env: { + CODE_QUEUE_DEV_CONTAINER_MASTER_HOST: "203.0.113.10", + CODE_QUEUE_SERVICE_ROLE: "scheduler", + }, + runRemoteScriptForTest: () => command({ + exitCode: 1, + stderr: "Error response from daemon: No such container: unidesk-backend-core\n", + }), + runCliForTest: () => command({ + exitCode: 1, + stdout: JSON.stringify({ + ok: false, + command: "artifact-registry health --provider-id D601", + data: { + transport: "frontend", + readonly: true, + dispatch: { ok: false, status: 502, body: { ok: false, error: "backend-core proxy unavailable" } }, + wait: null, + result: { + ok: false, + readonly: true, + installed: false, + healthy: false, + decision: "infra-blocked", + retryable: true, + runnerDisposition: "infra-blocked", + failureClassification: "control-plane-missing", + failedScopes: ["control-plane-missing", "backend-core-api"], + runtimeApiHealthy: false, + }, + }, + }), + }), + }); + const controlPlaneMissing = asRecord(localMissingWithNoRemote, "control-plane missing result"); + assertCondition(controlPlaneMissing.ok === false, "missing local and remote control planes should fail", controlPlaneMissing); + assertCondition(controlPlaneMissing.failureClassification === "control-plane-missing", "missing remote control plane should classify control-plane-missing", controlPlaneMissing); + assertCondition(asStringArray(controlPlaneMissing.failedScopes, "controlPlaneMissing.failedScopes").includes("control-plane-missing"), "control-plane missing scope should be reported", controlPlaneMissing); + assertCondition(asRecord(controlPlaneMissing.controlPlane, "controlPlane").localBackendCoreMissing === true, "local backend-core absence should remain evidence", controlPlaneMissing); + + const remoteFallback = await runArtifactRegistryCommand(["health", "--provider-id", "D601"], { + env: { + CODE_QUEUE_DEV_CONTAINER_MASTER_HOST: "74.48.78.17", + }, + runRemoteScriptForTest: () => command({ + exitCode: 1, + stderr: "Error response from daemon: No such container: unidesk-backend-core\n", + }), + runCliForTest: () => command({ + stdout: JSON.stringify({ + ok: true, + command: "artifact-registry health --provider-id D601", + data: { + transport: "frontend", + readonly: true, + result: success, + }, + }), + }), + }); + const remoteFallbackRecord = asRecord(remoteFallback, "remote fallback result"); + assertCondition(remoteFallbackRecord.ok === true, "remote fallback should return the remote registry result", remoteFallbackRecord); + const fallbackControlPlane = asRecord(remoteFallbackRecord.controlPlane, "remote fallback controlPlane"); + assertCondition(fallbackControlPlane.remoteFallbackUsed === true, "remote fallback should be marked", fallbackControlPlane); + assertCondition(fallbackControlPlane.localBackendCoreMissing === true, "local backend-core absence should remain evidence only", fallbackControlPlane); + assertCondition(asStringArray(remoteFallbackRecord.failedScopes, "remoteFallback.failedScopes").length === 0, "remote fallback should preserve registry scopes", remoteFallbackRecord); + + let remoteFirstLocalSshCalls = 0; + const remoteFirst = await runArtifactRegistryCommand(["health", "--provider-id", "D601"], { + env: { + CODE_QUEUE_SERVICE_ROLE: "scheduler", + CODE_QUEUE_DEV_CONTAINER_MASTER_HOST: "74.48.78.17", + }, + runRemoteScriptForTest: () => { + remoteFirstLocalSshCalls += 1; + return command({ exitCode: 1, stderr: "unexpected local ssh path" }); + }, + runCliForTest: () => command({ + stdout: JSON.stringify({ + ok: true, + command: "artifact-registry health --provider-id D601", + data: { + transport: "frontend", + readonly: true, + result: success, + }, + }), + }), + }); + const remoteFirstRecord = asRecord(remoteFirst, "remote first result"); + assertCondition(remoteFirstRecord.ok === true, "runner-like env should succeed through remote frontend first", remoteFirstRecord); + assertCondition(remoteFirstLocalSshCalls === 0, "runner-like env should not require local backend-core before remote frontend", { remoteFirstLocalSshCalls }); + assertCondition(asRecord(remoteFirstRecord.controlPlane, "remoteFirst.controlPlane").remoteFirst === true, "remote-first controlPlane should be marked", remoteFirstRecord.controlPlane); + + const autoPlan = artifactRegistryReadonlyAutoRemotePlan("health", options, { + CODE_QUEUE_SERVICE_ROLE: "scheduler", + CODE_QUEUE_DEV_CONTAINER_MASTER_HOST: "74.48.78.17", + }); + assertCondition(autoPlan.enabled === true, "runner-like env should auto-select remote frontend for readonly registry health", autoPlan); + assertCondition(String(autoPlan.command ?? "").includes("--main-server-ip 74.48.78.17"), "auto remote plan should expose command shape", autoPlan); + + process.stdout.write(`${JSON.stringify({ + ok: true, + checks: [ + "provider-ssh-command missing is classified distinctly", + "oversized host.ssh command shape is classified distinctly", + "remote host.ssh timeout is classified distinctly", + "successful registry readonly probe has no failed scopes", + "runner-like env uses remote frontend before local backend-core", + "local backend-core absence can fall back to remote frontend control plane", + "missing local and remote control planes classify as control-plane-missing", + "rendered-config and registry-image drift classify as registry-unhealthy", + ], + classifications: { + missing: missing.failureClassification, + commandShape: commandShape.failureClassification, + timeout: timeout.failureClassification, + success: success.failureClassification, + drift: drift.failureClassification, + controlPlaneMissing: controlPlaneMissing.failureClassification, + remoteFallback: remoteFallbackRecord.failureClassification, + remoteFirst: remoteFirstRecord.failureClassification, + }, + }, null, 2)}\n`); +} + +if (import.meta.main) { + await main(); +} diff --git a/scripts/src/artifact-registry.ts b/scripts/src/artifact-registry.ts index bc818c8e..4d9fe4b0 100644 --- a/scripts/src/artifact-registry.ts +++ b/scripts/src/artifact-registry.ts @@ -4,6 +4,18 @@ import { join } from "node:path"; import { runCommand, type CommandResult } from "./command"; import { readConfig, type UniDeskConfig, repoRoot, rootPath } from "./config"; import { startJob } from "./jobs"; +import { + compareDeployJsonExecutorMirrors, + deployJsonCommitImage, + deployJsonDriftResult, + deployJsonSourceOfTruth, + hasDeployJsonExecutorContract, + k3sManifestExecutorMirror, + parseDeployJsonServiceContractBase64, + readDeployJsonServiceContractFromFile, + type DeployJsonExecutorMirror, + type DeployJsonServiceContract, +} from "./deploy-json-contract"; export type ArtifactRegistryAction = "plan" | "render" | "status" | "health" | "install" | "deploy-backend-core" | "deploy-service"; type ArtifactDeployEnvironment = "prod" | "dev"; @@ -29,6 +41,13 @@ export interface ArtifactRegistryOptions { sourceRepo: string; sourceRepoExplicit: boolean; deployRef: string | null; + deployJsonService: DeployJsonServiceContract | null; +} + +export interface ArtifactRegistryCommandRuntime { + env?: NodeJS.ProcessEnv; + runRemoteScriptForTest?: (options: ArtifactRegistryOptions, script: string, timeoutMs: number) => CommandResult; + runCliForTest?: (command: string[], timeoutMs: number) => CommandResult; } interface RenderedFile { @@ -60,6 +79,7 @@ export interface ArtifactRegistryReadonlyProbe { } export type ArtifactRegistryFailureClassification = + | "control-plane-missing" | "local-docker-required" | "provider-ssh-command-missing" | "ssh-helper-command-shape-incompatible" @@ -88,6 +108,7 @@ const defaultOptions: ArtifactRegistryOptions = { sourceRepo: "https://github.com/pikasTech/unidesk", sourceRepoExplicit: false, deployRef: null, + deployJsonService: null, }; const supportedArtifactConsumerServices = [ "backend-core", @@ -1150,6 +1171,7 @@ function asBool(value: string | undefined): boolean { function registryRecommendedAction(classification: ArtifactRegistryFailureClassification | null): string { if (classification === null) return "none"; + if (classification === "control-plane-missing") return "restore the artifact registry readonly control plane, or rerun through --main-server-ip once frontend/backend-core dispatch is reachable"; if (classification === "local-docker-required") return "run the read-only check through --main-server-ip or from a main-server CLI with backend-core available"; if (classification === "provider-ssh-command-missing") return "restore D601 provider-gateway host.ssh dispatch/capability before retrying artifact registry health"; if (classification === "ssh-helper-command-shape-incompatible") return "upgrade the CLI/control-plane host.ssh helper shape so it can run bash -lc readonly probes"; @@ -1182,6 +1204,7 @@ function classifyProviderSshCommandFailure(command: CommandResult): ArtifactRegi function providerSshCommandFailureScopes(classification: ArtifactRegistryFailureClassification): string[] { const scopes = ["provider-ssh-command"]; + if (classification === "control-plane-missing") return ["control-plane-missing", ...scopes]; if (classification === "remote-command-timeout") scopes.push("remote-command-timeout"); if (classification === "local-docker-required") scopes.push("local-docker-control-plane"); if (classification === "ssh-helper-command-shape-incompatible") scopes.push("ssh-helper-command-shape"); @@ -1354,8 +1377,9 @@ function deployRefFor(options: ArtifactRegistryOptions, spec: ArtifactConsumerSp return options.deployRef ?? target?.deployRef ?? `deploy.json#environments.${options.environment ?? "prod"}.services.${spec.serviceId}`; } -function runRemoteScript(options: ArtifactRegistryOptions, script: string, timeoutMs = options.timeoutMs): CommandResult { +function runRemoteScript(options: ArtifactRegistryOptions, script: string, timeoutMs = options.timeoutMs, runtime: ArtifactRegistryCommandRuntime = {}): CommandResult { const command = [process.execPath, "scripts/cli.ts", "ssh", options.providerId, "argv", "bash", "-lc", script]; + if (runtime.runRemoteScriptForTest !== undefined) return runtime.runRemoteScriptForTest(options, script, timeoutMs); return runCommand(command, repoRoot, { timeoutMs }); } @@ -1389,6 +1413,204 @@ function readonlyCommandFailureResult( }; } +function remoteFrontendHostFromEnv(env: NodeJS.ProcessEnv = process.env): string | null { + for (const key of ["UNIDESK_MAIN_SERVER_IP", "UNIDESK_MAIN_SERVER_HOST", "CODE_QUEUE_DEV_CONTAINER_MASTER_HOST"]) { + const value = env[key]?.trim() ?? ""; + if (value.length === 0) continue; + if (value === "localhost" || value === "127.0.0.1" || value === "::1") continue; + return value.replace(/\/+$/u, ""); + } + if (env.CODE_QUEUE_SERVICE_ROLE || env.CODE_QUEUE_INSTANCE_ID || env.KUBERNETES_SERVICE_HOST) { + const publicHost = readConfig().network.publicHost.trim(); + if (publicHost.length > 0 && publicHost !== "localhost" && publicHost !== "127.0.0.1" && publicHost !== "::1") return publicHost; + } + return null; +} + +function controlPlaneMissingResult( + options: ArtifactRegistryOptions, + action: "status" | "health", + localResult: Record, + remoteResult: Record | null, + remoteCommand: CommandResult | null, + remoteHost: string | null, +): Record { + const bundle = renderBundle(options); + return { + ok: false, + readonly: true, + installed: false, + healthy: false, + decision: "infra-blocked", + retryable: true, + runnerDisposition: "infra-blocked", + failureClassification: "control-plane-missing", + recommendedAction: registryRecommendedAction("control-plane-missing"), + remoteCommandShape: readonlyRemoteCommandShape(action, options), + healthyScopes: [], + failedScopes: providerSshCommandFailureScopes("control-plane-missing"), + runtimeApiHealthy: false, + checks: {}, + expected: { + endpoint: `http://${options.host}:${options.port}`, + image: options.image, + paths: bundle.paths, + }, + controlPlane: { + preferred: "remote-frontend", + remoteFallbackAttempted: remoteHost !== null, + remoteFallbackUsed: false, + remoteHost, + localBackendCoreMissing: true, + classification: "control-plane-missing", + retryCommand: remoteHost === null + ? `bun scripts/cli.ts --main-server-ip artifact-registry ${action} --provider-id ${options.providerId}` + : `bun scripts/cli.ts --main-server-ip ${remoteHost} artifact-registry ${action} --provider-id ${options.providerId}`, + }, + localObservation: localResult, + remoteObservation: remoteResult, + remoteCommand: remoteCommand === null ? null : artifactRegistryCommandTail(remoteCommand), + }; +} + +function parsedCliData(stdout: string): Record | null { + if (stdout.trim().length === 0) return null; + try { + const parsed = JSON.parse(stdout) as unknown; + if (typeof parsed !== "object" || parsed === null || Array.isArray(parsed)) return null; + const record = parsed as Record; + const data = record.data; + return typeof data === "object" && data !== null && !Array.isArray(data) ? data as Record : record; + } catch { + return null; + } +} + +function unwrapRemoteArtifactRegistryResult(data: Record): Record | null { + const directResult = data.result; + if (typeof directResult === "object" && directResult !== null && !Array.isArray(directResult)) return directResult as Record; + const nested = data.data; + if (typeof nested === "object" && nested !== null && !Array.isArray(nested)) { + const nestedRecord = nested as Record; + const nestedResult = nestedRecord.result; + if (typeof nestedResult === "object" && nestedResult !== null && !Array.isArray(nestedResult)) return nestedResult as Record; + } + return null; +} + +function annotateRemoteReadonlyResult( + result: Record, + localResult: Record, + remoteHost: string, + command: CommandResult, +): Record { + return { + ...result, + controlPlane: { + preferred: "remote-frontend", + remoteFallbackAttempted: true, + remoteFallbackUsed: true, + remoteHost, + localBackendCoreMissing: true, + classification: null, + }, + localObservation: localResult, + remoteCommand: artifactRegistryCommandTail(command), + }; +} + +function annotateRemoteFirstReadonlyResult( + result: Record, + remoteHost: string, + command: CommandResult, +): Record { + return { + ...result, + controlPlane: { + preferred: "remote-frontend", + remoteFallbackAttempted: false, + remoteFallbackUsed: true, + remoteFirst: true, + remoteHost, + localBackendCoreMissing: null, + classification: null, + }, + remoteCommand: artifactRegistryCommandTail(command), + }; +} + +export function artifactRegistryReadonlyAutoRemotePlan( + action: "status" | "health", + options: ArtifactRegistryOptions, + env: NodeJS.ProcessEnv = process.env, +): Record { + const remoteHost = remoteFrontendHostFromEnv(env); + return { + enabled: remoteHost !== null, + action, + providerId: options.providerId, + host: remoteHost, + transport: "frontend", + command: remoteHost === null ? null : `bun scripts/cli.ts --main-server-ip ${remoteHost} artifact-registry ${action} --provider-id ${options.providerId}`, + failureClassification: remoteHost === null ? "control-plane-missing" : null, + }; +} + +function runReadonlyStatusWithRemoteFallback(options: ArtifactRegistryOptions, healthMode: boolean, runtime: ArtifactRegistryCommandRuntime = {}): Record { + const action = healthMode ? "health" : "status"; + const remoteHost = remoteFrontendHostFromEnv(runtime.env ?? process.env); + const shouldPreferRemote = remoteHost !== null && ( + Boolean((runtime.env ?? process.env).CODE_QUEUE_SERVICE_ROLE) + || Boolean((runtime.env ?? process.env).CODE_QUEUE_INSTANCE_ID) + || Boolean((runtime.env ?? process.env).KUBERNETES_SERVICE_HOST) + || Boolean((runtime.env ?? process.env).UNIDESK_ARTIFACT_REGISTRY_REMOTE_FIRST) + ); + if (shouldPreferRemote) { + const remoteResult = runReadonlyStatusViaRemoteFrontend(options, action, remoteHost, null, runtime); + if (remoteResult !== null) return remoteResult; + } + const localResult = runReadonlyStatus(options, healthMode, runtime); + if (localResult.failureClassification !== "local-docker-required") return localResult; + if (remoteHost === null) return controlPlaneMissingResult(options, action, localResult, null, null, remoteHost); + const remoteResult = runReadonlyStatusViaRemoteFrontend(options, action, remoteHost, localResult, runtime); + if (remoteResult !== null) return remoteResult; + return controlPlaneMissingResult(options, action, localResult, null, null, remoteHost); +} + +function runReadonlyStatusViaRemoteFrontend( + options: ArtifactRegistryOptions, + action: "status" | "health", + remoteHost: string, + localResult: Record | null, + runtime: ArtifactRegistryCommandRuntime, +): Record | null { + const remoteCommandArgs = [ + process.execPath, + rootPath("scripts", "cli.ts"), + "--main-server-ip", + remoteHost, + "artifact-registry", + action, + "--provider-id", + options.providerId, + "--timeout-ms", + String(options.timeoutMs), + ]; + const remoteTimeoutMs = Math.max(options.timeoutMs + 15_000, 45_000); + const remoteCommand = runtime.runCliForTest === undefined + ? runCommand(remoteCommandArgs, repoRoot, { timeoutMs: remoteTimeoutMs }) + : runtime.runCliForTest(remoteCommandArgs, remoteTimeoutMs); + const parsed = parsedCliData(remoteCommand.stdout); + const remoteResult = parsed === null ? null : unwrapRemoteArtifactRegistryResult(parsed); + if (remoteCommand.exitCode === 0 && !remoteCommand.timedOut && remoteResult !== null) { + return localResult === null + ? annotateRemoteFirstReadonlyResult(remoteResult, remoteHost, remoteCommand) + : annotateRemoteReadonlyResult(remoteResult, localResult, remoteHost, remoteCommand); + } + if (localResult !== null) return controlPlaneMissingResult(options, action, localResult, remoteResult, remoteCommand, remoteHost); + return null; +} + function statusFromValues(options: ArtifactRegistryOptions, values: Record, command: CommandResult, healthMode: boolean): Record { const commandOk = command.exitCode === 0 && !command.timedOut; const bundle = renderBundle(options); @@ -1466,10 +1688,10 @@ function statusFromValues(options: ArtifactRegistryOptions, values: Record { +function runReadonlyStatus(options: ArtifactRegistryOptions, healthMode: boolean, runtime: ArtifactRegistryCommandRuntime = {}): Record { const bundle = renderBundle(options); const script = statusScript(options, bundle); - const result = runRemoteScript(options, script); + const result = runRemoteScript(options, script, options.timeoutMs, runtime); if (result.exitCode !== 0 || result.timedOut) { return readonlyCommandFailureResult(options, result, healthMode ? "health" : "status"); } @@ -2819,7 +3041,7 @@ function localHelp(): Record { }; } -export async function runArtifactRegistryCommand(args: string[]): Promise { +export async function runArtifactRegistryCommand(args: string[], runtime: ArtifactRegistryCommandRuntime = {}): Promise { const action = args[0]; if (isHelpArg(action)) return localHelp(); if (action !== "plan" && action !== "render" && action !== "status" && action !== "health" && action !== "install" && action !== "deploy-backend-core" && action !== "deploy-service") { @@ -2829,8 +3051,8 @@ export async function runArtifactRegistryCommand(args: string[]): Promise | null | undefined; + const dispatchError = typeof dispatchBody?.error === "string" ? dispatchBody.error : null; return { transport: "frontend", readonly: true, @@ -706,13 +708,21 @@ async function remoteArtifactRegistry(session: FrontendSession, args: string[]): decision: "infra-blocked", retryable: true, runnerDisposition: "infra-blocked", + failureClassification: "control-plane-missing", + recommendedAction: "restore the remote frontend/backend-core dispatch control plane, then rerun artifact-registry status|health", healthyScopes: [], - failedScopes: ["backend-core-api"], + failedScopes: ["control-plane-missing", "backend-core-api"], runtimeApiHealthy: false, channels: [ { channel: "backend-core-api", ok: false, requiredFor: "frontend /api/dispatch backend-core session creation", detail: dispatched.dispatch }, { channel: "provider-dispatch", ok: false, requiredFor: "host.ssh task creation", detail: dispatched.dispatch }, ], + controlPlane: { + transport: "remote-frontend", + failureClassification: "control-plane-missing", + dispatchStatus: dispatched.dispatch.status ?? null, + dispatchError, + }, registry: registryResult, } : registryResult,