fix: route artifact registry health via remote control plane

This commit is contained in:
Codex
2026-05-21 12:55:11 +00:00
parent 569cf7a74c
commit d709b74bb3
3 changed files with 411 additions and 22 deletions
@@ -1,7 +1,9 @@
import {
artifactRegistryReadonlyAutoRemotePlan,
artifactRegistryReadonlyResultFromCommand,
buildArtifactRegistryReadonlyProbe,
parseArtifactRegistryOptions,
runArtifactRegistryCommand,
} from "./src/artifact-registry";
import type { CommandResult } from "./src/command";
@@ -114,18 +116,173 @@ assertCondition(asStringArray(success.failedScopes, "success.failedScopes").leng
assertCondition(success.recommendedAction === "none", "healthy registry recommendedAction should be none", success);
assertCondition(success.remoteCommandShape === probe.remoteCommandShape, "healthy registry should echo remote command shape", success);
process.stdout.write(`${JSON.stringify({
ok: true,
checks: [
"provider-ssh-command missing is classified distinctly",
"oversized host.ssh command shape is classified distinctly",
"remote host.ssh timeout is classified distinctly",
"successful registry readonly probe has no failed scopes",
],
classifications: {
missing: missing.failureClassification,
commandShape: commandShape.failureClassification,
timeout: timeout.failureClassification,
success: success.failureClassification,
},
}, null, 2)}\n`);
const driftStdout = [
"readonly=true",
"unit_exists=true",
"compose_exists=true",
"config_exists=true",
"storage_exists=true",
"systemctl_available=true",
"unit_active=active",
"unit_enabled=enabled",
"docker_available=true",
"container_running=true",
"container_status=running",
"container_image=registry:2.8.2",
"container_restart_policy=unless-stopped",
"listener_count=1",
"bad_listener_count=0",
"loopback_only=true",
"curl_available=true",
"v2_http_code=200",
"config_hash=old-config",
"compose_hash=old-compose",
"unit_hash=old-unit",
"config_hash_matches=false",
"compose_hash_matches=false",
"unit_hash_matches=false",
"image_matches=false",
"",
].join("\n");
const drift = asRecord(artifactRegistryReadonlyResultFromCommand(probe, command({
stdout: driftStdout,
})), "registry drift result");
assertCondition(drift.ok === false, "health should fail when rendered config/image drift exists", drift);
assertCondition(drift.failureClassification === "registry-unhealthy", "registry drift should classify as registry-unhealthy", drift);
const driftScopes = asStringArray(drift.failedScopes, "drift.failedScopes");
assertCondition(driftScopes.includes("rendered-config"), "registry drift should include rendered-config scope", drift);
assertCondition(driftScopes.includes("registry-image"), "registry drift should include registry-image scope", drift);
assertCondition(!driftScopes.includes("control-plane-missing"), "registry drift must not be classified as control-plane missing", drift);
async function main(): Promise<void> {
const localMissingWithNoRemote = await runArtifactRegistryCommand(["health", "--provider-id", "D601"], {
env: {
CODE_QUEUE_DEV_CONTAINER_MASTER_HOST: "203.0.113.10",
CODE_QUEUE_SERVICE_ROLE: "scheduler",
},
runRemoteScriptForTest: () => command({
exitCode: 1,
stderr: "Error response from daemon: No such container: unidesk-backend-core\n",
}),
runCliForTest: () => command({
exitCode: 1,
stdout: JSON.stringify({
ok: false,
command: "artifact-registry health --provider-id D601",
data: {
transport: "frontend",
readonly: true,
dispatch: { ok: false, status: 502, body: { ok: false, error: "backend-core proxy unavailable" } },
wait: null,
result: {
ok: false,
readonly: true,
installed: false,
healthy: false,
decision: "infra-blocked",
retryable: true,
runnerDisposition: "infra-blocked",
failureClassification: "control-plane-missing",
failedScopes: ["control-plane-missing", "backend-core-api"],
runtimeApiHealthy: false,
},
},
}),
}),
});
const controlPlaneMissing = asRecord(localMissingWithNoRemote, "control-plane missing result");
assertCondition(controlPlaneMissing.ok === false, "missing local and remote control planes should fail", controlPlaneMissing);
assertCondition(controlPlaneMissing.failureClassification === "control-plane-missing", "missing remote control plane should classify control-plane-missing", controlPlaneMissing);
assertCondition(asStringArray(controlPlaneMissing.failedScopes, "controlPlaneMissing.failedScopes").includes("control-plane-missing"), "control-plane missing scope should be reported", controlPlaneMissing);
assertCondition(asRecord(controlPlaneMissing.controlPlane, "controlPlane").localBackendCoreMissing === true, "local backend-core absence should remain evidence", controlPlaneMissing);
const remoteFallback = await runArtifactRegistryCommand(["health", "--provider-id", "D601"], {
env: {
CODE_QUEUE_DEV_CONTAINER_MASTER_HOST: "74.48.78.17",
},
runRemoteScriptForTest: () => command({
exitCode: 1,
stderr: "Error response from daemon: No such container: unidesk-backend-core\n",
}),
runCliForTest: () => command({
stdout: JSON.stringify({
ok: true,
command: "artifact-registry health --provider-id D601",
data: {
transport: "frontend",
readonly: true,
result: success,
},
}),
}),
});
const remoteFallbackRecord = asRecord(remoteFallback, "remote fallback result");
assertCondition(remoteFallbackRecord.ok === true, "remote fallback should return the remote registry result", remoteFallbackRecord);
const fallbackControlPlane = asRecord(remoteFallbackRecord.controlPlane, "remote fallback controlPlane");
assertCondition(fallbackControlPlane.remoteFallbackUsed === true, "remote fallback should be marked", fallbackControlPlane);
assertCondition(fallbackControlPlane.localBackendCoreMissing === true, "local backend-core absence should remain evidence only", fallbackControlPlane);
assertCondition(asStringArray(remoteFallbackRecord.failedScopes, "remoteFallback.failedScopes").length === 0, "remote fallback should preserve registry scopes", remoteFallbackRecord);
let remoteFirstLocalSshCalls = 0;
const remoteFirst = await runArtifactRegistryCommand(["health", "--provider-id", "D601"], {
env: {
CODE_QUEUE_SERVICE_ROLE: "scheduler",
CODE_QUEUE_DEV_CONTAINER_MASTER_HOST: "74.48.78.17",
},
runRemoteScriptForTest: () => {
remoteFirstLocalSshCalls += 1;
return command({ exitCode: 1, stderr: "unexpected local ssh path" });
},
runCliForTest: () => command({
stdout: JSON.stringify({
ok: true,
command: "artifact-registry health --provider-id D601",
data: {
transport: "frontend",
readonly: true,
result: success,
},
}),
}),
});
const remoteFirstRecord = asRecord(remoteFirst, "remote first result");
assertCondition(remoteFirstRecord.ok === true, "runner-like env should succeed through remote frontend first", remoteFirstRecord);
assertCondition(remoteFirstLocalSshCalls === 0, "runner-like env should not require local backend-core before remote frontend", { remoteFirstLocalSshCalls });
assertCondition(asRecord(remoteFirstRecord.controlPlane, "remoteFirst.controlPlane").remoteFirst === true, "remote-first controlPlane should be marked", remoteFirstRecord.controlPlane);
const autoPlan = artifactRegistryReadonlyAutoRemotePlan("health", options, {
CODE_QUEUE_SERVICE_ROLE: "scheduler",
CODE_QUEUE_DEV_CONTAINER_MASTER_HOST: "74.48.78.17",
});
assertCondition(autoPlan.enabled === true, "runner-like env should auto-select remote frontend for readonly registry health", autoPlan);
assertCondition(String(autoPlan.command ?? "").includes("--main-server-ip 74.48.78.17"), "auto remote plan should expose command shape", autoPlan);
process.stdout.write(`${JSON.stringify({
ok: true,
checks: [
"provider-ssh-command missing is classified distinctly",
"oversized host.ssh command shape is classified distinctly",
"remote host.ssh timeout is classified distinctly",
"successful registry readonly probe has no failed scopes",
"runner-like env uses remote frontend before local backend-core",
"local backend-core absence can fall back to remote frontend control plane",
"missing local and remote control planes classify as control-plane-missing",
"rendered-config and registry-image drift classify as registry-unhealthy",
],
classifications: {
missing: missing.failureClassification,
commandShape: commandShape.failureClassification,
timeout: timeout.failureClassification,
success: success.failureClassification,
drift: drift.failureClassification,
controlPlaneMissing: controlPlaneMissing.failureClassification,
remoteFallback: remoteFallbackRecord.failureClassification,
remoteFirst: remoteFirstRecord.failureClassification,
},
}, null, 2)}\n`);
}
if (import.meta.main) {
await main();
}
+228 -6
View File
@@ -4,6 +4,18 @@ import { join } from "node:path";
import { runCommand, type CommandResult } from "./command";
import { readConfig, type UniDeskConfig, repoRoot, rootPath } from "./config";
import { startJob } from "./jobs";
import {
compareDeployJsonExecutorMirrors,
deployJsonCommitImage,
deployJsonDriftResult,
deployJsonSourceOfTruth,
hasDeployJsonExecutorContract,
k3sManifestExecutorMirror,
parseDeployJsonServiceContractBase64,
readDeployJsonServiceContractFromFile,
type DeployJsonExecutorMirror,
type DeployJsonServiceContract,
} from "./deploy-json-contract";
export type ArtifactRegistryAction = "plan" | "render" | "status" | "health" | "install" | "deploy-backend-core" | "deploy-service";
type ArtifactDeployEnvironment = "prod" | "dev";
@@ -29,6 +41,13 @@ export interface ArtifactRegistryOptions {
sourceRepo: string;
sourceRepoExplicit: boolean;
deployRef: string | null;
deployJsonService: DeployJsonServiceContract | null;
}
export interface ArtifactRegistryCommandRuntime {
env?: NodeJS.ProcessEnv;
runRemoteScriptForTest?: (options: ArtifactRegistryOptions, script: string, timeoutMs: number) => CommandResult;
runCliForTest?: (command: string[], timeoutMs: number) => CommandResult;
}
interface RenderedFile {
@@ -60,6 +79,7 @@ export interface ArtifactRegistryReadonlyProbe {
}
export type ArtifactRegistryFailureClassification =
| "control-plane-missing"
| "local-docker-required"
| "provider-ssh-command-missing"
| "ssh-helper-command-shape-incompatible"
@@ -88,6 +108,7 @@ const defaultOptions: ArtifactRegistryOptions = {
sourceRepo: "https://github.com/pikasTech/unidesk",
sourceRepoExplicit: false,
deployRef: null,
deployJsonService: null,
};
const supportedArtifactConsumerServices = [
"backend-core",
@@ -1150,6 +1171,7 @@ function asBool(value: string | undefined): boolean {
function registryRecommendedAction(classification: ArtifactRegistryFailureClassification | null): string {
if (classification === null) return "none";
if (classification === "control-plane-missing") return "restore the artifact registry readonly control plane, or rerun through --main-server-ip <host> once frontend/backend-core dispatch is reachable";
if (classification === "local-docker-required") return "run the read-only check through --main-server-ip <host> or from a main-server CLI with backend-core available";
if (classification === "provider-ssh-command-missing") return "restore D601 provider-gateway host.ssh dispatch/capability before retrying artifact registry health";
if (classification === "ssh-helper-command-shape-incompatible") return "upgrade the CLI/control-plane host.ssh helper shape so it can run bash -lc readonly probes";
@@ -1182,6 +1204,7 @@ function classifyProviderSshCommandFailure(command: CommandResult): ArtifactRegi
function providerSshCommandFailureScopes(classification: ArtifactRegistryFailureClassification): string[] {
const scopes = ["provider-ssh-command"];
if (classification === "control-plane-missing") return ["control-plane-missing", ...scopes];
if (classification === "remote-command-timeout") scopes.push("remote-command-timeout");
if (classification === "local-docker-required") scopes.push("local-docker-control-plane");
if (classification === "ssh-helper-command-shape-incompatible") scopes.push("ssh-helper-command-shape");
@@ -1354,8 +1377,9 @@ function deployRefFor(options: ArtifactRegistryOptions, spec: ArtifactConsumerSp
return options.deployRef ?? target?.deployRef ?? `deploy.json#environments.${options.environment ?? "prod"}.services.${spec.serviceId}`;
}
function runRemoteScript(options: ArtifactRegistryOptions, script: string, timeoutMs = options.timeoutMs): CommandResult {
function runRemoteScript(options: ArtifactRegistryOptions, script: string, timeoutMs = options.timeoutMs, runtime: ArtifactRegistryCommandRuntime = {}): CommandResult {
const command = [process.execPath, "scripts/cli.ts", "ssh", options.providerId, "argv", "bash", "-lc", script];
if (runtime.runRemoteScriptForTest !== undefined) return runtime.runRemoteScriptForTest(options, script, timeoutMs);
return runCommand(command, repoRoot, { timeoutMs });
}
@@ -1389,6 +1413,204 @@ function readonlyCommandFailureResult(
};
}
function remoteFrontendHostFromEnv(env: NodeJS.ProcessEnv = process.env): string | null {
for (const key of ["UNIDESK_MAIN_SERVER_IP", "UNIDESK_MAIN_SERVER_HOST", "CODE_QUEUE_DEV_CONTAINER_MASTER_HOST"]) {
const value = env[key]?.trim() ?? "";
if (value.length === 0) continue;
if (value === "localhost" || value === "127.0.0.1" || value === "::1") continue;
return value.replace(/\/+$/u, "");
}
if (env.CODE_QUEUE_SERVICE_ROLE || env.CODE_QUEUE_INSTANCE_ID || env.KUBERNETES_SERVICE_HOST) {
const publicHost = readConfig().network.publicHost.trim();
if (publicHost.length > 0 && publicHost !== "localhost" && publicHost !== "127.0.0.1" && publicHost !== "::1") return publicHost;
}
return null;
}
function controlPlaneMissingResult(
options: ArtifactRegistryOptions,
action: "status" | "health",
localResult: Record<string, unknown>,
remoteResult: Record<string, unknown> | null,
remoteCommand: CommandResult | null,
remoteHost: string | null,
): Record<string, unknown> {
const bundle = renderBundle(options);
return {
ok: false,
readonly: true,
installed: false,
healthy: false,
decision: "infra-blocked",
retryable: true,
runnerDisposition: "infra-blocked",
failureClassification: "control-plane-missing",
recommendedAction: registryRecommendedAction("control-plane-missing"),
remoteCommandShape: readonlyRemoteCommandShape(action, options),
healthyScopes: [],
failedScopes: providerSshCommandFailureScopes("control-plane-missing"),
runtimeApiHealthy: false,
checks: {},
expected: {
endpoint: `http://${options.host}:${options.port}`,
image: options.image,
paths: bundle.paths,
},
controlPlane: {
preferred: "remote-frontend",
remoteFallbackAttempted: remoteHost !== null,
remoteFallbackUsed: false,
remoteHost,
localBackendCoreMissing: true,
classification: "control-plane-missing",
retryCommand: remoteHost === null
? `bun scripts/cli.ts --main-server-ip <host> artifact-registry ${action} --provider-id ${options.providerId}`
: `bun scripts/cli.ts --main-server-ip ${remoteHost} artifact-registry ${action} --provider-id ${options.providerId}`,
},
localObservation: localResult,
remoteObservation: remoteResult,
remoteCommand: remoteCommand === null ? null : artifactRegistryCommandTail(remoteCommand),
};
}
function parsedCliData(stdout: string): Record<string, unknown> | null {
if (stdout.trim().length === 0) return null;
try {
const parsed = JSON.parse(stdout) as unknown;
if (typeof parsed !== "object" || parsed === null || Array.isArray(parsed)) return null;
const record = parsed as Record<string, unknown>;
const data = record.data;
return typeof data === "object" && data !== null && !Array.isArray(data) ? data as Record<string, unknown> : record;
} catch {
return null;
}
}
function unwrapRemoteArtifactRegistryResult(data: Record<string, unknown>): Record<string, unknown> | null {
const directResult = data.result;
if (typeof directResult === "object" && directResult !== null && !Array.isArray(directResult)) return directResult as Record<string, unknown>;
const nested = data.data;
if (typeof nested === "object" && nested !== null && !Array.isArray(nested)) {
const nestedRecord = nested as Record<string, unknown>;
const nestedResult = nestedRecord.result;
if (typeof nestedResult === "object" && nestedResult !== null && !Array.isArray(nestedResult)) return nestedResult as Record<string, unknown>;
}
return null;
}
function annotateRemoteReadonlyResult(
result: Record<string, unknown>,
localResult: Record<string, unknown>,
remoteHost: string,
command: CommandResult,
): Record<string, unknown> {
return {
...result,
controlPlane: {
preferred: "remote-frontend",
remoteFallbackAttempted: true,
remoteFallbackUsed: true,
remoteHost,
localBackendCoreMissing: true,
classification: null,
},
localObservation: localResult,
remoteCommand: artifactRegistryCommandTail(command),
};
}
function annotateRemoteFirstReadonlyResult(
result: Record<string, unknown>,
remoteHost: string,
command: CommandResult,
): Record<string, unknown> {
return {
...result,
controlPlane: {
preferred: "remote-frontend",
remoteFallbackAttempted: false,
remoteFallbackUsed: true,
remoteFirst: true,
remoteHost,
localBackendCoreMissing: null,
classification: null,
},
remoteCommand: artifactRegistryCommandTail(command),
};
}
export function artifactRegistryReadonlyAutoRemotePlan(
action: "status" | "health",
options: ArtifactRegistryOptions,
env: NodeJS.ProcessEnv = process.env,
): Record<string, unknown> {
const remoteHost = remoteFrontendHostFromEnv(env);
return {
enabled: remoteHost !== null,
action,
providerId: options.providerId,
host: remoteHost,
transport: "frontend",
command: remoteHost === null ? null : `bun scripts/cli.ts --main-server-ip ${remoteHost} artifact-registry ${action} --provider-id ${options.providerId}`,
failureClassification: remoteHost === null ? "control-plane-missing" : null,
};
}
function runReadonlyStatusWithRemoteFallback(options: ArtifactRegistryOptions, healthMode: boolean, runtime: ArtifactRegistryCommandRuntime = {}): Record<string, unknown> {
const action = healthMode ? "health" : "status";
const remoteHost = remoteFrontendHostFromEnv(runtime.env ?? process.env);
const shouldPreferRemote = remoteHost !== null && (
Boolean((runtime.env ?? process.env).CODE_QUEUE_SERVICE_ROLE)
|| Boolean((runtime.env ?? process.env).CODE_QUEUE_INSTANCE_ID)
|| Boolean((runtime.env ?? process.env).KUBERNETES_SERVICE_HOST)
|| Boolean((runtime.env ?? process.env).UNIDESK_ARTIFACT_REGISTRY_REMOTE_FIRST)
);
if (shouldPreferRemote) {
const remoteResult = runReadonlyStatusViaRemoteFrontend(options, action, remoteHost, null, runtime);
if (remoteResult !== null) return remoteResult;
}
const localResult = runReadonlyStatus(options, healthMode, runtime);
if (localResult.failureClassification !== "local-docker-required") return localResult;
if (remoteHost === null) return controlPlaneMissingResult(options, action, localResult, null, null, remoteHost);
const remoteResult = runReadonlyStatusViaRemoteFrontend(options, action, remoteHost, localResult, runtime);
if (remoteResult !== null) return remoteResult;
return controlPlaneMissingResult(options, action, localResult, null, null, remoteHost);
}
function runReadonlyStatusViaRemoteFrontend(
options: ArtifactRegistryOptions,
action: "status" | "health",
remoteHost: string,
localResult: Record<string, unknown> | null,
runtime: ArtifactRegistryCommandRuntime,
): Record<string, unknown> | null {
const remoteCommandArgs = [
process.execPath,
rootPath("scripts", "cli.ts"),
"--main-server-ip",
remoteHost,
"artifact-registry",
action,
"--provider-id",
options.providerId,
"--timeout-ms",
String(options.timeoutMs),
];
const remoteTimeoutMs = Math.max(options.timeoutMs + 15_000, 45_000);
const remoteCommand = runtime.runCliForTest === undefined
? runCommand(remoteCommandArgs, repoRoot, { timeoutMs: remoteTimeoutMs })
: runtime.runCliForTest(remoteCommandArgs, remoteTimeoutMs);
const parsed = parsedCliData(remoteCommand.stdout);
const remoteResult = parsed === null ? null : unwrapRemoteArtifactRegistryResult(parsed);
if (remoteCommand.exitCode === 0 && !remoteCommand.timedOut && remoteResult !== null) {
return localResult === null
? annotateRemoteFirstReadonlyResult(remoteResult, remoteHost, remoteCommand)
: annotateRemoteReadonlyResult(remoteResult, localResult, remoteHost, remoteCommand);
}
if (localResult !== null) return controlPlaneMissingResult(options, action, localResult, remoteResult, remoteCommand, remoteHost);
return null;
}
function statusFromValues(options: ArtifactRegistryOptions, values: Record<string, string>, command: CommandResult, healthMode: boolean): Record<string, unknown> {
const commandOk = command.exitCode === 0 && !command.timedOut;
const bundle = renderBundle(options);
@@ -1466,10 +1688,10 @@ function statusFromValues(options: ArtifactRegistryOptions, values: Record<strin
};
}
function runReadonlyStatus(options: ArtifactRegistryOptions, healthMode: boolean): Record<string, unknown> {
function runReadonlyStatus(options: ArtifactRegistryOptions, healthMode: boolean, runtime: ArtifactRegistryCommandRuntime = {}): Record<string, unknown> {
const bundle = renderBundle(options);
const script = statusScript(options, bundle);
const result = runRemoteScript(options, script);
const result = runRemoteScript(options, script, options.timeoutMs, runtime);
if (result.exitCode !== 0 || result.timedOut) {
return readonlyCommandFailureResult(options, result, healthMode ? "health" : "status");
}
@@ -2819,7 +3041,7 @@ function localHelp(): Record<string, unknown> {
};
}
export async function runArtifactRegistryCommand(args: string[]): Promise<unknown> {
export async function runArtifactRegistryCommand(args: string[], runtime: ArtifactRegistryCommandRuntime = {}): Promise<unknown> {
const action = args[0];
if (isHelpArg(action)) return localHelp();
if (action !== "plan" && action !== "render" && action !== "status" && action !== "health" && action !== "install" && action !== "deploy-backend-core" && action !== "deploy-service") {
@@ -2829,8 +3051,8 @@ export async function runArtifactRegistryCommand(args: string[]): Promise<unknow
const options = parseArtifactRegistryOptions(args.slice(1));
if (action === "plan") return plan(options);
if (action === "render") return { ok: true, providerId: options.providerId, render: renderBundle(options) };
if (action === "status") return runReadonlyStatus(options, false);
if (action === "health") return runReadonlyStatus(options, true);
if (action === "status") return runReadonlyStatusWithRemoteFallback(options, false, runtime);
if (action === "health") return runReadonlyStatusWithRemoteFallback(options, true, runtime);
if (action === "install") {
return options.dryRun ? installDryRun(options) : install(options);
}
+11 -1
View File
@@ -692,6 +692,8 @@ async function remoteArtifactRegistry(session: FrontendSession, args: string[]):
const command = ["frontend", "/api/dispatch", probe.providerId, "host.ssh", action];
const result = commandResultFromFrontendTask(command, dispatched.task);
const registryResult = artifactRegistryReadonlyResultFromCommand(probe, result);
const dispatchBody = dispatched.dispatch.body as Record<string, unknown> | null | undefined;
const dispatchError = typeof dispatchBody?.error === "string" ? dispatchBody.error : null;
return {
transport: "frontend",
readonly: true,
@@ -706,13 +708,21 @@ async function remoteArtifactRegistry(session: FrontendSession, args: string[]):
decision: "infra-blocked",
retryable: true,
runnerDisposition: "infra-blocked",
failureClassification: "control-plane-missing",
recommendedAction: "restore the remote frontend/backend-core dispatch control plane, then rerun artifact-registry status|health",
healthyScopes: [],
failedScopes: ["backend-core-api"],
failedScopes: ["control-plane-missing", "backend-core-api"],
runtimeApiHealthy: false,
channels: [
{ channel: "backend-core-api", ok: false, requiredFor: "frontend /api/dispatch backend-core session creation", detail: dispatched.dispatch },
{ channel: "provider-dispatch", ok: false, requiredFor: "host.ssh task creation", detail: dispatched.dispatch },
],
controlPlane: {
transport: "remote-frontend",
failureClassification: "control-plane-missing",
dispatchStatus: dispatched.dispatch.status ?? null,
dispatchError,
},
registry: registryResult,
}
: registryResult,