fix(cicd): add branch follower refresh gate

This commit is contained in:
Codex
2026-07-04 11:30:54 +00:00
parent 09d7633f3d
commit 08d5e91c5e
5 changed files with 191 additions and 5 deletions
@@ -18,6 +18,7 @@ bun scripts/cli.ts cicd branch-follower debug-step --follower <id> --step decide
bun scripts/cli.ts cicd branch-follower debug-step --follower <id> --step state-write --confirm
bun scripts/cli.ts cicd branch-follower events --follower <id>
bun scripts/cli.ts cicd branch-follower logs --follower <id>
bun scripts/cli.ts cicd branch-follower gate --follower hwlab-jd01-v03 --gate control-plane-refresh --source-commit <sha> --confirm --json
```
`apply --confirm --wait` is the one-command deploy/update entry for the K8s controller. `status` is the default intermediate-state query. `status --live` and local `run-once` submit a bounded K8s reconcile Job; the Job performs all source, Tekton, Argo and runtime reads inside the cluster and may write only the compact state summary. `events` and `logs` are read-only drill-downs for the same Kubernetes-native state. `run-once --confirm --wait` is the manual one-command trigger and closeout path.
@@ -36,7 +37,7 @@ When a branch-follower issue remains ambiguous after a debug step or drill-down,
For HWLAB native `control-plane-refresh`, the bounded evidence chain must preserve both the rendered Pipeline summary and the applied cluster object summary for the same source commit: rendered Pipeline name, bounded `runtime-ready` task/when summary, source commit/stage ref, applied Pipeline name, resourceVersion, and a short annotation/label subset proving which object was patched. If the Job TTL has already removed the original Job, status/events/logs must show `-` or a bounded missing reason from stored state instead of inferring the missing edge.
CI/CD validation must be decomposable into ordered single-step gates before a full rollout observation is accepted: first validate the reuse plan, then CI parallelism/TaskRun plan, then CD rollout plan, then post-deploy monitoring/health evidence. "Single-step" means an independently triggerable and independently executable target-side CLI/debug-step/drill-down entry, not a passive observation extracted from one end-to-end follower run. Each gate must be runnable against a selected follower/source snapshot, must emit bounded evidence, and must be retryable/fixable without creating a new source PR or replaying the full follower loop. Do not use issue comments, repeated PR merges, or end-to-end follower loops as substitutes for a missing single-step validator; add the missing bounded CLI step first.
CI/CD validation must be decomposable into ordered single-step gates before a full rollout observation is accepted: first validate the reuse plan, then CI parallelism/TaskRun plan, then CD rollout plan, then post-deploy monitoring/health evidence. "Single-step" means an independently triggerable and independently executable target-side CLI/debug-step/drill-down entry, not a passive observation extracted from one end-to-end follower run. Each gate must be runnable against a selected follower/source snapshot, must emit bounded evidence, and must be retryable/fixable without creating a new source PR or replaying the full follower loop. For HWLAB Pipeline render changes, `gate --gate control-plane-refresh --source-commit <sha> --confirm` is the independently triggerable native refresh gate. Do not use issue comments, repeated PR merges, or end-to-end follower loops as substitutes for a missing single-step validator; add the missing bounded CLI step first.
PRs that change branch-follower convergence, reuse, Tekton/Argo closeout, runtime readiness or gate visibility must be submitted only after the author has run the affected independently triggerable single-step gates on the target NODE/k8s and captured bounded pass evidence. If a required gate cannot be triggered independently or does not pass, do not open the PR as a validation vehicle; leave a short issue comment with the missing gate, target object names and next minimal fix scope, then fix the gate first.
@@ -169,6 +170,8 @@ The controller automatic loop submits trigger work without a blocking wait; late
State ConfigMaps must stay bounded and human-queryable. Store compact summaries, stage refs, conditions, short messages, and drill-down object names; do not store full API payloads or long log dumps. Cleanup is an explicit operator operation for stale/broken state and must not be required for normal convergence.
When retesting the same source sha after fixing controller/render inputs, `cleanup-state` only deletes the stored follower state. It does not delete deterministic native objects such as an existing PipelineRun, and decision logic may still treat that sha as already triggered. Do not loop on cleanup plus `run-once`; use an independently triggerable gate such as control-plane refresh or an explicit rerun/cleanup of the native object, then re-read status.
Status readers must compute near the data. When the operator CLI reaches a target node or k8s route through `trans`, the target NODE/k8s side must parse ConfigMap values, Kubernetes objects and log/event lists locally, then return only the bounded follower summary, timing rows, object names, counts and short tails needed by the CLI. Do not transmit complete ConfigMap entries, full API objects or long logs back to the host just so host-side TypeScript can parse and trim them.
Operator transport timing warnings such as `UNIDESK_SSH_TIMING` measure CLI/trans latency, not branch-follower CI/CD stage time or end-to-end convergence time. Do not mix those warnings into `timings.totalSeconds`, stage rows, or performance closeout evidence; when transport cost becomes noisy, reduce round trips by adding a target-side debug/status summary instead of pulling more raw output to the host.
+2 -2
View File
@@ -213,8 +213,8 @@ function debugStepOption(value: string): BranchFollowerDebugStep {
}
function gateOption(value: string): BranchFollowerGate {
if (value === "reuse-plan" || value === "ci-taskrun-plan" || value === "cd-rollout-plan" || value === "post-deploy-health") return value;
throw new Error("--gate must be reuse-plan, ci-taskrun-plan, cd-rollout-plan, or post-deploy-health");
if (value === "reuse-plan" || value === "ci-taskrun-plan" || value === "cd-rollout-plan" || value === "post-deploy-health" || value === "control-plane-refresh") return value;
throw new Error("--gate must be reuse-plan, ci-taskrun-plan, cd-rollout-plan, post-deploy-health, or control-plane-refresh");
}
function isInClusterRuntime(): boolean {
+183 -1
View File
@@ -2,6 +2,7 @@
// Responsibility: submit bounded target-side gate Jobs and return compact evidence.
import type { CommandResult } from "./command";
import { resolveAgentRunLaneTarget } from "./agentrun-lanes";
import { runNativeHwlabControlPlaneRefresh } from "./cicd-hwlab-refresh";
import { nativeCicdScriptLoadShell } from "./cicd-native-bundle";
import { waitForJobShell } from "./cicd-controller-render";
import type { BranchFollowerRegistry, FollowerSpec, ParsedOptions } from "./cicd-types";
@@ -11,7 +12,12 @@ import { shQuote, redactText } from "./platform-infra-ops-library";
type KubeScriptRunner = (registry: BranchFollowerRegistry, options: ParsedOptions, script: string, input: string, timeoutMs: number) => CommandResult;
export async function runBranchFollowerGate(registry: BranchFollowerRegistry, follower: FollowerSpec, options: ParsedOptions, runKubeScript: KubeScriptRunner): Promise<Record<string, unknown>> {
if (options.gate === null) throw new Error("gate requires --gate <reuse-plan|ci-taskrun-plan|cd-rollout-plan|post-deploy-health>");
if (options.gate === null) throw new Error("gate requires --gate <reuse-plan|ci-taskrun-plan|cd-rollout-plan|post-deploy-health|control-plane-refresh>");
if (options.gate === "control-plane-refresh") {
return options.inCluster
? runControlPlaneRefreshGate(registry, follower, options)
: runTargetControlPlaneRefreshGateJob(registry, follower, options, runKubeScript);
}
if (options.inCluster) return { ok: false, action: "gate", gate: options.gate, follower: follower.id, degradedReason: "operator-entry-required" };
const timeoutSeconds = options.timeoutSeconds ?? follower.budgets.statusSeconds;
const jobName = `bf-gate-${safeName(follower.id)}-${safeName(options.gate)}-${Date.now().toString(36)}`.slice(0, 63);
@@ -50,6 +56,177 @@ export async function runBranchFollowerGate(registry: BranchFollowerRegistry, fo
};
}
function runTargetControlPlaneRefreshGateJob(registry: BranchFollowerRegistry, follower: FollowerSpec, options: ParsedOptions, runKubeScript: KubeScriptRunner): Record<string, unknown> {
if (follower.adapter !== "hwlab-node-runtime" || options.sourceCommit === null || !options.confirm) {
return runControlPlaneRefreshGate(registry, follower, options);
}
const timeoutSeconds = options.timeoutSeconds ?? follower.budgets.controlPlaneRefreshSeconds;
const jobName = `bf-gate-${safeName(follower.id)}-control-refresh-${Date.now().toString(36)}`.slice(0, 63);
const manifest = controllerGateJobManifest(registry, follower, options, jobName, timeoutSeconds);
const manifestYaml = `${Bun.YAML.stringify(manifest).trim()}\n`;
const script = [
"set -eu",
"tmp=$(mktemp)",
"base64 -d >\"$tmp\" <<'UNIDESK_CONTROL_PLANE_REFRESH_GATE_JOB'",
Buffer.from(manifestYaml, "utf8").toString("base64"),
"UNIDESK_CONTROL_PLANE_REFRESH_GATE_JOB",
`kubectl -n ${shQuote(registry.controller.namespace)} delete job ${shQuote(jobName)} --ignore-not-found=true >/dev/null 2>&1 || true`,
`kubectl apply --server-side --force-conflicts --field-manager=${shQuote(registry.controller.fieldManager)} -f "$tmp" >/dev/null`,
waitForJobShell(registry.controller.namespace, jobName, timeoutSeconds),
].join("\n");
const startedAt = Date.now();
const command = runKubeScript(registry, options, script, "", (timeoutSeconds + registry.controller.budgets.reconcileTransportGraceSeconds) * 1000);
const parsed = command.exitCode === 0 ? parseFirstJsonObject(command.stdout) : null;
const ok = command.exitCode === 0 && parsed !== null && parsed.ok !== false;
return {
ok,
action: "gate",
gate: options.gate,
follower: follower.id,
target: { name: jobName, namespace: registry.controller.namespace, execution: "k8s-native-gate-job" },
result: parsed,
command: {
exitCode: command.exitCode,
timedOut: command.timedOut,
elapsedMs: Date.now() - startedAt,
parseError: parsed === null ? "stdout-json-parse-failed" : null,
stdoutTail: ok ? "" : redactText(tailText(command.stdout, 1600)),
stderrTail: ok ? "" : redactText(tailText(command.stderr, 1200)),
},
parsedDownstreamCliOutput: false,
};
}
function runControlPlaneRefreshGate(registry: BranchFollowerRegistry, follower: FollowerSpec, options: ParsedOptions): Record<string, unknown> {
if (follower.adapter !== "hwlab-node-runtime") {
return {
ok: false,
action: "gate",
gate: options.gate,
follower: follower.id,
degradedReason: "unsupported-follower-adapter",
message: "control-plane-refresh gate is only available for hwlab-node-runtime followers",
parsedDownstreamCliOutput: false,
};
}
if (options.sourceCommit === null) {
return {
ok: false,
action: "gate",
gate: options.gate,
follower: follower.id,
degradedReason: "source-commit-required",
message: "control-plane-refresh gate requires --source-commit <sha>",
parsedDownstreamCliOutput: false,
};
}
const spec = hwlabRuntimeLaneSpecForNode(follower.target.lane, follower.target.node);
const timeoutSeconds = options.timeoutSeconds ?? follower.budgets.controlPlaneRefreshSeconds;
const jobName = nativeCapabilityJobName(follower.id, "control-plane-refresh", options.sourceCommit);
if (!options.confirm) {
return {
ok: true,
action: "gate",
gate: options.gate,
follower: follower.id,
dryRun: true,
target: { name: jobName, namespace: registry.controller.namespace, execution: "k8s-native-control-plane-refresh" },
sourceCommit: options.sourceCommit,
message: "add --confirm to run the native control-plane refresh gate",
parsedDownstreamCliOutput: false,
};
}
const startedAt = Date.now();
const refresh = runNativeHwlabControlPlaneRefresh(registry, follower, spec, options.sourceCommit, timeoutSeconds, jobName);
return {
ok: refresh.result.ok,
action: "gate",
gate: options.gate,
follower: follower.id,
dryRun: false,
sourceCommit: options.sourceCommit,
target: { name: refresh.jobName, namespace: refresh.namespace, execution: "k8s-native-control-plane-refresh" },
result: refresh.result,
command: {
elapsedMs: Date.now() - startedAt,
timeoutSeconds,
},
parsedDownstreamCliOutput: false,
next: {
statusRead: `bun scripts/cli.ts cicd branch-follower debug-step --follower ${follower.id} --step status-read --json`,
job: `bun scripts/cli.ts cicd branch-follower job --follower ${follower.id} --source-commit ${options.sourceCommit} --job control-plane-refresh --json`,
},
};
}
function controllerGateJobManifest(registry: BranchFollowerRegistry, follower: FollowerSpec, options: ParsedOptions, jobName: string, timeoutSeconds: number): Record<string, unknown> {
const labels = { ...registry.controller.labels, "app.kubernetes.io/component": "cicd-gate-job" };
const commandArgs = [
"bun",
"scripts/cli.ts",
"cicd",
"branch-follower",
"gate",
"--follower",
follower.id,
"--gate",
"control-plane-refresh",
"--source-commit",
options.sourceCommit ?? "",
"--confirm",
"--in-cluster",
"--config",
"config/cicd-branch-followers.yaml",
"--timeout-seconds",
String(timeoutSeconds),
"--json",
];
return {
apiVersion: "batch/v1",
kind: "Job",
metadata: { name: jobName, namespace: registry.controller.namespace, labels },
spec: {
backoffLimit: registry.controller.budgets.reconcileJobBackoffLimit,
ttlSecondsAfterFinished: registry.controller.budgets.reconcileJobTtlSeconds,
activeDeadlineSeconds: timeoutSeconds + registry.controller.budgets.reconcileJobDeadlineGraceSeconds,
template: {
metadata: { labels },
spec: {
restartPolicy: "Never",
serviceAccountName: registry.controller.serviceAccountName,
volumes: [
{ name: "registry", configMap: { name: registry.controller.configMapName, defaultMode: 0o755 } },
{ name: "git-mirror-cache", persistentVolumeClaim: { claimName: registry.controller.source.gitMirrorCachePvcName } },
{ name: "git-ssh", secret: { secretName: registry.controller.source.githubSsh.secretName, defaultMode: 0o400 } },
{ name: "work", emptyDir: {} },
],
containers: [{
name: "gate",
image: registry.controller.image,
imagePullPolicy: "IfNotPresent",
command: ["/bin/sh", "/etc/unidesk-cicd-branch-follower/controller-one-shot.sh"],
args: commandArgs,
env: [
{ name: "UNIDESK_CONTROLLER_SOURCE_BRANCH", value: registry.controller.source.branch },
{ name: "UNIDESK_CONTROLLER_SOURCE_REPOSITORY", value: registry.controller.source.repository },
{ name: "UNIDESK_CONTROLLER_SOURCE_SNAPSHOT_PREFIX", value: registry.controller.source.sourceSnapshot.stageRefPrefix.replaceAll("{branch}", registry.controller.source.branch) },
{ name: "UNIDESK_CONTROLLER_GITHUB_SSH_PRIVATE_KEY", value: `/git-ssh/${registry.controller.source.githubSsh.privateKeySecretKey}` },
{ name: "UNIDESK_CONTROLLER_GITHUB_PROXY_HOST", value: registry.controller.source.githubSsh.proxyHost },
{ name: "UNIDESK_CONTROLLER_GITHUB_PROXY_PORT", value: String(registry.controller.source.githubSsh.proxyPort) },
],
volumeMounts: [
{ name: "registry", mountPath: "/etc/unidesk-cicd-branch-follower", readOnly: true },
{ name: "git-mirror-cache", mountPath: "/cache" },
{ name: "git-ssh", mountPath: "/git-ssh", readOnly: true },
{ name: "work", mountPath: "/work" },
],
}],
},
},
},
};
}
function gateJobManifest(registry: BranchFollowerRegistry, follower: FollowerSpec, options: ParsedOptions, jobName: string, timeoutSeconds: number): Record<string, unknown> {
const labels = { ...registry.controller.labels, "app.kubernetes.io/component": "cicd-gate-job" };
const agentrun = follower.adapter === "agentrun-yaml-lane" ? resolveAgentRunLaneTarget({ node: follower.target.node, lane: follower.target.lane }).spec : null;
@@ -124,6 +301,11 @@ function gateJobManifest(registry: BranchFollowerRegistry, follower: FollowerSpe
};
}
function nativeCapabilityJobName(followerId: string, action: string, sha: string): string {
const prefix = `${safeName(followerId)}-${safeName(action)}`;
return `${prefix}-${sha.slice(0, 12)}`.replace(/-+/gu, "-").replace(/^-|-$/gu, "").slice(0, 63);
}
function gateHealthUrl(follower: FollowerSpec): string {
if (follower.adapter === "agentrun-yaml-lane") {
return resolveAgentRunLaneTarget({ node: follower.target.node, lane: follower.target.lane }).spec.runtime.internalBaseUrl;
+1
View File
@@ -20,6 +20,7 @@ export function buildCicdHelp(configPath: string, spec: string): unknown {
"bun scripts/cli.ts cicd branch-follower logs --follower web-probe-sentinel-master",
"bun scripts/cli.ts cicd branch-follower taskrun --follower hwlab-jd01-v03 --taskrun runtime-ready --logs-tail 120 --json",
"bun scripts/cli.ts cicd branch-follower job --follower agentrun-jd01-v02 --source-commit <sha> --job image-build --json",
"bun scripts/cli.ts cicd branch-follower gate --follower hwlab-jd01-v03 --gate control-plane-refresh --source-commit <sha> --confirm --json",
"bun scripts/cli.ts cicd branch-follower gate --follower agentrun-jd01-v02 --gate reuse-plan --source-commit <sha> --json",
],
config: configPath,
+1 -1
View File
@@ -4,7 +4,7 @@
export type OutputMode = "human" | "json" | "yaml";
export type BranchFollowerAction = "help" | "plan" | "apply" | "status" | "run-once" | "debug-step" | "cleanup-state" | "events" | "logs" | "taskrun" | "job" | "runtime" | "gate";
export type BranchFollowerDebugStep = "state-read" | "controller-source" | "status-read" | "decide" | "state-write";
export type BranchFollowerGate = "reuse-plan" | "ci-taskrun-plan" | "cd-rollout-plan" | "post-deploy-health";
export type BranchFollowerGate = "reuse-plan" | "ci-taskrun-plan" | "cd-rollout-plan" | "post-deploy-health" | "control-plane-refresh";
export type BranchFollowerPhase =
| "Observed"
| "Noop"