fix: improve sentinel publish recovery diagnostics

This commit is contained in:
Codex
2026-07-02 01:04:19 +00:00
parent b3a24e3f79
commit 7ca5d6be33
+212 -12
View File
@@ -296,6 +296,7 @@ function runSentinelControlPlane(state: SentinelCicdState, options: Extract<WebP
const observedReady = options.action !== "status" || sentinelObservedReady(record(observed));
const observedWarnings = options.action === "status" ? sentinelObservedWarnings(record(observed)) : [];
const pipelineRun = sentinelPipelineRunName(state, options.rerun);
const statusDiagnosis = options.action === "status" && !observedReady ? sentinelObservedStatusDiagnosis(state, observed, pipelineRun) : null;
const result = {
ok: state.configReady && state.sourceHead.ok && observedReady,
command,
@@ -344,8 +345,12 @@ function runSentinelControlPlane(state: SentinelCicdState, options: Extract<WebP
cicd: state.cicd,
}),
observed,
warnings: observedWarnings,
blocker: null,
statusDiagnosis,
warnings: mergeWarnings(observedWarnings, record(statusDiagnosis).warning),
blocker: observedReady
? null
: record(statusDiagnosis).blocker ?? { code: "sentinel-control-plane-observed-not-ready", reason: "one or more source, registry, GitOps, Argo, runtime or cadence checks did not pass", valuesRedacted: true },
recoveryNext: record(statusDiagnosis).recoveryNext ?? null,
next: controlPlaneNext(state, options.action),
valuesRedacted: true,
};
@@ -390,6 +395,24 @@ function runSentinelPublishCurrent(state: SentinelCicdState, options: Extract<We
}
function runSentinelPublishCurrentConfirmed(state: SentinelCicdState, options: Extract<WebProbeSentinelOptions, { kind: "publish" }>): RenderedCliResult {
const interruptContext: Record<string, unknown> = {
phase: "starting",
pipelineRun: sentinelPipelineRunName(state, options.rerun),
sourceCommit: state.sourceHead.commit,
node: state.spec.nodeId,
lane: state.spec.lane,
sentinelId: state.sentinelId,
valuesRedacted: true,
};
const uninstallInterruptHandler = installSentinelPublishInterruptHandler(state, interruptContext);
try {
return runSentinelPublishCurrentConfirmedInner(state, options, interruptContext);
} finally {
uninstallInterruptHandler();
}
}
function runSentinelPublishCurrentConfirmedInner(state: SentinelCicdState, options: Extract<WebProbeSentinelOptions, { kind: "publish" }>, interruptContext: Record<string, unknown>): RenderedCliResult {
const startedAt = Date.now();
const command = "web-probe sentinel publish-current";
const budget = publishCurrentBudget(state);
@@ -397,11 +420,22 @@ function runSentinelPublishCurrentConfirmed(state: SentinelCicdState, options: E
const deadline = startedAt + budgetSeconds * 1000;
const remainingBudgetSeconds = () => strictRemainingSeconds(deadline, budgetSeconds);
let controlResult: Record<string, unknown> | null = null;
sentinelProgressEvent("sentinel.publish.progress", {
phase: "publish-current-start",
status: "running",
pipelineRun: interruptContext.pipelineRun,
sourceCommit: state.sourceHead.commit,
node: state.spec.nodeId,
lane: state.spec.lane,
sentinelId: state.sentinelId,
});
if (state.configReady && state.sourceHead.ok && remainingBudgetSeconds() >= 5) {
interruptContext.phase = "already-current-registry-probe";
const registryProbe = probeImageRegistry(state, Math.max(1, Math.min(remainingBudgetSeconds(), 5)));
if (record(record(registryProbe).probe).present === true && remainingBudgetSeconds() >= 5) {
const preflightStartedAt = Date.now();
const preflightTimeoutSeconds = Math.max(1, Math.min(remainingBudgetSeconds(), 10));
interruptContext.phase = "already-current-observed-probe";
const preflightObserved = withObservedWait(
collectSentinelObservedStatus(state, preflightTimeoutSeconds, undefined, true),
preflightStartedAt,
@@ -413,6 +447,7 @@ function runSentinelPublishCurrentConfirmed(state: SentinelCicdState, options: E
}
}
}
interruptContext.phase = "control-plane-trigger-current";
controlResult ??= sentinelControlPlaneConfirmedResult(state, {
kind: "control-plane",
action: "trigger-current",
@@ -432,6 +467,7 @@ function runSentinelPublishCurrentConfirmed(state: SentinelCicdState, options: E
} else if (remainingBudgetSeconds() < 2) {
health = { ok: false, skipped: true, reason: "end-to-end-budget-exhausted-before-health", valuesRedacted: true };
} else {
interruptContext.phase = "health-endpoint-validation";
const healthStartedAt = Date.now();
health = probeSentinelRuntimeHealthEndpoint(state, remainingBudgetSeconds());
healthElapsedMs = Date.now() - healthStartedAt;
@@ -470,6 +506,7 @@ function runSentinelPublishCurrentConfirmed(state: SentinelCicdState, options: E
next: publishCurrentNext(state),
valuesRedacted: true,
};
interruptContext.phase = "completed";
return rendered(ok, command, renderPublishCurrentResult(result));
}
@@ -1045,6 +1082,7 @@ function publishCurrentNext(state: SentinelCicdState): Record<string, string> {
controlPlaneStatus: `bun scripts/cli.ts web-probe sentinel control-plane status --node ${node} --lane ${lane}${suffix}`,
dashboardVerify: `bun scripts/cli.ts web-probe sentinel dashboard verify --node ${node} --lane ${lane}${suffix}`,
gitMirrorStatus: `bun scripts/cli.ts hwlab nodes git-mirror status --node ${node} --lane ${lane}`,
gitMirrorSync: `bun scripts/cli.ts hwlab nodes git-mirror sync --node ${node} --lane ${lane} --confirm`,
gitMirrorFlush: `bun scripts/cli.ts hwlab nodes git-mirror flush --node ${node} --lane ${lane} --confirm --wait`,
};
}
@@ -1901,6 +1939,92 @@ function sentinelObservedWarnings(value: Record<string, unknown> | SentinelObser
return mergeWarnings(argo.warning, cadence.warning);
}
function sentinelObservedStatusDiagnosis(state: SentinelCicdState, value: unknown, pipelineRun: string): Record<string, unknown> | null {
const observed = record(value);
if (sentinelObservedReady(observed)) return null;
const sourceMirror = record(observed.sourceMirror);
const registryProbe = record(record(observed.registry).probe);
const gitMirror = record(observed.gitMirror);
const gitops = record(observed.gitops);
const argo = record(observed.argo);
const runtimeDeployment = record(record(record(observed.runtime).probe).deployment);
const cadence = record(observed.cadence);
const sourceReady = sourceMirror.ok === true;
const registryPresent = registryProbe.present === true;
const gitMirrorReady = gitMirror.skipped === true || gitMirror.ok === true;
const gitopsReady = gitops.ok === true;
const argoReady = argo.ok === true;
const runtimeReady = record(observed.runtime).ok === true;
const cadenceReady = cadence.ok === true;
const code = sourceReady && !registryPresent
? "sentinel-publish-half-state-registry-missing"
: registryPresent && !gitopsReady
? "sentinel-gitops-manifest-not-updated"
: gitopsReady && (!argoReady || !runtimeReady)
? "sentinel-runtime-not-aligned"
: !gitMirrorReady
? "sentinel-git-mirror-not-in-sync"
: !cadenceReady
? "sentinel-cadence-not-ready"
: "sentinel-control-plane-observed-not-ready";
const reason = code === "sentinel-publish-half-state-registry-missing"
? "source mirror contains the selected commit, but the expected registry tag is missing; retry publish-current so Tekton builds/pushes the image and advances GitOps/runtime"
: code === "sentinel-gitops-manifest-not-updated"
? "registry contains the selected image, but GitOps manifest is not yet updated to a digest-pinned runtime image"
: code === "sentinel-runtime-not-aligned"
? "GitOps manifest is present, but Argo or runtime objects have not converged to the selected image yet"
: code === "sentinel-git-mirror-not-in-sync"
? "runtime is aligned, but git-mirror status is not in sync; run the controlled git-mirror sync/status path before treating the control plane as fully healthy"
: code === "sentinel-cadence-not-ready"
? "runtime is aligned, but cadence CronJob validation did not pass"
: "one or more source, registry, GitOps, Argo, runtime or cadence checks did not pass";
const next = publishCurrentNext(state);
const controlNext = controlPlaneNext(state, "trigger-current");
const shouldRetryPublish = code === "sentinel-publish-half-state-registry-missing" || code === "sentinel-gitops-manifest-not-updated";
return {
code,
phase: code === "sentinel-publish-half-state-registry-missing"
? "source-ready-registry-missing"
: code === "sentinel-gitops-manifest-not-updated"
? "registry-ready-gitops-pending"
: code === "sentinel-runtime-not-aligned"
? "gitops-ready-runtime-pending"
: code === "sentinel-git-mirror-not-in-sync"
? "runtime-ready-git-mirror-pending"
: code === "sentinel-cadence-not-ready"
? "runtime-ready-cadence-pending"
: "observed-not-ready",
reason,
sourceMirror: sourceReady ? `ready ${short(record(sourceMirror.probe).commit ?? record(sourceMirror.probe).expectedCommit)}` : `blocked ${short(record(sourceMirror.probe).commit)}/${short(record(sourceMirror.probe).expectedCommit)}`,
registry: registryPresent ? `present ${short(registryProbe.digest)}` : "missing -",
gitMirror: gitMirrorReady ? "ready" : "pending",
gitops: gitopsReady ? `ready ${short(gitops.image)}` : `pending ${short(gitops.image)}`,
argo: `${argo.syncStatus ?? "-"} ${argo.healthStatus ?? "-"} ${short(argo.revision)}/${short(argo.expectedRevision)}`,
runtime: `ready=${runtimeDeployment.readyReplicas ?? "-"}/${runtimeDeployment.desiredReplicas ?? "-"} image=${short(runtimeDeployment.image)} expected=${short(runtimeDeployment.expectedImage)}`,
pipelineRun,
warning: code === "sentinel-publish-half-state-registry-missing"
? `source mirror already exposes ${short(state.sourceHead.commit)} but registry tag ${state.image.tag} is missing; rerun ${next.publishCurrent} and then recheck ${next.controlPlaneStatus}.`
: code === "sentinel-git-mirror-not-in-sync"
? `runtime is aligned but git-mirror is not in sync; run ${next.gitMirrorSync} and then recheck ${next.controlPlaneStatus}.`
: null,
blocker: { code, reason, valuesRedacted: true },
recoveryNext: {
reason,
pipelineRun,
digestRef: registryPresent ? expectedRuntimeImageFromRegistry(state, record(observed.registry)) : null,
gitopsCommit: gitops.revision ?? null,
publishCurrent: shouldRetryPublish ? next.publishCurrent : null,
nextStatus: next.controlPlaneStatus,
gitMirrorStatus: next.gitMirrorStatus,
gitMirrorSync: !gitMirrorReady ? next.gitMirrorSync : null,
gitMirrorFlush: null,
controlPlaneApply: shouldRetryPublish || code === "sentinel-runtime-not-aligned" ? controlNext.apply : null,
valuesRedacted: true,
},
valuesRedacted: true,
};
}
function probeSourceMirror(state: SentinelCicdState, timeoutSeconds: number): Record<string, unknown> {
const result = probeSourceMirrorCache(state.cicd, state.controlPlaneNode, timeoutSeconds, state.sourceHead.commit);
return { ...result, result: compactCommand(result.result) };
@@ -3141,8 +3265,10 @@ function sentinelRemoteJobDiagnostics(state: SentinelCicdState, result: Sentinel
? `trans ${stringAt(state.controlPlaneNode, "kubeRoute")} kubectl -n ${namespace} describe pipelinerun/${result.jobName}`
: `trans ${stringAt(state.controlPlaneNode, "kubeRoute")} kubectl -n ${namespace} describe job/${result.jobName}`,
gitMirrorStatus: `bun scripts/cli.ts hwlab nodes git-mirror status --node ${state.spec.nodeId} --lane ${state.spec.lane}`,
gitMirrorSync: `bun scripts/cli.ts hwlab nodes git-mirror sync --node ${state.spec.nodeId} --lane ${state.spec.lane} --confirm`,
gitMirrorFlush: `bun scripts/cli.ts hwlab nodes git-mirror flush --node ${state.spec.nodeId} --lane ${state.spec.lane} --confirm --wait`,
controlPlaneApply: `bun scripts/cli.ts web-probe sentinel control-plane apply --node ${state.spec.nodeId} --lane ${state.spec.lane}${sentinelCliSuffix(state)} --confirm --wait`,
publishCurrent: `bun scripts/cli.ts web-probe sentinel publish-current --node ${state.spec.nodeId} --lane ${state.spec.lane}${sentinelCliSuffix(state)} --confirm --wait`,
valuesRedacted: true,
};
return {
@@ -3336,6 +3462,42 @@ function sentinelProgressEvent(event: string, payload: Record<string, unknown>):
console.error(JSON.stringify({ event, at: new Date().toISOString(), ...payload, valuesRedacted: true }));
}
function installSentinelPublishInterruptHandler(state: SentinelCicdState, context: Record<string, unknown>): () => void {
let handled = false;
const handler = (signal: string) => {
if (handled) return;
handled = true;
const exitCode = signal === "SIGINT" ? 130 : 143;
const next = publishCurrentNext(state);
sentinelProgressEvent("sentinel.publish.interrupted", {
signal,
exitCode,
phase: context.phase ?? "unknown",
pipelineRun: context.pipelineRun ?? sentinelPipelineRunName(state, false),
sourceCommit: context.sourceCommit ?? state.sourceHead.commit,
node: state.spec.nodeId,
lane: state.spec.lane,
sentinelId: state.sentinelId,
recoveryNext: {
status: next.controlPlaneStatus,
retry: next.publishCurrent,
gitMirrorStatus: next.gitMirrorStatus,
gitMirrorSync: next.gitMirrorSync,
gitMirrorFlush: next.gitMirrorFlush,
valuesRedacted: true,
},
valuesRedacted: true,
});
process.exit(exitCode);
};
process.once("SIGTERM", handler);
process.once("SIGINT", handler);
return () => {
process.off("SIGTERM", handler);
process.off("SIGINT", handler);
};
}
function confirmBlocked(action: string, state: SentinelCicdState): Record<string, unknown> {
return {
code: "sentinel-cicd-confirm-requires-tekton-pipelinerun",
@@ -3365,6 +3527,7 @@ function controlPlaneNext(state: SentinelCicdState, action: WebProbeSentinelCont
validate: `bun scripts/cli.ts web-probe sentinel validate --node ${node} --lane ${lane}${suffix}`,
quickVerify: `bun scripts/cli.ts web-probe sentinel validate --node ${node} --lane ${lane}${suffix} --quick-verify --confirm --wait`,
gitMirrorStatus: `bun scripts/cli.ts hwlab nodes git-mirror status --node ${node} --lane ${lane}`,
gitMirrorSync: `bun scripts/cli.ts hwlab nodes git-mirror sync --node ${node} --lane ${lane} --confirm`,
gitMirrorFlush: `bun scripts/cli.ts hwlab nodes git-mirror flush --node ${node} --lane ${lane} --confirm --wait`,
issue: "https://github.com/pikasTech/unidesk/issues/1285",
currentAction: action,
@@ -3379,12 +3542,15 @@ function controlPlaneRecoveryNext(state: SentinelCicdState, ok: boolean, publish
const observedRecord = record(observed);
return {
reason: "publish produced an image digest, but GitOps/git-mirror/Argo/runtime alignment is not complete yet",
pipelineRun: record(publish).jobName ?? null,
digestRef: payload.digestRef,
gitopsCommit: payload.gitopsCommit ?? null,
flushMode: flushRecord.mode ?? null,
observedReady: sentinelObservedReady(observedRecord),
publishCurrent: `bun scripts/cli.ts web-probe sentinel publish-current --node ${state.spec.nodeId} --lane ${state.spec.lane}${sentinelCliSuffix(state)} --confirm --wait`,
nextStatus: next.status,
gitMirrorStatus: next.gitMirrorStatus,
gitMirrorSync: next.gitMirrorSync,
gitMirrorFlush: next.gitMirrorFlush,
controlPlaneApply: next.apply,
valuesRedacted: true,
@@ -3965,9 +4131,11 @@ function renderPublishResult(publish: Record<string, unknown>): string {
"PUBLISH_DRILLDOWN",
` status: ${commands.cliStatus ?? "-"}`,
` logs: ${commands.logs ?? "-"}`,
` describe: ${commands.describe ?? "-"}`,
` git-mirror: ${commands.gitMirrorStatus ?? "-"}`,
` flush: ${commands.gitMirrorFlush ?? "-"}`,
` describe: ${commands.describe ?? "-"}`,
` publish-current: ${commands.publishCurrent ?? "-"}`,
` git-mirror: ${commands.gitMirrorStatus ?? "-"}`,
` sync: ${commands.gitMirrorSync ?? "-"}`,
` flush: ${commands.gitMirrorFlush ?? "-"}`,
` apply: ${commands.controlPlaneApply ?? "-"}`,
);
}
@@ -3992,6 +4160,7 @@ function renderPublishCurrentResult(result: Record<string, unknown>): string {
const stageBudgets = record(result.stageBudgets);
const validationPlan = record(result.validationPlan);
const blocker = record(result.blocker);
const recoveryNext = record(controlPlane.recoveryNext);
const next = record(result.next);
const warnings = Array.isArray(result.warnings) ? result.warnings : [];
const slowStages = Array.isArray(result.slowStages) ? result.slowStages.map(record) : [];
@@ -4084,13 +4253,25 @@ function renderPublishCurrentResult(result: Record<string, unknown>): string {
"",
warnings.length === 0 ? "WARNINGS\n-" : ["WARNINGS", ...warnings.map((item) => `- ${text(item)}`)].join("\n"),
"",
Object.keys(blocker).length === 0 ? "BLOCKER\n-" : table(["CODE", "REASON"], [[blocker.code, blocker.reason]]),
Object.keys(blocker).length === 0 ? "BLOCKER\n-" : ["BLOCKER", table(["CODE", "REASON"], [[blocker.code, blocker.reason]])].join("\n"),
"",
Object.keys(recoveryNext).length === 0 ? "RECOVERY_NEXT\n-" : [
"RECOVERY_NEXT",
table(["REASON", "PIPELINERUN", "DIGEST", "GITOPS"], [[recoveryNext.reason, recoveryNext.pipelineRun ?? "-", short(recoveryNext.digestRef), short(recoveryNext.gitopsCommit)]]),
` publish-current: ${recoveryNext.publishCurrent ?? "-"}`,
` status: ${recoveryNext.nextStatus ?? "-"}`,
` git-mirror: ${recoveryNext.gitMirrorStatus ?? "-"}`,
` sync: ${recoveryNext.gitMirrorSync ?? "-"}`,
` flush: ${recoveryNext.gitMirrorFlush ?? "-"}`,
` apply: ${recoveryNext.controlPlaneApply ?? "-"}`,
].join("\n"),
"",
"NEXT",
` publish-current: ${next.publishCurrent ?? "-"}`,
` status: ${next.controlPlaneStatus ?? "-"}`,
` post-deploy-dashboard: ${next.dashboardVerify ?? "-"}`,
` git-mirror: ${next.gitMirrorStatus ?? "-"}`,
` sync: ${next.gitMirrorSync ?? "-"}`,
` flush: ${next.gitMirrorFlush ?? "-"}`,
"",
"DISCLOSURE",
@@ -4133,7 +4314,7 @@ function renderImageResult(result: Record<string, unknown>): string {
"",
warnings.length === 0 ? "WARNINGS\n-" : ["WARNINGS", ...warnings.map((item) => `- ${text(item)}`)].join("\n"),
"",
Object.keys(blocker).length === 0 ? "BLOCKER\n-" : table(["CODE", "REASON"], [[blocker.code, blocker.reason]]),
Object.keys(blocker).length === 0 ? "BLOCKER\n-" : ["BLOCKER", table(["CODE", "REASON"], [[blocker.code, blocker.reason]])].join("\n"),
"",
"NEXT",
` status: ${next.status ?? "-"}`,
@@ -4163,6 +4344,7 @@ function renderControlPlaneResult(result: Record<string, unknown>): string {
const publicExposureCaddy = record(publicExposureApply.caddy);
const argoApply = record(result.argoApply);
const blocker = record(result.blocker);
const statusDiagnosis = record(result.statusDiagnosis);
const targetValidation = record(result.targetValidation);
const targetValidationBusiness = record(targetValidation.businessStatus);
const recoveryNext = record(result.recoveryNext);
@@ -4183,6 +4365,21 @@ function renderControlPlaneResult(result: Record<string, unknown>): string {
"",
renderObservedStatus(observed),
"",
Object.keys(statusDiagnosis).length === 0 ? "STATUS_DIAGNOSIS\n-" : [
"STATUS_DIAGNOSIS",
table(["CODE", "PHASE", "PIPELINERUN", "SOURCE", "REGISTRY", "GIT_MIRROR", "GITOPS", "ARGO", "RUNTIME"], [[
statusDiagnosis.code,
statusDiagnosis.phase,
statusDiagnosis.pipelineRun,
statusDiagnosis.sourceMirror,
statusDiagnosis.registry,
statusDiagnosis.gitMirror,
statusDiagnosis.gitops,
statusDiagnosis.argo,
statusDiagnosis.runtime,
]]),
].join("\n"),
"",
Object.keys(sourceMirrorSync).length === 0 ? "SOURCE_MIRROR_SYNC\n-" : table(["OK", "PHASE", "JOB", "COMMIT", "STAGE_REF", "ELAPSED"], [[sourceMirrorSync.ok, sourceMirrorSync.phase, sourceMirrorSync.jobName, short(record(sourceMirrorSync.payload).mirrorCommit), short(record(sourceMirrorSync.payload).stageRef), sourceMirrorSync.elapsedMs ?? "-"]]),
"",
Object.keys(targetValidation).length === 0 ? "TARGET_VALIDATION\n-" : table(["OK", "STATUS", "BUSINESS", "SCENARIO", "RUN", "OBSERVER", "REPORT", "FINDINGS", "ARTIFACTS"], [[
@@ -4217,13 +4414,15 @@ function renderControlPlaneResult(result: Record<string, unknown>): string {
"",
warnings.length === 0 ? "WARNINGS\n-" : ["WARNINGS", ...warnings.map((item) => `- ${text(item)}`)].join("\n"),
"",
Object.keys(blocker).length === 0 ? "BLOCKER\n-" : table(["CODE", "REASON"], [[blocker.code, blocker.reason]]),
Object.keys(blocker).length === 0 ? "BLOCKER\n-" : ["BLOCKER", table(["CODE", "REASON"], [[blocker.code, blocker.reason]])].join("\n"),
"",
Object.keys(recoveryNext).length === 0 ? "RECOVERY_NEXT\n-" : [
"RECOVERY_NEXT",
table(["REASON", "DIGEST", "GITOPS"], [[recoveryNext.reason, short(recoveryNext.digestRef), short(recoveryNext.gitopsCommit)]]),
table(["REASON", "PIPELINERUN", "DIGEST", "GITOPS"], [[recoveryNext.reason, recoveryNext.pipelineRun ?? "-", short(recoveryNext.digestRef), short(recoveryNext.gitopsCommit)]]),
` publish-current: ${recoveryNext.publishCurrent ?? "-"}`,
` status: ${recoveryNext.nextStatus ?? "-"}`,
` git-mirror: ${recoveryNext.gitMirrorStatus ?? "-"}`,
` sync: ${recoveryNext.gitMirrorSync ?? "-"}`,
` flush: ${recoveryNext.gitMirrorFlush ?? "-"}`,
` apply: ${recoveryNext.controlPlaneApply ?? "-"}`,
].join("\n"),
@@ -4235,9 +4434,10 @@ function renderControlPlaneResult(result: Record<string, unknown>): string {
` trigger-current: ${next.triggerCurrent ?? "-"}`,
` apply: ${next.apply ?? "-"}`,
` validate: ${next.validate ?? "-"}`,
` quick-verify: ${next.quickVerify ?? "-"}`,
` git-mirror: ${next.gitMirrorStatus ?? "-"}`,
` flush: ${next.gitMirrorFlush ?? "-"}`,
` quick-verify: ${next.quickVerify ?? "-"}`,
` git-mirror: ${next.gitMirrorStatus ?? "-"}`,
` sync: ${next.gitMirrorSync ?? "-"}`,
` flush: ${next.gitMirrorFlush ?? "-"}`,
"",
"DISCLOSURE",
" default view is a bounded CI/CD summary; full manifest content is represented by object counts and sha256.",