diff --git a/scripts/src/hwlab-node-web-sentinel-cicd.ts b/scripts/src/hwlab-node-web-sentinel-cicd.ts index 3e1d7fc6..d808ecde 100644 --- a/scripts/src/hwlab-node-web-sentinel-cicd.ts +++ b/scripts/src/hwlab-node-web-sentinel-cicd.ts @@ -296,6 +296,7 @@ function runSentinelControlPlane(state: SentinelCicdState, options: Extract): RenderedCliResult { + const interruptContext: Record = { + phase: "starting", + pipelineRun: sentinelPipelineRunName(state, options.rerun), + sourceCommit: state.sourceHead.commit, + node: state.spec.nodeId, + lane: state.spec.lane, + sentinelId: state.sentinelId, + valuesRedacted: true, + }; + const uninstallInterruptHandler = installSentinelPublishInterruptHandler(state, interruptContext); + try { + return runSentinelPublishCurrentConfirmedInner(state, options, interruptContext); + } finally { + uninstallInterruptHandler(); + } +} + +function runSentinelPublishCurrentConfirmedInner(state: SentinelCicdState, options: Extract, interruptContext: Record): RenderedCliResult { const startedAt = Date.now(); const command = "web-probe sentinel publish-current"; const budget = publishCurrentBudget(state); @@ -397,11 +420,22 @@ function runSentinelPublishCurrentConfirmed(state: SentinelCicdState, options: E const deadline = startedAt + budgetSeconds * 1000; const remainingBudgetSeconds = () => strictRemainingSeconds(deadline, budgetSeconds); let controlResult: Record | null = null; + sentinelProgressEvent("sentinel.publish.progress", { + phase: "publish-current-start", + status: "running", + pipelineRun: interruptContext.pipelineRun, + sourceCommit: state.sourceHead.commit, + node: state.spec.nodeId, + lane: state.spec.lane, + sentinelId: state.sentinelId, + }); if (state.configReady && state.sourceHead.ok && remainingBudgetSeconds() >= 5) { + interruptContext.phase = "already-current-registry-probe"; const registryProbe = probeImageRegistry(state, Math.max(1, Math.min(remainingBudgetSeconds(), 5))); if (record(record(registryProbe).probe).present === true && remainingBudgetSeconds() >= 5) { const preflightStartedAt = Date.now(); const preflightTimeoutSeconds = Math.max(1, Math.min(remainingBudgetSeconds(), 10)); + interruptContext.phase = "already-current-observed-probe"; const preflightObserved = withObservedWait( collectSentinelObservedStatus(state, preflightTimeoutSeconds, undefined, true), preflightStartedAt, @@ -413,6 +447,7 @@ function runSentinelPublishCurrentConfirmed(state: SentinelCicdState, options: E } } } + interruptContext.phase = "control-plane-trigger-current"; controlResult ??= sentinelControlPlaneConfirmedResult(state, { kind: "control-plane", action: "trigger-current", @@ -432,6 +467,7 @@ function runSentinelPublishCurrentConfirmed(state: SentinelCicdState, options: E } else if (remainingBudgetSeconds() < 2) { health = { ok: false, skipped: true, reason: "end-to-end-budget-exhausted-before-health", valuesRedacted: true }; } else { + interruptContext.phase = "health-endpoint-validation"; const healthStartedAt = Date.now(); health = probeSentinelRuntimeHealthEndpoint(state, remainingBudgetSeconds()); healthElapsedMs = Date.now() - healthStartedAt; @@ -470,6 +506,7 @@ function runSentinelPublishCurrentConfirmed(state: SentinelCicdState, options: E next: publishCurrentNext(state), valuesRedacted: true, }; + interruptContext.phase = "completed"; return rendered(ok, command, renderPublishCurrentResult(result)); } @@ -1045,6 +1082,7 @@ function publishCurrentNext(state: SentinelCicdState): Record { controlPlaneStatus: `bun scripts/cli.ts web-probe sentinel control-plane status --node ${node} --lane ${lane}${suffix}`, dashboardVerify: `bun scripts/cli.ts web-probe sentinel dashboard verify --node ${node} --lane ${lane}${suffix}`, gitMirrorStatus: `bun scripts/cli.ts hwlab nodes git-mirror status --node ${node} --lane ${lane}`, + gitMirrorSync: `bun scripts/cli.ts hwlab nodes git-mirror sync --node ${node} --lane ${lane} --confirm`, gitMirrorFlush: `bun scripts/cli.ts hwlab nodes git-mirror flush --node ${node} --lane ${lane} --confirm --wait`, }; } @@ -1901,6 +1939,92 @@ function sentinelObservedWarnings(value: Record | SentinelObser return mergeWarnings(argo.warning, cadence.warning); } +function sentinelObservedStatusDiagnosis(state: SentinelCicdState, value: unknown, pipelineRun: string): Record | null { + const observed = record(value); + if (sentinelObservedReady(observed)) return null; + const sourceMirror = record(observed.sourceMirror); + const registryProbe = record(record(observed.registry).probe); + const gitMirror = record(observed.gitMirror); + const gitops = record(observed.gitops); + const argo = record(observed.argo); + const runtimeDeployment = record(record(record(observed.runtime).probe).deployment); + const cadence = record(observed.cadence); + const sourceReady = sourceMirror.ok === true; + const registryPresent = registryProbe.present === true; + const gitMirrorReady = gitMirror.skipped === true || gitMirror.ok === true; + const gitopsReady = gitops.ok === true; + const argoReady = argo.ok === true; + const runtimeReady = record(observed.runtime).ok === true; + const cadenceReady = cadence.ok === true; + const code = sourceReady && !registryPresent + ? "sentinel-publish-half-state-registry-missing" + : registryPresent && !gitopsReady + ? "sentinel-gitops-manifest-not-updated" + : gitopsReady && (!argoReady || !runtimeReady) + ? "sentinel-runtime-not-aligned" + : !gitMirrorReady + ? "sentinel-git-mirror-not-in-sync" + : !cadenceReady + ? "sentinel-cadence-not-ready" + : "sentinel-control-plane-observed-not-ready"; + const reason = code === "sentinel-publish-half-state-registry-missing" + ? "source mirror contains the selected commit, but the expected registry tag is missing; retry publish-current so Tekton builds/pushes the image and advances GitOps/runtime" + : code === "sentinel-gitops-manifest-not-updated" + ? "registry contains the selected image, but GitOps manifest is not yet updated to a digest-pinned runtime image" + : code === "sentinel-runtime-not-aligned" + ? "GitOps manifest is present, but Argo or runtime objects have not converged to the selected image yet" + : code === "sentinel-git-mirror-not-in-sync" + ? "runtime is aligned, but git-mirror status is not in sync; run the controlled git-mirror sync/status path before treating the control plane as fully healthy" + : code === "sentinel-cadence-not-ready" + ? "runtime is aligned, but cadence CronJob validation did not pass" + : "one or more source, registry, GitOps, Argo, runtime or cadence checks did not pass"; + const next = publishCurrentNext(state); + const controlNext = controlPlaneNext(state, "trigger-current"); + const shouldRetryPublish = code === "sentinel-publish-half-state-registry-missing" || code === "sentinel-gitops-manifest-not-updated"; + return { + code, + phase: code === "sentinel-publish-half-state-registry-missing" + ? "source-ready-registry-missing" + : code === "sentinel-gitops-manifest-not-updated" + ? "registry-ready-gitops-pending" + : code === "sentinel-runtime-not-aligned" + ? "gitops-ready-runtime-pending" + : code === "sentinel-git-mirror-not-in-sync" + ? "runtime-ready-git-mirror-pending" + : code === "sentinel-cadence-not-ready" + ? "runtime-ready-cadence-pending" + : "observed-not-ready", + reason, + sourceMirror: sourceReady ? `ready ${short(record(sourceMirror.probe).commit ?? record(sourceMirror.probe).expectedCommit)}` : `blocked ${short(record(sourceMirror.probe).commit)}/${short(record(sourceMirror.probe).expectedCommit)}`, + registry: registryPresent ? `present ${short(registryProbe.digest)}` : "missing -", + gitMirror: gitMirrorReady ? "ready" : "pending", + gitops: gitopsReady ? `ready ${short(gitops.image)}` : `pending ${short(gitops.image)}`, + argo: `${argo.syncStatus ?? "-"} ${argo.healthStatus ?? "-"} ${short(argo.revision)}/${short(argo.expectedRevision)}`, + runtime: `ready=${runtimeDeployment.readyReplicas ?? "-"}/${runtimeDeployment.desiredReplicas ?? "-"} image=${short(runtimeDeployment.image)} expected=${short(runtimeDeployment.expectedImage)}`, + pipelineRun, + warning: code === "sentinel-publish-half-state-registry-missing" + ? `source mirror already exposes ${short(state.sourceHead.commit)} but registry tag ${state.image.tag} is missing; rerun ${next.publishCurrent} and then recheck ${next.controlPlaneStatus}.` + : code === "sentinel-git-mirror-not-in-sync" + ? `runtime is aligned but git-mirror is not in sync; run ${next.gitMirrorSync} and then recheck ${next.controlPlaneStatus}.` + : null, + blocker: { code, reason, valuesRedacted: true }, + recoveryNext: { + reason, + pipelineRun, + digestRef: registryPresent ? expectedRuntimeImageFromRegistry(state, record(observed.registry)) : null, + gitopsCommit: gitops.revision ?? null, + publishCurrent: shouldRetryPublish ? next.publishCurrent : null, + nextStatus: next.controlPlaneStatus, + gitMirrorStatus: next.gitMirrorStatus, + gitMirrorSync: !gitMirrorReady ? next.gitMirrorSync : null, + gitMirrorFlush: null, + controlPlaneApply: shouldRetryPublish || code === "sentinel-runtime-not-aligned" ? controlNext.apply : null, + valuesRedacted: true, + }, + valuesRedacted: true, + }; +} + function probeSourceMirror(state: SentinelCicdState, timeoutSeconds: number): Record { const result = probeSourceMirrorCache(state.cicd, state.controlPlaneNode, timeoutSeconds, state.sourceHead.commit); return { ...result, result: compactCommand(result.result) }; @@ -3141,8 +3265,10 @@ function sentinelRemoteJobDiagnostics(state: SentinelCicdState, result: Sentinel ? `trans ${stringAt(state.controlPlaneNode, "kubeRoute")} kubectl -n ${namespace} describe pipelinerun/${result.jobName}` : `trans ${stringAt(state.controlPlaneNode, "kubeRoute")} kubectl -n ${namespace} describe job/${result.jobName}`, gitMirrorStatus: `bun scripts/cli.ts hwlab nodes git-mirror status --node ${state.spec.nodeId} --lane ${state.spec.lane}`, + gitMirrorSync: `bun scripts/cli.ts hwlab nodes git-mirror sync --node ${state.spec.nodeId} --lane ${state.spec.lane} --confirm`, gitMirrorFlush: `bun scripts/cli.ts hwlab nodes git-mirror flush --node ${state.spec.nodeId} --lane ${state.spec.lane} --confirm --wait`, controlPlaneApply: `bun scripts/cli.ts web-probe sentinel control-plane apply --node ${state.spec.nodeId} --lane ${state.spec.lane}${sentinelCliSuffix(state)} --confirm --wait`, + publishCurrent: `bun scripts/cli.ts web-probe sentinel publish-current --node ${state.spec.nodeId} --lane ${state.spec.lane}${sentinelCliSuffix(state)} --confirm --wait`, valuesRedacted: true, }; return { @@ -3336,6 +3462,42 @@ function sentinelProgressEvent(event: string, payload: Record): console.error(JSON.stringify({ event, at: new Date().toISOString(), ...payload, valuesRedacted: true })); } +function installSentinelPublishInterruptHandler(state: SentinelCicdState, context: Record): () => void { + let handled = false; + const handler = (signal: string) => { + if (handled) return; + handled = true; + const exitCode = signal === "SIGINT" ? 130 : 143; + const next = publishCurrentNext(state); + sentinelProgressEvent("sentinel.publish.interrupted", { + signal, + exitCode, + phase: context.phase ?? "unknown", + pipelineRun: context.pipelineRun ?? sentinelPipelineRunName(state, false), + sourceCommit: context.sourceCommit ?? state.sourceHead.commit, + node: state.spec.nodeId, + lane: state.spec.lane, + sentinelId: state.sentinelId, + recoveryNext: { + status: next.controlPlaneStatus, + retry: next.publishCurrent, + gitMirrorStatus: next.gitMirrorStatus, + gitMirrorSync: next.gitMirrorSync, + gitMirrorFlush: next.gitMirrorFlush, + valuesRedacted: true, + }, + valuesRedacted: true, + }); + process.exit(exitCode); + }; + process.once("SIGTERM", handler); + process.once("SIGINT", handler); + return () => { + process.off("SIGTERM", handler); + process.off("SIGINT", handler); + }; +} + function confirmBlocked(action: string, state: SentinelCicdState): Record { return { code: "sentinel-cicd-confirm-requires-tekton-pipelinerun", @@ -3365,6 +3527,7 @@ function controlPlaneNext(state: SentinelCicdState, action: WebProbeSentinelCont validate: `bun scripts/cli.ts web-probe sentinel validate --node ${node} --lane ${lane}${suffix}`, quickVerify: `bun scripts/cli.ts web-probe sentinel validate --node ${node} --lane ${lane}${suffix} --quick-verify --confirm --wait`, gitMirrorStatus: `bun scripts/cli.ts hwlab nodes git-mirror status --node ${node} --lane ${lane}`, + gitMirrorSync: `bun scripts/cli.ts hwlab nodes git-mirror sync --node ${node} --lane ${lane} --confirm`, gitMirrorFlush: `bun scripts/cli.ts hwlab nodes git-mirror flush --node ${node} --lane ${lane} --confirm --wait`, issue: "https://github.com/pikasTech/unidesk/issues/1285", currentAction: action, @@ -3379,12 +3542,15 @@ function controlPlaneRecoveryNext(state: SentinelCicdState, ok: boolean, publish const observedRecord = record(observed); return { reason: "publish produced an image digest, but GitOps/git-mirror/Argo/runtime alignment is not complete yet", + pipelineRun: record(publish).jobName ?? null, digestRef: payload.digestRef, gitopsCommit: payload.gitopsCommit ?? null, flushMode: flushRecord.mode ?? null, observedReady: sentinelObservedReady(observedRecord), + publishCurrent: `bun scripts/cli.ts web-probe sentinel publish-current --node ${state.spec.nodeId} --lane ${state.spec.lane}${sentinelCliSuffix(state)} --confirm --wait`, nextStatus: next.status, gitMirrorStatus: next.gitMirrorStatus, + gitMirrorSync: next.gitMirrorSync, gitMirrorFlush: next.gitMirrorFlush, controlPlaneApply: next.apply, valuesRedacted: true, @@ -3965,9 +4131,11 @@ function renderPublishResult(publish: Record): string { "PUBLISH_DRILLDOWN", ` status: ${commands.cliStatus ?? "-"}`, ` logs: ${commands.logs ?? "-"}`, - ` describe: ${commands.describe ?? "-"}`, - ` git-mirror: ${commands.gitMirrorStatus ?? "-"}`, - ` flush: ${commands.gitMirrorFlush ?? "-"}`, + ` describe: ${commands.describe ?? "-"}`, + ` publish-current: ${commands.publishCurrent ?? "-"}`, + ` git-mirror: ${commands.gitMirrorStatus ?? "-"}`, + ` sync: ${commands.gitMirrorSync ?? "-"}`, + ` flush: ${commands.gitMirrorFlush ?? "-"}`, ` apply: ${commands.controlPlaneApply ?? "-"}`, ); } @@ -3992,6 +4160,7 @@ function renderPublishCurrentResult(result: Record): string { const stageBudgets = record(result.stageBudgets); const validationPlan = record(result.validationPlan); const blocker = record(result.blocker); + const recoveryNext = record(controlPlane.recoveryNext); const next = record(result.next); const warnings = Array.isArray(result.warnings) ? result.warnings : []; const slowStages = Array.isArray(result.slowStages) ? result.slowStages.map(record) : []; @@ -4084,13 +4253,25 @@ function renderPublishCurrentResult(result: Record): string { "", warnings.length === 0 ? "WARNINGS\n-" : ["WARNINGS", ...warnings.map((item) => `- ${text(item)}`)].join("\n"), "", - Object.keys(blocker).length === 0 ? "BLOCKER\n-" : table(["CODE", "REASON"], [[blocker.code, blocker.reason]]), + Object.keys(blocker).length === 0 ? "BLOCKER\n-" : ["BLOCKER", table(["CODE", "REASON"], [[blocker.code, blocker.reason]])].join("\n"), + "", + Object.keys(recoveryNext).length === 0 ? "RECOVERY_NEXT\n-" : [ + "RECOVERY_NEXT", + table(["REASON", "PIPELINERUN", "DIGEST", "GITOPS"], [[recoveryNext.reason, recoveryNext.pipelineRun ?? "-", short(recoveryNext.digestRef), short(recoveryNext.gitopsCommit)]]), + ` publish-current: ${recoveryNext.publishCurrent ?? "-"}`, + ` status: ${recoveryNext.nextStatus ?? "-"}`, + ` git-mirror: ${recoveryNext.gitMirrorStatus ?? "-"}`, + ` sync: ${recoveryNext.gitMirrorSync ?? "-"}`, + ` flush: ${recoveryNext.gitMirrorFlush ?? "-"}`, + ` apply: ${recoveryNext.controlPlaneApply ?? "-"}`, + ].join("\n"), "", "NEXT", ` publish-current: ${next.publishCurrent ?? "-"}`, ` status: ${next.controlPlaneStatus ?? "-"}`, ` post-deploy-dashboard: ${next.dashboardVerify ?? "-"}`, ` git-mirror: ${next.gitMirrorStatus ?? "-"}`, + ` sync: ${next.gitMirrorSync ?? "-"}`, ` flush: ${next.gitMirrorFlush ?? "-"}`, "", "DISCLOSURE", @@ -4133,7 +4314,7 @@ function renderImageResult(result: Record): string { "", warnings.length === 0 ? "WARNINGS\n-" : ["WARNINGS", ...warnings.map((item) => `- ${text(item)}`)].join("\n"), "", - Object.keys(blocker).length === 0 ? "BLOCKER\n-" : table(["CODE", "REASON"], [[blocker.code, blocker.reason]]), + Object.keys(blocker).length === 0 ? "BLOCKER\n-" : ["BLOCKER", table(["CODE", "REASON"], [[blocker.code, blocker.reason]])].join("\n"), "", "NEXT", ` status: ${next.status ?? "-"}`, @@ -4163,6 +4344,7 @@ function renderControlPlaneResult(result: Record): string { const publicExposureCaddy = record(publicExposureApply.caddy); const argoApply = record(result.argoApply); const blocker = record(result.blocker); + const statusDiagnosis = record(result.statusDiagnosis); const targetValidation = record(result.targetValidation); const targetValidationBusiness = record(targetValidation.businessStatus); const recoveryNext = record(result.recoveryNext); @@ -4183,6 +4365,21 @@ function renderControlPlaneResult(result: Record): string { "", renderObservedStatus(observed), "", + Object.keys(statusDiagnosis).length === 0 ? "STATUS_DIAGNOSIS\n-" : [ + "STATUS_DIAGNOSIS", + table(["CODE", "PHASE", "PIPELINERUN", "SOURCE", "REGISTRY", "GIT_MIRROR", "GITOPS", "ARGO", "RUNTIME"], [[ + statusDiagnosis.code, + statusDiagnosis.phase, + statusDiagnosis.pipelineRun, + statusDiagnosis.sourceMirror, + statusDiagnosis.registry, + statusDiagnosis.gitMirror, + statusDiagnosis.gitops, + statusDiagnosis.argo, + statusDiagnosis.runtime, + ]]), + ].join("\n"), + "", Object.keys(sourceMirrorSync).length === 0 ? "SOURCE_MIRROR_SYNC\n-" : table(["OK", "PHASE", "JOB", "COMMIT", "STAGE_REF", "ELAPSED"], [[sourceMirrorSync.ok, sourceMirrorSync.phase, sourceMirrorSync.jobName, short(record(sourceMirrorSync.payload).mirrorCommit), short(record(sourceMirrorSync.payload).stageRef), sourceMirrorSync.elapsedMs ?? "-"]]), "", Object.keys(targetValidation).length === 0 ? "TARGET_VALIDATION\n-" : table(["OK", "STATUS", "BUSINESS", "SCENARIO", "RUN", "OBSERVER", "REPORT", "FINDINGS", "ARTIFACTS"], [[ @@ -4217,13 +4414,15 @@ function renderControlPlaneResult(result: Record): string { "", warnings.length === 0 ? "WARNINGS\n-" : ["WARNINGS", ...warnings.map((item) => `- ${text(item)}`)].join("\n"), "", - Object.keys(blocker).length === 0 ? "BLOCKER\n-" : table(["CODE", "REASON"], [[blocker.code, blocker.reason]]), + Object.keys(blocker).length === 0 ? "BLOCKER\n-" : ["BLOCKER", table(["CODE", "REASON"], [[blocker.code, blocker.reason]])].join("\n"), "", Object.keys(recoveryNext).length === 0 ? "RECOVERY_NEXT\n-" : [ "RECOVERY_NEXT", - table(["REASON", "DIGEST", "GITOPS"], [[recoveryNext.reason, short(recoveryNext.digestRef), short(recoveryNext.gitopsCommit)]]), + table(["REASON", "PIPELINERUN", "DIGEST", "GITOPS"], [[recoveryNext.reason, recoveryNext.pipelineRun ?? "-", short(recoveryNext.digestRef), short(recoveryNext.gitopsCommit)]]), + ` publish-current: ${recoveryNext.publishCurrent ?? "-"}`, ` status: ${recoveryNext.nextStatus ?? "-"}`, ` git-mirror: ${recoveryNext.gitMirrorStatus ?? "-"}`, + ` sync: ${recoveryNext.gitMirrorSync ?? "-"}`, ` flush: ${recoveryNext.gitMirrorFlush ?? "-"}`, ` apply: ${recoveryNext.controlPlaneApply ?? "-"}`, ].join("\n"), @@ -4235,9 +4434,10 @@ function renderControlPlaneResult(result: Record): string { ` trigger-current: ${next.triggerCurrent ?? "-"}`, ` apply: ${next.apply ?? "-"}`, ` validate: ${next.validate ?? "-"}`, - ` quick-verify: ${next.quickVerify ?? "-"}`, - ` git-mirror: ${next.gitMirrorStatus ?? "-"}`, - ` flush: ${next.gitMirrorFlush ?? "-"}`, + ` quick-verify: ${next.quickVerify ?? "-"}`, + ` git-mirror: ${next.gitMirrorStatus ?? "-"}`, + ` sync: ${next.gitMirrorSync ?? "-"}`, + ` flush: ${next.gitMirrorFlush ?? "-"}`, "", "DISCLOSURE", " default view is a bounded CI/CD summary; full manifest content is represented by object counts and sha256.",