fix: decouple sentinel rollout status from target validation (#984)

Co-authored-by: Codex <codex@noreply.local>
This commit is contained in:
Lyon
2026-06-26 14:21:48 +08:00
committed by GitHub
parent 6985387702
commit 5dd0cc2d7d
2 changed files with 9 additions and 13 deletions
@@ -277,7 +277,7 @@ sequenceDiagram
Pod-->>Val: scheduler heartbeat and PVC writable
```
哨兵自身 rollout 只验证哨兵服务健康、配置装载、PVC/SQLite 可写、metrics、dashboard 和调度循环;它不能把另一个哨兵当作 HWLAB Web 业务观察对象。
哨兵自身 rollout 只验证哨兵服务健康、配置装载、PVC/SQLite 可写、metrics、dashboard 和调度循环;它不能把另一个哨兵当作 HWLAB Web 业务观察对象,也不能因为 HWLAB 业务 quick verify blocked 就把哨兵自身发布状态标成失败
### 5.7 哨兵不可用结构化失败时序图
@@ -486,7 +486,7 @@ P8 恢复判定必须把 Workbench 业务失败继续 drill-down 到运行面依
Web哨兵自身必须纳入受控且独立的 sentinel control-planesource 来自 UniDesk `master`,镜像、GitOps path、Argo Application、publicExposure 和 targetValidation 由 Web 哨兵 owning YAML 声明。D601/v03 当前可通过 `web-probe sentinel image|control-plane` 的独立 publish Job 实现构建、推送、GitOps 写回和 Argo 收敛;后续也可以切换到 Tekton Pipeline,但 builder 类型必须来自 YAML,不得依赖 operator 本地 dirty worktree。
哨兵 rollout 与 HWLAB runtime rollout 不是同一个滚动单元。哨兵 dashboard/API/服务代码变更应通过 Web 哨兵独立 control-plane 滚动;HWLAB runtime 发布流水只调用当前已部署哨兵的 `maintenance/start``maintenance/stop` 和 quick verify 作为恢复判定。哨兵 validate、maintenance 和 quick verify 控制路径必须优先走 k3s 内部 Service DNS,不绕 `monitor.pikapython.com` 公网入口。
哨兵 rollout 与 HWLAB runtime rollout 不是同一个滚动单元。哨兵 dashboard/API/服务代码变更应通过 Web 哨兵独立 control-plane 滚动;HWLAB runtime 发布流水只调用当前已部署哨兵的 `maintenance/start``maintenance/stop` 和 quick verify 作为恢复判定。哨兵 control-plane 的顶层状态只表达哨兵自身 source、镜像、GitOps、Argo、runtime、metrics 和 dashboard 是否发布成功;HWLAB quick verify 必须作为独立 `targetValidation` 状态、warning 和 report 证据输出。哨兵 validate、maintenance 和 quick verify 控制路径必须优先走 k3s 内部 Service DNS,不绕 `monitor.pikapython.com` 公网入口。
哨兵镜像构建应使用 YAML 声明的 tools image、base image、registry、egress proxy 和 env-reuse 配方。Node/Bun/Playwright/Chromium 依赖不得在 runtime Pod 中临时下载。Secret 与 env 复用只走 sourceRef/keyMapping;日志、status、dashboard 和 issue closeout 只输出 object/key/presence/fingerprint/digest。
+7 -11
View File
@@ -591,7 +591,7 @@ function runSentinelControlPlaneConfirmed(state: SentinelCicdState, options: Ext
reason: "runtime-not-ready",
valuesRedacted: true,
};
const targetValidationOk = applyOnly || record(targetValidation).ok === true;
const targetValidationBlocked = !applyOnly && record(targetValidation).ok !== true;
const ok = state.configReady
&& state.sourceHead.ok
&& (applyOnly || record(sourceMirrorSync).ok === true)
@@ -599,18 +599,13 @@ function runSentinelControlPlaneConfirmed(state: SentinelCicdState, options: Ext
&& (applyOnly || record(flush).ok === true)
&& record(publicExposureApply).ok === true
&& record(argoApply).ok === true
&& observedReady
&& targetValidationOk;
&& observedReady;
const elapsedMs = Date.now() - startedAt;
const blocker = ok ? null : {
code: targetValidationOk
? record(sourceMirrorSync).ok === false ? "sentinel-source-mirror-sync-failed" : "sentinel-control-plane-not-ready"
: "sentinel-target-validation-failed",
reason: targetValidationOk
? record(sourceMirrorSync).ok === false
? "source mirror sync did not complete; investigate git mirror/proxy before control-plane publish"
: "one or more publish, publicExposure, Argo or runtime observation checks did not pass"
: text(record(targetValidation).failure ?? record(targetValidation).reason ?? "quick verify did not pass"),
code: record(sourceMirrorSync).ok === false ? "sentinel-source-mirror-sync-failed" : "sentinel-control-plane-not-ready",
reason: record(sourceMirrorSync).ok === false
? "source mirror sync did not complete; investigate git mirror/proxy before control-plane publish"
: "one or more publish, publicExposure, Argo or runtime observation checks did not pass",
};
const result = {
ok,
@@ -657,6 +652,7 @@ function runSentinelControlPlaneConfirmed(state: SentinelCicdState, options: Ext
...sentinelElapsedWarnings(record(publish).elapsedMs),
...sentinelElapsedWarnings(record(flush).result === undefined ? null : record(record(flush).result).durationMs),
...(Array.isArray(record(targetValidation).warnings) ? record(targetValidation).warnings.map(text) : []),
...(targetValidationBlocked ? ["targetValidation is blocked; top-level STATUS only covers sentinel control-plane rollout. HWLAB business recovery remains pending; rerun quick verify after internal DB switch completes, without public fallback or a second execution path."] : []),
])),
blocker,
next: controlPlaneNext(state, options.action),