fix: decouple sentinel rollout status from target validation (#984)
Co-authored-by: Codex <codex@noreply.local>
This commit is contained in:
@@ -277,7 +277,7 @@ sequenceDiagram
|
||||
Pod-->>Val: scheduler heartbeat and PVC writable
|
||||
```
|
||||
|
||||
哨兵自身 rollout 只验证哨兵服务健康、配置装载、PVC/SQLite 可写、metrics、dashboard 和调度循环;它不能把另一个哨兵当作 HWLAB Web 业务观察对象。
|
||||
哨兵自身 rollout 只验证哨兵服务健康、配置装载、PVC/SQLite 可写、metrics、dashboard 和调度循环;它不能把另一个哨兵当作 HWLAB Web 业务观察对象,也不能因为 HWLAB 业务 quick verify blocked 就把哨兵自身发布状态标成失败。
|
||||
|
||||
### 5.7 哨兵不可用结构化失败时序图
|
||||
|
||||
@@ -486,7 +486,7 @@ P8 恢复判定必须把 Workbench 业务失败继续 drill-down 到运行面依
|
||||
|
||||
Web哨兵自身必须纳入受控且独立的 sentinel control-plane:source 来自 UniDesk `master`,镜像、GitOps path、Argo Application、publicExposure 和 targetValidation 由 Web 哨兵 owning YAML 声明。D601/v03 当前可通过 `web-probe sentinel image|control-plane` 的独立 publish Job 实现构建、推送、GitOps 写回和 Argo 收敛;后续也可以切换到 Tekton Pipeline,但 builder 类型必须来自 YAML,不得依赖 operator 本地 dirty worktree。
|
||||
|
||||
哨兵 rollout 与 HWLAB runtime rollout 不是同一个滚动单元。哨兵 dashboard/API/服务代码变更应通过 Web 哨兵独立 control-plane 滚动;HWLAB runtime 发布流水只调用当前已部署哨兵的 `maintenance/start`、`maintenance/stop` 和 quick verify 作为恢复判定。哨兵 validate、maintenance 和 quick verify 控制路径必须优先走 k3s 内部 Service DNS,不绕 `monitor.pikapython.com` 公网入口。
|
||||
哨兵 rollout 与 HWLAB runtime rollout 不是同一个滚动单元。哨兵 dashboard/API/服务代码变更应通过 Web 哨兵独立 control-plane 滚动;HWLAB runtime 发布流水只调用当前已部署哨兵的 `maintenance/start`、`maintenance/stop` 和 quick verify 作为恢复判定。哨兵 control-plane 的顶层状态只表达哨兵自身 source、镜像、GitOps、Argo、runtime、metrics 和 dashboard 是否发布成功;HWLAB quick verify 必须作为独立 `targetValidation` 状态、warning 和 report 证据输出。哨兵 validate、maintenance 和 quick verify 控制路径必须优先走 k3s 内部 Service DNS,不绕 `monitor.pikapython.com` 公网入口。
|
||||
|
||||
哨兵镜像构建应使用 YAML 声明的 tools image、base image、registry、egress proxy 和 env-reuse 配方。Node/Bun/Playwright/Chromium 依赖不得在 runtime Pod 中临时下载。Secret 与 env 复用只走 sourceRef/keyMapping;日志、status、dashboard 和 issue closeout 只输出 object/key/presence/fingerprint/digest。
|
||||
|
||||
|
||||
@@ -591,7 +591,7 @@ function runSentinelControlPlaneConfirmed(state: SentinelCicdState, options: Ext
|
||||
reason: "runtime-not-ready",
|
||||
valuesRedacted: true,
|
||||
};
|
||||
const targetValidationOk = applyOnly || record(targetValidation).ok === true;
|
||||
const targetValidationBlocked = !applyOnly && record(targetValidation).ok !== true;
|
||||
const ok = state.configReady
|
||||
&& state.sourceHead.ok
|
||||
&& (applyOnly || record(sourceMirrorSync).ok === true)
|
||||
@@ -599,18 +599,13 @@ function runSentinelControlPlaneConfirmed(state: SentinelCicdState, options: Ext
|
||||
&& (applyOnly || record(flush).ok === true)
|
||||
&& record(publicExposureApply).ok === true
|
||||
&& record(argoApply).ok === true
|
||||
&& observedReady
|
||||
&& targetValidationOk;
|
||||
&& observedReady;
|
||||
const elapsedMs = Date.now() - startedAt;
|
||||
const blocker = ok ? null : {
|
||||
code: targetValidationOk
|
||||
? record(sourceMirrorSync).ok === false ? "sentinel-source-mirror-sync-failed" : "sentinel-control-plane-not-ready"
|
||||
: "sentinel-target-validation-failed",
|
||||
reason: targetValidationOk
|
||||
? record(sourceMirrorSync).ok === false
|
||||
? "source mirror sync did not complete; investigate git mirror/proxy before control-plane publish"
|
||||
: "one or more publish, publicExposure, Argo or runtime observation checks did not pass"
|
||||
: text(record(targetValidation).failure ?? record(targetValidation).reason ?? "quick verify did not pass"),
|
||||
code: record(sourceMirrorSync).ok === false ? "sentinel-source-mirror-sync-failed" : "sentinel-control-plane-not-ready",
|
||||
reason: record(sourceMirrorSync).ok === false
|
||||
? "source mirror sync did not complete; investigate git mirror/proxy before control-plane publish"
|
||||
: "one or more publish, publicExposure, Argo or runtime observation checks did not pass",
|
||||
};
|
||||
const result = {
|
||||
ok,
|
||||
@@ -657,6 +652,7 @@ function runSentinelControlPlaneConfirmed(state: SentinelCicdState, options: Ext
|
||||
...sentinelElapsedWarnings(record(publish).elapsedMs),
|
||||
...sentinelElapsedWarnings(record(flush).result === undefined ? null : record(record(flush).result).durationMs),
|
||||
...(Array.isArray(record(targetValidation).warnings) ? record(targetValidation).warnings.map(text) : []),
|
||||
...(targetValidationBlocked ? ["targetValidation is blocked; top-level STATUS only covers sentinel control-plane rollout. HWLAB business recovery remains pending; rerun quick verify after internal DB switch completes, without public fallback or a second execution path."] : []),
|
||||
])),
|
||||
blocker,
|
||||
next: controlPlaneNext(state, options.action),
|
||||
|
||||
Reference in New Issue
Block a user