diff --git a/project-management/PJ2026-01/specs/PJ2026-01060508-web-probe-sentinel.md b/project-management/PJ2026-01/specs/PJ2026-01060508-web-probe-sentinel.md index d7f2d539..6c39a473 100644 --- a/project-management/PJ2026-01/specs/PJ2026-01060508-web-probe-sentinel.md +++ b/project-management/PJ2026-01/specs/PJ2026-01060508-web-probe-sentinel.md @@ -277,7 +277,7 @@ sequenceDiagram Pod-->>Val: scheduler heartbeat and PVC writable ``` -哨兵自身 rollout 只验证哨兵服务健康、配置装载、PVC/SQLite 可写、metrics、dashboard 和调度循环;它不能把另一个哨兵当作 HWLAB Web 业务观察对象。 +哨兵自身 rollout 只验证哨兵服务健康、配置装载、PVC/SQLite 可写、metrics、dashboard 和调度循环;它不能把另一个哨兵当作 HWLAB Web 业务观察对象,也不能因为 HWLAB 业务 quick verify blocked 就把哨兵自身发布状态标成失败。 ### 5.7 哨兵不可用结构化失败时序图 @@ -486,7 +486,7 @@ P8 恢复判定必须把 Workbench 业务失败继续 drill-down 到运行面依 Web哨兵自身必须纳入受控且独立的 sentinel control-plane:source 来自 UniDesk `master`,镜像、GitOps path、Argo Application、publicExposure 和 targetValidation 由 Web 哨兵 owning YAML 声明。D601/v03 当前可通过 `web-probe sentinel image|control-plane` 的独立 publish Job 实现构建、推送、GitOps 写回和 Argo 收敛;后续也可以切换到 Tekton Pipeline,但 builder 类型必须来自 YAML,不得依赖 operator 本地 dirty worktree。 -哨兵 rollout 与 HWLAB runtime rollout 不是同一个滚动单元。哨兵 dashboard/API/服务代码变更应通过 Web 哨兵独立 control-plane 滚动;HWLAB runtime 发布流水只调用当前已部署哨兵的 `maintenance/start`、`maintenance/stop` 和 quick verify 作为恢复判定。哨兵 validate、maintenance 和 quick verify 控制路径必须优先走 k3s 内部 Service DNS,不绕 `monitor.pikapython.com` 公网入口。 +哨兵 rollout 与 HWLAB runtime rollout 不是同一个滚动单元。哨兵 dashboard/API/服务代码变更应通过 Web 哨兵独立 control-plane 滚动;HWLAB runtime 发布流水只调用当前已部署哨兵的 `maintenance/start`、`maintenance/stop` 和 quick verify 作为恢复判定。哨兵 control-plane 的顶层状态只表达哨兵自身 source、镜像、GitOps、Argo、runtime、metrics 和 dashboard 是否发布成功;HWLAB quick verify 必须作为独立 `targetValidation` 状态、warning 和 report 证据输出。哨兵 validate、maintenance 和 quick verify 控制路径必须优先走 k3s 内部 Service DNS,不绕 `monitor.pikapython.com` 公网入口。 哨兵镜像构建应使用 YAML 声明的 tools image、base image、registry、egress proxy 和 env-reuse 配方。Node/Bun/Playwright/Chromium 依赖不得在 runtime Pod 中临时下载。Secret 与 env 复用只走 sourceRef/keyMapping;日志、status、dashboard 和 issue closeout 只输出 object/key/presence/fingerprint/digest。 diff --git a/scripts/src/hwlab-node-web-sentinel-cicd.ts b/scripts/src/hwlab-node-web-sentinel-cicd.ts index c5ffa702..ba4ccc4e 100644 --- a/scripts/src/hwlab-node-web-sentinel-cicd.ts +++ b/scripts/src/hwlab-node-web-sentinel-cicd.ts @@ -591,7 +591,7 @@ function runSentinelControlPlaneConfirmed(state: SentinelCicdState, options: Ext reason: "runtime-not-ready", valuesRedacted: true, }; - const targetValidationOk = applyOnly || record(targetValidation).ok === true; + const targetValidationBlocked = !applyOnly && record(targetValidation).ok !== true; const ok = state.configReady && state.sourceHead.ok && (applyOnly || record(sourceMirrorSync).ok === true) @@ -599,18 +599,13 @@ function runSentinelControlPlaneConfirmed(state: SentinelCicdState, options: Ext && (applyOnly || record(flush).ok === true) && record(publicExposureApply).ok === true && record(argoApply).ok === true - && observedReady - && targetValidationOk; + && observedReady; const elapsedMs = Date.now() - startedAt; const blocker = ok ? null : { - code: targetValidationOk - ? record(sourceMirrorSync).ok === false ? "sentinel-source-mirror-sync-failed" : "sentinel-control-plane-not-ready" - : "sentinel-target-validation-failed", - reason: targetValidationOk - ? record(sourceMirrorSync).ok === false - ? "source mirror sync did not complete; investigate git mirror/proxy before control-plane publish" - : "one or more publish, publicExposure, Argo or runtime observation checks did not pass" - : text(record(targetValidation).failure ?? record(targetValidation).reason ?? "quick verify did not pass"), + code: record(sourceMirrorSync).ok === false ? "sentinel-source-mirror-sync-failed" : "sentinel-control-plane-not-ready", + reason: record(sourceMirrorSync).ok === false + ? "source mirror sync did not complete; investigate git mirror/proxy before control-plane publish" + : "one or more publish, publicExposure, Argo or runtime observation checks did not pass", }; const result = { ok, @@ -657,6 +652,7 @@ function runSentinelControlPlaneConfirmed(state: SentinelCicdState, options: Ext ...sentinelElapsedWarnings(record(publish).elapsedMs), ...sentinelElapsedWarnings(record(flush).result === undefined ? null : record(record(flush).result).durationMs), ...(Array.isArray(record(targetValidation).warnings) ? record(targetValidation).warnings.map(text) : []), + ...(targetValidationBlocked ? ["targetValidation is blocked; top-level STATUS only covers sentinel control-plane rollout. HWLAB business recovery remains pending; rerun quick verify after internal DB switch completes, without public fallback or a second execution path."] : []), ])), blocker, next: controlPlaneNext(state, options.action),