From ee0c3d19f30d58f535e1f623ac139990287ee5fc Mon Sep 17 00:00:00 2001 From: Codex Date: Wed, 1 Jul 2026 02:11:33 +0000 Subject: [PATCH] fix: gate sentinel publish on health endpoint --- .agents/skills/unidesk-cicd/SKILL.md | 7 +- .../skills/unidesk-cicd/references/full.md | 2 +- scripts/src/hwlab-node-web-sentinel-cicd.ts | 124 +++++++++--------- scripts/src/hwlab-node-web-sentinel-p5.ts | 22 ++++ 4 files changed, 92 insertions(+), 63 deletions(-) diff --git a/.agents/skills/unidesk-cicd/SKILL.md b/.agents/skills/unidesk-cicd/SKILL.md index 664f4880..eb7fe3a3 100644 --- a/.agents/skills/unidesk-cicd/SKILL.md +++ b/.agents/skills/unidesk-cicd/SKILL.md @@ -24,10 +24,11 @@ bun scripts/cli.ts agentrun control-plane status - CI/CD、GitOps、rollout、PipelineRun、Argo、git-mirror 和 AgentRun 部署必须走受控 CLI;不要用裸 `kubectl`、`argo`、`tkn`、`curl` 当正式控制入口。 - CI/CD、rollout、publish、image build 和部署链路禁止新引入 Docker 依赖;不得依赖 Docker socket、Docker daemon、host Docker、`docker build`、`docker push` 或等价 Docker-only 路径。 - 正式 CI/CD、publish、image build 和 rollout 必须走 Tekton Task/Pipeline/PipelineRun 承担 CI,并通过 GitOps/Argo 承担部署收敛;普通 Kubernetes Job 只允许用于 bounded helper、source sync、diagnostic、cleanup 或 bootstrap,不得作为正式发布、镜像构建或 rollout 入口。 -- 正式 CI/CD 必须提供一键完成入口:同一受控命令应完成 source sync、构建、发布、GitOps/Argo 收敛、runtime provenance 校验和用户入口验证;不要要求操作者手动串联多个 publish/apply/status 命令才能完成一次交付。 -- CI/CD 一键交付的端到端 wall-clock 目标是低于 2 分钟;计时从操作者触发受控命令开始,到 runtime ready 且入口验证完成为止。具体 wait/timeout/budget 字段必须从 YAML/source-of-truth 读取并配置到满足该目标。 +- 正式 CI/CD 必须提供一键完成入口:同一受控命令应完成 source sync、构建、发布、GitOps/Argo 收敛、runtime provenance 校验和 `/health` 端点验证;不要要求操作者手动串联多个 publish/apply/status 命令才能完成一次交付。 +- CI/CD 一键交付的端到端 wall-clock 目标是低于 2 分钟;计时从操作者触发受控命令开始,到 runtime ready 且 `/health` 端点验证完成为止。具体 wait/timeout/budget 字段必须从 YAML/source-of-truth 读取并配置到满足该目标。 +- CI/CD validation 阶段只能验证部署对象的 `/health` 端点和必要 provenance;禁止在 CI/CD gate 中运行 web-probe、Playwright、远程浏览器截图、用户路径 E2E 或等价重型业务探针。业务/用户入口验证只能作为发布后的独立 post-deploy validation 证据,不得阻塞 CI/CD 一键交付。 - 任一 CI/CD 阶段或总耗时超过 2 分钟时,不要继续死等或把超长等待视为正常;先输出阶段耗时分解,并优先从 env reuse、git mirror、BuildKit/cache、GitOps/Argo watch 和 runtime readiness 探测方向优化后再继续交付。 -- 触发或验收 rollout 时必须绑定 lane、source commit、PipelineRun/GitOps revision 和用户入口验证结果。 +- 触发或验收 rollout 时必须绑定 lane、source commit、PipelineRun/GitOps revision、runtime ready 和 `/health` 端点验证结果;web-probe/Playwright 结果只能作为单独的 post-deploy 证据。 - Secret 只通过 YAML sourceRef/targetKey 和受控 CLI 下发;输出只披露 presence/fingerprint。 - 长命令用异步 job 或短轮询;不要长时间挂住 trans/ssh。 diff --git a/.agents/skills/unidesk-cicd/references/full.md b/.agents/skills/unidesk-cicd/references/full.md index aa5aafeb..b778421a 100644 --- a/.agents/skills/unidesk-cicd/references/full.md +++ b/.agents/skills/unidesk-cicd/references/full.md @@ -37,7 +37,7 @@ bun scripts/cli.ts hwlab g14 monitor-prs --lane v02 [--once] [--dry-run] bun scripts/cli.ts hwlab g14 monitor-prs --lane v03 [--once] [--dry-run] ``` -只监控 base=`v0.3` 的 PR。ready PR 经 UniDesk `gh pr merge` 合并后触发 runtime lane CD,检查 PipelineRun、Argo、`hwlab-v03` runtime public probes 和 Git mirror flush,并对失败 check、冲突、CD failure/timeout 创建或更新 failure issue。public probe 必须使用 `config/hwlab-node-lanes.yaml` 选中 node/lane 的 formal public URL;D601 `v0.3` 当前是 `https://hwlab.pikapython.com`,裸 IP、FRP 端口和 legacy `20666/20667` 只作为边缘诊断证据,不能作为用户入口验收口径。 +只监控 base=`v0.3` 的 PR。ready PR 经 UniDesk `gh pr merge` 合并后触发 runtime lane CD,检查 PipelineRun、Argo、`hwlab-v03` runtime `/health` endpoint 和 Git mirror flush,并对失败 check、冲突、CD failure/timeout 创建或更新 failure issue。CI/CD validation 只允许使用部署对象的 `/health` 端点和必要 provenance;禁止在 CI/CD gate 中运行 web-probe、Playwright、远程浏览器截图或用户路径 E2E。public health probe 必须使用 `config/hwlab-node-lanes.yaml` 选中 node/lane 的 formal public URL;D601 `v0.3` 当前是 `https://hwlab.pikapython.com`,裸 IP、FRP 端口和 legacy `20666/20667` 只作为边缘诊断证据,不能作为 CI/CD 验收口径。 --- diff --git a/scripts/src/hwlab-node-web-sentinel-cicd.ts b/scripts/src/hwlab-node-web-sentinel-cicd.ts index 7209da7d..adb74ecc 100644 --- a/scripts/src/hwlab-node-web-sentinel-cicd.ts +++ b/scripts/src/hwlab-node-web-sentinel-cicd.ts @@ -18,7 +18,7 @@ import { readWebProbeSentinelConfigRefTarget } from "./hwlab-node-web-sentinel-c import { effectiveWebProbeSentinelPublicExposure, requireSentinelIdForRegistry, resolveWebProbeSentinel } from "./hwlab-node-web-sentinel-resolver"; import type { HwlabRuntimeLaneSpec } from "./hwlab-node-lanes"; import type { RenderedCliResult } from "./output"; -import { probeSentinelDashboardBrowser, runSentinelDashboard, runSentinelMaintenance, runSentinelReport, runSentinelValidate } from "./hwlab-node-web-sentinel-p5"; +import { probeSentinelRuntimeHealthEndpoint, runSentinelDashboard, runSentinelMaintenance, runSentinelReport, runSentinelValidate } from "./hwlab-node-web-sentinel-p5"; import { runChildCli, sentinelP5Next } from "./hwlab-node-web-sentinel-p5-observe"; export type WebProbeSentinelConfigAction = "plan" | "status"; @@ -353,7 +353,7 @@ function runSentinelPublishCurrent(state: SentinelCicdState, options: Extract; - let dashboardElapsedMs: number | null = null; + let health: Record; + let healthElapsedMs: number | null = null; if (controlResult.ok !== true) { - dashboard = { ok: false, skipped: true, reason: "control-plane-blocked", valuesRedacted: true }; - } else if (!publishCurrentDashboardEnabled(state)) { - dashboard = { ok: !dashboardRequired, skipped: true, reason: "disabled-by-yaml", valuesRedacted: true }; + health = { ok: false, skipped: true, reason: "control-plane-blocked", valuesRedacted: true }; } else if (remainingBudgetSeconds() < 2) { - dashboard = { ok: false, skipped: true, reason: "end-to-end-budget-exhausted-before-dashboard", valuesRedacted: true }; + health = { ok: false, skipped: true, reason: "end-to-end-budget-exhausted-before-health", valuesRedacted: true }; } else { - const dashboardStartedAt = Date.now(); - dashboard = probeSentinelDashboardBrowser(state, publishCurrentDashboardOptions(state, remainingBudgetSeconds())); - dashboardElapsedMs = Date.now() - dashboardStartedAt; - dashboard = { ...dashboard, elapsedMs: dashboardElapsedMs, valuesRedacted: true }; + const healthStartedAt = Date.now(); + health = probeSentinelRuntimeHealthEndpoint(state, remainingBudgetSeconds()); + healthElapsedMs = Date.now() - healthStartedAt; + health = { ...health, elapsedMs: healthElapsedMs, valuesRedacted: true }; } const elapsedMs = Date.now() - startedAt; - const timings = publishCurrentStageTimings(controlResult, dashboard, elapsedMs); + const timings = publishCurrentStageTimings(controlResult, health, elapsedMs); const slowStages = publishCurrentSlowStages(state, timings, budgetSeconds); const withinBudget = elapsedMs <= budgetSeconds * 1000; - const dashboardOk = dashboardRequired ? dashboard.ok === true : dashboard.ok !== false; - const ok = controlResult.ok === true && dashboardOk && withinBudget; - const blocker = ok ? null : publishCurrentBlocker(controlResult, dashboard, withinBudget); + const healthOk = health.ok === true; + const ok = controlResult.ok === true && healthOk && withinBudget; + const blocker = ok ? null : publishCurrentBlocker(controlResult, health, withinBudget); const result = { ok, command, @@ -433,9 +429,9 @@ function runSentinelPublishCurrentConfirmed(state: SentinelCicdState, options: E image: state.image, pipelineRun: record(controlResult).pipelineRun ?? sentinelPipelineRunName(state), controlPlane: controlResult, - dashboard, + health, budget, - dashboardPlan: publishCurrentDashboardPlan(state), + validationPlan: publishCurrentHealthValidationPlan(state), stageBudgets: publishCurrentStageBudgets(state), elapsedMs, withinBudget, @@ -552,7 +548,7 @@ function sentinelAlreadyCurrentControlResult(state: SentinelCicdState, observed: targetValidation: null, elapsedMs, warnings: [ - "publish-current already-current fast path: source mirror, registry, GitOps, Argo and runtime already match the selected source; skipped Tekton publish and used dashboard verification only.", + "publish-current already-current fast path: source mirror, registry, GitOps, Argo and runtime already match the selected source; skipped Tekton publish and used health endpoint validation only.", ...sentinelObservedWarnings(observed), ...targetValidationDeferredWarnings(state, false, controlPlaneWaitWarningSeconds(state)), ], @@ -754,6 +750,19 @@ function publishCurrentDashboardRequired(state: SentinelCicdState): boolean { return booleanAt(recordTarget(valueAtPath(state.cicd, "publishCurrent.dashboard"), "publishCurrent.dashboard"), "required"); } +function publishCurrentHealthValidationPlan(state: SentinelCicdState): Record { + return { + enabled: true, + required: true, + endpoint: stringAt(state.runtime, "healthPath"), + source: "runtime.healthPath", + browser: false, + playwright: false, + webProbe: false, + valuesRedacted: true, + }; +} + function publishCurrentDashboardReserveSeconds(state: SentinelCicdState): number { if (!publishCurrentDashboardEnabled(state)) return 0; const dashboard = publishCurrentDashboardPlan(state); @@ -788,7 +797,7 @@ function publishCurrentDashboardOptions(state: SentinelCicdState, timeoutSeconds }; } -function publishCurrentStageTimings(controlResult: Record, dashboard: Record, elapsedMs: number): Record { +function publishCurrentStageTimings(controlResult: Record, health: Record, elapsedMs: number): Record { const publish = record(controlResult.publish); const payload = record(publish.payload); const payloadStageTimings = record(payload.stageTimings); @@ -802,7 +811,7 @@ function publishCurrentStageTimings(controlResult: Record, dash imageBuildMs: finiteNumberOrNull(stageTimings.imageBuildMs), gitopsMs: finiteNumberOrNull(stageTimings.gitopsMs), argoRuntimeMs: finiteNumberOrNull(observedWait.elapsedMs), - dashboardVerifyMs: finiteNumberOrNull(dashboard.elapsedMs), + healthValidationMs: finiteNumberOrNull(health.elapsedMs), totalMs: elapsedMs, valuesRedacted: true, }; @@ -817,7 +826,7 @@ function publishCurrentSlowStages(state: SentinelCicdState, timings: Record { const elapsed = finiteNumberOrNull(timings[timingKey]); @@ -831,7 +840,7 @@ function publishCurrentSlowStages(state: SentinelCicdState, timings: Record, dashboard: Record, withinBudget: boolean): Record { +function publishCurrentBlocker(controlResult: Record, health: Record, withinBudget: boolean): Record { if (controlResult.ok !== true) { const blocker = record(controlResult.blocker); return { @@ -853,22 +862,22 @@ function publishCurrentBlocker(controlResult: Record, dashboard valuesRedacted: true, }; } - if (dashboard.ok !== true) { - const degradedReason = text(dashboard.degradedReason); + if (health.ok !== true) { + const degradedReason = text(health.degradedReason); return { - code: dashboard.skipped === true ? text(dashboard.reason) : "sentinel-publish-current-dashboard-verify-failed", - reason: dashboard.skipped === true - ? "dashboard verification did not run" + code: health.skipped === true ? text(health.reason) : "sentinel-publish-current-health-endpoint-failed", + reason: health.skipped === true + ? "health endpoint validation did not run" : degradedReason === "-" - ? "dashboard verification did not pass" - : `dashboard verification did not pass: ${degradedReason}`, + ? "health endpoint validation did not pass" + : `health endpoint validation did not pass: ${degradedReason}`, valuesRedacted: true, }; } if (!withinBudget) { return { code: "sentinel-publish-current-over-budget", - reason: "runtime and dashboard converged, but the one-click CI/CD path exceeded the YAML end-to-end budget", + reason: "runtime and health endpoint converged, but the one-click CI/CD path exceeded the YAML end-to-end budget", valuesRedacted: true, }; } @@ -2886,7 +2895,7 @@ function sentinelSourceMirrorAlreadyPresentResult(state: SentinelCicdState, prob function targetValidationDeferredWarnings(state: SentinelCicdState, applyOnly: boolean, budgetSeconds: number): string[] { if (applyOnly) return []; const next = sentinelP5Next(state); - return [`targetValidation quick verify is deferred from control-plane confirm-wait to keep CI/CD wait under ${Math.round(budgetSeconds)}s; run ${next.quickVerify}.`]; + return [`targetValidation quick verify is outside the CI/CD validation gate; run ${next.quickVerify} only as separate post-deploy evidence if needed.`]; } export function targetValidationElapsedWarnings(value: unknown, subject: string, budgetSeconds: number): string[] { @@ -3559,15 +3568,12 @@ function renderPublishCurrentResult(result: Record): string { const argo = record(observed.argo); const runtime = record(observed.runtime); const runtimeDeployment = record(record(runtime.probe).deployment); - const dashboard = record(result.dashboard); - const dashboardPage = record(dashboard.page); - const dashboardDom = record(dashboardPage.dom); - const latestRunCounts = record(dashboardDom.latestRunCounts); - const checkScope = record(dashboardDom.checkScope); + const health = record(result.health); + const healthBody = record(record(health.health).bodyJson); const timings = record(result.timings); const budget = record(result.budget); const stageBudgets = record(result.stageBudgets); - const dashboardPlan = record(result.dashboardPlan); + const validationPlan = record(result.validationPlan); const blocker = record(result.blocker); const next = record(result.next); const warnings = Array.isArray(result.warnings) ? result.warnings : []; @@ -3593,27 +3599,27 @@ function renderPublishCurrentResult(result: Record): string { result.pipelineRun ?? publish.jobName ?? "-", ]]), "", - table(["GITOPS_REV", "ARGO_REV", "ARGO", "RUNTIME_IMAGE", "RUNTIME_READY", "DASHBOARD"], [[ + table(["GITOPS_REV", "ARGO_REV", "ARGO", "RUNTIME_IMAGE", "RUNTIME_READY", "HEALTH"], [[ short(gitops.revision), short(argo.revision), `${argo.syncStatus ?? "-"}/${argo.healthStatus ?? "-"}`, short(runtimeDeployment.image), `${runtimeDeployment.readyReplicas ?? "-"}/${runtimeDeployment.desiredReplicas ?? "-"}`, - dashboard.ok === true ? "pass" : dashboard.skipped === true ? `skipped:${text(dashboard.reason)}` : Object.keys(dashboard).length === 0 ? "planned" : "blocked", + health.ok === true ? "pass" : health.skipped === true ? `skipped:${text(health.reason)}` : Object.keys(health).length === 0 ? "planned" : "blocked", ]]), "", - table(["SOURCE_SYNC_MS", "SOURCE_FETCH_MS", "VERIFY_MS", "IMAGE_MS", "GITOPS_MS", "ARGO_RUNTIME_MS", "DASHBOARD_MS", "TOTAL_MS"], [[ + table(["SOURCE_SYNC_MS", "SOURCE_FETCH_MS", "VERIFY_MS", "IMAGE_MS", "GITOPS_MS", "ARGO_RUNTIME_MS", "VALIDATION_MS", "TOTAL_MS"], [[ timings.sourceSyncMs ?? "-", timings.sourceFetchMs ?? "-", timings.monitorWebVerifyMs ?? "-", timings.imageBuildMs ?? "-", timings.gitopsMs ?? "-", timings.argoRuntimeMs ?? "-", - timings.dashboardVerifyMs ?? "-", + timings.healthValidationMs ?? "-", timings.totalMs ?? "-", ]]), "", - table(["BUDGET_SOURCE", "SOURCE_SYNC", "SOURCE_FETCH", "VERIFY", "IMAGE", "GITOPS", "ARGO_RUNTIME", "DASHBOARD"], [[ + table(["BUDGET_SOURCE", "SOURCE_SYNC", "SOURCE_FETCH", "VERIFY", "IMAGE", "GITOPS", "ARGO_RUNTIME", "VALIDATION"], [[ "YAML publishCurrent", stageBudgets.sourceSyncSeconds ?? "-", stageBudgets.sourceFetchSeconds ?? "-", @@ -3641,16 +3647,15 @@ function renderPublishCurrentResult(result: Record): string { } lines.push( "", - Object.keys(dashboard).length === 0 - ? "DASHBOARD_VERIFY\n-" - : table(["URL", "HTTP", "LATEST_RUN", "CHECK_SCOPE", "CHECK_MATCH", "REQ_FAIL", "CONSOLE_ERR"], [[ - dashboard.publicUrl ?? "-", - dashboardPage.httpStatus ?? "-", - latestRunCounts.runId ?? "-", - checkScope.scope ?? "-", - checkScope.matchesRunDetail ?? "-", - dashboardPage.requestFailureCount ?? "-", - dashboardPage.consoleErrorCount ?? "-", + Object.keys(health).length === 0 + ? "HEALTH_VALIDATION\n-" + : table(["ENDPOINT", "HTTP", "OK", "STATUS", "PUBLIC_URL", "INTERNAL_URL"], [[ + health.endpoint ?? validationPlan.endpoint ?? "-", + health.httpStatus ?? "-", + healthBody.ok ?? "-", + healthBody.status ?? "-", + health.publicUrl ?? "-", + health.internalUrl ?? "-", ]]), "", slowStages.length === 0 ? "SLOW_STAGES\n-" : [ @@ -3665,12 +3670,13 @@ function renderPublishCurrentResult(result: Record): string { "NEXT", ` publish-current: ${next.publishCurrent ?? "-"}`, ` status: ${next.controlPlaneStatus ?? "-"}`, - ` dashboard: ${next.dashboardVerify ?? "-"}`, + ` post-deploy-dashboard: ${next.dashboardVerify ?? "-"}`, ` git-mirror: ${next.gitMirrorStatus ?? "-"}`, ` flush: ${next.gitMirrorFlush ?? "-"}`, "", "DISCLOSURE", - ` end-to-end and stage budgets are read from ${Object.keys(dashboardPlan).length > 0 ? "publishCurrent YAML" : "YAML-required publishCurrent fields"}.`, + ` end-to-end and stage budgets are read from ${Object.keys(validationPlan).length > 0 ? "publishCurrent YAML and runtime.healthPath" : "YAML-required publishCurrent fields"}.`, + " CI/CD validation only checks the configured health endpoint; web-probe, Playwright and browser dashboard checks are post-deploy evidence, not this gate.", " image build uses Tekton PipelineRun and BuildKit; this command does not require Docker daemon/socket/build.", ); return lines.join("\n"); diff --git a/scripts/src/hwlab-node-web-sentinel-p5.ts b/scripts/src/hwlab-node-web-sentinel-p5.ts index c90cf215..a47781a8 100644 --- a/scripts/src/hwlab-node-web-sentinel-p5.ts +++ b/scripts/src/hwlab-node-web-sentinel-p5.ts @@ -198,6 +198,28 @@ export function runSentinelValidate(state: SentinelCicdState, options: Extract { + const endpoint = stringAt(state.runtime, "healthPath"); + const serviceProbeTimeoutSeconds = Math.min(timeoutSeconds, 20); + const health = callSentinelService(state, "GET", endpoint, null, serviceProbeTimeoutSeconds); + const publicHealth = health.ok ? null : probePublicSentinelService(state, endpoint, serviceProbeTimeoutSeconds); + const effectiveHealth = health.ok ? health : record(publicHealth).ok === true ? record(publicHealth) : health; + const bodyJson = record(effectiveHealth.bodyJson); + const ok = effectiveHealth.ok === true && bodyJson.ok === true; + return { + ok, + endpoint, + health: effectiveHealth, + internalHealth: health, + publicHealth, + httpStatus: effectiveHealth.httpStatus ?? null, + publicUrl: effectiveHealth.publicUrl ?? null, + internalUrl: effectiveHealth.internalUrl ?? null, + degradedReason: ok ? null : "sentinel-runtime-health-endpoint-failed", + valuesRedacted: true, + }; +} + export function runSentinelReport(state: SentinelCicdState, options: Extract): RenderedCliResult { const command = `web-probe sentinel report ${options.latest ? "--latest " : ""}--view ${options.view}`; const query = new URLSearchParams({ view: options.view });