fix: gate sentinel publish on health endpoint

This commit is contained in:
Codex
2026-07-01 02:11:33 +00:00
parent fbcf6c02bf
commit ee0c3d19f3
4 changed files with 92 additions and 63 deletions
+4 -3
View File
@@ -24,10 +24,11 @@ bun scripts/cli.ts agentrun control-plane status
- CI/CD、GitOps、rollout、PipelineRun、Argo、git-mirror 和 AgentRun 部署必须走受控 CLI;不要用裸 `kubectl``argo``tkn``curl` 当正式控制入口。
- CI/CD、rollout、publish、image build 和部署链路禁止新引入 Docker 依赖;不得依赖 Docker socket、Docker daemon、host Docker、`docker build``docker push` 或等价 Docker-only 路径。
- 正式 CI/CD、publish、image build 和 rollout 必须走 Tekton Task/Pipeline/PipelineRun 承担 CI,并通过 GitOps/Argo 承担部署收敛;普通 Kubernetes Job 只允许用于 bounded helper、source sync、diagnostic、cleanup 或 bootstrap,不得作为正式发布、镜像构建或 rollout 入口。
- 正式 CI/CD 必须提供一键完成入口:同一受控命令应完成 source sync、构建、发布、GitOps/Argo 收敛、runtime provenance 校验和用户入口验证;不要要求操作者手动串联多个 publish/apply/status 命令才能完成一次交付。
- CI/CD 一键交付的端到端 wall-clock 目标是低于 2 分钟;计时从操作者触发受控命令开始,到 runtime ready 且入口验证完成为止。具体 wait/timeout/budget 字段必须从 YAML/source-of-truth 读取并配置到满足该目标。
- 正式 CI/CD 必须提供一键完成入口:同一受控命令应完成 source sync、构建、发布、GitOps/Argo 收敛、runtime provenance 校验和 `/health` 端点验证;不要要求操作者手动串联多个 publish/apply/status 命令才能完成一次交付。
- CI/CD 一键交付的端到端 wall-clock 目标是低于 2 分钟;计时从操作者触发受控命令开始,到 runtime ready 且 `/health` 端点验证完成为止。具体 wait/timeout/budget 字段必须从 YAML/source-of-truth 读取并配置到满足该目标。
- CI/CD validation 阶段只能验证部署对象的 `/health` 端点和必要 provenance;禁止在 CI/CD gate 中运行 web-probe、Playwright、远程浏览器截图、用户路径 E2E 或等价重型业务探针。业务/用户入口验证只能作为发布后的独立 post-deploy validation 证据,不得阻塞 CI/CD 一键交付。
- 任一 CI/CD 阶段或总耗时超过 2 分钟时,不要继续死等或把超长等待视为正常;先输出阶段耗时分解,并优先从 env reuse、git mirror、BuildKit/cache、GitOps/Argo watch 和 runtime readiness 探测方向优化后再继续交付。
- 触发或验收 rollout 时必须绑定 lane、source commit、PipelineRun/GitOps revision 和用户入口验证结果
- 触发或验收 rollout 时必须绑定 lane、source commit、PipelineRun/GitOps revision、runtime ready 和 `/health` 端点验证结果;web-probe/Playwright 结果只能作为单独的 post-deploy 证据
- Secret 只通过 YAML sourceRef/targetKey 和受控 CLI 下发;输出只披露 presence/fingerprint。
- 长命令用异步 job 或短轮询;不要长时间挂住 trans/ssh。
@@ -37,7 +37,7 @@ bun scripts/cli.ts hwlab g14 monitor-prs --lane v02 [--once] [--dry-run]
bun scripts/cli.ts hwlab g14 monitor-prs --lane v03 [--once] [--dry-run]
```
只监控 base=`v0.3` 的 PR。ready PR 经 UniDesk `gh pr merge` 合并后触发 runtime lane CD,检查 PipelineRun、Argo、`hwlab-v03` runtime public probes 和 Git mirror flush,并对失败 check、冲突、CD failure/timeout 创建或更新 failure issue。public probe 必须使用 `config/hwlab-node-lanes.yaml` 选中 node/lane 的 formal public URLD601 `v0.3` 当前是 `https://hwlab.pikapython.com`,裸 IP、FRP 端口和 legacy `20666/20667` 只作为边缘诊断证据,不能作为用户入口验收口径。
只监控 base=`v0.3` 的 PR。ready PR 经 UniDesk `gh pr merge` 合并后触发 runtime lane CD,检查 PipelineRun、Argo、`hwlab-v03` runtime `/health` endpoint 和 Git mirror flush,并对失败 check、冲突、CD failure/timeout 创建或更新 failure issue。CI/CD validation 只允许使用部署对象的 `/health` 端点和必要 provenance;禁止在 CI/CD gate 中运行 web-probe、Playwright、远程浏览器截图或用户路径 E2E。public health probe 必须使用 `config/hwlab-node-lanes.yaml` 选中 node/lane 的 formal public URLD601 `v0.3` 当前是 `https://hwlab.pikapython.com`,裸 IP、FRP 端口和 legacy `20666/20667` 只作为边缘诊断证据,不能作为 CI/CD 验收口径。
---
+65 -59
View File
@@ -18,7 +18,7 @@ import { readWebProbeSentinelConfigRefTarget } from "./hwlab-node-web-sentinel-c
import { effectiveWebProbeSentinelPublicExposure, requireSentinelIdForRegistry, resolveWebProbeSentinel } from "./hwlab-node-web-sentinel-resolver";
import type { HwlabRuntimeLaneSpec } from "./hwlab-node-lanes";
import type { RenderedCliResult } from "./output";
import { probeSentinelDashboardBrowser, runSentinelDashboard, runSentinelMaintenance, runSentinelReport, runSentinelValidate } from "./hwlab-node-web-sentinel-p5";
import { probeSentinelRuntimeHealthEndpoint, runSentinelDashboard, runSentinelMaintenance, runSentinelReport, runSentinelValidate } from "./hwlab-node-web-sentinel-p5";
import { runChildCli, sentinelP5Next } from "./hwlab-node-web-sentinel-p5-observe";
export type WebProbeSentinelConfigAction = "plan" | "status";
@@ -353,7 +353,7 @@ function runSentinelPublishCurrent(state: SentinelCicdState, options: Extract<We
applicationName: stringAt(state.cicd, "argo.applicationName"),
},
budget: publishCurrentBudget(state),
dashboardPlan: publishCurrentDashboardPlan(state),
validationPlan: publishCurrentHealthValidationPlan(state),
stageBudgets: publishCurrentStageBudgets(state),
blocker: state.configReady && state.sourceHead.ok ? null : { code: "sentinel-publish-current-plan-blocked", reason: "sentinel config or source head is not ready" },
next: publishCurrentNext(state),
@@ -386,7 +386,6 @@ function runSentinelPublishCurrentConfirmed(state: SentinelCicdState, options: E
}
}
}
const dashboardReserveSeconds = publishCurrentDashboardReserveSeconds(state);
controlResult ??= sentinelControlPlaneConfirmedResult(state, {
kind: "control-plane",
action: "trigger-current",
@@ -396,30 +395,27 @@ function runSentinelPublishCurrentConfirmed(state: SentinelCicdState, options: E
dryRun: false,
confirm: true,
wait: true,
timeoutSeconds: Math.max(1, remainingBudgetSeconds() - dashboardReserveSeconds),
timeoutSeconds: Math.max(1, remainingBudgetSeconds()),
});
const dashboardRequired = publishCurrentDashboardRequired(state);
let dashboard: Record<string, unknown>;
let dashboardElapsedMs: number | null = null;
let health: Record<string, unknown>;
let healthElapsedMs: number | null = null;
if (controlResult.ok !== true) {
dashboard = { ok: false, skipped: true, reason: "control-plane-blocked", valuesRedacted: true };
} else if (!publishCurrentDashboardEnabled(state)) {
dashboard = { ok: !dashboardRequired, skipped: true, reason: "disabled-by-yaml", valuesRedacted: true };
health = { ok: false, skipped: true, reason: "control-plane-blocked", valuesRedacted: true };
} else if (remainingBudgetSeconds() < 2) {
dashboard = { ok: false, skipped: true, reason: "end-to-end-budget-exhausted-before-dashboard", valuesRedacted: true };
health = { ok: false, skipped: true, reason: "end-to-end-budget-exhausted-before-health", valuesRedacted: true };
} else {
const dashboardStartedAt = Date.now();
dashboard = probeSentinelDashboardBrowser(state, publishCurrentDashboardOptions(state, remainingBudgetSeconds()));
dashboardElapsedMs = Date.now() - dashboardStartedAt;
dashboard = { ...dashboard, elapsedMs: dashboardElapsedMs, valuesRedacted: true };
const healthStartedAt = Date.now();
health = probeSentinelRuntimeHealthEndpoint(state, remainingBudgetSeconds());
healthElapsedMs = Date.now() - healthStartedAt;
health = { ...health, elapsedMs: healthElapsedMs, valuesRedacted: true };
}
const elapsedMs = Date.now() - startedAt;
const timings = publishCurrentStageTimings(controlResult, dashboard, elapsedMs);
const timings = publishCurrentStageTimings(controlResult, health, elapsedMs);
const slowStages = publishCurrentSlowStages(state, timings, budgetSeconds);
const withinBudget = elapsedMs <= budgetSeconds * 1000;
const dashboardOk = dashboardRequired ? dashboard.ok === true : dashboard.ok !== false;
const ok = controlResult.ok === true && dashboardOk && withinBudget;
const blocker = ok ? null : publishCurrentBlocker(controlResult, dashboard, withinBudget);
const healthOk = health.ok === true;
const ok = controlResult.ok === true && healthOk && withinBudget;
const blocker = ok ? null : publishCurrentBlocker(controlResult, health, withinBudget);
const result = {
ok,
command,
@@ -433,9 +429,9 @@ function runSentinelPublishCurrentConfirmed(state: SentinelCicdState, options: E
image: state.image,
pipelineRun: record(controlResult).pipelineRun ?? sentinelPipelineRunName(state),
controlPlane: controlResult,
dashboard,
health,
budget,
dashboardPlan: publishCurrentDashboardPlan(state),
validationPlan: publishCurrentHealthValidationPlan(state),
stageBudgets: publishCurrentStageBudgets(state),
elapsedMs,
withinBudget,
@@ -552,7 +548,7 @@ function sentinelAlreadyCurrentControlResult(state: SentinelCicdState, observed:
targetValidation: null,
elapsedMs,
warnings: [
"publish-current already-current fast path: source mirror, registry, GitOps, Argo and runtime already match the selected source; skipped Tekton publish and used dashboard verification only.",
"publish-current already-current fast path: source mirror, registry, GitOps, Argo and runtime already match the selected source; skipped Tekton publish and used health endpoint validation only.",
...sentinelObservedWarnings(observed),
...targetValidationDeferredWarnings(state, false, controlPlaneWaitWarningSeconds(state)),
],
@@ -754,6 +750,19 @@ function publishCurrentDashboardRequired(state: SentinelCicdState): boolean {
return booleanAt(recordTarget(valueAtPath(state.cicd, "publishCurrent.dashboard"), "publishCurrent.dashboard"), "required");
}
function publishCurrentHealthValidationPlan(state: SentinelCicdState): Record<string, unknown> {
return {
enabled: true,
required: true,
endpoint: stringAt(state.runtime, "healthPath"),
source: "runtime.healthPath",
browser: false,
playwright: false,
webProbe: false,
valuesRedacted: true,
};
}
function publishCurrentDashboardReserveSeconds(state: SentinelCicdState): number {
if (!publishCurrentDashboardEnabled(state)) return 0;
const dashboard = publishCurrentDashboardPlan(state);
@@ -788,7 +797,7 @@ function publishCurrentDashboardOptions(state: SentinelCicdState, timeoutSeconds
};
}
function publishCurrentStageTimings(controlResult: Record<string, unknown>, dashboard: Record<string, unknown>, elapsedMs: number): Record<string, unknown> {
function publishCurrentStageTimings(controlResult: Record<string, unknown>, health: Record<string, unknown>, elapsedMs: number): Record<string, unknown> {
const publish = record(controlResult.publish);
const payload = record(publish.payload);
const payloadStageTimings = record(payload.stageTimings);
@@ -802,7 +811,7 @@ function publishCurrentStageTimings(controlResult: Record<string, unknown>, dash
imageBuildMs: finiteNumberOrNull(stageTimings.imageBuildMs),
gitopsMs: finiteNumberOrNull(stageTimings.gitopsMs),
argoRuntimeMs: finiteNumberOrNull(observedWait.elapsedMs),
dashboardVerifyMs: finiteNumberOrNull(dashboard.elapsedMs),
healthValidationMs: finiteNumberOrNull(health.elapsedMs),
totalMs: elapsedMs,
valuesRedacted: true,
};
@@ -817,7 +826,7 @@ function publishCurrentSlowStages(state: SentinelCicdState, timings: Record<stri
["image-build", "imageBuildMs", "imageBuildSeconds", "verify env reuse node_modules hit, BuildKit layer cache, copy-only Containerfile and image-build proxy route"],
["gitops", "gitopsMs", "gitopsSeconds", "inspect GitOps mirror cache, commit/writeback latency and post-flush state"],
["argo-runtime", "argoRuntimeMs", "argoRuntimeSeconds", "inspect Argo refresh, runtime Deployment readiness and image digest alignment probes"],
["dashboard-verify", "dashboardVerifyMs", "dashboardVerifySeconds", "inspect remote browser startup, monitor-web public route and dashboard API latency"],
["health-validation", "healthValidationMs", "dashboardVerifySeconds", "inspect runtime health endpoint latency and service routing"],
];
const slow = stageMap.flatMap(([stage, timingKey, budgetKey, suggestion]) => {
const elapsed = finiteNumberOrNull(timings[timingKey]);
@@ -831,7 +840,7 @@ function publishCurrentSlowStages(state: SentinelCicdState, timings: Record<stri
stage: "total",
elapsedMs: total,
budgetSeconds,
suggestion: "stop blind waiting; use the stage table to optimize the largest source sync, BuildKit/cache, GitOps/Argo or dashboard segment before rerun",
suggestion: "stop blind waiting; use the stage table to optimize the largest source sync, BuildKit/cache, GitOps/Argo or health endpoint segment before rerun",
valuesRedacted: true,
});
}
@@ -844,7 +853,7 @@ function publishCurrentBudgetWarnings(slowStages: readonly Record<string, unknow
return warnings;
}
function publishCurrentBlocker(controlResult: Record<string, unknown>, dashboard: Record<string, unknown>, withinBudget: boolean): Record<string, unknown> {
function publishCurrentBlocker(controlResult: Record<string, unknown>, health: Record<string, unknown>, withinBudget: boolean): Record<string, unknown> {
if (controlResult.ok !== true) {
const blocker = record(controlResult.blocker);
return {
@@ -853,22 +862,22 @@ function publishCurrentBlocker(controlResult: Record<string, unknown>, dashboard
valuesRedacted: true,
};
}
if (dashboard.ok !== true) {
const degradedReason = text(dashboard.degradedReason);
if (health.ok !== true) {
const degradedReason = text(health.degradedReason);
return {
code: dashboard.skipped === true ? text(dashboard.reason) : "sentinel-publish-current-dashboard-verify-failed",
reason: dashboard.skipped === true
? "dashboard verification did not run"
code: health.skipped === true ? text(health.reason) : "sentinel-publish-current-health-endpoint-failed",
reason: health.skipped === true
? "health endpoint validation did not run"
: degradedReason === "-"
? "dashboard verification did not pass"
: `dashboard verification did not pass: ${degradedReason}`,
? "health endpoint validation did not pass"
: `health endpoint validation did not pass: ${degradedReason}`,
valuesRedacted: true,
};
}
if (!withinBudget) {
return {
code: "sentinel-publish-current-over-budget",
reason: "runtime and dashboard converged, but the one-click CI/CD path exceeded the YAML end-to-end budget",
reason: "runtime and health endpoint converged, but the one-click CI/CD path exceeded the YAML end-to-end budget",
valuesRedacted: true,
};
}
@@ -2886,7 +2895,7 @@ function sentinelSourceMirrorAlreadyPresentResult(state: SentinelCicdState, prob
function targetValidationDeferredWarnings(state: SentinelCicdState, applyOnly: boolean, budgetSeconds: number): string[] {
if (applyOnly) return [];
const next = sentinelP5Next(state);
return [`targetValidation quick verify is deferred from control-plane confirm-wait to keep CI/CD wait under ${Math.round(budgetSeconds)}s; run ${next.quickVerify}.`];
return [`targetValidation quick verify is outside the CI/CD validation gate; run ${next.quickVerify} only as separate post-deploy evidence if needed.`];
}
export function targetValidationElapsedWarnings(value: unknown, subject: string, budgetSeconds: number): string[] {
@@ -3559,15 +3568,12 @@ function renderPublishCurrentResult(result: Record<string, unknown>): string {
const argo = record(observed.argo);
const runtime = record(observed.runtime);
const runtimeDeployment = record(record(runtime.probe).deployment);
const dashboard = record(result.dashboard);
const dashboardPage = record(dashboard.page);
const dashboardDom = record(dashboardPage.dom);
const latestRunCounts = record(dashboardDom.latestRunCounts);
const checkScope = record(dashboardDom.checkScope);
const health = record(result.health);
const healthBody = record(record(health.health).bodyJson);
const timings = record(result.timings);
const budget = record(result.budget);
const stageBudgets = record(result.stageBudgets);
const dashboardPlan = record(result.dashboardPlan);
const validationPlan = record(result.validationPlan);
const blocker = record(result.blocker);
const next = record(result.next);
const warnings = Array.isArray(result.warnings) ? result.warnings : [];
@@ -3593,27 +3599,27 @@ function renderPublishCurrentResult(result: Record<string, unknown>): string {
result.pipelineRun ?? publish.jobName ?? "-",
]]),
"",
table(["GITOPS_REV", "ARGO_REV", "ARGO", "RUNTIME_IMAGE", "RUNTIME_READY", "DASHBOARD"], [[
table(["GITOPS_REV", "ARGO_REV", "ARGO", "RUNTIME_IMAGE", "RUNTIME_READY", "HEALTH"], [[
short(gitops.revision),
short(argo.revision),
`${argo.syncStatus ?? "-"}/${argo.healthStatus ?? "-"}`,
short(runtimeDeployment.image),
`${runtimeDeployment.readyReplicas ?? "-"}/${runtimeDeployment.desiredReplicas ?? "-"}`,
dashboard.ok === true ? "pass" : dashboard.skipped === true ? `skipped:${text(dashboard.reason)}` : Object.keys(dashboard).length === 0 ? "planned" : "blocked",
health.ok === true ? "pass" : health.skipped === true ? `skipped:${text(health.reason)}` : Object.keys(health).length === 0 ? "planned" : "blocked",
]]),
"",
table(["SOURCE_SYNC_MS", "SOURCE_FETCH_MS", "VERIFY_MS", "IMAGE_MS", "GITOPS_MS", "ARGO_RUNTIME_MS", "DASHBOARD_MS", "TOTAL_MS"], [[
table(["SOURCE_SYNC_MS", "SOURCE_FETCH_MS", "VERIFY_MS", "IMAGE_MS", "GITOPS_MS", "ARGO_RUNTIME_MS", "VALIDATION_MS", "TOTAL_MS"], [[
timings.sourceSyncMs ?? "-",
timings.sourceFetchMs ?? "-",
timings.monitorWebVerifyMs ?? "-",
timings.imageBuildMs ?? "-",
timings.gitopsMs ?? "-",
timings.argoRuntimeMs ?? "-",
timings.dashboardVerifyMs ?? "-",
timings.healthValidationMs ?? "-",
timings.totalMs ?? "-",
]]),
"",
table(["BUDGET_SOURCE", "SOURCE_SYNC", "SOURCE_FETCH", "VERIFY", "IMAGE", "GITOPS", "ARGO_RUNTIME", "DASHBOARD"], [[
table(["BUDGET_SOURCE", "SOURCE_SYNC", "SOURCE_FETCH", "VERIFY", "IMAGE", "GITOPS", "ARGO_RUNTIME", "VALIDATION"], [[
"YAML publishCurrent",
stageBudgets.sourceSyncSeconds ?? "-",
stageBudgets.sourceFetchSeconds ?? "-",
@@ -3641,16 +3647,15 @@ function renderPublishCurrentResult(result: Record<string, unknown>): string {
}
lines.push(
"",
Object.keys(dashboard).length === 0
? "DASHBOARD_VERIFY\n-"
: table(["URL", "HTTP", "LATEST_RUN", "CHECK_SCOPE", "CHECK_MATCH", "REQ_FAIL", "CONSOLE_ERR"], [[
dashboard.publicUrl ?? "-",
dashboardPage.httpStatus ?? "-",
latestRunCounts.runId ?? "-",
checkScope.scope ?? "-",
checkScope.matchesRunDetail ?? "-",
dashboardPage.requestFailureCount ?? "-",
dashboardPage.consoleErrorCount ?? "-",
Object.keys(health).length === 0
? "HEALTH_VALIDATION\n-"
: table(["ENDPOINT", "HTTP", "OK", "STATUS", "PUBLIC_URL", "INTERNAL_URL"], [[
health.endpoint ?? validationPlan.endpoint ?? "-",
health.httpStatus ?? "-",
healthBody.ok ?? "-",
healthBody.status ?? "-",
health.publicUrl ?? "-",
health.internalUrl ?? "-",
]]),
"",
slowStages.length === 0 ? "SLOW_STAGES\n-" : [
@@ -3665,12 +3670,13 @@ function renderPublishCurrentResult(result: Record<string, unknown>): string {
"NEXT",
` publish-current: ${next.publishCurrent ?? "-"}`,
` status: ${next.controlPlaneStatus ?? "-"}`,
` dashboard: ${next.dashboardVerify ?? "-"}`,
` post-deploy-dashboard: ${next.dashboardVerify ?? "-"}`,
` git-mirror: ${next.gitMirrorStatus ?? "-"}`,
` flush: ${next.gitMirrorFlush ?? "-"}`,
"",
"DISCLOSURE",
` end-to-end and stage budgets are read from ${Object.keys(dashboardPlan).length > 0 ? "publishCurrent YAML" : "YAML-required publishCurrent fields"}.`,
` end-to-end and stage budgets are read from ${Object.keys(validationPlan).length > 0 ? "publishCurrent YAML and runtime.healthPath" : "YAML-required publishCurrent fields"}.`,
" CI/CD validation only checks the configured health endpoint; web-probe, Playwright and browser dashboard checks are post-deploy evidence, not this gate.",
" image build uses Tekton PipelineRun and BuildKit; this command does not require Docker daemon/socket/build.",
);
return lines.join("\n");
+22
View File
@@ -198,6 +198,28 @@ export function runSentinelValidate(state: SentinelCicdState, options: Extract<W
return rendered(ok, command, renderValidateResult(result));
}
export function probeSentinelRuntimeHealthEndpoint(state: SentinelCicdState, timeoutSeconds: number): Record<string, unknown> {
const endpoint = stringAt(state.runtime, "healthPath");
const serviceProbeTimeoutSeconds = Math.min(timeoutSeconds, 20);
const health = callSentinelService(state, "GET", endpoint, null, serviceProbeTimeoutSeconds);
const publicHealth = health.ok ? null : probePublicSentinelService(state, endpoint, serviceProbeTimeoutSeconds);
const effectiveHealth = health.ok ? health : record(publicHealth).ok === true ? record(publicHealth) : health;
const bodyJson = record(effectiveHealth.bodyJson);
const ok = effectiveHealth.ok === true && bodyJson.ok === true;
return {
ok,
endpoint,
health: effectiveHealth,
internalHealth: health,
publicHealth,
httpStatus: effectiveHealth.httpStatus ?? null,
publicUrl: effectiveHealth.publicUrl ?? null,
internalUrl: effectiveHealth.internalUrl ?? null,
degradedReason: ok ? null : "sentinel-runtime-health-endpoint-failed",
valuesRedacted: true,
};
}
export function runSentinelReport(state: SentinelCicdState, options: Extract<WebProbeSentinelOptions, { kind: "report" }>): RenderedCliResult {
const command = `web-probe sentinel report ${options.latest ? "--latest " : ""}--view ${options.view}`;
const query = new URLSearchParams({ view: options.view });