feat: surface g14 observability resource snapshot
This commit is contained in:
@@ -60,7 +60,7 @@ CI/CD、GitOps、rollout、artifact 发布、PR 合并后的 DEV/PROD 滚动、P
|
||||
- `hwlab g14 control-plane cleanup-released-pvs --lane all [--limit N] [--dry-run|--confirm]` 是 local-path 未自动回收后的补充 retention 入口;只列并删除 `Released`、`local-path`、`Delete`、`claimNamespace=hwlab-ci` 且 claim 名称形如 Tekton 临时 `pvc-*` 的 PV。
|
||||
- `hwlab g14 git-mirror status|apply|sync|flush [--dry-run|--confirm]` 是 `devops-infra` git mirror/relay 的受控维护入口:`apply` 渲染并 server-side apply `devops-infra/git-mirror.yaml`,同时删除遗留 `git-mirror-hwlab-sync` CronJob;`sync` 创建一次性 manual Job,把 GitHub allowlist refs 拉入本地 mirror;`flush` 创建一次性 manual Job,把本地 `v0.2-gitops` 快进推回 GitHub。
|
||||
`status` 返回 read/write URL、last sync/write/flush、本地 ref、GitHub staging ref 和 pending flush 状态,并在 `cache.summary` 给出 `localV02`、`localGitops`、`githubGitops`、`pendingFlush`、`flushNeeded`、`githubInSync` 和下一条受控 `flushCommand`。confirmed `sync` 和 `flush` 默认创建 `.state/jobs/` 异步 job 并立刻返回可查询状态,只有现场同步调试才显式加 `--wait`;mirror 不设置 CronJob。
|
||||
- `hwlab g14 observability status|apply|query|targets|boundary|closeout [--lane v02] [--promql <expr>] [--expect-count N] [--expect-value V] [--dry-run|--confirm]` 是 G14 `devops-infra` 共享监控基础设施和 HWLAB v0.2 监控 closeout 的受控入口。`apply` 固定安装 Prometheus Operator `v0.91.0`、Prometheus `v3.12.0`、Prometheus 发现 RBAC、`devops-infra` 内 Prometheus 实例和 ClusterIP query Service,并给被允许发现的 workload namespace 打低风险 label;它不把 Prometheus、Grafana 或 Alertmanager 部署到 `hwlab-v02`,也不接管 HWLAB runtime Deployment/Service。`status` 只读汇总 CRD、operator Deployment、Prometheus CR/pod/service、`hwlab-v02` ServiceMonitor/PrometheusRule 和 bounded `up` 查询;`query` 只通过 Kubernetes service proxy 查询 Prometheus,支持 `--expect-count` / `--expect-value` 输出 `assertion`、bad values 和 missing/extra series;`targets` 汇总 ServiceMonitor/PrometheusRule、metrics sidecar readiness/restart 和三层指标值;`boundary` 验证 workload namespace 没有 Prometheus/Alertmanager,并对 `19666/19667` 公网 `/metrics` 做负向验证;`closeout` 聚合平台 ready、scrape reachable、sidecar serving、business health probe、namespace boundary 和 public metrics exposure 语义结论。长期边界见 `docs/reference/g14-observability-infra.md`。
|
||||
- `hwlab g14 observability status|apply|query|targets|boundary|closeout [--lane v02] [--promql <expr>] [--expect-count N] [--expect-value V] [--dry-run|--confirm]` 是 G14 `devops-infra` 共享监控基础设施和 HWLAB v0.2 监控 closeout 的受控入口。`apply` 固定安装 Prometheus Operator `v0.91.0`、Prometheus `v3.12.0`、Prometheus 发现 RBAC、`devops-infra` 内 Prometheus 实例和 ClusterIP query Service,并给被允许发现的 workload namespace 打低风险 label;它不把 Prometheus、Grafana 或 Alertmanager 部署到 `hwlab-v02`,也不接管 HWLAB runtime Deployment/Service。`status` 只读汇总 CRD、operator Deployment、Prometheus CR/pod/service、`hwlab-v02` ServiceMonitor/PrometheusRule 和 bounded `up` 查询;`query` 只通过 Kubernetes service proxy 查询 Prometheus,支持 `--expect-count` / `--expect-value` 输出 `assertion`、bad values 和 missing/extra series;`targets` 汇总 ServiceMonitor/PrometheusRule、metrics sidecar readiness/restart、三层指标值和 `metrics.k8s.io` 当前 CPU/内存资源快照;`boundary` 验证 workload namespace 没有 Prometheus/Alertmanager,并对 `19666/19667` 公网 `/metrics` 做负向验证;`closeout` 聚合平台 ready、scrape reachable、sidecar serving、business health probe、resource snapshot、namespace boundary 和 public metrics exposure 语义结论。长期边界见 `docs/reference/g14-observability-infra.md`。
|
||||
- `hwlab g14 tools-image status|build --name ci-node-tools --tag <tag> [--dockerfile deploy/ci/hwlab-ci-node-tools.Dockerfile] [--dry-run|--confirm]` 是 G14 固定 HWLAB CI tools image 的受控 host build/push 入口;构建和 push 只发生在 G14 host 与本地 registry,不在 master server 构建,也不把 `apk add`/runtime install 塞进 Tekton PipelineRun。
|
||||
- `trans gh:/owner/repo ...` 把 GitHub issue/PR 映射成只读/受控写入的虚拟文本目录,适合日报、PR 正文和 issue 正文的小补丁维护:`trans gh:/pikasTech/HWLAB ls` 展示 `pr/` 与 `issue/`,`trans gh:/pikasTech/HWLAB/pr ls [--limit N] [--full]` 和 `trans gh:/pikasTech/HWLAB/issue ls [--limit N] [--full]` 展示条目状态、楼层数、正文长度和标题,`trans gh:/pikasTech/HWLAB/pr/507 ls` 展示单个 PR 的一楼正文文件,`trans gh:/pikasTech/HWLAB/505/1 cat|rg|patch-apply` 兼容旧式 issue/PR number route。`patch-apply` 使用 UniDesk 默认 apply-patch v2 的虚拟文件 executor,把正文一楼映射为 `body.md`,写回仍走 `bun scripts/cli.ts gh issue/pr update` 的 guard/concurrency 规则;`rm` 对正文一楼结构化拒绝,避免误删 issue/PR 正文。大正文读取必须展开 UniDesk gh dump 文件,否则 `cat/rg/patch-apply` 会误读为空,这是 `gh:` 虚拟文件接口的 P0 可见性契约。
|
||||
- `hwlab cd status|audit|preflight|apply --env dev [--dry-run]` 是旧 D601 HWLAB DEV CD 指挥侧 wrapper,仅用于显式 legacy 诊断和迁移对照。默认通过 UniDesk provider `host.ssh` 进入 D601,再调用 HWLAB repo-owned `scripts/dev-cd-apply.mjs`,不内嵌发布 kubectl 逻辑:`status` 汇总固定 CD mirror、Git clean/main/origin-main、`deploy/deploy.json`/artifact catalog/report、D601 native k3s guard 和 CD Lease lock,并用 `scripts/dev-cd-apply.mjs --status --skip-live-verify` 取得 target/promotion 摘要;`audit` 在 k3s/CD 恢复后做只读健康审计,返回有界 JSON 的 blocker 分类、D601 guard/node、SecretRef 存在性、registry 可达性、Lease phase/holder/staleness、deploy.json 与 artifact/workload image 收敛、current Deployment image/revision/rollout、16666/16667 public health commit/readiness 和 DB/runtime durability 摘要;`preflight` 进一步检查必需 SecretRef 对象/键存在性并运行 HWLAB `scripts/dev-cd-apply.mjs --dry-run --skip-live-verify` 受控事务摘要。完整远端 stdout/stderr 写入 D601 `~/.state/unidesk-hwlab-cd/<run-id>/` 和本地 `.state/hwlab-cd/<run-id>/` task dump,stdout 只返回有界摘要。默认 HWLAB CD repo 是 `/home/ubuntu/hwlab_cd`,`/home/ubuntu/hwlab` runner 历史目录不得作为发布真相。wrapper 强制 `KUBECONFIG=/etc/rancher/k3s/k3s.yaml` 并只以这个显式目标作为 gate;显式目标出现 `docker-desktop`、`desktop-control-plane` 或 `127.0.0.1:11700` 信号会结构化拒绝,audit/preflight/apply --dry-run 都必须观察到 node `d601`。真实 apply 只暴露 `scripts/dev-cd-apply.mjs --apply --confirm-dev --confirmed-non-production --write-report` 命令形状并标注 host-commander-only,本 runner 不执行 live apply、rollout、Lease mutation 或 DEV deploy apply。长期规则见 `docs/reference/hwlab.md`。
|
||||
|
||||
@@ -103,13 +103,15 @@ A durable closeout must include:
|
||||
- `hwlab g14 observability status` or the equivalent controlled infrastructure status showing CRDs, Prometheus Operator and Prometheus Ready in `devops-infra`.
|
||||
- Explicit PromQL assertions for the workload namespace, not only the infrastructure status summary. Use `hwlab g14 observability query --promql <expr> --expect-count <N> --expect-value <V>` so the CLI returns `assertion.ok`, actual count, bad values and missing/extra series instead of requiring manual vector inspection.
|
||||
- For HWLAB v0.2, the current application-owned PromQL checks are `up{namespace="hwlab-v02"}`, `hwlab_service_up{namespace="hwlab-v02"}` and `hwlab_service_health_probe_success{namespace="hwlab-v02"}`. `up=1` proves Prometheus can scrape the sidecar; it does not prove the sidecar can reach the business health endpoint.
|
||||
- `hwlab g14 observability targets --lane v02` for the high-level target view: discovered service/pod, metrics sidecar readiness and restart count, selected monitor declarations and the latest `up` / `hwlab_service_up` / `hwlab_service_health_probe_success` values.
|
||||
- `hwlab g14 observability targets --lane v02` for the high-level target view: discovered service/pod, metrics sidecar readiness and restart count, selected monitor declarations, the latest `up` / `hwlab_service_up` / `hwlab_service_health_probe_success` values, synthetic health/scrape duration summaries and the current CPU/memory resource snapshot from `metrics.k8s.io`.
|
||||
- `hwlab g14 observability boundary --lane v02` for the namespace and public ingress boundary: the workload namespace may contain application `ServiceMonitor` / `PodMonitor` / `PrometheusRule` declarations only, must not contain shared Prometheus or Alertmanager instances, and public `19666/19667` `/metrics` must be denied or non-Prometheus text.
|
||||
- `hwlab g14 observability closeout --lane v02` as the standard monitoring closeout summary. It should report semantic fields such as `platformReady`, `scrapeReachable`, `sidecarServing`, `businessHealthProbe`, `namespaceControlPlaneBoundary` and `publicMetricsExposure`, plus bounded drill-down evidence and next diagnostic commands on failure. Public `/metrics` denial is represented as `publicMetricsExposure=pass` with `publicMetricsExposureState=denied`.
|
||||
- `hwlab g14 observability closeout --lane v02` as the standard monitoring closeout summary. It should report semantic fields such as `platformReady`, `scrapeReachable`, `sidecarServing`, `businessHealthProbe`, `resourceSnapshot`, `namespaceControlPlaneBoundary` and `publicMetricsExposure`, plus bounded drill-down evidence and next diagnostic commands on failure. Public `/metrics` denial is represented as `publicMetricsExposure=pass` with `publicMetricsExposureState=denied`.
|
||||
- CI/CD and GitOps provenance when the workload desired state changed. For HWLAB v0.2 this includes the target source commit, PipelineRun, Argo sync revision and git mirror `pendingFlush=false` / `githubInSync=true`.
|
||||
|
||||
Issue comments should lead with the semantic conclusion and then list the commands, result counts and target values. A raw metrics dump or a green `status` command alone is not a closeout, and CI/CD provenance still comes from `hwlab g14 control-plane closeout --lane v02 --source-commit <full-sha>` or the equivalent high-level control-plane entry when runtime desired state changed.
|
||||
|
||||
The current HWLAB v0.2 monitoring surface is intentionally split by source. Prometheus provides sidecar availability, business health probe success/status/duration, scrape duration and sidecar uptime; `metrics.k8s.io` provides current pod/container CPU and memory snapshots for the same monitored services. Request throughput, error rate, per-route latency percentiles and business-operation latency are application-owned signals and require HWLAB service instrumentation before Prometheus can answer them.
|
||||
|
||||
## Failure Modes
|
||||
|
||||
The following regressions are common enough to require explicit checks in future monitoring work:
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import { activeV02PipelineRuns, g14ObservabilityQueryAssertion, gitMirrorFlushJobManifest, gitMirrorStatusSummary, gitMirrorSyncJobManifest, gitMirrorV02SyncRequirement, hwlabG14Help, hwlabG14MonitorStateFileName, parseGitMirrorStatusRefs, parsePipelineTaskRunMetrics, parseV02TriggerSnapshot, rolloutRecordBody, semanticChangelogBullets, summarizeV02CdStatus, v02CloseoutVerdict, v02CommitAlignment, v02ControlPlaneRefreshScriptHash, v02ControlPlaneRenderScript, v02ExistingPipelineRunReuseDecision, v02FalseGreenGuard, v02GitMirrorPreSyncWaitMs, v02LatestOnlyTargetValidation, v02PipelineServiceIds, v02PrAutomationCommentBody, v02ReusableGitMirrorPreSyncMarker, v02ReusableRefreshMarker, v02TaskRunPerformanceSummary } from "./src/hwlab-g14";
|
||||
import { activeV02PipelineRuns, g14ObservabilityQueryAssertion, gitMirrorFlushJobManifest, gitMirrorStatusSummary, gitMirrorSyncJobManifest, gitMirrorV02SyncRequirement, hwlabG14Help, hwlabG14MonitorStateFileName, parseGitMirrorStatusRefs, parseK8sCpuMillicores, parseK8sMemoryMiB, parsePipelineTaskRunMetrics, parseV02TriggerSnapshot, rolloutRecordBody, semanticChangelogBullets, summarizeV02CdStatus, v02CloseoutVerdict, v02CommitAlignment, v02ControlPlaneRefreshScriptHash, v02ControlPlaneRenderScript, v02ExistingPipelineRunReuseDecision, v02FalseGreenGuard, v02GitMirrorPreSyncWaitMs, v02LatestOnlyTargetValidation, v02PipelineServiceIds, v02PrAutomationCommentBody, v02ReusableGitMirrorPreSyncMarker, v02ReusableRefreshMarker, v02TaskRunPerformanceSummary } from "./src/hwlab-g14";
|
||||
import { runCommand } from "./src/command";
|
||||
|
||||
function assertCondition(condition: unknown, message: string, detail: unknown = {}): void {
|
||||
@@ -114,6 +114,20 @@ assertCondition(
|
||||
"observability CLI must fail visibly on unsupported options instead of silently ignoring friction-prone flags",
|
||||
unsupportedObservabilityJson,
|
||||
);
|
||||
assertCondition(
|
||||
parseK8sCpuMillicores("46095136n") !== null
|
||||
&& Math.abs((parseK8sCpuMillicores("46095136n") ?? 0) - 46.095136) < 0.000001
|
||||
&& parseK8sCpuMillicores("47m") === 47
|
||||
&& parseK8sCpuMillicores("1") === 1000,
|
||||
"observability resource snapshot must convert metrics.k8s.io CPU quantities to millicores",
|
||||
);
|
||||
assertCondition(
|
||||
parseK8sMemoryMiB("99860Ki") !== null
|
||||
&& Math.abs((parseK8sMemoryMiB("99860Ki") ?? 0) - 97.51953125) < 0.000001
|
||||
&& parseK8sMemoryMiB("97Mi") === 97
|
||||
&& parseK8sMemoryMiB("1048576") === 1,
|
||||
"observability resource snapshot must convert metrics.k8s.io memory quantities to MiB",
|
||||
);
|
||||
|
||||
const v02CommentBody = v02PrAutomationCommentBody({
|
||||
pr: {
|
||||
@@ -741,6 +755,8 @@ console.log(JSON.stringify({
|
||||
"observability help exposes assertion, target, boundary, and closeout entrypoints",
|
||||
"observability query assertions report count and terminal value pass/fail",
|
||||
"observability CLI rejects unsupported options with visible JSON errors",
|
||||
"observability resource snapshot converts metrics.k8s.io CPU quantities to millicores",
|
||||
"observability resource snapshot converts metrics.k8s.io memory quantities to MiB",
|
||||
"git mirror sync is a manual devops-infra Job, not a CronJob",
|
||||
"git mirror flush is a manual devops-infra Job, not a CronJob",
|
||||
"trigger-current can decide whether v0.2 git mirror pre-sync is required",
|
||||
|
||||
@@ -5681,6 +5681,7 @@ function closeoutAdvice(summary: Record<string, unknown>): string[] {
|
||||
if (summary.scrapeReachable !== "pass") advice.push("scrapeReachable failed -> check ServiceMonitor labels, metrics sidecar port name, and Prometheus target discovery");
|
||||
if (summary.sidecarServing !== "pass") advice.push("sidecarServing failed -> check hwlab-metrics sidecar readiness, restartCount, and metrics script/container logs");
|
||||
if (summary.businessHealthProbe !== "pass") advice.push("businessHealthProbe failed -> up=1 but health_probe=0 usually means sidecar can be scraped but cannot reach the business health endpoint");
|
||||
if (summary.resourceSnapshot !== "pass") advice.push("resourceSnapshot failed -> check metrics.k8s.io APIService and metrics-server availability on G14 k3s");
|
||||
if (summary.namespaceControlPlaneBoundary !== "pass") advice.push("namespaceControlPlaneBoundary failed -> remove Prometheus/Alertmanager from workload namespace; shared control plane belongs in devops-infra");
|
||||
if (summary.publicMetricsExposure !== "pass") advice.push("publicMetricsExposure failed -> public /metrics returned Prometheus text; remove FRP/edge exposure or add an authenticated internal-only route");
|
||||
return advice;
|
||||
@@ -5697,6 +5698,7 @@ function runG14ObservabilityCloseout(options: G14ObservabilityOptions): Record<s
|
||||
const platformReady = record(status.crds).ok === true && record(status.operator).ok === true && record(status.prometheus).ok === true && record(status.query).ok === true;
|
||||
const namespaceBoundaryOk = record(record(boundary.namespaceBoundary)).ok === true && record(record(boundary.infraControlPlane)).ok === true;
|
||||
const publicDenied = record(record(boundary.publicMetricsExposure)).ok === true;
|
||||
const resourceOk = record(record(targets.resourceSnapshot)).ok === true;
|
||||
const summary = {
|
||||
platformReady: passFail(platformReady),
|
||||
workloadMonitorCount: numericValue(record(status.workloadMonitors).count) ?? numericValue(record(targets.monitors).count) ?? 0,
|
||||
@@ -5705,6 +5707,7 @@ function runG14ObservabilityCloseout(options: G14ObservabilityOptions): Record<s
|
||||
sidecarServing: passFail(queryOk("sidecarServing") && sidecarsOk),
|
||||
businessHealthProbe: passFail(queryOk("businessHealthProbe")),
|
||||
sidecarReady: passFail(sidecarsOk),
|
||||
resourceSnapshot: passFail(resourceOk),
|
||||
namespaceControlPlaneBoundary: passFail(namespaceBoundaryOk),
|
||||
publicMetricsExposure: passFail(publicDenied),
|
||||
publicMetricsExposureState: publicDenied ? "denied" : "exposed-or-unknown",
|
||||
@@ -5739,12 +5742,18 @@ function runG14ObservabilityCloseout(options: G14ObservabilityOptions): Record<s
|
||||
readySidecarCount: record(targets.sidecars).readyCount ?? null,
|
||||
healthProbeDuration: record(record(targets.levelSummary).healthProbeDuration),
|
||||
scrapeDuration: record(record(targets.levelSummary).scrapeDuration),
|
||||
resourceSnapshot: record(targets.resourceSnapshot),
|
||||
resourceUsage: record(record(targets.levelSummary).resourceUsage),
|
||||
services: arrayRecords(record(targets.levelSummary).services).map((service) => ({
|
||||
serviceId: service.serviceId ?? null,
|
||||
scrapeReachable: service.scrapeReachable ?? null,
|
||||
sidecarServing: service.sidecarServing ?? null,
|
||||
businessHealthProbe: service.businessHealthProbe ?? null,
|
||||
statusCode: service.statusCode ?? null,
|
||||
totalCpuMillicores: service.totalCpuMillicores ?? null,
|
||||
totalMemoryMiB: service.totalMemoryMiB ?? null,
|
||||
businessCpuMillicores: service.businessCpuMillicores ?? null,
|
||||
businessMemoryMiB: service.businessMemoryMiB ?? null,
|
||||
})),
|
||||
},
|
||||
boundary: {
|
||||
|
||||
Reference in New Issue
Block a user