feat: surface g14 observability resource snapshot

2026-06-05 02:38:00 +00:00
parent 64c936bf0c
commit 73a4fb57ee
4 changed files with 31 additions and 4 deletions
@@ -60,7 +60,7 @@ CI/CD、GitOps、rollout、artifact 发布、PR 合并后的 DEV/PROD 滚动、P
 - `hwlab g14 control-plane cleanup-released-pvs --lane all [--limit N] [--dry-run|--confirm]` 是 local-path 未自动回收后的补充 retention 入口；只列并删除 `Released`、`local-path`、`Delete`、`claimNamespace=hwlab-ci` 且 claim 名称形如 Tekton 临时 `pvc-*` 的 PV。
 - `hwlab g14 git-mirror status|apply|sync|flush [--dry-run|--confirm]` 是 `devops-infra` git mirror/relay 的受控维护入口：`apply` 渲染并 server-side apply `devops-infra/git-mirror.yaml`，同时删除遗留 `git-mirror-hwlab-sync` CronJob；`sync` 创建一次性 manual Job，把 GitHub allowlist refs 拉入本地 mirror；`flush` 创建一次性 manual Job，把本地 `v0.2-gitops` 快进推回 GitHub。
  `status` 返回 read/write URL、last sync/write/flush、本地 ref、GitHub staging ref 和 pending flush 状态，并在 `cache.summary` 给出 `localV02`、`localGitops`、`githubGitops`、`pendingFlush`、`flushNeeded`、`githubInSync` 和下一条受控 `flushCommand`。confirmed `sync` 和 `flush` 默认创建 `.state/jobs/` 异步 job 并立刻返回可查询状态，只有现场同步调试才显式加 `--wait`；mirror 不设置 CronJob。
- `hwlab g14 observability status|apply|query|targets|boundary|closeout [--lane v02] [--promql <expr>] [--expect-count N] [--expect-value V] [--dry-run|--confirm]` 是 G14 `devops-infra` 共享监控基础设施和 HWLAB v0.2 监控 closeout 的受控入口。`apply` 固定安装 Prometheus Operator `v0.91.0`、Prometheus `v3.12.0`、Prometheus 发现 RBAC、`devops-infra` 内 Prometheus 实例和 ClusterIP query Service，并给被允许发现的 workload namespace 打低风险 label；它不把 Prometheus、Grafana 或 Alertmanager 部署到 `hwlab-v02`，也不接管 HWLAB runtime Deployment/Service。`status` 只读汇总 CRD、operator Deployment、Prometheus CR/pod/service、`hwlab-v02` ServiceMonitor/PrometheusRule 和 bounded `up` 查询；`query` 只通过 Kubernetes service proxy 查询 Prometheus，支持 `--expect-count` / `--expect-value` 输出 `assertion`、bad values 和 missing/extra series；`targets` 汇总 ServiceMonitor/PrometheusRule、metrics sidecar readiness/restart 和三层指标值；`boundary` 验证 workload namespace 没有 Prometheus/Alertmanager，并对 `19666/19667` 公网 `/metrics` 做负向验证；`closeout` 聚合平台 ready、scrape reachable、sidecar serving、business health probe、namespace boundary 和 public metrics exposure 语义结论。长期边界见 `docs/reference/g14-observability-infra.md`。
+- `hwlab g14 observability status|apply|query|targets|boundary|closeout [--lane v02] [--promql <expr>] [--expect-count N] [--expect-value V] [--dry-run|--confirm]` 是 G14 `devops-infra` 共享监控基础设施和 HWLAB v0.2 监控 closeout 的受控入口。`apply` 固定安装 Prometheus Operator `v0.91.0`、Prometheus `v3.12.0`、Prometheus 发现 RBAC、`devops-infra` 内 Prometheus 实例和 ClusterIP query Service，并给被允许发现的 workload namespace 打低风险 label；它不把 Prometheus、Grafana 或 Alertmanager 部署到 `hwlab-v02`，也不接管 HWLAB runtime Deployment/Service。`status` 只读汇总 CRD、operator Deployment、Prometheus CR/pod/service、`hwlab-v02` ServiceMonitor/PrometheusRule 和 bounded `up` 查询；`query` 只通过 Kubernetes service proxy 查询 Prometheus，支持 `--expect-count` / `--expect-value` 输出 `assertion`、bad values 和 missing/extra series；`targets` 汇总 ServiceMonitor/PrometheusRule、metrics sidecar readiness/restart、三层指标值和 `metrics.k8s.io` 当前 CPU/内存资源快照；`boundary` 验证 workload namespace 没有 Prometheus/Alertmanager，并对 `19666/19667` 公网 `/metrics` 做负向验证；`closeout` 聚合平台 ready、scrape reachable、sidecar serving、business health probe、resource snapshot、namespace boundary 和 public metrics exposure 语义结论。长期边界见 `docs/reference/g14-observability-infra.md`。
 - `hwlab g14 tools-image status|build --name ci-node-tools --tag <tag> [--dockerfile deploy/ci/hwlab-ci-node-tools.Dockerfile] [--dry-run|--confirm]` 是 G14 固定 HWLAB CI tools image 的受控 host build/push 入口；构建和 push 只发生在 G14 host 与本地 registry，不在 master server 构建，也不把 `apk add`/runtime install 塞进 Tekton PipelineRun。
 - `trans gh:/owner/repo ...` 把 GitHub issue/PR 映射成只读/受控写入的虚拟文本目录，适合日报、PR 正文和 issue 正文的小补丁维护：`trans gh:/pikasTech/HWLAB ls` 展示 `pr/` 与 `issue/`，`trans gh:/pikasTech/HWLAB/pr ls [--limit N] [--full]` 和 `trans gh:/pikasTech/HWLAB/issue ls [--limit N] [--full]` 展示条目状态、楼层数、正文长度和标题，`trans gh:/pikasTech/HWLAB/pr/507 ls` 展示单个 PR 的一楼正文文件，`trans gh:/pikasTech/HWLAB/505/1 cat|rg|patch-apply` 兼容旧式 issue/PR number route。`patch-apply` 使用 UniDesk 默认 apply-patch v2 的虚拟文件 executor，把正文一楼映射为 `body.md`，写回仍走 `bun scripts/cli.ts gh issue/pr update` 的 guard/concurrency 规则；`rm` 对正文一楼结构化拒绝，避免误删 issue/PR 正文。大正文读取必须展开 UniDesk gh dump 文件，否则 `cat/rg/patch-apply` 会误读为空，这是 `gh:` 虚拟文件接口的 P0 可见性契约。
 - `hwlab cd status|audit|preflight|apply --env dev [--dry-run]` 是旧 D601 HWLAB DEV CD 指挥侧 wrapper，仅用于显式 legacy 诊断和迁移对照。默认通过 UniDesk provider `host.ssh` 进入 D601，再调用 HWLAB repo-owned `scripts/dev-cd-apply.mjs`，不内嵌发布 kubectl 逻辑：`status` 汇总固定 CD mirror、Git clean/main/origin-main、`deploy/deploy.json`/artifact catalog/report、D601 native k3s guard 和 CD Lease lock，并用 `scripts/dev-cd-apply.mjs --status --skip-live-verify` 取得 target/promotion 摘要；`audit` 在 k3s/CD 恢复后做只读健康审计，返回有界 JSON 的 blocker 分类、D601 guard/node、SecretRef 存在性、registry 可达性、Lease phase/holder/staleness、deploy.json 与 artifact/workload image 收敛、current Deployment image/revision/rollout、16666/16667 public health commit/readiness 和 DB/runtime durability 摘要；`preflight` 进一步检查必需 SecretRef 对象/键存在性并运行 HWLAB `scripts/dev-cd-apply.mjs --dry-run --skip-live-verify` 受控事务摘要。完整远端 stdout/stderr 写入 D601 `~/.state/unidesk-hwlab-cd/<run-id>/` 和本地 `.state/hwlab-cd/<run-id>/` task dump，stdout 只返回有界摘要。默认 HWLAB CD repo 是 `/home/ubuntu/hwlab_cd`，`/home/ubuntu/hwlab` runner 历史目录不得作为发布真相。wrapper 强制 `KUBECONFIG=/etc/rancher/k3s/k3s.yaml` 并只以这个显式目标作为 gate；显式目标出现 `docker-desktop`、`desktop-control-plane` 或 `127.0.0.1:11700` 信号会结构化拒绝，audit/preflight/apply --dry-run 都必须观察到 node `d601`。真实 apply 只暴露 `scripts/dev-cd-apply.mjs --apply --confirm-dev --confirmed-non-production --write-report` 命令形状并标注 host-commander-only，本 runner 不执行 live apply、rollout、Lease mutation 或 DEV deploy apply。长期规则见 `docs/reference/hwlab.md`。
@@ -103,13 +103,15 @@ A durable closeout must include:
 - `hwlab g14 observability status` or the equivalent controlled infrastructure status showing CRDs, Prometheus Operator and Prometheus Ready in `devops-infra`.
 - Explicit PromQL assertions for the workload namespace, not only the infrastructure status summary. Use `hwlab g14 observability query --promql <expr> --expect-count <N> --expect-value <V>` so the CLI returns `assertion.ok`, actual count, bad values and missing/extra series instead of requiring manual vector inspection.
 - For HWLAB v0.2, the current application-owned PromQL checks are `up{namespace="hwlab-v02"}`, `hwlab_service_up{namespace="hwlab-v02"}` and `hwlab_service_health_probe_success{namespace="hwlab-v02"}`. `up=1` proves Prometheus can scrape the sidecar; it does not prove the sidecar can reach the business health endpoint.
- `hwlab g14 observability targets --lane v02` for the high-level target view: discovered service/pod, metrics sidecar readiness and restart count, selected monitor declarations and the latest `up` / `hwlab_service_up` / `hwlab_service_health_probe_success` values.
+- `hwlab g14 observability targets --lane v02` for the high-level target view: discovered service/pod, metrics sidecar readiness and restart count, selected monitor declarations, the latest `up` / `hwlab_service_up` / `hwlab_service_health_probe_success` values, synthetic health/scrape duration summaries and the current CPU/memory resource snapshot from `metrics.k8s.io`.
 - `hwlab g14 observability boundary --lane v02` for the namespace and public ingress boundary: the workload namespace may contain application `ServiceMonitor` / `PodMonitor` / `PrometheusRule` declarations only, must not contain shared Prometheus or Alertmanager instances, and public `19666/19667` `/metrics` must be denied or non-Prometheus text.
- `hwlab g14 observability closeout --lane v02` as the standard monitoring closeout summary. It should report semantic fields such as `platformReady`, `scrapeReachable`, `sidecarServing`, `businessHealthProbe`, `namespaceControlPlaneBoundary` and `publicMetricsExposure`, plus bounded drill-down evidence and next diagnostic commands on failure. Public `/metrics` denial is represented as `publicMetricsExposure=pass` with `publicMetricsExposureState=denied`.
+- `hwlab g14 observability closeout --lane v02` as the standard monitoring closeout summary. It should report semantic fields such as `platformReady`, `scrapeReachable`, `sidecarServing`, `businessHealthProbe`, `resourceSnapshot`, `namespaceControlPlaneBoundary` and `publicMetricsExposure`, plus bounded drill-down evidence and next diagnostic commands on failure. Public `/metrics` denial is represented as `publicMetricsExposure=pass` with `publicMetricsExposureState=denied`.
 - CI/CD and GitOps provenance when the workload desired state changed. For HWLAB v0.2 this includes the target source commit, PipelineRun, Argo sync revision and git mirror `pendingFlush=false` / `githubInSync=true`.

 Issue comments should lead with the semantic conclusion and then list the commands, result counts and target values. A raw metrics dump or a green `status` command alone is not a closeout, and CI/CD provenance still comes from `hwlab g14 control-plane closeout --lane v02 --source-commit <full-sha>` or the equivalent high-level control-plane entry when runtime desired state changed.

+The current HWLAB v0.2 monitoring surface is intentionally split by source. Prometheus provides sidecar availability, business health probe success/status/duration, scrape duration and sidecar uptime; `metrics.k8s.io` provides current pod/container CPU and memory snapshots for the same monitored services. Request throughput, error rate, per-route latency percentiles and business-operation latency are application-owned signals and require HWLAB service instrumentation before Prometheus can answer them.
+
 ## Failure Modes

 The following regressions are common enough to require explicit checks in future monitoring work:
@@ -1,4 +1,4 @@
-import { activeV02PipelineRuns, g14ObservabilityQueryAssertion, gitMirrorFlushJobManifest, gitMirrorStatusSummary, gitMirrorSyncJobManifest, gitMirrorV02SyncRequirement, hwlabG14Help, hwlabG14MonitorStateFileName, parseGitMirrorStatusRefs, parsePipelineTaskRunMetrics, parseV02TriggerSnapshot, rolloutRecordBody, semanticChangelogBullets, summarizeV02CdStatus, v02CloseoutVerdict, v02CommitAlignment, v02ControlPlaneRefreshScriptHash, v02ControlPlaneRenderScript, v02ExistingPipelineRunReuseDecision, v02FalseGreenGuard, v02GitMirrorPreSyncWaitMs, v02LatestOnlyTargetValidation, v02PipelineServiceIds, v02PrAutomationCommentBody, v02ReusableGitMirrorPreSyncMarker, v02ReusableRefreshMarker, v02TaskRunPerformanceSummary } from "./src/hwlab-g14";
+import { activeV02PipelineRuns, g14ObservabilityQueryAssertion, gitMirrorFlushJobManifest, gitMirrorStatusSummary, gitMirrorSyncJobManifest, gitMirrorV02SyncRequirement, hwlabG14Help, hwlabG14MonitorStateFileName, parseGitMirrorStatusRefs, parseK8sCpuMillicores, parseK8sMemoryMiB, parsePipelineTaskRunMetrics, parseV02TriggerSnapshot, rolloutRecordBody, semanticChangelogBullets, summarizeV02CdStatus, v02CloseoutVerdict, v02CommitAlignment, v02ControlPlaneRefreshScriptHash, v02ControlPlaneRenderScript, v02ExistingPipelineRunReuseDecision, v02FalseGreenGuard, v02GitMirrorPreSyncWaitMs, v02LatestOnlyTargetValidation, v02PipelineServiceIds, v02PrAutomationCommentBody, v02ReusableGitMirrorPreSyncMarker, v02ReusableRefreshMarker, v02TaskRunPerformanceSummary } from "./src/hwlab-g14";
 import { runCommand } from "./src/command";

 function assertCondition(condition: unknown, message: string, detail: unknown = {}): void {
@@ -114,6 +114,20 @@ assertCondition(
  "observability CLI must fail visibly on unsupported options instead of silently ignoring friction-prone flags",
  unsupportedObservabilityJson,
 );
+assertCondition(
+  parseK8sCpuMillicores("46095136n") !== null
+    && Math.abs((parseK8sCpuMillicores("46095136n") ?? 0) - 46.095136) < 0.000001
+    && parseK8sCpuMillicores("47m") === 47
+    && parseK8sCpuMillicores("1") === 1000,
+  "observability resource snapshot must convert metrics.k8s.io CPU quantities to millicores",
+);
+assertCondition(
+  parseK8sMemoryMiB("99860Ki") !== null
+    && Math.abs((parseK8sMemoryMiB("99860Ki") ?? 0) - 97.51953125) < 0.000001
+    && parseK8sMemoryMiB("97Mi") === 97
+    && parseK8sMemoryMiB("1048576") === 1,
+  "observability resource snapshot must convert metrics.k8s.io memory quantities to MiB",
+);

 const v02CommentBody = v02PrAutomationCommentBody({
  pr: {
@@ -741,6 +755,8 @@ console.log(JSON.stringify({
    "observability help exposes assertion, target, boundary, and closeout entrypoints",
    "observability query assertions report count and terminal value pass/fail",
    "observability CLI rejects unsupported options with visible JSON errors",
+    "observability resource snapshot converts metrics.k8s.io CPU quantities to millicores",
+    "observability resource snapshot converts metrics.k8s.io memory quantities to MiB",
    "git mirror sync is a manual devops-infra Job, not a CronJob",
    "git mirror flush is a manual devops-infra Job, not a CronJob",
    "trigger-current can decide whether v0.2 git mirror pre-sync is required",
@@ -5681,6 +5681,7 @@ function closeoutAdvice(summary: Record<string, unknown>): string[] {
  if (summary.scrapeReachable !== "pass") advice.push("scrapeReachable failed -> check ServiceMonitor labels, metrics sidecar port name, and Prometheus target discovery");
  if (summary.sidecarServing !== "pass") advice.push("sidecarServing failed -> check hwlab-metrics sidecar readiness, restartCount, and metrics script/container logs");
  if (summary.businessHealthProbe !== "pass") advice.push("businessHealthProbe failed -> up=1 but health_probe=0 usually means sidecar can be scraped but cannot reach the business health endpoint");
+  if (summary.resourceSnapshot !== "pass") advice.push("resourceSnapshot failed -> check metrics.k8s.io APIService and metrics-server availability on G14 k3s");
  if (summary.namespaceControlPlaneBoundary !== "pass") advice.push("namespaceControlPlaneBoundary failed -> remove Prometheus/Alertmanager from workload namespace; shared control plane belongs in devops-infra");
  if (summary.publicMetricsExposure !== "pass") advice.push("publicMetricsExposure failed -> public /metrics returned Prometheus text; remove FRP/edge exposure or add an authenticated internal-only route");
  return advice;
@@ -5697,6 +5698,7 @@ function runG14ObservabilityCloseout(options: G14ObservabilityOptions): Record<s
  const platformReady = record(status.crds).ok === true && record(status.operator).ok === true && record(status.prometheus).ok === true && record(status.query).ok === true;
  const namespaceBoundaryOk = record(record(boundary.namespaceBoundary)).ok === true && record(record(boundary.infraControlPlane)).ok === true;
  const publicDenied = record(record(boundary.publicMetricsExposure)).ok === true;
+  const resourceOk = record(record(targets.resourceSnapshot)).ok === true;
  const summary = {
    platformReady: passFail(platformReady),
    workloadMonitorCount: numericValue(record(status.workloadMonitors).count) ?? numericValue(record(targets.monitors).count) ?? 0,
@@ -5705,6 +5707,7 @@ function runG14ObservabilityCloseout(options: G14ObservabilityOptions): Record<s
    sidecarServing: passFail(queryOk("sidecarServing") && sidecarsOk),
    businessHealthProbe: passFail(queryOk("businessHealthProbe")),
    sidecarReady: passFail(sidecarsOk),
+    resourceSnapshot: passFail(resourceOk),
    namespaceControlPlaneBoundary: passFail(namespaceBoundaryOk),
    publicMetricsExposure: passFail(publicDenied),
    publicMetricsExposureState: publicDenied ? "denied" : "exposed-or-unknown",
@@ -5739,12 +5742,18 @@ function runG14ObservabilityCloseout(options: G14ObservabilityOptions): Record<s
        readySidecarCount: record(targets.sidecars).readyCount ?? null,
        healthProbeDuration: record(record(targets.levelSummary).healthProbeDuration),
        scrapeDuration: record(record(targets.levelSummary).scrapeDuration),
+        resourceSnapshot: record(targets.resourceSnapshot),
+        resourceUsage: record(record(targets.levelSummary).resourceUsage),
        services: arrayRecords(record(targets.levelSummary).services).map((service) => ({
          serviceId: service.serviceId ?? null,
          scrapeReachable: service.scrapeReachable ?? null,
          sidecarServing: service.sidecarServing ?? null,
          businessHealthProbe: service.businessHealthProbe ?? null,
          statusCode: service.statusCode ?? null,
+          totalCpuMillicores: service.totalCpuMillicores ?? null,
+          totalMemoryMiB: service.totalMemoryMiB ?? null,
+          businessCpuMillicores: service.businessCpuMillicores ?? null,
+          businessMemoryMiB: service.businessMemoryMiB ?? null,
        })),
      },
      boundary: {