diff --git a/docs/reference/cli.md b/docs/reference/cli.md index 6210dbcc..10103586 100644 --- a/docs/reference/cli.md +++ b/docs/reference/cli.md @@ -60,7 +60,7 @@ CI/CD、GitOps、rollout、artifact 发布、PR 合并后的 DEV/PROD 滚动、P - `hwlab g14 control-plane cleanup-released-pvs --lane all [--limit N] [--dry-run|--confirm]` 是 local-path 未自动回收后的补充 retention 入口;只列并删除 `Released`、`local-path`、`Delete`、`claimNamespace=hwlab-ci` 且 claim 名称形如 Tekton 临时 `pvc-*` 的 PV。 - `hwlab g14 git-mirror status|apply|sync|flush [--dry-run|--confirm]` 是 `devops-infra` git mirror/relay 的受控维护入口:`apply` 渲染并 server-side apply `devops-infra/git-mirror.yaml`,同时删除遗留 `git-mirror-hwlab-sync` CronJob;`sync` 创建一次性 manual Job,把 GitHub allowlist refs 拉入本地 mirror;`flush` 创建一次性 manual Job,把本地 `v0.2-gitops` 快进推回 GitHub。 `status` 返回 read/write URL、last sync/write/flush、本地 ref、GitHub staging ref 和 pending flush 状态,并在 `cache.summary` 给出 `localV02`、`localGitops`、`githubGitops`、`pendingFlush`、`flushNeeded`、`githubInSync` 和下一条受控 `flushCommand`。confirmed `sync` 和 `flush` 默认创建 `.state/jobs/` 异步 job 并立刻返回可查询状态,只有现场同步调试才显式加 `--wait`;mirror 不设置 CronJob。 -- `hwlab g14 observability status|apply|query|targets|boundary|closeout [--lane v02] [--promql ] [--expect-count N] [--expect-value V] [--dry-run|--confirm]` 是 G14 `devops-infra` 共享监控基础设施和 HWLAB v0.2 监控 closeout 的受控入口。`apply` 固定安装 Prometheus Operator `v0.91.0`、Prometheus `v3.12.0`、Prometheus 发现 RBAC、`devops-infra` 内 Prometheus 实例和 ClusterIP query Service,并给被允许发现的 workload namespace 打低风险 label;它不把 Prometheus、Grafana 或 Alertmanager 部署到 `hwlab-v02`,也不接管 HWLAB runtime Deployment/Service。`status` 只读汇总 CRD、operator Deployment、Prometheus CR/pod/service、`hwlab-v02` ServiceMonitor/PrometheusRule 和 bounded `up` 查询;`query` 只通过 Kubernetes service proxy 查询 Prometheus,支持 `--expect-count` / `--expect-value` 输出 `assertion`、bad values 和 missing/extra series;`targets` 汇总 ServiceMonitor/PrometheusRule、metrics sidecar readiness/restart 和三层指标值;`boundary` 验证 workload namespace 没有 Prometheus/Alertmanager,并对 `19666/19667` 公网 `/metrics` 做负向验证;`closeout` 聚合平台 ready、scrape reachable、sidecar serving、business health probe、namespace boundary 和 public metrics exposure 语义结论。长期边界见 `docs/reference/g14-observability-infra.md`。 +- `hwlab g14 observability status|apply|query|targets|boundary|closeout [--lane v02] [--promql ] [--expect-count N] [--expect-value V] [--dry-run|--confirm]` 是 G14 `devops-infra` 共享监控基础设施和 HWLAB v0.2 监控 closeout 的受控入口。`apply` 固定安装 Prometheus Operator `v0.91.0`、Prometheus `v3.12.0`、Prometheus 发现 RBAC、`devops-infra` 内 Prometheus 实例和 ClusterIP query Service,并给被允许发现的 workload namespace 打低风险 label;它不把 Prometheus、Grafana 或 Alertmanager 部署到 `hwlab-v02`,也不接管 HWLAB runtime Deployment/Service。`status` 只读汇总 CRD、operator Deployment、Prometheus CR/pod/service、`hwlab-v02` ServiceMonitor/PrometheusRule 和 bounded `up` 查询;`query` 只通过 Kubernetes service proxy 查询 Prometheus,支持 `--expect-count` / `--expect-value` 输出 `assertion`、bad values 和 missing/extra series;`targets` 汇总 ServiceMonitor/PrometheusRule、metrics sidecar readiness/restart、三层指标值和 `metrics.k8s.io` 当前 CPU/内存资源快照;`boundary` 验证 workload namespace 没有 Prometheus/Alertmanager,并对 `19666/19667` 公网 `/metrics` 做负向验证;`closeout` 聚合平台 ready、scrape reachable、sidecar serving、business health probe、resource snapshot、namespace boundary 和 public metrics exposure 语义结论。长期边界见 `docs/reference/g14-observability-infra.md`。 - `hwlab g14 tools-image status|build --name ci-node-tools --tag [--dockerfile deploy/ci/hwlab-ci-node-tools.Dockerfile] [--dry-run|--confirm]` 是 G14 固定 HWLAB CI tools image 的受控 host build/push 入口;构建和 push 只发生在 G14 host 与本地 registry,不在 master server 构建,也不把 `apk add`/runtime install 塞进 Tekton PipelineRun。 - `trans gh:/owner/repo ...` 把 GitHub issue/PR 映射成只读/受控写入的虚拟文本目录,适合日报、PR 正文和 issue 正文的小补丁维护:`trans gh:/pikasTech/HWLAB ls` 展示 `pr/` 与 `issue/`,`trans gh:/pikasTech/HWLAB/pr ls [--limit N] [--full]` 和 `trans gh:/pikasTech/HWLAB/issue ls [--limit N] [--full]` 展示条目状态、楼层数、正文长度和标题,`trans gh:/pikasTech/HWLAB/pr/507 ls` 展示单个 PR 的一楼正文文件,`trans gh:/pikasTech/HWLAB/505/1 cat|rg|patch-apply` 兼容旧式 issue/PR number route。`patch-apply` 使用 UniDesk 默认 apply-patch v2 的虚拟文件 executor,把正文一楼映射为 `body.md`,写回仍走 `bun scripts/cli.ts gh issue/pr update` 的 guard/concurrency 规则;`rm` 对正文一楼结构化拒绝,避免误删 issue/PR 正文。大正文读取必须展开 UniDesk gh dump 文件,否则 `cat/rg/patch-apply` 会误读为空,这是 `gh:` 虚拟文件接口的 P0 可见性契约。 - `hwlab cd status|audit|preflight|apply --env dev [--dry-run]` 是旧 D601 HWLAB DEV CD 指挥侧 wrapper,仅用于显式 legacy 诊断和迁移对照。默认通过 UniDesk provider `host.ssh` 进入 D601,再调用 HWLAB repo-owned `scripts/dev-cd-apply.mjs`,不内嵌发布 kubectl 逻辑:`status` 汇总固定 CD mirror、Git clean/main/origin-main、`deploy/deploy.json`/artifact catalog/report、D601 native k3s guard 和 CD Lease lock,并用 `scripts/dev-cd-apply.mjs --status --skip-live-verify` 取得 target/promotion 摘要;`audit` 在 k3s/CD 恢复后做只读健康审计,返回有界 JSON 的 blocker 分类、D601 guard/node、SecretRef 存在性、registry 可达性、Lease phase/holder/staleness、deploy.json 与 artifact/workload image 收敛、current Deployment image/revision/rollout、16666/16667 public health commit/readiness 和 DB/runtime durability 摘要;`preflight` 进一步检查必需 SecretRef 对象/键存在性并运行 HWLAB `scripts/dev-cd-apply.mjs --dry-run --skip-live-verify` 受控事务摘要。完整远端 stdout/stderr 写入 D601 `~/.state/unidesk-hwlab-cd//` 和本地 `.state/hwlab-cd//` task dump,stdout 只返回有界摘要。默认 HWLAB CD repo 是 `/home/ubuntu/hwlab_cd`,`/home/ubuntu/hwlab` runner 历史目录不得作为发布真相。wrapper 强制 `KUBECONFIG=/etc/rancher/k3s/k3s.yaml` 并只以这个显式目标作为 gate;显式目标出现 `docker-desktop`、`desktop-control-plane` 或 `127.0.0.1:11700` 信号会结构化拒绝,audit/preflight/apply --dry-run 都必须观察到 node `d601`。真实 apply 只暴露 `scripts/dev-cd-apply.mjs --apply --confirm-dev --confirmed-non-production --write-report` 命令形状并标注 host-commander-only,本 runner 不执行 live apply、rollout、Lease mutation 或 DEV deploy apply。长期规则见 `docs/reference/hwlab.md`。 diff --git a/docs/reference/g14-observability-infra.md b/docs/reference/g14-observability-infra.md index d911db34..156526f6 100644 --- a/docs/reference/g14-observability-infra.md +++ b/docs/reference/g14-observability-infra.md @@ -103,13 +103,15 @@ A durable closeout must include: - `hwlab g14 observability status` or the equivalent controlled infrastructure status showing CRDs, Prometheus Operator and Prometheus Ready in `devops-infra`. - Explicit PromQL assertions for the workload namespace, not only the infrastructure status summary. Use `hwlab g14 observability query --promql --expect-count --expect-value ` so the CLI returns `assertion.ok`, actual count, bad values and missing/extra series instead of requiring manual vector inspection. - For HWLAB v0.2, the current application-owned PromQL checks are `up{namespace="hwlab-v02"}`, `hwlab_service_up{namespace="hwlab-v02"}` and `hwlab_service_health_probe_success{namespace="hwlab-v02"}`. `up=1` proves Prometheus can scrape the sidecar; it does not prove the sidecar can reach the business health endpoint. -- `hwlab g14 observability targets --lane v02` for the high-level target view: discovered service/pod, metrics sidecar readiness and restart count, selected monitor declarations and the latest `up` / `hwlab_service_up` / `hwlab_service_health_probe_success` values. +- `hwlab g14 observability targets --lane v02` for the high-level target view: discovered service/pod, metrics sidecar readiness and restart count, selected monitor declarations, the latest `up` / `hwlab_service_up` / `hwlab_service_health_probe_success` values, synthetic health/scrape duration summaries and the current CPU/memory resource snapshot from `metrics.k8s.io`. - `hwlab g14 observability boundary --lane v02` for the namespace and public ingress boundary: the workload namespace may contain application `ServiceMonitor` / `PodMonitor` / `PrometheusRule` declarations only, must not contain shared Prometheus or Alertmanager instances, and public `19666/19667` `/metrics` must be denied or non-Prometheus text. -- `hwlab g14 observability closeout --lane v02` as the standard monitoring closeout summary. It should report semantic fields such as `platformReady`, `scrapeReachable`, `sidecarServing`, `businessHealthProbe`, `namespaceControlPlaneBoundary` and `publicMetricsExposure`, plus bounded drill-down evidence and next diagnostic commands on failure. Public `/metrics` denial is represented as `publicMetricsExposure=pass` with `publicMetricsExposureState=denied`. +- `hwlab g14 observability closeout --lane v02` as the standard monitoring closeout summary. It should report semantic fields such as `platformReady`, `scrapeReachable`, `sidecarServing`, `businessHealthProbe`, `resourceSnapshot`, `namespaceControlPlaneBoundary` and `publicMetricsExposure`, plus bounded drill-down evidence and next diagnostic commands on failure. Public `/metrics` denial is represented as `publicMetricsExposure=pass` with `publicMetricsExposureState=denied`. - CI/CD and GitOps provenance when the workload desired state changed. For HWLAB v0.2 this includes the target source commit, PipelineRun, Argo sync revision and git mirror `pendingFlush=false` / `githubInSync=true`. Issue comments should lead with the semantic conclusion and then list the commands, result counts and target values. A raw metrics dump or a green `status` command alone is not a closeout, and CI/CD provenance still comes from `hwlab g14 control-plane closeout --lane v02 --source-commit ` or the equivalent high-level control-plane entry when runtime desired state changed. +The current HWLAB v0.2 monitoring surface is intentionally split by source. Prometheus provides sidecar availability, business health probe success/status/duration, scrape duration and sidecar uptime; `metrics.k8s.io` provides current pod/container CPU and memory snapshots for the same monitored services. Request throughput, error rate, per-route latency percentiles and business-operation latency are application-owned signals and require HWLAB service instrumentation before Prometheus can answer them. + ## Failure Modes The following regressions are common enough to require explicit checks in future monitoring work: diff --git a/scripts/hwlab-g14-contract-test.ts b/scripts/hwlab-g14-contract-test.ts index 0846d96d..ecc1edca 100644 --- a/scripts/hwlab-g14-contract-test.ts +++ b/scripts/hwlab-g14-contract-test.ts @@ -1,4 +1,4 @@ -import { activeV02PipelineRuns, g14ObservabilityQueryAssertion, gitMirrorFlushJobManifest, gitMirrorStatusSummary, gitMirrorSyncJobManifest, gitMirrorV02SyncRequirement, hwlabG14Help, hwlabG14MonitorStateFileName, parseGitMirrorStatusRefs, parsePipelineTaskRunMetrics, parseV02TriggerSnapshot, rolloutRecordBody, semanticChangelogBullets, summarizeV02CdStatus, v02CloseoutVerdict, v02CommitAlignment, v02ControlPlaneRefreshScriptHash, v02ControlPlaneRenderScript, v02ExistingPipelineRunReuseDecision, v02FalseGreenGuard, v02GitMirrorPreSyncWaitMs, v02LatestOnlyTargetValidation, v02PipelineServiceIds, v02PrAutomationCommentBody, v02ReusableGitMirrorPreSyncMarker, v02ReusableRefreshMarker, v02TaskRunPerformanceSummary } from "./src/hwlab-g14"; +import { activeV02PipelineRuns, g14ObservabilityQueryAssertion, gitMirrorFlushJobManifest, gitMirrorStatusSummary, gitMirrorSyncJobManifest, gitMirrorV02SyncRequirement, hwlabG14Help, hwlabG14MonitorStateFileName, parseGitMirrorStatusRefs, parseK8sCpuMillicores, parseK8sMemoryMiB, parsePipelineTaskRunMetrics, parseV02TriggerSnapshot, rolloutRecordBody, semanticChangelogBullets, summarizeV02CdStatus, v02CloseoutVerdict, v02CommitAlignment, v02ControlPlaneRefreshScriptHash, v02ControlPlaneRenderScript, v02ExistingPipelineRunReuseDecision, v02FalseGreenGuard, v02GitMirrorPreSyncWaitMs, v02LatestOnlyTargetValidation, v02PipelineServiceIds, v02PrAutomationCommentBody, v02ReusableGitMirrorPreSyncMarker, v02ReusableRefreshMarker, v02TaskRunPerformanceSummary } from "./src/hwlab-g14"; import { runCommand } from "./src/command"; function assertCondition(condition: unknown, message: string, detail: unknown = {}): void { @@ -114,6 +114,20 @@ assertCondition( "observability CLI must fail visibly on unsupported options instead of silently ignoring friction-prone flags", unsupportedObservabilityJson, ); +assertCondition( + parseK8sCpuMillicores("46095136n") !== null + && Math.abs((parseK8sCpuMillicores("46095136n") ?? 0) - 46.095136) < 0.000001 + && parseK8sCpuMillicores("47m") === 47 + && parseK8sCpuMillicores("1") === 1000, + "observability resource snapshot must convert metrics.k8s.io CPU quantities to millicores", +); +assertCondition( + parseK8sMemoryMiB("99860Ki") !== null + && Math.abs((parseK8sMemoryMiB("99860Ki") ?? 0) - 97.51953125) < 0.000001 + && parseK8sMemoryMiB("97Mi") === 97 + && parseK8sMemoryMiB("1048576") === 1, + "observability resource snapshot must convert metrics.k8s.io memory quantities to MiB", +); const v02CommentBody = v02PrAutomationCommentBody({ pr: { @@ -741,6 +755,8 @@ console.log(JSON.stringify({ "observability help exposes assertion, target, boundary, and closeout entrypoints", "observability query assertions report count and terminal value pass/fail", "observability CLI rejects unsupported options with visible JSON errors", + "observability resource snapshot converts metrics.k8s.io CPU quantities to millicores", + "observability resource snapshot converts metrics.k8s.io memory quantities to MiB", "git mirror sync is a manual devops-infra Job, not a CronJob", "git mirror flush is a manual devops-infra Job, not a CronJob", "trigger-current can decide whether v0.2 git mirror pre-sync is required", diff --git a/scripts/src/hwlab-g14.ts b/scripts/src/hwlab-g14.ts index a3f461d5..f86e8131 100644 --- a/scripts/src/hwlab-g14.ts +++ b/scripts/src/hwlab-g14.ts @@ -5681,6 +5681,7 @@ function closeoutAdvice(summary: Record): string[] { if (summary.scrapeReachable !== "pass") advice.push("scrapeReachable failed -> check ServiceMonitor labels, metrics sidecar port name, and Prometheus target discovery"); if (summary.sidecarServing !== "pass") advice.push("sidecarServing failed -> check hwlab-metrics sidecar readiness, restartCount, and metrics script/container logs"); if (summary.businessHealthProbe !== "pass") advice.push("businessHealthProbe failed -> up=1 but health_probe=0 usually means sidecar can be scraped but cannot reach the business health endpoint"); + if (summary.resourceSnapshot !== "pass") advice.push("resourceSnapshot failed -> check metrics.k8s.io APIService and metrics-server availability on G14 k3s"); if (summary.namespaceControlPlaneBoundary !== "pass") advice.push("namespaceControlPlaneBoundary failed -> remove Prometheus/Alertmanager from workload namespace; shared control plane belongs in devops-infra"); if (summary.publicMetricsExposure !== "pass") advice.push("publicMetricsExposure failed -> public /metrics returned Prometheus text; remove FRP/edge exposure or add an authenticated internal-only route"); return advice; @@ -5697,6 +5698,7 @@ function runG14ObservabilityCloseout(options: G14ObservabilityOptions): Record ({ serviceId: service.serviceId ?? null, scrapeReachable: service.scrapeReachable ?? null, sidecarServing: service.sidecarServing ?? null, businessHealthProbe: service.businessHealthProbe ?? null, statusCode: service.statusCode ?? null, + totalCpuMillicores: service.totalCpuMillicores ?? null, + totalMemoryMiB: service.totalMemoryMiB ?? null, + businessCpuMillicores: service.businessCpuMillicores ?? null, + businessMemoryMiB: service.businessMemoryMiB ?? null, })), }, boundary: {