diff --git a/docs/reference/g14-observability-infra.md b/docs/reference/g14-observability-infra.md index 5f6b43f5..0662b5a9 100644 --- a/docs/reference/g14-observability-infra.md +++ b/docs/reference/g14-observability-infra.md @@ -37,13 +37,16 @@ The shared Prometheus stack may discover application monitors across namespaces Monitoring infrastructure must be declared as Git-backed desired state and applied through a controlled UniDesk or G14 GitOps path. A temporary `kubectl apply` may be used only as a `$dad-dev` P2 experiment; it must be followed by a durable source change and GitOps/CLI validation. -Recommended durable shape: +Current durable control surface: -- A dedicated G14 infrastructure desired-state path for `devops-infra` observability resources. -- A dedicated Argo CD Application or an equivalent UniDesk-controlled apply surface for that path. +- `bun scripts/cli.ts hwlab g14 observability status` reads the G14 monitoring state through the controlled `G14:k3s` route and reports CRDs, Prometheus Operator readiness, Prometheus readiness, selected workload monitors and a bounded `up` query. +- `bun scripts/cli.ts hwlab g14 observability apply --dry-run|--confirm` is the standard write path for the shared stack. It installs Prometheus Operator `v0.91.0`, Prometheus `v3.12.0`, Prometheus RBAC, the `devops-infra` Prometheus instance and the internal query Service. +- `bun scripts/cli.ts hwlab g14 observability query --promql ` is the controlled query path. It uses Kubernetes service proxy to the internal ClusterIP Service and must not expose Prometheus through FRP, NodePort or LoadBalancer. - Cluster-scoped CRDs and ClusterRole/ClusterRoleBinding resources owned by the infrastructure path, not by a HWLAB lane Application whose destination is only `hwlab-v02`. - Runtime workloads in `devops-infra` labeled with `app.kubernetes.io/part-of=devops-infra` and component labels such as `observability`, `prometheus`, `operator` or `query`. +Future GitOps work may move the same desired state behind a dedicated G14 infrastructure Argo CD Application. Until that exists, the UniDesk CLI source is the stable audited desired-state entry, and direct native `kubectl` remains only an implementation detail inside that CLI. + Do not attach Prometheus Operator CRDs, Prometheus Deployments, Grafana or Alertmanager to `hwlab-g14-v02`. That Argo Application is scoped to the HWLAB v0.2 runtime namespace and must remain a lane-specific application rollout controller. ## Security diff --git a/scripts/src/hwlab-g14.ts b/scripts/src/hwlab-g14.ts index c014bb0b..f8474311 100644 --- a/scripts/src/hwlab-g14.ts +++ b/scripts/src/hwlab-g14.ts @@ -38,6 +38,14 @@ const GIT_MIRROR_NAMESPACE = "devops-infra"; const GIT_MIRROR_MANIFEST_FIELD_MANAGER = "unidesk-hwlab-git-mirror"; const GIT_MIRROR_SYNC_JOB_PREFIX = "git-mirror-hwlab-sync-manual"; const GIT_MIRROR_LEGACY_CRONJOB = "git-mirror-hwlab-sync"; +const G14_OBSERVABILITY_NAMESPACE = "devops-infra"; +const G14_OBSERVABILITY_FIELD_MANAGER = "unidesk-g14-observability"; +const G14_PROMETHEUS_OPERATOR_VERSION = "v0.91.0"; +const G14_PROMETHEUS_VERSION = "v3.12.0"; +const G14_PROMETHEUS_NAME = "g14-shared"; +const G14_PROMETHEUS_SERVICE = "prometheus-g14-shared"; +const G14_PROMETHEUS_SERVICE_ACCOUNT = "g14-observability-prometheus"; +const G14_PROMETHEUS_OPERATOR_RELEASE_ASSET = `https://github.com/prometheus-operator/prometheus-operator/releases/download/${G14_PROMETHEUS_OPERATOR_VERSION}/bundle.yaml`; const V02_SERVICE_IDS = [ "hwlab-cloud-api", "hwlab-cloud-web", @@ -131,6 +139,15 @@ interface G14GitMirrorOptions { timeoutSeconds: number; } +interface G14ObservabilityOptions { + action: "status" | "apply" | "query"; + dryRun: boolean; + confirm: boolean; + wait: boolean; + timeoutSeconds: number; + query: string; +} + interface G14SecretOptions { action: "status" | "ensure"; lane: "v02"; @@ -370,6 +387,27 @@ function parseGitMirrorOptions(args: string[]): G14GitMirrorOptions { }; } +function parseObservabilityOptions(args: string[]): G14ObservabilityOptions { + const [actionRaw] = args; + if (actionRaw !== "status" && actionRaw !== "apply" && actionRaw !== "query") { + throw new Error("observability usage: status|apply|query [--promql ] [--dry-run|--confirm]"); + } + const confirm = args.includes("--confirm"); + const explicitDryRun = args.includes("--dry-run"); + if (confirm && explicitDryRun) throw new Error("observability accepts only one of --confirm or --dry-run"); + const query = optionValue(args, "--promql") ?? optionValue(args, "--query") ?? 'up{namespace="hwlab-v02"}'; + if (query.length > 500) throw new Error("--promql is limited to 500 characters"); + if (query.includes("\n") || query.includes("\r")) throw new Error("--promql must be a single-line expression"); + return { + action: actionRaw, + confirm, + wait: args.includes("--wait"), + dryRun: actionRaw === "status" || actionRaw === "query" ? true : explicitDryRun || !confirm, + timeoutSeconds: positiveIntegerOption(args, "--timeout-seconds", actionRaw === "apply" ? 240 : 120, 900), + query, + }; +} + function parseSecretOptions(args: string[]): G14SecretOptions { const [actionRaw] = args; if (actionRaw !== "status" && actionRaw !== "ensure") { @@ -4121,6 +4159,473 @@ function runG14GitMirror(options: G14GitMirrorOptions): Record return runGitMirrorSync(options); } +function observabilityLabels(component: string): Record { + return { + "app.kubernetes.io/part-of": "devops-infra", + "app.kubernetes.io/component": "observability", + "g14.pikastech.local/observability-component": component, + }; +} + +function observabilityNamespaceLabel(): Record { + return { + "g14.pikastech.local/observability-discovery": "enabled", + }; +} + +function g14PrometheusManifest(): Record { + const namespaceSelector = { matchLabels: observabilityNamespaceLabel() }; + const monitorSelector = { matchLabels: { "hwlab.pikastech.local/monitoring": "enabled" } }; + return { + apiVersion: "v1", + kind: "List", + items: [ + { + apiVersion: "v1", + kind: "Namespace", + metadata: { + name: G14_OBSERVABILITY_NAMESPACE, + labels: { + ...observabilityNamespaceLabel(), + ...observabilityLabels("namespace"), + }, + }, + }, + { + apiVersion: "v1", + kind: "Namespace", + metadata: { + name: V02_RUNTIME_NAMESPACE, + labels: observabilityNamespaceLabel(), + }, + }, + { + apiVersion: "v1", + kind: "ServiceAccount", + metadata: { + name: G14_PROMETHEUS_SERVICE_ACCOUNT, + namespace: G14_OBSERVABILITY_NAMESPACE, + labels: observabilityLabels("prometheus"), + }, + }, + { + apiVersion: "rbac.authorization.k8s.io/v1", + kind: "ClusterRole", + metadata: { + name: G14_PROMETHEUS_SERVICE_ACCOUNT, + labels: observabilityLabels("prometheus"), + }, + rules: [ + { + apiGroups: [""], + resources: ["nodes", "nodes/metrics", "services", "endpoints", "pods"], + verbs: ["get", "list", "watch"], + }, + { + apiGroups: ["discovery.k8s.io"], + resources: ["endpointslices"], + verbs: ["get", "list", "watch"], + }, + { + apiGroups: ["networking.k8s.io"], + resources: ["ingresses"], + verbs: ["get", "list", "watch"], + }, + { + nonResourceURLs: ["/metrics"], + verbs: ["get"], + }, + ], + }, + { + apiVersion: "rbac.authorization.k8s.io/v1", + kind: "ClusterRoleBinding", + metadata: { + name: G14_PROMETHEUS_SERVICE_ACCOUNT, + labels: observabilityLabels("prometheus"), + }, + roleRef: { + apiGroup: "rbac.authorization.k8s.io", + kind: "ClusterRole", + name: G14_PROMETHEUS_SERVICE_ACCOUNT, + }, + subjects: [{ + kind: "ServiceAccount", + name: G14_PROMETHEUS_SERVICE_ACCOUNT, + namespace: G14_OBSERVABILITY_NAMESPACE, + }], + }, + { + apiVersion: "monitoring.coreos.com/v1", + kind: "Prometheus", + metadata: { + name: G14_PROMETHEUS_NAME, + namespace: G14_OBSERVABILITY_NAMESPACE, + labels: observabilityLabels("prometheus"), + }, + spec: { + replicas: 1, + version: G14_PROMETHEUS_VERSION, + serviceAccountName: G14_PROMETHEUS_SERVICE_ACCOUNT, + scrapeInterval: "30s", + evaluationInterval: "30s", + retention: "7d", + resources: { + requests: { cpu: "100m", memory: "256Mi" }, + limits: { cpu: "500m", memory: "1Gi" }, + }, + storage: { + volumeClaimTemplate: { + spec: { + storageClassName: "local-path", + accessModes: ["ReadWriteOnce"], + resources: { requests: { storage: "10Gi" } }, + }, + }, + }, + serviceMonitorSelector: monitorSelector, + serviceMonitorNamespaceSelector: namespaceSelector, + podMonitorSelector: monitorSelector, + podMonitorNamespaceSelector: namespaceSelector, + ruleSelector: monitorSelector, + ruleNamespaceSelector: namespaceSelector, + probeSelector: monitorSelector, + probeNamespaceSelector: namespaceSelector, + }, + }, + { + apiVersion: "v1", + kind: "Service", + metadata: { + name: G14_PROMETHEUS_SERVICE, + namespace: G14_OBSERVABILITY_NAMESPACE, + labels: observabilityLabels("query"), + }, + spec: { + type: "ClusterIP", + selector: { + prometheus: G14_PROMETHEUS_NAME, + }, + ports: [{ + name: "web", + port: 9090, + targetPort: "web", + }], + }, + }, + ], + }; +} + +function parseSectionJson(section: ShellSection | undefined): Record { + const text = String(section?.stdout ?? "").trim(); + if (text.length === 0) return {}; + try { + return record(JSON.parse(text) as unknown); + } catch { + return {}; + } +} + +function parseSectionJsonArray(section: ShellSection | undefined): Record[] { + const parsed = parseSectionJson(section); + const items = parsed.kind === "List" && Array.isArray(parsed.items) + ? parsed.items + : Array.isArray(parsed.items) + ? parsed.items + : []; + return items.map((item) => record(item)); +} + +function conditionStatus(items: Record[], type: string): string | null { + for (const item of items) { + if (item.type === type) return typeof item.status === "string" ? item.status : null; + } + return null; +} + +function deploymentReady(deployment: Record): boolean { + const spec = record(deployment.spec); + const status = record(deployment.status); + const desired = numericValue(spec.replicas) ?? 1; + const ready = numericValue(status.readyReplicas) ?? 0; + const available = numericValue(status.availableReplicas) ?? 0; + return ready >= desired && available >= desired; +} + +function prometheusReady(prometheus: Record): boolean { + const conditions = Array.isArray(record(prometheus.status).conditions) + ? record(prometheus.status).conditions.map((item) => record(item)) + : []; + const available = conditionStatus(conditions, "Available"); + const reconciled = conditionStatus(conditions, "Reconciled"); + return available === "True" || reconciled === "True"; +} + +function g14ObservabilityStatus(): Record { + const startedAtMs = Date.now(); + const queryPath = `/api/v1/namespaces/${G14_OBSERVABILITY_NAMESPACE}/services/http:${G14_PROMETHEUS_SERVICE}:9090/proxy/api/v1/query?query=${encodeURIComponent("up")}`; + const crds = [ + "servicemonitors.monitoring.coreos.com", + "podmonitors.monitoring.coreos.com", + "prometheusrules.monitoring.coreos.com", + "prometheuses.monitoring.coreos.com", + "alertmanagers.monitoring.coreos.com", + ]; + const script = [ + "set +e", + "section() {", + " name=\"$1\"", + " shift", + " printf '__UNIDESK_SECTION_BEGIN__ %s\\n' \"$name\"", + " \"$@\"", + " code=$?", + " printf '\\n__UNIDESK_SECTION_END__ %s exit=%s\\n' \"$name\" \"$code\"", + "}", + `section namespace kubectl get namespace ${shellQuote(G14_OBSERVABILITY_NAMESPACE)} -o json`, + `section discoveryNamespace kubectl get namespace ${shellQuote(V02_RUNTIME_NAMESPACE)} -o json`, + `section crds kubectl get crd ${crds.map(shellQuote).join(" ")} -o json`, + `section operator kubectl get deploy -n ${shellQuote(G14_OBSERVABILITY_NAMESPACE)} prometheus-operator -o json`, + `section operatorPods kubectl get pods -n ${shellQuote(G14_OBSERVABILITY_NAMESPACE)} -l app.kubernetes.io/name=prometheus-operator -o json`, + `section prometheus kubectl get prometheus -n ${shellQuote(G14_OBSERVABILITY_NAMESPACE)} ${shellQuote(G14_PROMETHEUS_NAME)} -o json`, + `section prometheusPods kubectl get pods -n ${shellQuote(G14_OBSERVABILITY_NAMESPACE)} -l prometheus=${shellQuote(G14_PROMETHEUS_NAME)} -o json`, + `section prometheusService kubectl get service -n ${shellQuote(G14_OBSERVABILITY_NAMESPACE)} ${shellQuote(G14_PROMETHEUS_SERVICE)} -o json`, + `section workloadMonitors kubectl get servicemonitor,prometheusrule -n ${shellQuote(V02_RUNTIME_NAMESPACE)} -l hwlab.pikastech.local/monitoring=enabled -o json`, + `section query kubectl get --raw ${shellQuote(queryPath)}`, + ].join("\n"); + const bundle = g14K3s(["script", "--", script], 120_000); + const sections = parseShellSections(statusText(bundle)); + const namespace = parseSectionJson(sections.namespace); + const discoveryNamespace = parseSectionJson(sections.discoveryNamespace); + const crdItems = parseSectionJsonArray(sections.crds); + const crdNames = crdItems.map((item) => String(record(item.metadata).name ?? "")).filter(Boolean); + const operator = parseSectionJson(sections.operator); + const operatorPods = parseSectionJsonArray(sections.operatorPods); + const prometheus = parseSectionJson(sections.prometheus); + const prometheusPods = parseSectionJsonArray(sections.prometheusPods); + const prometheusService = parseSectionJson(sections.prometheusService); + const workloadMonitorItems = parseSectionJsonArray(sections.workloadMonitors); + const query = parseSectionJson(sections.query); + const requiredCrdsPresent = crds.every((name) => crdNames.includes(name)); + const namespaceLabel = stringOrNull(record(record(namespace.metadata).labels)["g14.pikastech.local/observability-discovery"]); + const workloadNamespaceLabel = stringOrNull(record(record(discoveryNamespace.metadata).labels)["g14.pikastech.local/observability-discovery"]); + const operatorIsReady = Object.keys(operator).length > 0 && deploymentReady(operator); + const prometheusExists = Object.keys(prometheus).length > 0; + const prometheusIsReady = prometheusExists && ( + prometheusReady(prometheus) + || prometheusPods.some((pod) => { + const statuses = Array.isArray(record(pod.status).containerStatuses) ? record(pod.status).containerStatuses.map((item) => record(item)) : []; + return statuses.length > 0 && statuses.every((status) => status.ready === true); + }) + ); + const queryOk = sections.query?.exitCode === 0 && query.status === "success"; + return { + ok: isCommandSuccess(bundle) && requiredCrdsPresent && operatorIsReady && prometheusExists, + command: "hwlab g14 observability status", + namespace: G14_OBSERVABILITY_NAMESPACE, + mode: "status", + elapsedMs: Date.now() - startedAtMs, + versions: { + prometheusOperator: G14_PROMETHEUS_OPERATOR_VERSION, + prometheus: G14_PROMETHEUS_VERSION, + operatorBundle: G14_PROMETHEUS_OPERATOR_RELEASE_ASSET, + }, + discovery: { + namespaceLabel, + workloadNamespace: V02_RUNTIME_NAMESPACE, + workloadNamespaceLabel, + selectorLabel: "hwlab.pikastech.local/monitoring=enabled", + }, + crds: { + ok: requiredCrdsPresent, + required: crds, + present: crdNames, + missing: crds.filter((name) => !crdNames.includes(name)), + sectionOk: shellSectionOk(sections.crds), + }, + operator: { + ok: operatorIsReady, + deployment: stringOrNull(record(operator.metadata).name), + desiredReplicas: numericValue(record(operator.spec).replicas) ?? 1, + readyReplicas: numericValue(record(operator.status).readyReplicas) ?? 0, + availableReplicas: numericValue(record(operator.status).availableReplicas) ?? 0, + pods: operatorPods.map((pod) => ({ + name: stringOrNull(record(pod.metadata).name), + phase: stringOrNull(record(pod.status).phase), + })), + sectionOk: shellSectionOk(sections.operator), + }, + prometheus: { + ok: prometheusExists && prometheusIsReady, + name: G14_PROMETHEUS_NAME, + service: G14_PROMETHEUS_SERVICE, + serviceExists: Object.keys(prometheusService).length > 0, + ready: prometheusIsReady, + conditions: Array.isArray(record(prometheus.status).conditions) ? record(prometheus.status).conditions : [], + pods: prometheusPods.map((pod) => ({ + name: stringOrNull(record(pod.metadata).name), + phase: stringOrNull(record(pod.status).phase), + ready: Array.isArray(record(pod.status).containerStatuses) + ? record(pod.status).containerStatuses.map((item) => record(item)).every((status) => status.ready === true) + : null, + })), + sectionOk: shellSectionOk(sections.prometheus), + }, + workloadMonitors: { + ok: shellSectionOk(sections.workloadMonitors), + namespace: V02_RUNTIME_NAMESPACE, + count: workloadMonitorItems.length, + items: workloadMonitorItems.map((item) => ({ + kind: item.kind ?? null, + name: stringOrNull(record(item.metadata).name), + })), + stderr: shellSectionOk(sections.workloadMonitors) ? "" : commandErrorSummary(bundle), + }, + query: { + ok: queryOk, + promql: "up", + serviceProxyPath: queryPath, + resultType: nested(query, ["data", "resultType"]) ?? null, + resultCount: Array.isArray(nested(query, ["data", "result"])) ? (nested(query, ["data", "result"]) as unknown[]).length : null, + status: query.status ?? null, + sectionOk: shellSectionOk(sections.query), + stderr: shellSectionOk(sections.query) ? "" : commandErrorSummary(bundle), + }, + result: compactCommandResult(bundle), + next: requiredCrdsPresent && operatorIsReady && prometheusExists + ? { query: 'bun scripts/cli.ts hwlab g14 observability query --promql \'up{namespace="hwlab-v02"}\'' } + : { apply: "bun scripts/cli.ts hwlab g14 observability apply --confirm" }, + }; +} + +function g14ObservabilityApplyScript(options: G14ObservabilityOptions, manifestB64: string): string { + const dryRunArg = options.dryRun ? "--dry-run=server" : ""; + const stackDryRunCommand = options.dryRun + ? [ + "core_stack_path=\"$tmpdir/g14-prometheus-core-stack.json\"", + "node - \"$stack_path\" \"$core_stack_path\" <<'NODE'", + "const fs = require('node:fs');", + "const input = process.argv[2];", + "const output = process.argv[3];", + "const stack = JSON.parse(fs.readFileSync(input, 'utf8'));", + "stack.items = (stack.items || []).filter((item) => item.kind !== 'Prometheus');", + "fs.writeFileSync(output, JSON.stringify(stack));", + "NODE", + "kubectl apply --dry-run=client --validate=false -f \"$core_stack_path\"", + "echo prometheus_cr_dry_run=skipped_until_monitoring_crds_are_installed", + ].join("\n") + : `kubectl apply --server-side --force-conflicts --field-manager=${shellQuote(G14_OBSERVABILITY_FIELD_MANAGER)} -f "$stack_path"`; + const preStackWaitCommands = options.dryRun + ? "echo observability_wait=skipped_dry_run" + : [ + "kubectl wait --for=condition=Established --timeout=120s crd/servicemonitors.monitoring.coreos.com crd/podmonitors.monitoring.coreos.com crd/prometheusrules.monitoring.coreos.com crd/prometheuses.monitoring.coreos.com", + `kubectl -n ${shellQuote(G14_OBSERVABILITY_NAMESPACE)} rollout status deploy/prometheus-operator --timeout=${options.timeoutSeconds}s`, + ].join("\n"); + const postStackWaitCommands = options.dryRun + ? "echo prometheus_wait=skipped_dry_run" + : [ + `kubectl -n ${shellQuote(G14_OBSERVABILITY_NAMESPACE)} wait --for=condition=Available --timeout=${options.timeoutSeconds}s prometheus/${G14_PROMETHEUS_NAME} || true`, + `kubectl -n ${shellQuote(G14_OBSERVABILITY_NAMESPACE)} get deploy,pod,svc,prometheus -l app.kubernetes.io/component=observability -o wide || true`, + `kubectl -n ${shellQuote(G14_OBSERVABILITY_NAMESPACE)} get pod -l prometheus=${shellQuote(G14_PROMETHEUS_NAME)} -o wide || true`, + ].join("\n"); + return [ + "set -eu", + `namespace=${shellQuote(G14_OBSERVABILITY_NAMESPACE)}`, + `bundle_url=${shellQuote(G14_PROMETHEUS_OPERATOR_RELEASE_ASSET)}`, + `operator_version=${shellQuote(G14_PROMETHEUS_OPERATOR_VERSION)}`, + `prometheus_version=${shellQuote(G14_PROMETHEUS_VERSION)}`, + `stack_b64=${shellQuote(manifestB64)}`, + "tmpdir=$(mktemp -d /tmp/g14-observability-XXXXXX)", + "cleanup() { rm -rf \"$tmpdir\"; }", + "trap cleanup EXIT", + "bundle_path=\"$tmpdir/operator-bundle.yaml\"", + "operator_path=\"$tmpdir/operator-rendered.yaml\"", + "stack_path=\"$tmpdir/g14-prometheus-stack.json\"", + "printf '%s' \"$stack_b64\" | base64 -d > \"$stack_path\"", + "export HTTP_PROXY=${HTTP_PROXY:-http://127.0.0.1:10808}", + "export HTTPS_PROXY=${HTTPS_PROXY:-http://127.0.0.1:10808}", + "export http_proxy=$HTTP_PROXY", + "export https_proxy=$HTTPS_PROXY", + "export NO_PROXY=${NO_PROXY:-localhost,127.0.0.1,::1,10.0.0.0/8,10.42.0.0/16,10.43.0.0/16,.svc,.svc.cluster.local,.cluster.local,kubernetes,kubernetes.default,kubernetes.default.svc}", + "export no_proxy=$NO_PROXY", + "curl -fsSL --connect-timeout 20 --retry 3 --retry-delay 2 -o \"$bundle_path\" \"$bundle_url\"", + "cat > \"$tmpdir/kustomization.yaml\" <<'YAML'", + "apiVersion: kustomize.config.k8s.io/v1beta1", + "kind: Kustomization", + `namespace: ${G14_OBSERVABILITY_NAMESPACE}`, + "resources:", + "- operator-bundle.yaml", + "YAML", + "kubectl kustomize \"$tmpdir\" > \"$operator_path\"", + "grep -q 'namespace: devops-infra' \"$operator_path\"", + `kubectl create namespace ${shellQuote(G14_OBSERVABILITY_NAMESPACE)} --dry-run=client -o yaml | kubectl apply --server-side --force-conflicts --field-manager=${shellQuote(G14_OBSERVABILITY_FIELD_MANAGER)} ${dryRunArg} -f -`, + `kubectl apply --server-side --force-conflicts --field-manager=${shellQuote(G14_OBSERVABILITY_FIELD_MANAGER)} ${dryRunArg} -f "$operator_path"`, + preStackWaitCommands, + stackDryRunCommand, + postStackWaitCommands, + `printf 'observability_apply=ok namespace=%s operator=%s prometheus=%s dryRun=%s\\n' "$namespace" "$operator_version" "$prometheus_version" ${shellQuote(String(options.dryRun))}`, + ].join("\n"); +} + +function runG14ObservabilityApply(options: G14ObservabilityOptions): Record { + const startedAtMs = Date.now(); + const manifest = g14PrometheusManifest(); + const manifestB64 = Buffer.from(JSON.stringify(manifest), "utf8").toString("base64"); + const script = g14ObservabilityApplyScript(options, manifestB64); + const result = g14K3s(["script", "--", script], options.timeoutSeconds * 1000 + 90_000); + const ok = isCommandSuccess(result); + return { + ok, + command: "hwlab g14 observability apply", + mode: options.dryRun ? "dry-run" : "confirmed-apply", + namespace: G14_OBSERVABILITY_NAMESPACE, + versions: { + prometheusOperator: G14_PROMETHEUS_OPERATOR_VERSION, + prometheus: G14_PROMETHEUS_VERSION, + operatorBundle: G14_PROMETHEUS_OPERATOR_RELEASE_ASSET, + }, + manifest: options.dryRun ? manifest : undefined, + elapsedMs: Date.now() - startedAtMs, + result: compactCommandResult(result), + status: ok && !options.dryRun ? g14ObservabilityStatus() : undefined, + next: options.dryRun + ? { apply: "bun scripts/cli.ts hwlab g14 observability apply --confirm" } + : { status: "bun scripts/cli.ts hwlab g14 observability status", query: 'bun scripts/cli.ts hwlab g14 observability query --promql \'up{namespace="hwlab-v02"}\'' }, + }; +} + +function runG14ObservabilityQuery(options: G14ObservabilityOptions): Record { + const serviceProxyPath = `/api/v1/namespaces/${G14_OBSERVABILITY_NAMESPACE}/services/http:${G14_PROMETHEUS_SERVICE}:9090/proxy/api/v1/query?query=${encodeURIComponent(options.query)}`; + const result = g14K3s(["kubectl", "get", "--raw", serviceProxyPath], options.timeoutSeconds * 1000); + const parsed = (() => { + try { + return record(JSON.parse(statusText(result)) as unknown); + } catch { + return {}; + } + })(); + return { + ok: isCommandSuccess(result) && parsed.status === "success", + command: "hwlab g14 observability query", + namespace: G14_OBSERVABILITY_NAMESPACE, + service: G14_PROMETHEUS_SERVICE, + promql: options.query, + serviceProxyPath, + status: parsed.status ?? null, + resultType: nested(parsed, ["data", "resultType"]) ?? null, + resultCount: Array.isArray(nested(parsed, ["data", "result"])) ? (nested(parsed, ["data", "result"]) as unknown[]).length : null, + data: parsed.data ?? null, + raw: Object.keys(parsed).length === 0 ? tailText(statusText(result), 4000) : undefined, + commandResult: compactCommandResult(result), + }; +} + +function runG14Observability(options: G14ObservabilityOptions): Record { + if (options.action === "status") return g14ObservabilityStatus(); + if (options.action === "query") return runG14ObservabilityQuery(options); + return runG14ObservabilityApply(options); +} + function startAsyncHwlabG14Job(name: string, command: string[], note: string): Record { const job = startJob(name, command, note); const statusCommand = `bun scripts/cli.ts job status ${job.id} --tail-bytes 12000`; @@ -5658,11 +6163,15 @@ export function hwlabG14Help(): Record { "bun scripts/cli.ts hwlab g14 git-mirror flush --confirm", "bun scripts/cli.ts hwlab g14 git-mirror sync --confirm --wait", "bun scripts/cli.ts hwlab g14 git-mirror flush --confirm --wait", + "bun scripts/cli.ts hwlab g14 observability status", + "bun scripts/cli.ts hwlab g14 observability apply --dry-run", + "bun scripts/cli.ts hwlab g14 observability apply --confirm", + "bun scripts/cli.ts hwlab g14 observability query --promql 'up{namespace=\"hwlab-v02\"}'", "bun scripts/cli.ts hwlab g14 tools-image status --name ci-node-tools --tag node22-alpine-bun-v1", "bun scripts/cli.ts hwlab g14 tools-image build --name ci-node-tools --tag node22-alpine-bun-v1 --confirm", "bun scripts/cli.ts job status --tail-bytes 30000", ], - description: "G14 HWLAB PR monitor, DEV rollout command, bounded v0.2 control-plane bootstrap/cleanup/runtime-migration helper, v0.2 runtime SecretRef bootstrap, devops-infra git mirror maintenance, and controlled CI tools image build/status entry. The public monitor starts a fire-and-forget job. Default monitor lane is base=G14; --lane v02 monitors base=v0.2 PRs, waits for GitHub preflight/CI readiness, automatically merges ready PRs without waiting for other active v0.2 PipelineRuns, triggers v0.2 CD with latest-only GitOps writeback, flushes the git mirror when needed, and posts deduplicated PR comments for pending, blocked/conflict, success, superseded, failure, or timeout states. confirmed control-plane trigger-current and git-mirror sync/flush also return async jobs by default, with --wait reserved for explicit synchronous debugging. control-plane status/closeout/apply/cleanup-runs/cleanup-released-pvs/runtime-migration uses UniDesk G14:k3s routes for v0.2 Tekton/Argo control resources, runtime migration, historical PipelineRun/source-commit closeout verdicts, GitOps mirror flush state, and completed CI workspace retention only. secret status/ensure is the standard v0.2 runtime SecretRef bootstrap path; it never reads or prints secret values. git-mirror status/apply/sync/flush is the manual devops-infra mirror/relay control path and does not install a CronJob.", + description: "G14 HWLAB PR monitor, DEV rollout command, bounded v0.2 control-plane bootstrap/cleanup/runtime-migration helper, v0.2 runtime SecretRef bootstrap, devops-infra git mirror and observability maintenance, and controlled CI tools image build/status entry. The public monitor starts a fire-and-forget job. Default monitor lane is base=G14; --lane v02 monitors base=v0.2 PRs, waits for GitHub preflight/CI readiness, automatically merges ready PRs without waiting for other active v0.2 PipelineRuns, triggers v0.2 CD with latest-only GitOps writeback, flushes the git mirror when needed, and posts deduplicated PR comments for pending, blocked/conflict, success, superseded, failure, or timeout states. confirmed control-plane trigger-current and git-mirror sync/flush also return async jobs by default, with --wait reserved for explicit synchronous debugging. control-plane status/closeout/apply/cleanup-runs/cleanup-released-pvs/runtime-migration uses UniDesk G14:k3s routes for v0.2 Tekton/Argo control resources, runtime migration, historical PipelineRun/source-commit closeout verdicts, GitOps mirror flush state, and completed CI workspace retention only. secret status/ensure is the standard v0.2 runtime SecretRef bootstrap path; it never reads or prints secret values. git-mirror status/apply/sync/flush is the manual devops-infra mirror/relay control path and does not install a CronJob. observability status/apply/query owns the shared Prometheus Operator and Prometheus instance in devops-infra, while HWLAB lane manifests own only ServiceMonitor and PrometheusRule objects.", defaults: { repo: HWLAB_REPO, base: G14_SOURCE_BRANCH, @@ -5675,6 +6184,9 @@ export function hwlabG14Help(): Record { devApplication: DEV_APP, v02Application: V02_APP, briefIndexIssue: G14_BRIEF_INDEX_ISSUE, + observabilityNamespace: G14_OBSERVABILITY_NAMESPACE, + prometheusOperatorVersion: G14_PROMETHEUS_OPERATOR_VERSION, + prometheusVersion: G14_PROMETHEUS_VERSION, }, stateFiles: { monitor: ".state/hwlab-g14/latest-monitor-job.json", @@ -5780,8 +6292,12 @@ export async function runHwlabG14Command(_config: Config, args: string[]): Promi } return runG14GitMirror(options); } + if (action === "observability") { + const options = parseObservabilityOptions(args.slice(1)); + return runG14Observability(options); + } if (action !== "monitor-prs") { - return { ok: false, command: `hwlab g14 ${action ?? ""}`.trim(), degradedReason: "unsupported-command", message: "supported commands: hwlab g14 monitor-prs, hwlab g14 record-rollout, hwlab g14 control-plane, hwlab g14 secret, hwlab g14 git-mirror, hwlab g14 tools-image" }; + return { ok: false, command: `hwlab g14 ${action ?? ""}`.trim(), degradedReason: "unsupported-command", message: "supported commands: hwlab g14 monitor-prs, hwlab g14 record-rollout, hwlab g14 control-plane, hwlab g14 secret, hwlab g14 git-mirror, hwlab g14 observability, hwlab g14 tools-image" }; } const options = parseOptions(args.slice(1)); if (options.worker) return runMonitorWorker(options);