feat: add G14 observability control CLI
This commit is contained in:
@@ -37,13 +37,16 @@ The shared Prometheus stack may discover application monitors across namespaces
|
||||
|
||||
Monitoring infrastructure must be declared as Git-backed desired state and applied through a controlled UniDesk or G14 GitOps path. A temporary `kubectl apply` may be used only as a `$dad-dev` P2 experiment; it must be followed by a durable source change and GitOps/CLI validation.
|
||||
|
||||
Recommended durable shape:
|
||||
Current durable control surface:
|
||||
|
||||
- A dedicated G14 infrastructure desired-state path for `devops-infra` observability resources.
|
||||
- A dedicated Argo CD Application or an equivalent UniDesk-controlled apply surface for that path.
|
||||
- `bun scripts/cli.ts hwlab g14 observability status` reads the G14 monitoring state through the controlled `G14:k3s` route and reports CRDs, Prometheus Operator readiness, Prometheus readiness, selected workload monitors and a bounded `up` query.
|
||||
- `bun scripts/cli.ts hwlab g14 observability apply --dry-run|--confirm` is the standard write path for the shared stack. It installs Prometheus Operator `v0.91.0`, Prometheus `v3.12.0`, Prometheus RBAC, the `devops-infra` Prometheus instance and the internal query Service.
|
||||
- `bun scripts/cli.ts hwlab g14 observability query --promql <expr>` is the controlled query path. It uses Kubernetes service proxy to the internal ClusterIP Service and must not expose Prometheus through FRP, NodePort or LoadBalancer.
|
||||
- Cluster-scoped CRDs and ClusterRole/ClusterRoleBinding resources owned by the infrastructure path, not by a HWLAB lane Application whose destination is only `hwlab-v02`.
|
||||
- Runtime workloads in `devops-infra` labeled with `app.kubernetes.io/part-of=devops-infra` and component labels such as `observability`, `prometheus`, `operator` or `query`.
|
||||
|
||||
Future GitOps work may move the same desired state behind a dedicated G14 infrastructure Argo CD Application. Until that exists, the UniDesk CLI source is the stable audited desired-state entry, and direct native `kubectl` remains only an implementation detail inside that CLI.
|
||||
|
||||
Do not attach Prometheus Operator CRDs, Prometheus Deployments, Grafana or Alertmanager to `hwlab-g14-v02`. That Argo Application is scoped to the HWLAB v0.2 runtime namespace and must remain a lane-specific application rollout controller.
|
||||
|
||||
## Security
|
||||
|
||||
+518
-2
@@ -38,6 +38,14 @@ const GIT_MIRROR_NAMESPACE = "devops-infra";
|
||||
const GIT_MIRROR_MANIFEST_FIELD_MANAGER = "unidesk-hwlab-git-mirror";
|
||||
const GIT_MIRROR_SYNC_JOB_PREFIX = "git-mirror-hwlab-sync-manual";
|
||||
const GIT_MIRROR_LEGACY_CRONJOB = "git-mirror-hwlab-sync";
|
||||
const G14_OBSERVABILITY_NAMESPACE = "devops-infra";
|
||||
const G14_OBSERVABILITY_FIELD_MANAGER = "unidesk-g14-observability";
|
||||
const G14_PROMETHEUS_OPERATOR_VERSION = "v0.91.0";
|
||||
const G14_PROMETHEUS_VERSION = "v3.12.0";
|
||||
const G14_PROMETHEUS_NAME = "g14-shared";
|
||||
const G14_PROMETHEUS_SERVICE = "prometheus-g14-shared";
|
||||
const G14_PROMETHEUS_SERVICE_ACCOUNT = "g14-observability-prometheus";
|
||||
const G14_PROMETHEUS_OPERATOR_RELEASE_ASSET = `https://github.com/prometheus-operator/prometheus-operator/releases/download/${G14_PROMETHEUS_OPERATOR_VERSION}/bundle.yaml`;
|
||||
const V02_SERVICE_IDS = [
|
||||
"hwlab-cloud-api",
|
||||
"hwlab-cloud-web",
|
||||
@@ -131,6 +139,15 @@ interface G14GitMirrorOptions {
|
||||
timeoutSeconds: number;
|
||||
}
|
||||
|
||||
interface G14ObservabilityOptions {
|
||||
action: "status" | "apply" | "query";
|
||||
dryRun: boolean;
|
||||
confirm: boolean;
|
||||
wait: boolean;
|
||||
timeoutSeconds: number;
|
||||
query: string;
|
||||
}
|
||||
|
||||
interface G14SecretOptions {
|
||||
action: "status" | "ensure";
|
||||
lane: "v02";
|
||||
@@ -370,6 +387,27 @@ function parseGitMirrorOptions(args: string[]): G14GitMirrorOptions {
|
||||
};
|
||||
}
|
||||
|
||||
function parseObservabilityOptions(args: string[]): G14ObservabilityOptions {
|
||||
const [actionRaw] = args;
|
||||
if (actionRaw !== "status" && actionRaw !== "apply" && actionRaw !== "query") {
|
||||
throw new Error("observability usage: status|apply|query [--promql <expr>] [--dry-run|--confirm]");
|
||||
}
|
||||
const confirm = args.includes("--confirm");
|
||||
const explicitDryRun = args.includes("--dry-run");
|
||||
if (confirm && explicitDryRun) throw new Error("observability accepts only one of --confirm or --dry-run");
|
||||
const query = optionValue(args, "--promql") ?? optionValue(args, "--query") ?? 'up{namespace="hwlab-v02"}';
|
||||
if (query.length > 500) throw new Error("--promql is limited to 500 characters");
|
||||
if (query.includes("\n") || query.includes("\r")) throw new Error("--promql must be a single-line expression");
|
||||
return {
|
||||
action: actionRaw,
|
||||
confirm,
|
||||
wait: args.includes("--wait"),
|
||||
dryRun: actionRaw === "status" || actionRaw === "query" ? true : explicitDryRun || !confirm,
|
||||
timeoutSeconds: positiveIntegerOption(args, "--timeout-seconds", actionRaw === "apply" ? 240 : 120, 900),
|
||||
query,
|
||||
};
|
||||
}
|
||||
|
||||
function parseSecretOptions(args: string[]): G14SecretOptions {
|
||||
const [actionRaw] = args;
|
||||
if (actionRaw !== "status" && actionRaw !== "ensure") {
|
||||
@@ -4121,6 +4159,473 @@ function runG14GitMirror(options: G14GitMirrorOptions): Record<string, unknown>
|
||||
return runGitMirrorSync(options);
|
||||
}
|
||||
|
||||
function observabilityLabels(component: string): Record<string, string> {
|
||||
return {
|
||||
"app.kubernetes.io/part-of": "devops-infra",
|
||||
"app.kubernetes.io/component": "observability",
|
||||
"g14.pikastech.local/observability-component": component,
|
||||
};
|
||||
}
|
||||
|
||||
function observabilityNamespaceLabel(): Record<string, string> {
|
||||
return {
|
||||
"g14.pikastech.local/observability-discovery": "enabled",
|
||||
};
|
||||
}
|
||||
|
||||
function g14PrometheusManifest(): Record<string, unknown> {
|
||||
const namespaceSelector = { matchLabels: observabilityNamespaceLabel() };
|
||||
const monitorSelector = { matchLabels: { "hwlab.pikastech.local/monitoring": "enabled" } };
|
||||
return {
|
||||
apiVersion: "v1",
|
||||
kind: "List",
|
||||
items: [
|
||||
{
|
||||
apiVersion: "v1",
|
||||
kind: "Namespace",
|
||||
metadata: {
|
||||
name: G14_OBSERVABILITY_NAMESPACE,
|
||||
labels: {
|
||||
...observabilityNamespaceLabel(),
|
||||
...observabilityLabels("namespace"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
apiVersion: "v1",
|
||||
kind: "Namespace",
|
||||
metadata: {
|
||||
name: V02_RUNTIME_NAMESPACE,
|
||||
labels: observabilityNamespaceLabel(),
|
||||
},
|
||||
},
|
||||
{
|
||||
apiVersion: "v1",
|
||||
kind: "ServiceAccount",
|
||||
metadata: {
|
||||
name: G14_PROMETHEUS_SERVICE_ACCOUNT,
|
||||
namespace: G14_OBSERVABILITY_NAMESPACE,
|
||||
labels: observabilityLabels("prometheus"),
|
||||
},
|
||||
},
|
||||
{
|
||||
apiVersion: "rbac.authorization.k8s.io/v1",
|
||||
kind: "ClusterRole",
|
||||
metadata: {
|
||||
name: G14_PROMETHEUS_SERVICE_ACCOUNT,
|
||||
labels: observabilityLabels("prometheus"),
|
||||
},
|
||||
rules: [
|
||||
{
|
||||
apiGroups: [""],
|
||||
resources: ["nodes", "nodes/metrics", "services", "endpoints", "pods"],
|
||||
verbs: ["get", "list", "watch"],
|
||||
},
|
||||
{
|
||||
apiGroups: ["discovery.k8s.io"],
|
||||
resources: ["endpointslices"],
|
||||
verbs: ["get", "list", "watch"],
|
||||
},
|
||||
{
|
||||
apiGroups: ["networking.k8s.io"],
|
||||
resources: ["ingresses"],
|
||||
verbs: ["get", "list", "watch"],
|
||||
},
|
||||
{
|
||||
nonResourceURLs: ["/metrics"],
|
||||
verbs: ["get"],
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
apiVersion: "rbac.authorization.k8s.io/v1",
|
||||
kind: "ClusterRoleBinding",
|
||||
metadata: {
|
||||
name: G14_PROMETHEUS_SERVICE_ACCOUNT,
|
||||
labels: observabilityLabels("prometheus"),
|
||||
},
|
||||
roleRef: {
|
||||
apiGroup: "rbac.authorization.k8s.io",
|
||||
kind: "ClusterRole",
|
||||
name: G14_PROMETHEUS_SERVICE_ACCOUNT,
|
||||
},
|
||||
subjects: [{
|
||||
kind: "ServiceAccount",
|
||||
name: G14_PROMETHEUS_SERVICE_ACCOUNT,
|
||||
namespace: G14_OBSERVABILITY_NAMESPACE,
|
||||
}],
|
||||
},
|
||||
{
|
||||
apiVersion: "monitoring.coreos.com/v1",
|
||||
kind: "Prometheus",
|
||||
metadata: {
|
||||
name: G14_PROMETHEUS_NAME,
|
||||
namespace: G14_OBSERVABILITY_NAMESPACE,
|
||||
labels: observabilityLabels("prometheus"),
|
||||
},
|
||||
spec: {
|
||||
replicas: 1,
|
||||
version: G14_PROMETHEUS_VERSION,
|
||||
serviceAccountName: G14_PROMETHEUS_SERVICE_ACCOUNT,
|
||||
scrapeInterval: "30s",
|
||||
evaluationInterval: "30s",
|
||||
retention: "7d",
|
||||
resources: {
|
||||
requests: { cpu: "100m", memory: "256Mi" },
|
||||
limits: { cpu: "500m", memory: "1Gi" },
|
||||
},
|
||||
storage: {
|
||||
volumeClaimTemplate: {
|
||||
spec: {
|
||||
storageClassName: "local-path",
|
||||
accessModes: ["ReadWriteOnce"],
|
||||
resources: { requests: { storage: "10Gi" } },
|
||||
},
|
||||
},
|
||||
},
|
||||
serviceMonitorSelector: monitorSelector,
|
||||
serviceMonitorNamespaceSelector: namespaceSelector,
|
||||
podMonitorSelector: monitorSelector,
|
||||
podMonitorNamespaceSelector: namespaceSelector,
|
||||
ruleSelector: monitorSelector,
|
||||
ruleNamespaceSelector: namespaceSelector,
|
||||
probeSelector: monitorSelector,
|
||||
probeNamespaceSelector: namespaceSelector,
|
||||
},
|
||||
},
|
||||
{
|
||||
apiVersion: "v1",
|
||||
kind: "Service",
|
||||
metadata: {
|
||||
name: G14_PROMETHEUS_SERVICE,
|
||||
namespace: G14_OBSERVABILITY_NAMESPACE,
|
||||
labels: observabilityLabels("query"),
|
||||
},
|
||||
spec: {
|
||||
type: "ClusterIP",
|
||||
selector: {
|
||||
prometheus: G14_PROMETHEUS_NAME,
|
||||
},
|
||||
ports: [{
|
||||
name: "web",
|
||||
port: 9090,
|
||||
targetPort: "web",
|
||||
}],
|
||||
},
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
function parseSectionJson(section: ShellSection | undefined): Record<string, unknown> {
|
||||
const text = String(section?.stdout ?? "").trim();
|
||||
if (text.length === 0) return {};
|
||||
try {
|
||||
return record(JSON.parse(text) as unknown);
|
||||
} catch {
|
||||
return {};
|
||||
}
|
||||
}
|
||||
|
||||
function parseSectionJsonArray(section: ShellSection | undefined): Record<string, unknown>[] {
|
||||
const parsed = parseSectionJson(section);
|
||||
const items = parsed.kind === "List" && Array.isArray(parsed.items)
|
||||
? parsed.items
|
||||
: Array.isArray(parsed.items)
|
||||
? parsed.items
|
||||
: [];
|
||||
return items.map((item) => record(item));
|
||||
}
|
||||
|
||||
function conditionStatus(items: Record<string, unknown>[], type: string): string | null {
|
||||
for (const item of items) {
|
||||
if (item.type === type) return typeof item.status === "string" ? item.status : null;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function deploymentReady(deployment: Record<string, unknown>): boolean {
|
||||
const spec = record(deployment.spec);
|
||||
const status = record(deployment.status);
|
||||
const desired = numericValue(spec.replicas) ?? 1;
|
||||
const ready = numericValue(status.readyReplicas) ?? 0;
|
||||
const available = numericValue(status.availableReplicas) ?? 0;
|
||||
return ready >= desired && available >= desired;
|
||||
}
|
||||
|
||||
function prometheusReady(prometheus: Record<string, unknown>): boolean {
|
||||
const conditions = Array.isArray(record(prometheus.status).conditions)
|
||||
? record(prometheus.status).conditions.map((item) => record(item))
|
||||
: [];
|
||||
const available = conditionStatus(conditions, "Available");
|
||||
const reconciled = conditionStatus(conditions, "Reconciled");
|
||||
return available === "True" || reconciled === "True";
|
||||
}
|
||||
|
||||
function g14ObservabilityStatus(): Record<string, unknown> {
|
||||
const startedAtMs = Date.now();
|
||||
const queryPath = `/api/v1/namespaces/${G14_OBSERVABILITY_NAMESPACE}/services/http:${G14_PROMETHEUS_SERVICE}:9090/proxy/api/v1/query?query=${encodeURIComponent("up")}`;
|
||||
const crds = [
|
||||
"servicemonitors.monitoring.coreos.com",
|
||||
"podmonitors.monitoring.coreos.com",
|
||||
"prometheusrules.monitoring.coreos.com",
|
||||
"prometheuses.monitoring.coreos.com",
|
||||
"alertmanagers.monitoring.coreos.com",
|
||||
];
|
||||
const script = [
|
||||
"set +e",
|
||||
"section() {",
|
||||
" name=\"$1\"",
|
||||
" shift",
|
||||
" printf '__UNIDESK_SECTION_BEGIN__ %s\\n' \"$name\"",
|
||||
" \"$@\"",
|
||||
" code=$?",
|
||||
" printf '\\n__UNIDESK_SECTION_END__ %s exit=%s\\n' \"$name\" \"$code\"",
|
||||
"}",
|
||||
`section namespace kubectl get namespace ${shellQuote(G14_OBSERVABILITY_NAMESPACE)} -o json`,
|
||||
`section discoveryNamespace kubectl get namespace ${shellQuote(V02_RUNTIME_NAMESPACE)} -o json`,
|
||||
`section crds kubectl get crd ${crds.map(shellQuote).join(" ")} -o json`,
|
||||
`section operator kubectl get deploy -n ${shellQuote(G14_OBSERVABILITY_NAMESPACE)} prometheus-operator -o json`,
|
||||
`section operatorPods kubectl get pods -n ${shellQuote(G14_OBSERVABILITY_NAMESPACE)} -l app.kubernetes.io/name=prometheus-operator -o json`,
|
||||
`section prometheus kubectl get prometheus -n ${shellQuote(G14_OBSERVABILITY_NAMESPACE)} ${shellQuote(G14_PROMETHEUS_NAME)} -o json`,
|
||||
`section prometheusPods kubectl get pods -n ${shellQuote(G14_OBSERVABILITY_NAMESPACE)} -l prometheus=${shellQuote(G14_PROMETHEUS_NAME)} -o json`,
|
||||
`section prometheusService kubectl get service -n ${shellQuote(G14_OBSERVABILITY_NAMESPACE)} ${shellQuote(G14_PROMETHEUS_SERVICE)} -o json`,
|
||||
`section workloadMonitors kubectl get servicemonitor,prometheusrule -n ${shellQuote(V02_RUNTIME_NAMESPACE)} -l hwlab.pikastech.local/monitoring=enabled -o json`,
|
||||
`section query kubectl get --raw ${shellQuote(queryPath)}`,
|
||||
].join("\n");
|
||||
const bundle = g14K3s(["script", "--", script], 120_000);
|
||||
const sections = parseShellSections(statusText(bundle));
|
||||
const namespace = parseSectionJson(sections.namespace);
|
||||
const discoveryNamespace = parseSectionJson(sections.discoveryNamespace);
|
||||
const crdItems = parseSectionJsonArray(sections.crds);
|
||||
const crdNames = crdItems.map((item) => String(record(item.metadata).name ?? "")).filter(Boolean);
|
||||
const operator = parseSectionJson(sections.operator);
|
||||
const operatorPods = parseSectionJsonArray(sections.operatorPods);
|
||||
const prometheus = parseSectionJson(sections.prometheus);
|
||||
const prometheusPods = parseSectionJsonArray(sections.prometheusPods);
|
||||
const prometheusService = parseSectionJson(sections.prometheusService);
|
||||
const workloadMonitorItems = parseSectionJsonArray(sections.workloadMonitors);
|
||||
const query = parseSectionJson(sections.query);
|
||||
const requiredCrdsPresent = crds.every((name) => crdNames.includes(name));
|
||||
const namespaceLabel = stringOrNull(record(record(namespace.metadata).labels)["g14.pikastech.local/observability-discovery"]);
|
||||
const workloadNamespaceLabel = stringOrNull(record(record(discoveryNamespace.metadata).labels)["g14.pikastech.local/observability-discovery"]);
|
||||
const operatorIsReady = Object.keys(operator).length > 0 && deploymentReady(operator);
|
||||
const prometheusExists = Object.keys(prometheus).length > 0;
|
||||
const prometheusIsReady = prometheusExists && (
|
||||
prometheusReady(prometheus)
|
||||
|| prometheusPods.some((pod) => {
|
||||
const statuses = Array.isArray(record(pod.status).containerStatuses) ? record(pod.status).containerStatuses.map((item) => record(item)) : [];
|
||||
return statuses.length > 0 && statuses.every((status) => status.ready === true);
|
||||
})
|
||||
);
|
||||
const queryOk = sections.query?.exitCode === 0 && query.status === "success";
|
||||
return {
|
||||
ok: isCommandSuccess(bundle) && requiredCrdsPresent && operatorIsReady && prometheusExists,
|
||||
command: "hwlab g14 observability status",
|
||||
namespace: G14_OBSERVABILITY_NAMESPACE,
|
||||
mode: "status",
|
||||
elapsedMs: Date.now() - startedAtMs,
|
||||
versions: {
|
||||
prometheusOperator: G14_PROMETHEUS_OPERATOR_VERSION,
|
||||
prometheus: G14_PROMETHEUS_VERSION,
|
||||
operatorBundle: G14_PROMETHEUS_OPERATOR_RELEASE_ASSET,
|
||||
},
|
||||
discovery: {
|
||||
namespaceLabel,
|
||||
workloadNamespace: V02_RUNTIME_NAMESPACE,
|
||||
workloadNamespaceLabel,
|
||||
selectorLabel: "hwlab.pikastech.local/monitoring=enabled",
|
||||
},
|
||||
crds: {
|
||||
ok: requiredCrdsPresent,
|
||||
required: crds,
|
||||
present: crdNames,
|
||||
missing: crds.filter((name) => !crdNames.includes(name)),
|
||||
sectionOk: shellSectionOk(sections.crds),
|
||||
},
|
||||
operator: {
|
||||
ok: operatorIsReady,
|
||||
deployment: stringOrNull(record(operator.metadata).name),
|
||||
desiredReplicas: numericValue(record(operator.spec).replicas) ?? 1,
|
||||
readyReplicas: numericValue(record(operator.status).readyReplicas) ?? 0,
|
||||
availableReplicas: numericValue(record(operator.status).availableReplicas) ?? 0,
|
||||
pods: operatorPods.map((pod) => ({
|
||||
name: stringOrNull(record(pod.metadata).name),
|
||||
phase: stringOrNull(record(pod.status).phase),
|
||||
})),
|
||||
sectionOk: shellSectionOk(sections.operator),
|
||||
},
|
||||
prometheus: {
|
||||
ok: prometheusExists && prometheusIsReady,
|
||||
name: G14_PROMETHEUS_NAME,
|
||||
service: G14_PROMETHEUS_SERVICE,
|
||||
serviceExists: Object.keys(prometheusService).length > 0,
|
||||
ready: prometheusIsReady,
|
||||
conditions: Array.isArray(record(prometheus.status).conditions) ? record(prometheus.status).conditions : [],
|
||||
pods: prometheusPods.map((pod) => ({
|
||||
name: stringOrNull(record(pod.metadata).name),
|
||||
phase: stringOrNull(record(pod.status).phase),
|
||||
ready: Array.isArray(record(pod.status).containerStatuses)
|
||||
? record(pod.status).containerStatuses.map((item) => record(item)).every((status) => status.ready === true)
|
||||
: null,
|
||||
})),
|
||||
sectionOk: shellSectionOk(sections.prometheus),
|
||||
},
|
||||
workloadMonitors: {
|
||||
ok: shellSectionOk(sections.workloadMonitors),
|
||||
namespace: V02_RUNTIME_NAMESPACE,
|
||||
count: workloadMonitorItems.length,
|
||||
items: workloadMonitorItems.map((item) => ({
|
||||
kind: item.kind ?? null,
|
||||
name: stringOrNull(record(item.metadata).name),
|
||||
})),
|
||||
stderr: shellSectionOk(sections.workloadMonitors) ? "" : commandErrorSummary(bundle),
|
||||
},
|
||||
query: {
|
||||
ok: queryOk,
|
||||
promql: "up",
|
||||
serviceProxyPath: queryPath,
|
||||
resultType: nested(query, ["data", "resultType"]) ?? null,
|
||||
resultCount: Array.isArray(nested(query, ["data", "result"])) ? (nested(query, ["data", "result"]) as unknown[]).length : null,
|
||||
status: query.status ?? null,
|
||||
sectionOk: shellSectionOk(sections.query),
|
||||
stderr: shellSectionOk(sections.query) ? "" : commandErrorSummary(bundle),
|
||||
},
|
||||
result: compactCommandResult(bundle),
|
||||
next: requiredCrdsPresent && operatorIsReady && prometheusExists
|
||||
? { query: 'bun scripts/cli.ts hwlab g14 observability query --promql \'up{namespace="hwlab-v02"}\'' }
|
||||
: { apply: "bun scripts/cli.ts hwlab g14 observability apply --confirm" },
|
||||
};
|
||||
}
|
||||
|
||||
function g14ObservabilityApplyScript(options: G14ObservabilityOptions, manifestB64: string): string {
|
||||
const dryRunArg = options.dryRun ? "--dry-run=server" : "";
|
||||
const stackDryRunCommand = options.dryRun
|
||||
? [
|
||||
"core_stack_path=\"$tmpdir/g14-prometheus-core-stack.json\"",
|
||||
"node - \"$stack_path\" \"$core_stack_path\" <<'NODE'",
|
||||
"const fs = require('node:fs');",
|
||||
"const input = process.argv[2];",
|
||||
"const output = process.argv[3];",
|
||||
"const stack = JSON.parse(fs.readFileSync(input, 'utf8'));",
|
||||
"stack.items = (stack.items || []).filter((item) => item.kind !== 'Prometheus');",
|
||||
"fs.writeFileSync(output, JSON.stringify(stack));",
|
||||
"NODE",
|
||||
"kubectl apply --dry-run=client --validate=false -f \"$core_stack_path\"",
|
||||
"echo prometheus_cr_dry_run=skipped_until_monitoring_crds_are_installed",
|
||||
].join("\n")
|
||||
: `kubectl apply --server-side --force-conflicts --field-manager=${shellQuote(G14_OBSERVABILITY_FIELD_MANAGER)} -f "$stack_path"`;
|
||||
const preStackWaitCommands = options.dryRun
|
||||
? "echo observability_wait=skipped_dry_run"
|
||||
: [
|
||||
"kubectl wait --for=condition=Established --timeout=120s crd/servicemonitors.monitoring.coreos.com crd/podmonitors.monitoring.coreos.com crd/prometheusrules.monitoring.coreos.com crd/prometheuses.monitoring.coreos.com",
|
||||
`kubectl -n ${shellQuote(G14_OBSERVABILITY_NAMESPACE)} rollout status deploy/prometheus-operator --timeout=${options.timeoutSeconds}s`,
|
||||
].join("\n");
|
||||
const postStackWaitCommands = options.dryRun
|
||||
? "echo prometheus_wait=skipped_dry_run"
|
||||
: [
|
||||
`kubectl -n ${shellQuote(G14_OBSERVABILITY_NAMESPACE)} wait --for=condition=Available --timeout=${options.timeoutSeconds}s prometheus/${G14_PROMETHEUS_NAME} || true`,
|
||||
`kubectl -n ${shellQuote(G14_OBSERVABILITY_NAMESPACE)} get deploy,pod,svc,prometheus -l app.kubernetes.io/component=observability -o wide || true`,
|
||||
`kubectl -n ${shellQuote(G14_OBSERVABILITY_NAMESPACE)} get pod -l prometheus=${shellQuote(G14_PROMETHEUS_NAME)} -o wide || true`,
|
||||
].join("\n");
|
||||
return [
|
||||
"set -eu",
|
||||
`namespace=${shellQuote(G14_OBSERVABILITY_NAMESPACE)}`,
|
||||
`bundle_url=${shellQuote(G14_PROMETHEUS_OPERATOR_RELEASE_ASSET)}`,
|
||||
`operator_version=${shellQuote(G14_PROMETHEUS_OPERATOR_VERSION)}`,
|
||||
`prometheus_version=${shellQuote(G14_PROMETHEUS_VERSION)}`,
|
||||
`stack_b64=${shellQuote(manifestB64)}`,
|
||||
"tmpdir=$(mktemp -d /tmp/g14-observability-XXXXXX)",
|
||||
"cleanup() { rm -rf \"$tmpdir\"; }",
|
||||
"trap cleanup EXIT",
|
||||
"bundle_path=\"$tmpdir/operator-bundle.yaml\"",
|
||||
"operator_path=\"$tmpdir/operator-rendered.yaml\"",
|
||||
"stack_path=\"$tmpdir/g14-prometheus-stack.json\"",
|
||||
"printf '%s' \"$stack_b64\" | base64 -d > \"$stack_path\"",
|
||||
"export HTTP_PROXY=${HTTP_PROXY:-http://127.0.0.1:10808}",
|
||||
"export HTTPS_PROXY=${HTTPS_PROXY:-http://127.0.0.1:10808}",
|
||||
"export http_proxy=$HTTP_PROXY",
|
||||
"export https_proxy=$HTTPS_PROXY",
|
||||
"export NO_PROXY=${NO_PROXY:-localhost,127.0.0.1,::1,10.0.0.0/8,10.42.0.0/16,10.43.0.0/16,.svc,.svc.cluster.local,.cluster.local,kubernetes,kubernetes.default,kubernetes.default.svc}",
|
||||
"export no_proxy=$NO_PROXY",
|
||||
"curl -fsSL --connect-timeout 20 --retry 3 --retry-delay 2 -o \"$bundle_path\" \"$bundle_url\"",
|
||||
"cat > \"$tmpdir/kustomization.yaml\" <<'YAML'",
|
||||
"apiVersion: kustomize.config.k8s.io/v1beta1",
|
||||
"kind: Kustomization",
|
||||
`namespace: ${G14_OBSERVABILITY_NAMESPACE}`,
|
||||
"resources:",
|
||||
"- operator-bundle.yaml",
|
||||
"YAML",
|
||||
"kubectl kustomize \"$tmpdir\" > \"$operator_path\"",
|
||||
"grep -q 'namespace: devops-infra' \"$operator_path\"",
|
||||
`kubectl create namespace ${shellQuote(G14_OBSERVABILITY_NAMESPACE)} --dry-run=client -o yaml | kubectl apply --server-side --force-conflicts --field-manager=${shellQuote(G14_OBSERVABILITY_FIELD_MANAGER)} ${dryRunArg} -f -`,
|
||||
`kubectl apply --server-side --force-conflicts --field-manager=${shellQuote(G14_OBSERVABILITY_FIELD_MANAGER)} ${dryRunArg} -f "$operator_path"`,
|
||||
preStackWaitCommands,
|
||||
stackDryRunCommand,
|
||||
postStackWaitCommands,
|
||||
`printf 'observability_apply=ok namespace=%s operator=%s prometheus=%s dryRun=%s\\n' "$namespace" "$operator_version" "$prometheus_version" ${shellQuote(String(options.dryRun))}`,
|
||||
].join("\n");
|
||||
}
|
||||
|
||||
function runG14ObservabilityApply(options: G14ObservabilityOptions): Record<string, unknown> {
|
||||
const startedAtMs = Date.now();
|
||||
const manifest = g14PrometheusManifest();
|
||||
const manifestB64 = Buffer.from(JSON.stringify(manifest), "utf8").toString("base64");
|
||||
const script = g14ObservabilityApplyScript(options, manifestB64);
|
||||
const result = g14K3s(["script", "--", script], options.timeoutSeconds * 1000 + 90_000);
|
||||
const ok = isCommandSuccess(result);
|
||||
return {
|
||||
ok,
|
||||
command: "hwlab g14 observability apply",
|
||||
mode: options.dryRun ? "dry-run" : "confirmed-apply",
|
||||
namespace: G14_OBSERVABILITY_NAMESPACE,
|
||||
versions: {
|
||||
prometheusOperator: G14_PROMETHEUS_OPERATOR_VERSION,
|
||||
prometheus: G14_PROMETHEUS_VERSION,
|
||||
operatorBundle: G14_PROMETHEUS_OPERATOR_RELEASE_ASSET,
|
||||
},
|
||||
manifest: options.dryRun ? manifest : undefined,
|
||||
elapsedMs: Date.now() - startedAtMs,
|
||||
result: compactCommandResult(result),
|
||||
status: ok && !options.dryRun ? g14ObservabilityStatus() : undefined,
|
||||
next: options.dryRun
|
||||
? { apply: "bun scripts/cli.ts hwlab g14 observability apply --confirm" }
|
||||
: { status: "bun scripts/cli.ts hwlab g14 observability status", query: 'bun scripts/cli.ts hwlab g14 observability query --promql \'up{namespace="hwlab-v02"}\'' },
|
||||
};
|
||||
}
|
||||
|
||||
function runG14ObservabilityQuery(options: G14ObservabilityOptions): Record<string, unknown> {
|
||||
const serviceProxyPath = `/api/v1/namespaces/${G14_OBSERVABILITY_NAMESPACE}/services/http:${G14_PROMETHEUS_SERVICE}:9090/proxy/api/v1/query?query=${encodeURIComponent(options.query)}`;
|
||||
const result = g14K3s(["kubectl", "get", "--raw", serviceProxyPath], options.timeoutSeconds * 1000);
|
||||
const parsed = (() => {
|
||||
try {
|
||||
return record(JSON.parse(statusText(result)) as unknown);
|
||||
} catch {
|
||||
return {};
|
||||
}
|
||||
})();
|
||||
return {
|
||||
ok: isCommandSuccess(result) && parsed.status === "success",
|
||||
command: "hwlab g14 observability query",
|
||||
namespace: G14_OBSERVABILITY_NAMESPACE,
|
||||
service: G14_PROMETHEUS_SERVICE,
|
||||
promql: options.query,
|
||||
serviceProxyPath,
|
||||
status: parsed.status ?? null,
|
||||
resultType: nested(parsed, ["data", "resultType"]) ?? null,
|
||||
resultCount: Array.isArray(nested(parsed, ["data", "result"])) ? (nested(parsed, ["data", "result"]) as unknown[]).length : null,
|
||||
data: parsed.data ?? null,
|
||||
raw: Object.keys(parsed).length === 0 ? tailText(statusText(result), 4000) : undefined,
|
||||
commandResult: compactCommandResult(result),
|
||||
};
|
||||
}
|
||||
|
||||
function runG14Observability(options: G14ObservabilityOptions): Record<string, unknown> {
|
||||
if (options.action === "status") return g14ObservabilityStatus();
|
||||
if (options.action === "query") return runG14ObservabilityQuery(options);
|
||||
return runG14ObservabilityApply(options);
|
||||
}
|
||||
|
||||
function startAsyncHwlabG14Job(name: string, command: string[], note: string): Record<string, unknown> {
|
||||
const job = startJob(name, command, note);
|
||||
const statusCommand = `bun scripts/cli.ts job status ${job.id} --tail-bytes 12000`;
|
||||
@@ -5658,11 +6163,15 @@ export function hwlabG14Help(): Record<string, unknown> {
|
||||
"bun scripts/cli.ts hwlab g14 git-mirror flush --confirm",
|
||||
"bun scripts/cli.ts hwlab g14 git-mirror sync --confirm --wait",
|
||||
"bun scripts/cli.ts hwlab g14 git-mirror flush --confirm --wait",
|
||||
"bun scripts/cli.ts hwlab g14 observability status",
|
||||
"bun scripts/cli.ts hwlab g14 observability apply --dry-run",
|
||||
"bun scripts/cli.ts hwlab g14 observability apply --confirm",
|
||||
"bun scripts/cli.ts hwlab g14 observability query --promql 'up{namespace=\"hwlab-v02\"}'",
|
||||
"bun scripts/cli.ts hwlab g14 tools-image status --name ci-node-tools --tag node22-alpine-bun-v1",
|
||||
"bun scripts/cli.ts hwlab g14 tools-image build --name ci-node-tools --tag node22-alpine-bun-v1 --confirm",
|
||||
"bun scripts/cli.ts job status <jobId> --tail-bytes 30000",
|
||||
],
|
||||
description: "G14 HWLAB PR monitor, DEV rollout command, bounded v0.2 control-plane bootstrap/cleanup/runtime-migration helper, v0.2 runtime SecretRef bootstrap, devops-infra git mirror maintenance, and controlled CI tools image build/status entry. The public monitor starts a fire-and-forget job. Default monitor lane is base=G14; --lane v02 monitors base=v0.2 PRs, waits for GitHub preflight/CI readiness, automatically merges ready PRs without waiting for other active v0.2 PipelineRuns, triggers v0.2 CD with latest-only GitOps writeback, flushes the git mirror when needed, and posts deduplicated PR comments for pending, blocked/conflict, success, superseded, failure, or timeout states. confirmed control-plane trigger-current and git-mirror sync/flush also return async jobs by default, with --wait reserved for explicit synchronous debugging. control-plane status/closeout/apply/cleanup-runs/cleanup-released-pvs/runtime-migration uses UniDesk G14:k3s routes for v0.2 Tekton/Argo control resources, runtime migration, historical PipelineRun/source-commit closeout verdicts, GitOps mirror flush state, and completed CI workspace retention only. secret status/ensure is the standard v0.2 runtime SecretRef bootstrap path; it never reads or prints secret values. git-mirror status/apply/sync/flush is the manual devops-infra mirror/relay control path and does not install a CronJob.",
|
||||
description: "G14 HWLAB PR monitor, DEV rollout command, bounded v0.2 control-plane bootstrap/cleanup/runtime-migration helper, v0.2 runtime SecretRef bootstrap, devops-infra git mirror and observability maintenance, and controlled CI tools image build/status entry. The public monitor starts a fire-and-forget job. Default monitor lane is base=G14; --lane v02 monitors base=v0.2 PRs, waits for GitHub preflight/CI readiness, automatically merges ready PRs without waiting for other active v0.2 PipelineRuns, triggers v0.2 CD with latest-only GitOps writeback, flushes the git mirror when needed, and posts deduplicated PR comments for pending, blocked/conflict, success, superseded, failure, or timeout states. confirmed control-plane trigger-current and git-mirror sync/flush also return async jobs by default, with --wait reserved for explicit synchronous debugging. control-plane status/closeout/apply/cleanup-runs/cleanup-released-pvs/runtime-migration uses UniDesk G14:k3s routes for v0.2 Tekton/Argo control resources, runtime migration, historical PipelineRun/source-commit closeout verdicts, GitOps mirror flush state, and completed CI workspace retention only. secret status/ensure is the standard v0.2 runtime SecretRef bootstrap path; it never reads or prints secret values. git-mirror status/apply/sync/flush is the manual devops-infra mirror/relay control path and does not install a CronJob. observability status/apply/query owns the shared Prometheus Operator and Prometheus instance in devops-infra, while HWLAB lane manifests own only ServiceMonitor and PrometheusRule objects.",
|
||||
defaults: {
|
||||
repo: HWLAB_REPO,
|
||||
base: G14_SOURCE_BRANCH,
|
||||
@@ -5675,6 +6184,9 @@ export function hwlabG14Help(): Record<string, unknown> {
|
||||
devApplication: DEV_APP,
|
||||
v02Application: V02_APP,
|
||||
briefIndexIssue: G14_BRIEF_INDEX_ISSUE,
|
||||
observabilityNamespace: G14_OBSERVABILITY_NAMESPACE,
|
||||
prometheusOperatorVersion: G14_PROMETHEUS_OPERATOR_VERSION,
|
||||
prometheusVersion: G14_PROMETHEUS_VERSION,
|
||||
},
|
||||
stateFiles: {
|
||||
monitor: ".state/hwlab-g14/latest-monitor-job.json",
|
||||
@@ -5780,8 +6292,12 @@ export async function runHwlabG14Command(_config: Config, args: string[]): Promi
|
||||
}
|
||||
return runG14GitMirror(options);
|
||||
}
|
||||
if (action === "observability") {
|
||||
const options = parseObservabilityOptions(args.slice(1));
|
||||
return runG14Observability(options);
|
||||
}
|
||||
if (action !== "monitor-prs") {
|
||||
return { ok: false, command: `hwlab g14 ${action ?? ""}`.trim(), degradedReason: "unsupported-command", message: "supported commands: hwlab g14 monitor-prs, hwlab g14 record-rollout, hwlab g14 control-plane, hwlab g14 secret, hwlab g14 git-mirror, hwlab g14 tools-image" };
|
||||
return { ok: false, command: `hwlab g14 ${action ?? ""}`.trim(), degradedReason: "unsupported-command", message: "supported commands: hwlab g14 monitor-prs, hwlab g14 record-rollout, hwlab g14 control-plane, hwlab g14 secret, hwlab g14 git-mirror, hwlab g14 observability, hwlab g14 tools-image" };
|
||||
}
|
||||
const options = parseOptions(args.slice(1));
|
||||
if (options.worker) return runMonitorWorker(options);
|
||||
|
||||
Reference in New Issue
Block a user