diff --git a/config/hwlab-node-lanes.yaml b/config/hwlab-node-lanes.yaml index 4d7853b6..0538da0c 100644 --- a/config/hwlab-node-lanes.yaml +++ b/config/hwlab-node-lanes.yaml @@ -169,6 +169,7 @@ lanes: workbench: enabled: true summaryPath: /v1/web-performance/summary + lowSampleThreshold: 5 metricPrefixes: - hwlab_workbench_ - hwlab_webui_ @@ -180,6 +181,68 @@ lanes: backendLabelDenylist: - unknown maxUnknownEventLines: 0 + recordingRules: + - id: workbench_submit_first_visible_p95 + metric: hwlab:workbench_submit_first_visible:p95_seconds + sourceMetric: hwlab_workbench_journey_duration_seconds + quantile: 0.95 + window: 5m + minSamples: 5 + groupBy: [namespace, gitops_target, journey, route, backend, transport, entry, outcome] + matchLabels: + journey: submit_to_first_visible + - id: workbench_backend_event_visible_p95 + metric: hwlab:workbench_backend_event_visible:p95_seconds + sourceMetric: hwlab_workbench_backend_event_visible_latency_seconds + quantile: 0.95 + window: 5m + minSamples: 5 + groupBy: [namespace, gitops_target, event_type, backend, transport, outcome] + - id: workbench_session_switch_p95 + metric: hwlab:workbench_session_switch:p95_seconds + sourceMetric: hwlab_workbench_journey_duration_seconds + quantile: 0.95 + window: 5m + minSamples: 5 + groupBy: [namespace, gitops_target, journey, route, target_state, cache, source, outcome] + matchLabels: + journey: session_switch_first_visible|session_switch_full_load + - id: workbench_open_p95 + metric: hwlab:workbench_open:p95_seconds + sourceMetric: hwlab_workbench_journey_duration_seconds + quantile: 0.95 + window: 5m + minSamples: 5 + groupBy: [namespace, gitops_target, journey, route, cache, auth_state, outcome] + matchLabels: + journey: workbench_open_first_visible|workbench_open_full_load + warningAlerts: + - id: HWLABWorkbenchSubmitFirstVisibleSlow + ruleId: workbench_submit_first_visible_p95 + severity: warning + thresholdSeconds: 15 + minSamples: 5 + for: 10m + matchLabels: + journey: submit_to_first_visible + - id: HWLABWorkbenchBackendEventVisibleSlow + ruleId: workbench_backend_event_visible_p95 + severity: warning + thresholdSeconds: 10 + minSamples: 5 + for: 10m + - id: HWLABWorkbenchSessionSwitchSlow + ruleId: workbench_session_switch_p95 + severity: warning + thresholdSeconds: 8 + minSamples: 5 + for: 10m + - id: HWLABWorkbenchOpenSlow + ruleId: workbench_open_p95 + severity: warning + thresholdSeconds: 13 + minSamples: 5 + for: 10m runtimeImageRewrites: - source: fatedier/frpc:v0.68.1 target: 127.0.0.1:5000/hwlab/frpc:v0.68.1 diff --git a/scripts/src/hwlab-node-lanes.ts b/scripts/src/hwlab-node-lanes.ts index e5067df2..3cc920f3 100644 --- a/scripts/src/hwlab-node-lanes.ts +++ b/scripts/src/hwlab-node-lanes.ts @@ -87,6 +87,8 @@ export interface HwlabRuntimeObservabilitySpec { readonly prometheusOperator: boolean; readonly metricsEndpoint?: HwlabRuntimeObservabilityMetricsEndpointSpec; readonly workbench?: HwlabRuntimeObservabilityWorkbenchSpec; + readonly recordingRules: readonly HwlabRuntimeObservabilityRecordingRuleSpec[]; + readonly warningAlerts: readonly HwlabRuntimeObservabilityWarningAlertSpec[]; } export interface HwlabRuntimeObservabilityMetricsEndpointSpec { @@ -102,12 +104,34 @@ export interface HwlabRuntimeObservabilityMetricsEndpointSpec { export interface HwlabRuntimeObservabilityWorkbenchSpec { readonly enabled: boolean; readonly summaryPath: string; + readonly lowSampleThreshold: number; readonly metricPrefixes: readonly string[]; readonly requiredSeries: readonly string[]; readonly backendLabelDenylist: readonly string[]; readonly maxUnknownEventLines: number; } +export interface HwlabRuntimeObservabilityRecordingRuleSpec { + readonly id: string; + readonly metric: string; + readonly sourceMetric: string; + readonly quantile: number; + readonly window: string; + readonly minSamples: number; + readonly groupBy: readonly string[]; + readonly matchLabels: Record; +} + +export interface HwlabRuntimeObservabilityWarningAlertSpec { + readonly id: string; + readonly ruleId: string; + readonly severity: "warning"; + readonly thresholdSeconds: number; + readonly minSamples: number; + readonly for: string; + readonly matchLabels: Record; +} + export interface HwlabRuntimeImageRewriteSpec { readonly source: string; readonly target: string; @@ -295,6 +319,12 @@ function nonNegativeIntegerField(obj: Record, key: string, path return value; } +function positiveNumberField(obj: Record, key: string, path: string): number { + const value = obj[key]; + if (typeof value !== "number" || !Number.isFinite(value) || value <= 0) throw new Error(`${path}.${key} must be a positive number`); + return value; +} + function sortedRecordEntries(value: unknown, path: string): Array<[string, Record]> { return Object.entries(asRecord(value, path)).map(([key, item]) => [key, asRecord(item, `${path}.${key}`)]); } @@ -584,10 +614,18 @@ function publicExposureConfig(value: unknown, path: string): HwlabRuntimePublicE function observabilityConfig(value: unknown, path: string): HwlabRuntimeObservabilitySpec { const raw = asRecord(value, path); + const recordingRules = observabilityRecordingRulesConfig(raw.recordingRules, `${path}.recordingRules`); + const warningAlerts = observabilityWarningAlertsConfig(raw.warningAlerts, `${path}.warningAlerts`); + const recordingRuleIds = new Set(recordingRules.map((rule) => rule.id)); + for (const alert of warningAlerts) { + if (!recordingRuleIds.has(alert.ruleId)) throw new Error(`${path}.warningAlerts.${alert.id}.ruleId must reference a recordingRules id`); + } return { prometheusOperator: booleanField(raw, "prometheusOperator", path), metricsEndpoint: observabilityMetricsEndpointConfig(raw.metricsEndpoint, `${path}.metricsEndpoint`), workbench: observabilityWorkbenchConfig(raw.workbench, `${path}.workbench`), + recordingRules, + warningAlerts, }; } @@ -617,6 +655,7 @@ function observabilityWorkbenchConfig(value: unknown, path: string): HwlabRuntim return { enabled: booleanField(raw, "enabled", path), summaryPath: stringField(raw, "summaryPath", path), + lowSampleThreshold: numberField(raw, "lowSampleThreshold", path), metricPrefixes: stringArrayField(raw, "metricPrefixes", path), requiredSeries: stringArrayField(raw, "requiredSeries", path), backendLabelDenylist: stringArrayField(raw, "backendLabelDenylist", path), @@ -624,6 +663,47 @@ function observabilityWorkbenchConfig(value: unknown, path: string): HwlabRuntim }; } +function observabilityRecordingRulesConfig(value: unknown, path: string): HwlabRuntimeObservabilityRecordingRuleSpec[] { + if (value === undefined) return []; + if (!Array.isArray(value)) throw new Error(`${path} must be an array`); + return value.map((item, index) => { + const itemPath = `${path}[${index}]`; + const raw = asRecord(item, itemPath); + const quantile = positiveNumberField(raw, "quantile", itemPath); + if (quantile >= 1) throw new Error(`${itemPath}.quantile must be less than 1`); + return { + id: stringField(raw, "id", itemPath), + metric: stringField(raw, "metric", itemPath), + sourceMetric: stringField(raw, "sourceMetric", itemPath), + quantile, + window: stringField(raw, "window", itemPath), + minSamples: numberField(raw, "minSamples", itemPath), + groupBy: stringArrayField(raw, "groupBy", itemPath), + matchLabels: optionalStringRecord(raw.matchLabels, `${itemPath}.matchLabels`), + }; + }); +} + +function observabilityWarningAlertsConfig(value: unknown, path: string): HwlabRuntimeObservabilityWarningAlertSpec[] { + if (value === undefined) return []; + if (!Array.isArray(value)) throw new Error(`${path} must be an array`); + return value.map((item, index) => { + const itemPath = `${path}[${index}]`; + const raw = asRecord(item, itemPath); + const severity = stringField(raw, "severity", itemPath); + if (severity !== "warning") throw new Error(`${itemPath}.severity must be warning`); + return { + id: stringField(raw, "id", itemPath), + ruleId: stringField(raw, "ruleId", itemPath), + severity, + thresholdSeconds: positiveNumberField(raw, "thresholdSeconds", itemPath), + minSamples: numberField(raw, "minSamples", itemPath), + for: stringField(raw, "for", itemPath), + matchLabels: optionalStringRecord(raw.matchLabels, `${itemPath}.matchLabels`), + }; + }); +} + function runtimeImageRewritesConfig(value: unknown, path: string): HwlabRuntimeImageRewriteSpec[] { if (value === undefined) return []; if (!Array.isArray(value)) throw new Error(`${path} must be an array`); diff --git a/scripts/src/hwlab-node.ts b/scripts/src/hwlab-node.ts index 092bb413..6fa19d05 100644 --- a/scripts/src/hwlab-node.ts +++ b/scripts/src/hwlab-node.ts @@ -9,7 +9,7 @@ import { startJob } from "./jobs"; import { classifySshTcpPoolFailure } from "./ssh"; import { runHwlabG14Command } from "./hwlab-g14"; import { HWLAB_NODE_CONTROL_PLANE_CONFIG_PATH, hwlabNodeControlPlaneInfraHelp, runHwlabNodeControlPlaneInfra } from "./hwlab-node-control-plane"; -import { hwlabRuntimeLaneConfigPath, hwlabRuntimeLaneSpec, hwlabRuntimeLaneSpecForNode, isHwlabRuntimeLane, type HwlabRuntimeLane, type HwlabRuntimeLaneSpec, type HwlabRuntimeObservabilitySpec, type HwlabRuntimePublicExposureSpec } from "./hwlab-node-lanes"; +import { hwlabRuntimeLaneConfigPath, hwlabRuntimeLaneSpec, hwlabRuntimeLaneSpecForNode, isHwlabRuntimeLane, type HwlabRuntimeLane, type HwlabRuntimeLaneSpec, type HwlabRuntimeObservabilityRecordingRuleSpec, type HwlabRuntimeObservabilitySpec, type HwlabRuntimeObservabilityWarningAlertSpec, type HwlabRuntimePublicExposureSpec } from "./hwlab-node-lanes"; type SecretAction = "status" | "ensure" | "cleanup-owned-postgres" | "cleanup-obsolete"; type SecretPreset = "openfga" | "master-server-admin-api-key" | "bootstrap-admin" | "code-agent-provider" | "cloud-api-db" | "owned-postgres-cleanup" | "obsolete-secret-cleanup"; @@ -697,6 +697,8 @@ function nodeObservabilityWorkbenchSummary(options: NodeObservabilityOptions, ex podName, containerName: serviceStatus.containerName, }, + recordingRules: nodeObservabilityRecordingRuleSummaries(options.spec.observability), + warningAlerts: nodeObservabilityWarningAlertSummaries(options.spec.observability), metrics, probe: metricsProbe === null ? null : { ok: metricsProbe.ok, @@ -719,6 +721,8 @@ function summarizeNodeObservabilityStatus(status: Record): Reco const publicRawMetrics = record(status.publicRawMetrics); const workbenchSummary = record(status.workbenchSummary); const metrics = record(workbenchSummary.metrics); + const recordingRules = Array.isArray(workbenchSummary.recordingRules) ? workbenchSummary.recordingRules : []; + const warningAlerts = Array.isArray(workbenchSummary.warningAlerts) ? workbenchSummary.warningAlerts : []; return { ok: status.ok === true, command: status.command, @@ -744,8 +748,11 @@ function summarizeNodeObservabilityStatus(status: Record): Reco ok: workbenchSummary.ok === true, httpStatus: metrics.httpStatus ?? null, bodyBytes: metrics.bodyBytes ?? null, + recordingRuleCount: recordingRules.length, + warningAlertCount: warningAlerts.length, seriesByPrefix: metrics.seriesByPrefix ?? {}, missingSeries: metrics.missingSeries ?? [], + topSlowDimensions: metrics.topSlowDimensions ?? [], deniedBackendEventLineCount: metrics.deniedBackendEventLineCount ?? null, maxDeniedBackendEventLines: metrics.maxDeniedBackendEventLines ?? null, }, @@ -771,6 +778,8 @@ function nodeObservabilityRenderPlan(observability: HwlabRuntimeObservabilitySpe prometheusOperator: observability.prometheusOperator, metricsEndpoint: nodeObservabilityEndpointSummary(observability), workbench: observability.workbench ?? null, + recordingRules: nodeObservabilityRecordingRuleSummaries(observability), + warningAlerts: nodeObservabilityWarningAlertSummaries(observability), clusterResources: observability.prometheusOperator ? { source: "HWLAB GitOps rendered manifests", @@ -785,6 +794,68 @@ function nodeObservabilityRenderPlan(observability: HwlabRuntimeObservabilitySpe }; } +function nodeObservabilityRecordingRuleSummaries(observability: HwlabRuntimeObservabilitySpec): Array> { + return observability.recordingRules.map((rule) => ({ + ...rule, + expression: nodeObservabilityRecordingRuleExpression(rule), + sampleCountExpression: nodeObservabilitySampleCountExpression(rule, rule.matchLabels), + lowSampleGuardExpression: `${nodeObservabilitySampleCountExpression(rule, rule.matchLabels)} >= ${rule.minSamples}`, + })); +} + +function nodeObservabilityWarningAlertSummaries(observability: HwlabRuntimeObservabilitySpec): Array> { + const recordingRules = new Map(observability.recordingRules.map((rule) => [rule.id, rule])); + return observability.warningAlerts.map((alert) => { + const rule = recordingRules.get(alert.ruleId); + return { + ...alert, + expression: rule === undefined ? null : nodeObservabilityWarningAlertExpression(rule, alert), + recordingRule: rule === undefined ? null : rule.metric, + lowSampleGuardExpression: rule === undefined ? null : `${nodeObservabilitySampleCountExpression(rule, { ...rule.matchLabels, ...alert.matchLabels })} >= ${alert.minSamples}`, + }; + }); +} + +function nodeObservabilityRecordingRuleExpression(rule: HwlabRuntimeObservabilityRecordingRuleSpec): string { + const bucketGroupBy = uniqueStrings([...rule.groupBy, "le"]); + return `histogram_quantile(${formatPrometheusNumber(rule.quantile)}, sum by (${bucketGroupBy.join(", ")}) (rate(${prometheusMetricSelector(`${rule.sourceMetric}_bucket`, rule.matchLabels)}[${rule.window}])))`; +} + +function nodeObservabilitySampleCountExpression(rule: HwlabRuntimeObservabilityRecordingRuleSpec, matchLabels: Record): string { + const groupBy = uniqueStrings([...rule.groupBy]); + return `sum by (${groupBy.join(", ")}) (increase(${prometheusMetricSelector(`${rule.sourceMetric}_count`, matchLabels)}[${rule.window}]))`; +} + +function nodeObservabilityWarningAlertExpression(rule: HwlabRuntimeObservabilityRecordingRuleSpec, alert: HwlabRuntimeObservabilityWarningAlertSpec): string { + const sampleLabels = { ...rule.matchLabels, ...alert.matchLabels }; + const sampleGuard = `${nodeObservabilitySampleCountExpression(rule, sampleLabels)} >= ${alert.minSamples}`; + const vectorMatch = rule.groupBy.length === 0 ? "" : ` on (${uniqueStrings([...rule.groupBy]).join(", ")})`; + return `(${prometheusMetricSelector(rule.metric, alert.matchLabels)} > ${formatPrometheusNumber(alert.thresholdSeconds)}) and${vectorMatch} (${sampleGuard})`; +} + +function prometheusMetricSelector(metric: string, labels: Record): string { + const selector = prometheusLabelSelector(labels); + return selector.length === 0 ? metric : `${metric}${selector}`; +} + +function prometheusLabelSelector(labels: Record): string { + const entries = Object.entries(labels); + if (entries.length === 0) return ""; + return `{${entries.map(([key, value]) => `${key}${value.includes("|") ? "=~" : "="}"${escapePrometheusLabelValue(value)}"`).join(",")}}`; +} + +function escapePrometheusLabelValue(value: string): string { + return value.replace(/\\/gu, "\\\\").replace(/"/gu, "\\\""); +} + +function formatPrometheusNumber(value: number): string { + return Number.isInteger(value) ? String(value) : String(value); +} + +function uniqueStrings(values: readonly string[]): string[] { + return [...new Set(values)]; +} + function nodeObservabilityApplyPlan(observability: HwlabRuntimeObservabilitySpec): Record { const endpoint = observability.metricsEndpoint; if (endpoint === undefined) { @@ -903,6 +974,12 @@ function nodeObservabilityMetricsProbe(options: NodeObservabilityOptions, podNam requiredSeries: workbench?.requiredSeries ?? [], backendLabelDenylist: workbench?.backendLabelDenylist ?? [], maxDeniedBackendEventLines: workbench?.maxUnknownEventLines ?? 0, + lowSampleThreshold: workbench?.lowSampleThreshold ?? 0, + histogramMetrics: [ + { id: "workbench_journey", metric: "hwlab_workbench_journey_duration_seconds", kind: "workbench_journey", dimensionLabels: ["namespace", "gitops_target", "journey", "route", "backend", "transport", "target_state", "cache", "source", "entry", "outcome"] }, + { id: "workbench_event_phase", metric: "hwlab_workbench_event_phase_duration_seconds", kind: "workbench_event_phase", dimensionLabels: ["namespace", "gitops_target", "phase", "event_type", "backend", "transport", "outcome"] }, + { id: "workbench_backend_event_visible", metric: "hwlab_workbench_backend_event_visible_latency_seconds", kind: "workbench_backend_event_visible", dimensionLabels: ["namespace", "gitops_target", "event_type", "backend", "transport", "outcome"] }, + ], }), "utf8").toString("base64"); const source = [ "const http = require('node:http');", @@ -914,6 +991,42 @@ function nodeObservabilityMetricsProbe(options: NodeObservabilityOptions, podNam "function lineHasLabel(line, name, value) { return line.includes(`${name}=\\\"${String(value).replace(/\\\\/g, '\\\\\\\\').replace(/\\\"/g, '\\\\\\\"')}\\\"`); }", "function isBackendEventMetric(name) { return name.startsWith('hwlab_workbench_event_') || name.startsWith('hwlab_workbench_backend_event_'); }", "function compactLines(lines) { return lines.slice(0, 16).map((line) => line.length > 320 ? `${line.slice(0, 317)}...` : line); }", + "function parseLabels(raw) { const labels = {}; for (const part of String(raw || '').split(',')) { const index = part.indexOf('='); if (index <= 0) continue; const key = part.slice(0, index); let value = part.slice(index + 1); if (value.startsWith('\\\"') && value.endsWith('\\\"')) value = value.slice(1, -1); labels[key] = value.replace(/\\\\\\\"/g, '\\\"').replace(/\\\\\\\\/g, '\\\\'); } return labels; }", + "function labelKey(metric, labels) { const copy = { ...labels }; delete copy.le; return `${metric}|${Object.keys(copy).sort().map((key) => `${key}=${copy[key]}`).join('|')}`; }", + "function pickLabels(labels, names) { return Object.fromEntries(names.map((name) => [name, labels[name] || 'unknown'])); }", + "function quantile(row, q) { const count = Number(row.count || 0); if (count <= 0) return 0; const rank = Math.max(1, Math.ceil(count * q)); const buckets = row.buckets.filter((item) => item.le !== '+Inf').sort((left, right) => Number(left.le) - Number(right.le)); for (const bucket of buckets) { if (bucket.value >= rank) return Number(bucket.le); } return buckets.length ? Number(buckets[buckets.length - 1].le) : 0; }", + "function summarizeHistograms(sampleLines) {", + " const metricConfig = new Map(config.histogramMetrics.map((item) => [item.metric, item]));", + " const series = new Map();", + " for (const line of sampleLines) {", + " const match = /^([A-Za-z_:][A-Za-z0-9_:]*?)_(bucket|sum|count)\\{([^}]*)\\}\\s+([0-9.eE+-]+)/.exec(line);", + " if (!match || !metricConfig.has(match[1])) continue;", + " const metric = match[1];", + " const suffix = match[2];", + " const labels = parseLabels(match[3]);", + " const value = Number(match[4]);", + " if (!Number.isFinite(value)) continue;", + " const key = labelKey(metric, labels);", + " const row = series.get(key) || { metric, labels: { ...labels }, buckets: [], sum: 0, count: 0 };", + " if (suffix === 'bucket') row.buckets.push({ le: labels.le || '+Inf', value });", + " if (suffix === 'sum') row.sum = value;", + " if (suffix === 'count') row.count = value;", + " series.set(key, row);", + " }", + " const lowSampleThreshold = Number(config.lowSampleThreshold || 0);", + " const groups = Object.fromEntries(config.histogramMetrics.map((item) => [item.id, []]));", + " const rows = [];", + " for (const row of series.values()) {", + " const cfg = metricConfig.get(row.metric);", + " const sampleCount = Number(row.count || 0);", + " const output = { kind: cfg.kind, metric: row.metric, sampleCount, average: sampleCount > 0 ? Number((row.sum / sampleCount).toFixed(4)) : 0, p50: quantile(row, 0.5), p75: quantile(row, 0.75), p95: quantile(row, 0.95), lowSample: sampleCount > 0 && sampleCount < lowSampleThreshold, sampleState: sampleCount <= 0 ? 'empty' : sampleCount < lowSampleThreshold ? 'low-sample' : 'ok', dimensions: pickLabels(row.labels, cfg.dimensionLabels) };", + " groups[cfg.id].push(output);", + " rows.push(output);", + " }", + " for (const key of Object.keys(groups)) groups[key].sort((left, right) => right.p95 - left.p95 || right.sampleCount - left.sampleCount);", + " rows.sort((left, right) => right.p95 - left.p95 || right.sampleCount - left.sampleCount);", + " return { groups: Object.fromEntries(Object.entries(groups).map(([key, value]) => [key, value.slice(0, 24)])), topSlowDimensions: rows.slice(0, 12) };", + "}", "function summarize(text, statusCode) {", " const lines = text.split(/\\r?\\n/).map((line) => line.trim()).filter(Boolean);", " const sampleLines = lines.filter((line) => !line.startsWith('#'));", @@ -921,6 +1034,7 @@ function nodeObservabilityMetricsProbe(options: NodeObservabilityOptions, podNam " const requiredSeries = config.requiredSeries.map((name) => { const matching = sampleLines.filter((line) => lineMatchesMetric(line, name)); return { name, present: matching.length > 0, sampleCount: matching.length }; });", " const missingSeries = requiredSeries.filter((series) => !series.present).map((series) => series.name);", " const deniedBackendEventLines = sampleLines.filter((line) => { const name = metricNameFromLine(line) || ''; if (!isBackendEventMetric(name)) return false; return config.backendLabelDenylist.some((label) => lineHasLabel(line, 'backend', label)); });", + " const histogramSummary = summarizeHistograms(sampleLines);", " const ready = statusCode >= 200 && statusCode < 300 && missingSeries.length === 0 && deniedBackendEventLines.length <= config.maxDeniedBackendEventLines;", " return {", " ready,", @@ -933,6 +1047,9 @@ function nodeObservabilityMetricsProbe(options: NodeObservabilityOptions, podNam " requiredSeries,", " missingSeries,", " backendLabelDenylist: config.backendLabelDenylist,", + " lowSampleThreshold: config.lowSampleThreshold,", + " workbenchHistograms: histogramSummary.groups,", + " topSlowDimensions: histogramSummary.topSlowDimensions,", " deniedBackendEventLineCount: deniedBackendEventLines.length,", " maxDeniedBackendEventLines: config.maxDeniedBackendEventLines,", " deniedBackendEventLines: compactLines(deniedBackendEventLines),",