Merge pull request #460 from pikasTech/issue-1400-p7-workbench-rules

feat: add Workbench observability rule summaries
This commit is contained in:
Lyon
2026-06-17 19:56:51 +08:00
committed by GitHub
3 changed files with 261 additions and 1 deletions
+63
View File
@@ -169,6 +169,7 @@ lanes:
workbench:
enabled: true
summaryPath: /v1/web-performance/summary
lowSampleThreshold: 5
metricPrefixes:
- hwlab_workbench_
- hwlab_webui_
@@ -180,6 +181,68 @@ lanes:
backendLabelDenylist:
- unknown
maxUnknownEventLines: 0
recordingRules:
- id: workbench_submit_first_visible_p95
metric: hwlab:workbench_submit_first_visible:p95_seconds
sourceMetric: hwlab_workbench_journey_duration_seconds
quantile: 0.95
window: 5m
minSamples: 5
groupBy: [namespace, gitops_target, journey, route, backend, transport, entry, outcome]
matchLabels:
journey: submit_to_first_visible
- id: workbench_backend_event_visible_p95
metric: hwlab:workbench_backend_event_visible:p95_seconds
sourceMetric: hwlab_workbench_backend_event_visible_latency_seconds
quantile: 0.95
window: 5m
minSamples: 5
groupBy: [namespace, gitops_target, event_type, backend, transport, outcome]
- id: workbench_session_switch_p95
metric: hwlab:workbench_session_switch:p95_seconds
sourceMetric: hwlab_workbench_journey_duration_seconds
quantile: 0.95
window: 5m
minSamples: 5
groupBy: [namespace, gitops_target, journey, route, target_state, cache, source, outcome]
matchLabels:
journey: session_switch_first_visible|session_switch_full_load
- id: workbench_open_p95
metric: hwlab:workbench_open:p95_seconds
sourceMetric: hwlab_workbench_journey_duration_seconds
quantile: 0.95
window: 5m
minSamples: 5
groupBy: [namespace, gitops_target, journey, route, cache, auth_state, outcome]
matchLabels:
journey: workbench_open_first_visible|workbench_open_full_load
warningAlerts:
- id: HWLABWorkbenchSubmitFirstVisibleSlow
ruleId: workbench_submit_first_visible_p95
severity: warning
thresholdSeconds: 15
minSamples: 5
for: 10m
matchLabels:
journey: submit_to_first_visible
- id: HWLABWorkbenchBackendEventVisibleSlow
ruleId: workbench_backend_event_visible_p95
severity: warning
thresholdSeconds: 10
minSamples: 5
for: 10m
- id: HWLABWorkbenchSessionSwitchSlow
ruleId: workbench_session_switch_p95
severity: warning
thresholdSeconds: 8
minSamples: 5
for: 10m
- id: HWLABWorkbenchOpenSlow
ruleId: workbench_open_p95
severity: warning
thresholdSeconds: 13
minSamples: 5
for: 10m
runtimeImageRewrites:
- source: fatedier/frpc:v0.68.1
target: 127.0.0.1:5000/hwlab/frpc:v0.68.1
+80
View File
@@ -87,6 +87,8 @@ export interface HwlabRuntimeObservabilitySpec {
readonly prometheusOperator: boolean;
readonly metricsEndpoint?: HwlabRuntimeObservabilityMetricsEndpointSpec;
readonly workbench?: HwlabRuntimeObservabilityWorkbenchSpec;
readonly recordingRules: readonly HwlabRuntimeObservabilityRecordingRuleSpec[];
readonly warningAlerts: readonly HwlabRuntimeObservabilityWarningAlertSpec[];
}
export interface HwlabRuntimeObservabilityMetricsEndpointSpec {
@@ -102,12 +104,34 @@ export interface HwlabRuntimeObservabilityMetricsEndpointSpec {
export interface HwlabRuntimeObservabilityWorkbenchSpec {
readonly enabled: boolean;
readonly summaryPath: string;
readonly lowSampleThreshold: number;
readonly metricPrefixes: readonly string[];
readonly requiredSeries: readonly string[];
readonly backendLabelDenylist: readonly string[];
readonly maxUnknownEventLines: number;
}
export interface HwlabRuntimeObservabilityRecordingRuleSpec {
readonly id: string;
readonly metric: string;
readonly sourceMetric: string;
readonly quantile: number;
readonly window: string;
readonly minSamples: number;
readonly groupBy: readonly string[];
readonly matchLabels: Record<string, string>;
}
export interface HwlabRuntimeObservabilityWarningAlertSpec {
readonly id: string;
readonly ruleId: string;
readonly severity: "warning";
readonly thresholdSeconds: number;
readonly minSamples: number;
readonly for: string;
readonly matchLabels: Record<string, string>;
}
export interface HwlabRuntimeImageRewriteSpec {
readonly source: string;
readonly target: string;
@@ -295,6 +319,12 @@ function nonNegativeIntegerField(obj: Record<string, unknown>, key: string, path
return value;
}
function positiveNumberField(obj: Record<string, unknown>, key: string, path: string): number {
const value = obj[key];
if (typeof value !== "number" || !Number.isFinite(value) || value <= 0) throw new Error(`${path}.${key} must be a positive number`);
return value;
}
function sortedRecordEntries(value: unknown, path: string): Array<[string, Record<string, unknown>]> {
return Object.entries(asRecord(value, path)).map(([key, item]) => [key, asRecord(item, `${path}.${key}`)]);
}
@@ -584,10 +614,18 @@ function publicExposureConfig(value: unknown, path: string): HwlabRuntimePublicE
function observabilityConfig(value: unknown, path: string): HwlabRuntimeObservabilitySpec {
const raw = asRecord(value, path);
const recordingRules = observabilityRecordingRulesConfig(raw.recordingRules, `${path}.recordingRules`);
const warningAlerts = observabilityWarningAlertsConfig(raw.warningAlerts, `${path}.warningAlerts`);
const recordingRuleIds = new Set(recordingRules.map((rule) => rule.id));
for (const alert of warningAlerts) {
if (!recordingRuleIds.has(alert.ruleId)) throw new Error(`${path}.warningAlerts.${alert.id}.ruleId must reference a recordingRules id`);
}
return {
prometheusOperator: booleanField(raw, "prometheusOperator", path),
metricsEndpoint: observabilityMetricsEndpointConfig(raw.metricsEndpoint, `${path}.metricsEndpoint`),
workbench: observabilityWorkbenchConfig(raw.workbench, `${path}.workbench`),
recordingRules,
warningAlerts,
};
}
@@ -617,6 +655,7 @@ function observabilityWorkbenchConfig(value: unknown, path: string): HwlabRuntim
return {
enabled: booleanField(raw, "enabled", path),
summaryPath: stringField(raw, "summaryPath", path),
lowSampleThreshold: numberField(raw, "lowSampleThreshold", path),
metricPrefixes: stringArrayField(raw, "metricPrefixes", path),
requiredSeries: stringArrayField(raw, "requiredSeries", path),
backendLabelDenylist: stringArrayField(raw, "backendLabelDenylist", path),
@@ -624,6 +663,47 @@ function observabilityWorkbenchConfig(value: unknown, path: string): HwlabRuntim
};
}
function observabilityRecordingRulesConfig(value: unknown, path: string): HwlabRuntimeObservabilityRecordingRuleSpec[] {
if (value === undefined) return [];
if (!Array.isArray(value)) throw new Error(`${path} must be an array`);
return value.map((item, index) => {
const itemPath = `${path}[${index}]`;
const raw = asRecord(item, itemPath);
const quantile = positiveNumberField(raw, "quantile", itemPath);
if (quantile >= 1) throw new Error(`${itemPath}.quantile must be less than 1`);
return {
id: stringField(raw, "id", itemPath),
metric: stringField(raw, "metric", itemPath),
sourceMetric: stringField(raw, "sourceMetric", itemPath),
quantile,
window: stringField(raw, "window", itemPath),
minSamples: numberField(raw, "minSamples", itemPath),
groupBy: stringArrayField(raw, "groupBy", itemPath),
matchLabels: optionalStringRecord(raw.matchLabels, `${itemPath}.matchLabels`),
};
});
}
function observabilityWarningAlertsConfig(value: unknown, path: string): HwlabRuntimeObservabilityWarningAlertSpec[] {
if (value === undefined) return [];
if (!Array.isArray(value)) throw new Error(`${path} must be an array`);
return value.map((item, index) => {
const itemPath = `${path}[${index}]`;
const raw = asRecord(item, itemPath);
const severity = stringField(raw, "severity", itemPath);
if (severity !== "warning") throw new Error(`${itemPath}.severity must be warning`);
return {
id: stringField(raw, "id", itemPath),
ruleId: stringField(raw, "ruleId", itemPath),
severity,
thresholdSeconds: positiveNumberField(raw, "thresholdSeconds", itemPath),
minSamples: numberField(raw, "minSamples", itemPath),
for: stringField(raw, "for", itemPath),
matchLabels: optionalStringRecord(raw.matchLabels, `${itemPath}.matchLabels`),
};
});
}
function runtimeImageRewritesConfig(value: unknown, path: string): HwlabRuntimeImageRewriteSpec[] {
if (value === undefined) return [];
if (!Array.isArray(value)) throw new Error(`${path} must be an array`);
+118 -1
View File
@@ -9,7 +9,7 @@ import { startJob } from "./jobs";
import { classifySshTcpPoolFailure } from "./ssh";
import { runHwlabG14Command } from "./hwlab-g14";
import { HWLAB_NODE_CONTROL_PLANE_CONFIG_PATH, hwlabNodeControlPlaneInfraHelp, runHwlabNodeControlPlaneInfra } from "./hwlab-node-control-plane";
import { hwlabRuntimeLaneConfigPath, hwlabRuntimeLaneSpec, hwlabRuntimeLaneSpecForNode, isHwlabRuntimeLane, type HwlabRuntimeLane, type HwlabRuntimeLaneSpec, type HwlabRuntimeObservabilitySpec, type HwlabRuntimePublicExposureSpec } from "./hwlab-node-lanes";
import { hwlabRuntimeLaneConfigPath, hwlabRuntimeLaneSpec, hwlabRuntimeLaneSpecForNode, isHwlabRuntimeLane, type HwlabRuntimeLane, type HwlabRuntimeLaneSpec, type HwlabRuntimeObservabilityRecordingRuleSpec, type HwlabRuntimeObservabilitySpec, type HwlabRuntimeObservabilityWarningAlertSpec, type HwlabRuntimePublicExposureSpec } from "./hwlab-node-lanes";
type SecretAction = "status" | "ensure" | "cleanup-owned-postgres" | "cleanup-obsolete";
type SecretPreset = "openfga" | "master-server-admin-api-key" | "bootstrap-admin" | "code-agent-provider" | "cloud-api-db" | "owned-postgres-cleanup" | "obsolete-secret-cleanup";
@@ -697,6 +697,8 @@ function nodeObservabilityWorkbenchSummary(options: NodeObservabilityOptions, ex
podName,
containerName: serviceStatus.containerName,
},
recordingRules: nodeObservabilityRecordingRuleSummaries(options.spec.observability),
warningAlerts: nodeObservabilityWarningAlertSummaries(options.spec.observability),
metrics,
probe: metricsProbe === null ? null : {
ok: metricsProbe.ok,
@@ -719,6 +721,8 @@ function summarizeNodeObservabilityStatus(status: Record<string, unknown>): Reco
const publicRawMetrics = record(status.publicRawMetrics);
const workbenchSummary = record(status.workbenchSummary);
const metrics = record(workbenchSummary.metrics);
const recordingRules = Array.isArray(workbenchSummary.recordingRules) ? workbenchSummary.recordingRules : [];
const warningAlerts = Array.isArray(workbenchSummary.warningAlerts) ? workbenchSummary.warningAlerts : [];
return {
ok: status.ok === true,
command: status.command,
@@ -744,8 +748,11 @@ function summarizeNodeObservabilityStatus(status: Record<string, unknown>): Reco
ok: workbenchSummary.ok === true,
httpStatus: metrics.httpStatus ?? null,
bodyBytes: metrics.bodyBytes ?? null,
recordingRuleCount: recordingRules.length,
warningAlertCount: warningAlerts.length,
seriesByPrefix: metrics.seriesByPrefix ?? {},
missingSeries: metrics.missingSeries ?? [],
topSlowDimensions: metrics.topSlowDimensions ?? [],
deniedBackendEventLineCount: metrics.deniedBackendEventLineCount ?? null,
maxDeniedBackendEventLines: metrics.maxDeniedBackendEventLines ?? null,
},
@@ -771,6 +778,8 @@ function nodeObservabilityRenderPlan(observability: HwlabRuntimeObservabilitySpe
prometheusOperator: observability.prometheusOperator,
metricsEndpoint: nodeObservabilityEndpointSummary(observability),
workbench: observability.workbench ?? null,
recordingRules: nodeObservabilityRecordingRuleSummaries(observability),
warningAlerts: nodeObservabilityWarningAlertSummaries(observability),
clusterResources: observability.prometheusOperator
? {
source: "HWLAB GitOps rendered manifests",
@@ -785,6 +794,68 @@ function nodeObservabilityRenderPlan(observability: HwlabRuntimeObservabilitySpe
};
}
function nodeObservabilityRecordingRuleSummaries(observability: HwlabRuntimeObservabilitySpec): Array<Record<string, unknown>> {
return observability.recordingRules.map((rule) => ({
...rule,
expression: nodeObservabilityRecordingRuleExpression(rule),
sampleCountExpression: nodeObservabilitySampleCountExpression(rule, rule.matchLabels),
lowSampleGuardExpression: `${nodeObservabilitySampleCountExpression(rule, rule.matchLabels)} >= ${rule.minSamples}`,
}));
}
function nodeObservabilityWarningAlertSummaries(observability: HwlabRuntimeObservabilitySpec): Array<Record<string, unknown>> {
const recordingRules = new Map(observability.recordingRules.map((rule) => [rule.id, rule]));
return observability.warningAlerts.map((alert) => {
const rule = recordingRules.get(alert.ruleId);
return {
...alert,
expression: rule === undefined ? null : nodeObservabilityWarningAlertExpression(rule, alert),
recordingRule: rule === undefined ? null : rule.metric,
lowSampleGuardExpression: rule === undefined ? null : `${nodeObservabilitySampleCountExpression(rule, { ...rule.matchLabels, ...alert.matchLabels })} >= ${alert.minSamples}`,
};
});
}
function nodeObservabilityRecordingRuleExpression(rule: HwlabRuntimeObservabilityRecordingRuleSpec): string {
const bucketGroupBy = uniqueStrings([...rule.groupBy, "le"]);
return `histogram_quantile(${formatPrometheusNumber(rule.quantile)}, sum by (${bucketGroupBy.join(", ")}) (rate(${prometheusMetricSelector(`${rule.sourceMetric}_bucket`, rule.matchLabels)}[${rule.window}])))`;
}
function nodeObservabilitySampleCountExpression(rule: HwlabRuntimeObservabilityRecordingRuleSpec, matchLabels: Record<string, string>): string {
const groupBy = uniqueStrings([...rule.groupBy]);
return `sum by (${groupBy.join(", ")}) (increase(${prometheusMetricSelector(`${rule.sourceMetric}_count`, matchLabels)}[${rule.window}]))`;
}
function nodeObservabilityWarningAlertExpression(rule: HwlabRuntimeObservabilityRecordingRuleSpec, alert: HwlabRuntimeObservabilityWarningAlertSpec): string {
const sampleLabels = { ...rule.matchLabels, ...alert.matchLabels };
const sampleGuard = `${nodeObservabilitySampleCountExpression(rule, sampleLabels)} >= ${alert.minSamples}`;
const vectorMatch = rule.groupBy.length === 0 ? "" : ` on (${uniqueStrings([...rule.groupBy]).join(", ")})`;
return `(${prometheusMetricSelector(rule.metric, alert.matchLabels)} > ${formatPrometheusNumber(alert.thresholdSeconds)}) and${vectorMatch} (${sampleGuard})`;
}
function prometheusMetricSelector(metric: string, labels: Record<string, string>): string {
const selector = prometheusLabelSelector(labels);
return selector.length === 0 ? metric : `${metric}${selector}`;
}
function prometheusLabelSelector(labels: Record<string, string>): string {
const entries = Object.entries(labels);
if (entries.length === 0) return "";
return `{${entries.map(([key, value]) => `${key}${value.includes("|") ? "=~" : "="}"${escapePrometheusLabelValue(value)}"`).join(",")}}`;
}
function escapePrometheusLabelValue(value: string): string {
return value.replace(/\\/gu, "\\\\").replace(/"/gu, "\\\"");
}
function formatPrometheusNumber(value: number): string {
return Number.isInteger(value) ? String(value) : String(value);
}
function uniqueStrings(values: readonly string[]): string[] {
return [...new Set(values)];
}
function nodeObservabilityApplyPlan(observability: HwlabRuntimeObservabilitySpec): Record<string, unknown> {
const endpoint = observability.metricsEndpoint;
if (endpoint === undefined) {
@@ -903,6 +974,12 @@ function nodeObservabilityMetricsProbe(options: NodeObservabilityOptions, podNam
requiredSeries: workbench?.requiredSeries ?? [],
backendLabelDenylist: workbench?.backendLabelDenylist ?? [],
maxDeniedBackendEventLines: workbench?.maxUnknownEventLines ?? 0,
lowSampleThreshold: workbench?.lowSampleThreshold ?? 0,
histogramMetrics: [
{ id: "workbench_journey", metric: "hwlab_workbench_journey_duration_seconds", kind: "workbench_journey", dimensionLabels: ["namespace", "gitops_target", "journey", "route", "backend", "transport", "target_state", "cache", "source", "entry", "outcome"] },
{ id: "workbench_event_phase", metric: "hwlab_workbench_event_phase_duration_seconds", kind: "workbench_event_phase", dimensionLabels: ["namespace", "gitops_target", "phase", "event_type", "backend", "transport", "outcome"] },
{ id: "workbench_backend_event_visible", metric: "hwlab_workbench_backend_event_visible_latency_seconds", kind: "workbench_backend_event_visible", dimensionLabels: ["namespace", "gitops_target", "event_type", "backend", "transport", "outcome"] },
],
}), "utf8").toString("base64");
const source = [
"const http = require('node:http');",
@@ -914,6 +991,42 @@ function nodeObservabilityMetricsProbe(options: NodeObservabilityOptions, podNam
"function lineHasLabel(line, name, value) { return line.includes(`${name}=\\\"${String(value).replace(/\\\\/g, '\\\\\\\\').replace(/\\\"/g, '\\\\\\\"')}\\\"`); }",
"function isBackendEventMetric(name) { return name.startsWith('hwlab_workbench_event_') || name.startsWith('hwlab_workbench_backend_event_'); }",
"function compactLines(lines) { return lines.slice(0, 16).map((line) => line.length > 320 ? `${line.slice(0, 317)}...` : line); }",
"function parseLabels(raw) { const labels = {}; for (const part of String(raw || '').split(',')) { const index = part.indexOf('='); if (index <= 0) continue; const key = part.slice(0, index); let value = part.slice(index + 1); if (value.startsWith('\\\"') && value.endsWith('\\\"')) value = value.slice(1, -1); labels[key] = value.replace(/\\\\\\\"/g, '\\\"').replace(/\\\\\\\\/g, '\\\\'); } return labels; }",
"function labelKey(metric, labels) { const copy = { ...labels }; delete copy.le; return `${metric}|${Object.keys(copy).sort().map((key) => `${key}=${copy[key]}`).join('|')}`; }",
"function pickLabels(labels, names) { return Object.fromEntries(names.map((name) => [name, labels[name] || 'unknown'])); }",
"function quantile(row, q) { const count = Number(row.count || 0); if (count <= 0) return 0; const rank = Math.max(1, Math.ceil(count * q)); const buckets = row.buckets.filter((item) => item.le !== '+Inf').sort((left, right) => Number(left.le) - Number(right.le)); for (const bucket of buckets) { if (bucket.value >= rank) return Number(bucket.le); } return buckets.length ? Number(buckets[buckets.length - 1].le) : 0; }",
"function summarizeHistograms(sampleLines) {",
" const metricConfig = new Map(config.histogramMetrics.map((item) => [item.metric, item]));",
" const series = new Map();",
" for (const line of sampleLines) {",
" const match = /^([A-Za-z_:][A-Za-z0-9_:]*?)_(bucket|sum|count)\\{([^}]*)\\}\\s+([0-9.eE+-]+)/.exec(line);",
" if (!match || !metricConfig.has(match[1])) continue;",
" const metric = match[1];",
" const suffix = match[2];",
" const labels = parseLabels(match[3]);",
" const value = Number(match[4]);",
" if (!Number.isFinite(value)) continue;",
" const key = labelKey(metric, labels);",
" const row = series.get(key) || { metric, labels: { ...labels }, buckets: [], sum: 0, count: 0 };",
" if (suffix === 'bucket') row.buckets.push({ le: labels.le || '+Inf', value });",
" if (suffix === 'sum') row.sum = value;",
" if (suffix === 'count') row.count = value;",
" series.set(key, row);",
" }",
" const lowSampleThreshold = Number(config.lowSampleThreshold || 0);",
" const groups = Object.fromEntries(config.histogramMetrics.map((item) => [item.id, []]));",
" const rows = [];",
" for (const row of series.values()) {",
" const cfg = metricConfig.get(row.metric);",
" const sampleCount = Number(row.count || 0);",
" const output = { kind: cfg.kind, metric: row.metric, sampleCount, average: sampleCount > 0 ? Number((row.sum / sampleCount).toFixed(4)) : 0, p50: quantile(row, 0.5), p75: quantile(row, 0.75), p95: quantile(row, 0.95), lowSample: sampleCount > 0 && sampleCount < lowSampleThreshold, sampleState: sampleCount <= 0 ? 'empty' : sampleCount < lowSampleThreshold ? 'low-sample' : 'ok', dimensions: pickLabels(row.labels, cfg.dimensionLabels) };",
" groups[cfg.id].push(output);",
" rows.push(output);",
" }",
" for (const key of Object.keys(groups)) groups[key].sort((left, right) => right.p95 - left.p95 || right.sampleCount - left.sampleCount);",
" rows.sort((left, right) => right.p95 - left.p95 || right.sampleCount - left.sampleCount);",
" return { groups: Object.fromEntries(Object.entries(groups).map(([key, value]) => [key, value.slice(0, 24)])), topSlowDimensions: rows.slice(0, 12) };",
"}",
"function summarize(text, statusCode) {",
" const lines = text.split(/\\r?\\n/).map((line) => line.trim()).filter(Boolean);",
" const sampleLines = lines.filter((line) => !line.startsWith('#'));",
@@ -921,6 +1034,7 @@ function nodeObservabilityMetricsProbe(options: NodeObservabilityOptions, podNam
" const requiredSeries = config.requiredSeries.map((name) => { const matching = sampleLines.filter((line) => lineMatchesMetric(line, name)); return { name, present: matching.length > 0, sampleCount: matching.length }; });",
" const missingSeries = requiredSeries.filter((series) => !series.present).map((series) => series.name);",
" const deniedBackendEventLines = sampleLines.filter((line) => { const name = metricNameFromLine(line) || ''; if (!isBackendEventMetric(name)) return false; return config.backendLabelDenylist.some((label) => lineHasLabel(line, 'backend', label)); });",
" const histogramSummary = summarizeHistograms(sampleLines);",
" const ready = statusCode >= 200 && statusCode < 300 && missingSeries.length === 0 && deniedBackendEventLines.length <= config.maxDeniedBackendEventLines;",
" return {",
" ready,",
@@ -933,6 +1047,9 @@ function nodeObservabilityMetricsProbe(options: NodeObservabilityOptions, podNam
" requiredSeries,",
" missingSeries,",
" backendLabelDenylist: config.backendLabelDenylist,",
" lowSampleThreshold: config.lowSampleThreshold,",
" workbenchHistograms: histogramSummary.groups,",
" topSlowDimensions: histogramSummary.topSlowDimensions,",
" deniedBackendEventLineCount: deniedBackendEventLines.length,",
" maxDeniedBackendEventLines: config.maxDeniedBackendEventLines,",
" deniedBackendEventLines: compactLines(deniedBackendEventLines),",