feat: add hwlab workbench projection observability config

This commit is contained in:
Codex
2026-06-19 03:08:40 +00:00
parent 6f4bf0ac86
commit 08c3cf60d7
2 changed files with 97 additions and 0 deletions
+79
View File
@@ -172,12 +172,26 @@ lanes:
lowSampleThreshold: 5
metricPrefixes:
- hwlab_workbench_
- hwlab_agentrun_
- hwlab_webui_
- hwlab_http_
requiredSeries:
- hwlab_workbench_journey_total
- hwlab_workbench_event_phase_duration_seconds_count
- hwlab_workbench_backend_event_visible_latency_seconds_count
- hwlab_workbench_projection_lag_events_bucket
- hwlab_workbench_projection_lag_seconds_bucket
- hwlab_workbench_projection_stuck_traces
- hwlab_workbench_projection_cursor_advance_total
- hwlab_workbench_turn_get_duration_seconds_bucket
- hwlab_workbench_turn_get_response_bytes_bucket
- hwlab_agentrun_result_duration_seconds_bucket
- hwlab_agentrun_result_pages_scanned_bucket
- hwlab_agentrun_result_events_scanned_bucket
- hwlab_workbench_projector_batch_duration_seconds_bucket
- hwlab_workbench_projector_candidates_total
- hwlab_workbench_projector_events_processed_total
- hwlab_workbench_projector_last_success_unixtime
backendLabelDenylist:
- unknown
maxUnknownEventLines: 0
@@ -216,6 +230,41 @@ lanes:
groupBy: [namespace, gitops_target, journey, route, cache, auth_state, outcome]
matchLabels:
journey: workbench_open_first_visible|workbench_open_full_load
- id: workbench_projection_lag_p95
metric: hwlab:workbench_projection_lag:p95_seconds
sourceMetric: hwlab_workbench_projection_lag_seconds
quantile: 0.95
window: 5m
minSamples: 1
groupBy: [namespace, gitops_target, node, lane, projection_status, source, status, reason]
- id: workbench_terminal_projection_delay_p95
metric: hwlab:workbench_terminal_projection_delay:p95_seconds
sourceMetric: hwlab_workbench_terminal_projection_delay_seconds
quantile: 0.95
window: 5m
minSamples: 1
groupBy: [namespace, gitops_target, node, lane, projection_status, source, status, reason]
- id: workbench_turn_get_p95
metric: hwlab:workbench_turn_get:p95_seconds
sourceMetric: hwlab_workbench_turn_get_duration_seconds
quantile: 0.95
window: 5m
minSamples: 1
groupBy: [namespace, gitops_target, node, lane, route, status, degraded_reason]
- id: agentrun_result_p95
metric: hwlab:agentrun_result:p95_seconds
sourceMetric: hwlab_agentrun_result_duration_seconds
quantile: 0.95
window: 5m
minSamples: 1
groupBy: [namespace, gitops_target, node, lane, event_count_bucket, status]
- id: workbench_projector_batch_p95
metric: hwlab:workbench_projector_batch:p95_seconds
sourceMetric: hwlab_workbench_projector_batch_duration_seconds
quantile: 0.95
window: 5m
minSamples: 1
groupBy: [namespace, gitops_target, node, lane, phase, status]
warningAlerts:
- id: HWLABWorkbenchSubmitFirstVisibleSlow
ruleId: workbench_submit_first_visible_p95
@@ -243,6 +292,36 @@ lanes:
thresholdSeconds: 13
minSamples: 5
for: 10m
- id: WorkbenchProjectionStuck
ruleId: workbench_projection_lag_p95
severity: warning
thresholdSeconds: 60
minSamples: 1
for: 10m
- id: WorkbenchTerminalProjectionMissing
ruleId: workbench_terminal_projection_delay_p95
severity: warning
thresholdSeconds: 60
minSamples: 1
for: 10m
- id: WorkbenchTurnReadSlow
ruleId: workbench_turn_get_p95
severity: warning
thresholdSeconds: 5
minSamples: 1
for: 10m
- id: AgentRunResultSlowLongTrace
ruleId: agentrun_result_p95
severity: warning
thresholdSeconds: 2.5
minSamples: 1
for: 10m
- id: WorkbenchProjectorNoProgress
ruleId: workbench_projector_batch_p95
severity: warning
thresholdSeconds: 10
minSamples: 1
for: 10m
runtimeImageRewrites:
- source: fatedier/frpc:v0.68.1
target: 127.0.0.1:5000/hwlab/frpc:v0.68.1
+18
View File
@@ -852,6 +852,14 @@ function nodeObservabilityMetricsProbe(options: NodeObservabilityOptions, podNam
{ id: "workbench_journey", metric: "hwlab_workbench_journey_duration_seconds", kind: "workbench_journey", dimensionLabels: ["namespace", "gitops_target", "journey", "route", "backend", "transport", "target_state", "cache", "source", "entry", "outcome"] },
{ id: "workbench_event_phase", metric: "hwlab_workbench_event_phase_duration_seconds", kind: "workbench_event_phase", dimensionLabels: ["namespace", "gitops_target", "phase", "event_type", "backend", "transport", "outcome"] },
{ id: "workbench_backend_event_visible", metric: "hwlab_workbench_backend_event_visible_latency_seconds", kind: "workbench_backend_event_visible", dimensionLabels: ["namespace", "gitops_target", "event_type", "backend", "transport", "outcome"] },
{ id: "workbench_projection_lag_events", metric: "hwlab_workbench_projection_lag_events", kind: "workbench_projection_lag_events", dimensionLabels: ["namespace", "gitops_target", "node", "lane", "projection_status", "source", "status", "reason"] },
{ id: "workbench_projection_lag_seconds", metric: "hwlab_workbench_projection_lag_seconds", kind: "workbench_projection_lag_seconds", dimensionLabels: ["namespace", "gitops_target", "node", "lane", "projection_status", "source", "status", "reason"] },
{ id: "workbench_terminal_projection_delay", metric: "hwlab_workbench_terminal_projection_delay_seconds", kind: "workbench_terminal_projection_delay", dimensionLabels: ["namespace", "gitops_target", "node", "lane", "projection_status", "source", "status", "reason"] },
{ id: "workbench_turn_get", metric: "hwlab_workbench_turn_get_duration_seconds", kind: "workbench_turn_get", dimensionLabels: ["namespace", "gitops_target", "node", "lane", "route", "status", "degraded_reason"] },
{ id: "agentrun_result_duration", metric: "hwlab_agentrun_result_duration_seconds", kind: "agentrun_result_duration", dimensionLabels: ["namespace", "gitops_target", "node", "lane", "event_count_bucket", "status"] },
{ id: "agentrun_result_pages", metric: "hwlab_agentrun_result_pages_scanned", kind: "agentrun_result_pages", dimensionLabels: ["namespace", "gitops_target", "node", "lane"] },
{ id: "agentrun_result_events", metric: "hwlab_agentrun_result_events_scanned", kind: "agentrun_result_events", dimensionLabels: ["namespace", "gitops_target", "node", "lane"] },
{ id: "workbench_projector_batch", metric: "hwlab_workbench_projector_batch_duration_seconds", kind: "workbench_projector_batch", dimensionLabels: ["namespace", "gitops_target", "node", "lane", "phase", "status"] },
],
}), "utf8").toString("base64");
const source = [
@@ -928,6 +936,11 @@ function nodeObservabilityMetricsProbe(options: NodeObservabilityOptions, podNam
" deniedBackendEventLines: compactLines(deniedBackendEventLines),",
" backendVisibleCountLines: compactLines(sampleLines.filter((line) => lineMatchesMetric(line, 'hwlab_workbench_backend_event_visible_latency_seconds_count'))),",
" phaseCountLines: compactLines(sampleLines.filter((line) => lineMatchesMetric(line, 'hwlab_workbench_event_phase_duration_seconds_count'))),",
" projectionLagCountLines: compactLines(sampleLines.filter((line) => lineMatchesMetric(line, 'hwlab_workbench_projection_lag_events_count') || lineMatchesMetric(line, 'hwlab_workbench_projection_lag_seconds_count'))),",
" turnGetCountLines: compactLines(sampleLines.filter((line) => lineMatchesMetric(line, 'hwlab_workbench_turn_get_duration_seconds_count'))),",
" agentRunResultCountLines: compactLines(sampleLines.filter((line) => lineMatchesMetric(line, 'hwlab_agentrun_result_duration_seconds_count'))),",
" projectorBatchCountLines: compactLines(sampleLines.filter((line) => lineMatchesMetric(line, 'hwlab_workbench_projector_batch_duration_seconds_count'))),",
" projectorLastSuccessLines: compactLines(sampleLines.filter((line) => lineMatchesMetric(line, 'hwlab_workbench_projector_last_success_unixtime'))),",
" degradedReason: ready ? undefined : missingSeries.length > 0 ? 'node-observability-required-series-missing' : 'node-observability-denied-backend-label-present',",
" };",
"}",
@@ -1043,6 +1056,11 @@ function summarizeNodeObservabilityMetrics(observability: HwlabRuntimeObservabil
deniedBackendEventLines: compactPrometheusLines(deniedBackendEventLines),
backendVisibleCountLines: compactPrometheusLines(sampleLines.filter((line) => prometheusLineMatchesMetric(line, "hwlab_workbench_backend_event_visible_latency_seconds_count"))),
phaseCountLines: compactPrometheusLines(sampleLines.filter((line) => prometheusLineMatchesMetric(line, "hwlab_workbench_event_phase_duration_seconds_count"))),
projectionLagCountLines: compactPrometheusLines(sampleLines.filter((line) => prometheusLineMatchesMetric(line, "hwlab_workbench_projection_lag_events_count") || prometheusLineMatchesMetric(line, "hwlab_workbench_projection_lag_seconds_count"))),
turnGetCountLines: compactPrometheusLines(sampleLines.filter((line) => prometheusLineMatchesMetric(line, "hwlab_workbench_turn_get_duration_seconds_count"))),
agentRunResultCountLines: compactPrometheusLines(sampleLines.filter((line) => prometheusLineMatchesMetric(line, "hwlab_agentrun_result_duration_seconds_count"))),
projectorBatchCountLines: compactPrometheusLines(sampleLines.filter((line) => prometheusLineMatchesMetric(line, "hwlab_workbench_projector_batch_duration_seconds_count"))),
projectorLastSuccessLines: compactPrometheusLines(sampleLines.filter((line) => prometheusLineMatchesMetric(line, "hwlab_workbench_projector_last_success_unixtime"))),
degradedReason: ready ? undefined : missingSeries.length > 0 ? "node-observability-required-series-missing" : "node-observability-denied-backend-label-present",
};
}