feat: add hwlab workbench projection observability config
This commit is contained in:
@@ -172,12 +172,26 @@ lanes:
|
||||
lowSampleThreshold: 5
|
||||
metricPrefixes:
|
||||
- hwlab_workbench_
|
||||
- hwlab_agentrun_
|
||||
- hwlab_webui_
|
||||
- hwlab_http_
|
||||
requiredSeries:
|
||||
- hwlab_workbench_journey_total
|
||||
- hwlab_workbench_event_phase_duration_seconds_count
|
||||
- hwlab_workbench_backend_event_visible_latency_seconds_count
|
||||
- hwlab_workbench_projection_lag_events_bucket
|
||||
- hwlab_workbench_projection_lag_seconds_bucket
|
||||
- hwlab_workbench_projection_stuck_traces
|
||||
- hwlab_workbench_projection_cursor_advance_total
|
||||
- hwlab_workbench_turn_get_duration_seconds_bucket
|
||||
- hwlab_workbench_turn_get_response_bytes_bucket
|
||||
- hwlab_agentrun_result_duration_seconds_bucket
|
||||
- hwlab_agentrun_result_pages_scanned_bucket
|
||||
- hwlab_agentrun_result_events_scanned_bucket
|
||||
- hwlab_workbench_projector_batch_duration_seconds_bucket
|
||||
- hwlab_workbench_projector_candidates_total
|
||||
- hwlab_workbench_projector_events_processed_total
|
||||
- hwlab_workbench_projector_last_success_unixtime
|
||||
backendLabelDenylist:
|
||||
- unknown
|
||||
maxUnknownEventLines: 0
|
||||
@@ -216,6 +230,41 @@ lanes:
|
||||
groupBy: [namespace, gitops_target, journey, route, cache, auth_state, outcome]
|
||||
matchLabels:
|
||||
journey: workbench_open_first_visible|workbench_open_full_load
|
||||
- id: workbench_projection_lag_p95
|
||||
metric: hwlab:workbench_projection_lag:p95_seconds
|
||||
sourceMetric: hwlab_workbench_projection_lag_seconds
|
||||
quantile: 0.95
|
||||
window: 5m
|
||||
minSamples: 1
|
||||
groupBy: [namespace, gitops_target, node, lane, projection_status, source, status, reason]
|
||||
- id: workbench_terminal_projection_delay_p95
|
||||
metric: hwlab:workbench_terminal_projection_delay:p95_seconds
|
||||
sourceMetric: hwlab_workbench_terminal_projection_delay_seconds
|
||||
quantile: 0.95
|
||||
window: 5m
|
||||
minSamples: 1
|
||||
groupBy: [namespace, gitops_target, node, lane, projection_status, source, status, reason]
|
||||
- id: workbench_turn_get_p95
|
||||
metric: hwlab:workbench_turn_get:p95_seconds
|
||||
sourceMetric: hwlab_workbench_turn_get_duration_seconds
|
||||
quantile: 0.95
|
||||
window: 5m
|
||||
minSamples: 1
|
||||
groupBy: [namespace, gitops_target, node, lane, route, status, degraded_reason]
|
||||
- id: agentrun_result_p95
|
||||
metric: hwlab:agentrun_result:p95_seconds
|
||||
sourceMetric: hwlab_agentrun_result_duration_seconds
|
||||
quantile: 0.95
|
||||
window: 5m
|
||||
minSamples: 1
|
||||
groupBy: [namespace, gitops_target, node, lane, event_count_bucket, status]
|
||||
- id: workbench_projector_batch_p95
|
||||
metric: hwlab:workbench_projector_batch:p95_seconds
|
||||
sourceMetric: hwlab_workbench_projector_batch_duration_seconds
|
||||
quantile: 0.95
|
||||
window: 5m
|
||||
minSamples: 1
|
||||
groupBy: [namespace, gitops_target, node, lane, phase, status]
|
||||
warningAlerts:
|
||||
- id: HWLABWorkbenchSubmitFirstVisibleSlow
|
||||
ruleId: workbench_submit_first_visible_p95
|
||||
@@ -243,6 +292,36 @@ lanes:
|
||||
thresholdSeconds: 13
|
||||
minSamples: 5
|
||||
for: 10m
|
||||
- id: WorkbenchProjectionStuck
|
||||
ruleId: workbench_projection_lag_p95
|
||||
severity: warning
|
||||
thresholdSeconds: 60
|
||||
minSamples: 1
|
||||
for: 10m
|
||||
- id: WorkbenchTerminalProjectionMissing
|
||||
ruleId: workbench_terminal_projection_delay_p95
|
||||
severity: warning
|
||||
thresholdSeconds: 60
|
||||
minSamples: 1
|
||||
for: 10m
|
||||
- id: WorkbenchTurnReadSlow
|
||||
ruleId: workbench_turn_get_p95
|
||||
severity: warning
|
||||
thresholdSeconds: 5
|
||||
minSamples: 1
|
||||
for: 10m
|
||||
- id: AgentRunResultSlowLongTrace
|
||||
ruleId: agentrun_result_p95
|
||||
severity: warning
|
||||
thresholdSeconds: 2.5
|
||||
minSamples: 1
|
||||
for: 10m
|
||||
- id: WorkbenchProjectorNoProgress
|
||||
ruleId: workbench_projector_batch_p95
|
||||
severity: warning
|
||||
thresholdSeconds: 10
|
||||
minSamples: 1
|
||||
for: 10m
|
||||
runtimeImageRewrites:
|
||||
- source: fatedier/frpc:v0.68.1
|
||||
target: 127.0.0.1:5000/hwlab/frpc:v0.68.1
|
||||
|
||||
@@ -852,6 +852,14 @@ function nodeObservabilityMetricsProbe(options: NodeObservabilityOptions, podNam
|
||||
{ id: "workbench_journey", metric: "hwlab_workbench_journey_duration_seconds", kind: "workbench_journey", dimensionLabels: ["namespace", "gitops_target", "journey", "route", "backend", "transport", "target_state", "cache", "source", "entry", "outcome"] },
|
||||
{ id: "workbench_event_phase", metric: "hwlab_workbench_event_phase_duration_seconds", kind: "workbench_event_phase", dimensionLabels: ["namespace", "gitops_target", "phase", "event_type", "backend", "transport", "outcome"] },
|
||||
{ id: "workbench_backend_event_visible", metric: "hwlab_workbench_backend_event_visible_latency_seconds", kind: "workbench_backend_event_visible", dimensionLabels: ["namespace", "gitops_target", "event_type", "backend", "transport", "outcome"] },
|
||||
{ id: "workbench_projection_lag_events", metric: "hwlab_workbench_projection_lag_events", kind: "workbench_projection_lag_events", dimensionLabels: ["namespace", "gitops_target", "node", "lane", "projection_status", "source", "status", "reason"] },
|
||||
{ id: "workbench_projection_lag_seconds", metric: "hwlab_workbench_projection_lag_seconds", kind: "workbench_projection_lag_seconds", dimensionLabels: ["namespace", "gitops_target", "node", "lane", "projection_status", "source", "status", "reason"] },
|
||||
{ id: "workbench_terminal_projection_delay", metric: "hwlab_workbench_terminal_projection_delay_seconds", kind: "workbench_terminal_projection_delay", dimensionLabels: ["namespace", "gitops_target", "node", "lane", "projection_status", "source", "status", "reason"] },
|
||||
{ id: "workbench_turn_get", metric: "hwlab_workbench_turn_get_duration_seconds", kind: "workbench_turn_get", dimensionLabels: ["namespace", "gitops_target", "node", "lane", "route", "status", "degraded_reason"] },
|
||||
{ id: "agentrun_result_duration", metric: "hwlab_agentrun_result_duration_seconds", kind: "agentrun_result_duration", dimensionLabels: ["namespace", "gitops_target", "node", "lane", "event_count_bucket", "status"] },
|
||||
{ id: "agentrun_result_pages", metric: "hwlab_agentrun_result_pages_scanned", kind: "agentrun_result_pages", dimensionLabels: ["namespace", "gitops_target", "node", "lane"] },
|
||||
{ id: "agentrun_result_events", metric: "hwlab_agentrun_result_events_scanned", kind: "agentrun_result_events", dimensionLabels: ["namespace", "gitops_target", "node", "lane"] },
|
||||
{ id: "workbench_projector_batch", metric: "hwlab_workbench_projector_batch_duration_seconds", kind: "workbench_projector_batch", dimensionLabels: ["namespace", "gitops_target", "node", "lane", "phase", "status"] },
|
||||
],
|
||||
}), "utf8").toString("base64");
|
||||
const source = [
|
||||
@@ -928,6 +936,11 @@ function nodeObservabilityMetricsProbe(options: NodeObservabilityOptions, podNam
|
||||
" deniedBackendEventLines: compactLines(deniedBackendEventLines),",
|
||||
" backendVisibleCountLines: compactLines(sampleLines.filter((line) => lineMatchesMetric(line, 'hwlab_workbench_backend_event_visible_latency_seconds_count'))),",
|
||||
" phaseCountLines: compactLines(sampleLines.filter((line) => lineMatchesMetric(line, 'hwlab_workbench_event_phase_duration_seconds_count'))),",
|
||||
" projectionLagCountLines: compactLines(sampleLines.filter((line) => lineMatchesMetric(line, 'hwlab_workbench_projection_lag_events_count') || lineMatchesMetric(line, 'hwlab_workbench_projection_lag_seconds_count'))),",
|
||||
" turnGetCountLines: compactLines(sampleLines.filter((line) => lineMatchesMetric(line, 'hwlab_workbench_turn_get_duration_seconds_count'))),",
|
||||
" agentRunResultCountLines: compactLines(sampleLines.filter((line) => lineMatchesMetric(line, 'hwlab_agentrun_result_duration_seconds_count'))),",
|
||||
" projectorBatchCountLines: compactLines(sampleLines.filter((line) => lineMatchesMetric(line, 'hwlab_workbench_projector_batch_duration_seconds_count'))),",
|
||||
" projectorLastSuccessLines: compactLines(sampleLines.filter((line) => lineMatchesMetric(line, 'hwlab_workbench_projector_last_success_unixtime'))),",
|
||||
" degradedReason: ready ? undefined : missingSeries.length > 0 ? 'node-observability-required-series-missing' : 'node-observability-denied-backend-label-present',",
|
||||
" };",
|
||||
"}",
|
||||
@@ -1043,6 +1056,11 @@ function summarizeNodeObservabilityMetrics(observability: HwlabRuntimeObservabil
|
||||
deniedBackendEventLines: compactPrometheusLines(deniedBackendEventLines),
|
||||
backendVisibleCountLines: compactPrometheusLines(sampleLines.filter((line) => prometheusLineMatchesMetric(line, "hwlab_workbench_backend_event_visible_latency_seconds_count"))),
|
||||
phaseCountLines: compactPrometheusLines(sampleLines.filter((line) => prometheusLineMatchesMetric(line, "hwlab_workbench_event_phase_duration_seconds_count"))),
|
||||
projectionLagCountLines: compactPrometheusLines(sampleLines.filter((line) => prometheusLineMatchesMetric(line, "hwlab_workbench_projection_lag_events_count") || prometheusLineMatchesMetric(line, "hwlab_workbench_projection_lag_seconds_count"))),
|
||||
turnGetCountLines: compactPrometheusLines(sampleLines.filter((line) => prometheusLineMatchesMetric(line, "hwlab_workbench_turn_get_duration_seconds_count"))),
|
||||
agentRunResultCountLines: compactPrometheusLines(sampleLines.filter((line) => prometheusLineMatchesMetric(line, "hwlab_agentrun_result_duration_seconds_count"))),
|
||||
projectorBatchCountLines: compactPrometheusLines(sampleLines.filter((line) => prometheusLineMatchesMetric(line, "hwlab_workbench_projector_batch_duration_seconds_count"))),
|
||||
projectorLastSuccessLines: compactPrometheusLines(sampleLines.filter((line) => prometheusLineMatchesMetric(line, "hwlab_workbench_projector_last_success_unixtime"))),
|
||||
degradedReason: ready ? undefined : missingSeries.length > 0 ? "node-observability-required-series-missing" : "node-observability-denied-backend-label-present",
|
||||
};
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user