From 08c3cf60d7b43dc2979b313b08916d53f76f0377 Mon Sep 17 00:00:00 2001 From: Codex Date: Fri, 19 Jun 2026 03:08:40 +0000 Subject: [PATCH] feat: add hwlab workbench projection observability config --- config/hwlab-node-lanes.yaml | 79 ++++++++++++++++++++++++++++++++++ scripts/src/hwlab-node-impl.ts | 18 ++++++++ 2 files changed, 97 insertions(+) diff --git a/config/hwlab-node-lanes.yaml b/config/hwlab-node-lanes.yaml index 65a6b0af..89233208 100644 --- a/config/hwlab-node-lanes.yaml +++ b/config/hwlab-node-lanes.yaml @@ -172,12 +172,26 @@ lanes: lowSampleThreshold: 5 metricPrefixes: - hwlab_workbench_ + - hwlab_agentrun_ - hwlab_webui_ - hwlab_http_ requiredSeries: - hwlab_workbench_journey_total - hwlab_workbench_event_phase_duration_seconds_count - hwlab_workbench_backend_event_visible_latency_seconds_count + - hwlab_workbench_projection_lag_events_bucket + - hwlab_workbench_projection_lag_seconds_bucket + - hwlab_workbench_projection_stuck_traces + - hwlab_workbench_projection_cursor_advance_total + - hwlab_workbench_turn_get_duration_seconds_bucket + - hwlab_workbench_turn_get_response_bytes_bucket + - hwlab_agentrun_result_duration_seconds_bucket + - hwlab_agentrun_result_pages_scanned_bucket + - hwlab_agentrun_result_events_scanned_bucket + - hwlab_workbench_projector_batch_duration_seconds_bucket + - hwlab_workbench_projector_candidates_total + - hwlab_workbench_projector_events_processed_total + - hwlab_workbench_projector_last_success_unixtime backendLabelDenylist: - unknown maxUnknownEventLines: 0 @@ -216,6 +230,41 @@ lanes: groupBy: [namespace, gitops_target, journey, route, cache, auth_state, outcome] matchLabels: journey: workbench_open_first_visible|workbench_open_full_load + - id: workbench_projection_lag_p95 + metric: hwlab:workbench_projection_lag:p95_seconds + sourceMetric: hwlab_workbench_projection_lag_seconds + quantile: 0.95 + window: 5m + minSamples: 1 + groupBy: [namespace, gitops_target, node, lane, projection_status, source, status, reason] + - id: workbench_terminal_projection_delay_p95 + metric: hwlab:workbench_terminal_projection_delay:p95_seconds + sourceMetric: hwlab_workbench_terminal_projection_delay_seconds + quantile: 0.95 + window: 5m + minSamples: 1 + groupBy: [namespace, gitops_target, node, lane, projection_status, source, status, reason] + - id: workbench_turn_get_p95 + metric: hwlab:workbench_turn_get:p95_seconds + sourceMetric: hwlab_workbench_turn_get_duration_seconds + quantile: 0.95 + window: 5m + minSamples: 1 + groupBy: [namespace, gitops_target, node, lane, route, status, degraded_reason] + - id: agentrun_result_p95 + metric: hwlab:agentrun_result:p95_seconds + sourceMetric: hwlab_agentrun_result_duration_seconds + quantile: 0.95 + window: 5m + minSamples: 1 + groupBy: [namespace, gitops_target, node, lane, event_count_bucket, status] + - id: workbench_projector_batch_p95 + metric: hwlab:workbench_projector_batch:p95_seconds + sourceMetric: hwlab_workbench_projector_batch_duration_seconds + quantile: 0.95 + window: 5m + minSamples: 1 + groupBy: [namespace, gitops_target, node, lane, phase, status] warningAlerts: - id: HWLABWorkbenchSubmitFirstVisibleSlow ruleId: workbench_submit_first_visible_p95 @@ -243,6 +292,36 @@ lanes: thresholdSeconds: 13 minSamples: 5 for: 10m + - id: WorkbenchProjectionStuck + ruleId: workbench_projection_lag_p95 + severity: warning + thresholdSeconds: 60 + minSamples: 1 + for: 10m + - id: WorkbenchTerminalProjectionMissing + ruleId: workbench_terminal_projection_delay_p95 + severity: warning + thresholdSeconds: 60 + minSamples: 1 + for: 10m + - id: WorkbenchTurnReadSlow + ruleId: workbench_turn_get_p95 + severity: warning + thresholdSeconds: 5 + minSamples: 1 + for: 10m + - id: AgentRunResultSlowLongTrace + ruleId: agentrun_result_p95 + severity: warning + thresholdSeconds: 2.5 + minSamples: 1 + for: 10m + - id: WorkbenchProjectorNoProgress + ruleId: workbench_projector_batch_p95 + severity: warning + thresholdSeconds: 10 + minSamples: 1 + for: 10m runtimeImageRewrites: - source: fatedier/frpc:v0.68.1 target: 127.0.0.1:5000/hwlab/frpc:v0.68.1 diff --git a/scripts/src/hwlab-node-impl.ts b/scripts/src/hwlab-node-impl.ts index d6bdd898..22074031 100644 --- a/scripts/src/hwlab-node-impl.ts +++ b/scripts/src/hwlab-node-impl.ts @@ -852,6 +852,14 @@ function nodeObservabilityMetricsProbe(options: NodeObservabilityOptions, podNam { id: "workbench_journey", metric: "hwlab_workbench_journey_duration_seconds", kind: "workbench_journey", dimensionLabels: ["namespace", "gitops_target", "journey", "route", "backend", "transport", "target_state", "cache", "source", "entry", "outcome"] }, { id: "workbench_event_phase", metric: "hwlab_workbench_event_phase_duration_seconds", kind: "workbench_event_phase", dimensionLabels: ["namespace", "gitops_target", "phase", "event_type", "backend", "transport", "outcome"] }, { id: "workbench_backend_event_visible", metric: "hwlab_workbench_backend_event_visible_latency_seconds", kind: "workbench_backend_event_visible", dimensionLabels: ["namespace", "gitops_target", "event_type", "backend", "transport", "outcome"] }, + { id: "workbench_projection_lag_events", metric: "hwlab_workbench_projection_lag_events", kind: "workbench_projection_lag_events", dimensionLabels: ["namespace", "gitops_target", "node", "lane", "projection_status", "source", "status", "reason"] }, + { id: "workbench_projection_lag_seconds", metric: "hwlab_workbench_projection_lag_seconds", kind: "workbench_projection_lag_seconds", dimensionLabels: ["namespace", "gitops_target", "node", "lane", "projection_status", "source", "status", "reason"] }, + { id: "workbench_terminal_projection_delay", metric: "hwlab_workbench_terminal_projection_delay_seconds", kind: "workbench_terminal_projection_delay", dimensionLabels: ["namespace", "gitops_target", "node", "lane", "projection_status", "source", "status", "reason"] }, + { id: "workbench_turn_get", metric: "hwlab_workbench_turn_get_duration_seconds", kind: "workbench_turn_get", dimensionLabels: ["namespace", "gitops_target", "node", "lane", "route", "status", "degraded_reason"] }, + { id: "agentrun_result_duration", metric: "hwlab_agentrun_result_duration_seconds", kind: "agentrun_result_duration", dimensionLabels: ["namespace", "gitops_target", "node", "lane", "event_count_bucket", "status"] }, + { id: "agentrun_result_pages", metric: "hwlab_agentrun_result_pages_scanned", kind: "agentrun_result_pages", dimensionLabels: ["namespace", "gitops_target", "node", "lane"] }, + { id: "agentrun_result_events", metric: "hwlab_agentrun_result_events_scanned", kind: "agentrun_result_events", dimensionLabels: ["namespace", "gitops_target", "node", "lane"] }, + { id: "workbench_projector_batch", metric: "hwlab_workbench_projector_batch_duration_seconds", kind: "workbench_projector_batch", dimensionLabels: ["namespace", "gitops_target", "node", "lane", "phase", "status"] }, ], }), "utf8").toString("base64"); const source = [ @@ -928,6 +936,11 @@ function nodeObservabilityMetricsProbe(options: NodeObservabilityOptions, podNam " deniedBackendEventLines: compactLines(deniedBackendEventLines),", " backendVisibleCountLines: compactLines(sampleLines.filter((line) => lineMatchesMetric(line, 'hwlab_workbench_backend_event_visible_latency_seconds_count'))),", " phaseCountLines: compactLines(sampleLines.filter((line) => lineMatchesMetric(line, 'hwlab_workbench_event_phase_duration_seconds_count'))),", + " projectionLagCountLines: compactLines(sampleLines.filter((line) => lineMatchesMetric(line, 'hwlab_workbench_projection_lag_events_count') || lineMatchesMetric(line, 'hwlab_workbench_projection_lag_seconds_count'))),", + " turnGetCountLines: compactLines(sampleLines.filter((line) => lineMatchesMetric(line, 'hwlab_workbench_turn_get_duration_seconds_count'))),", + " agentRunResultCountLines: compactLines(sampleLines.filter((line) => lineMatchesMetric(line, 'hwlab_agentrun_result_duration_seconds_count'))),", + " projectorBatchCountLines: compactLines(sampleLines.filter((line) => lineMatchesMetric(line, 'hwlab_workbench_projector_batch_duration_seconds_count'))),", + " projectorLastSuccessLines: compactLines(sampleLines.filter((line) => lineMatchesMetric(line, 'hwlab_workbench_projector_last_success_unixtime'))),", " degradedReason: ready ? undefined : missingSeries.length > 0 ? 'node-observability-required-series-missing' : 'node-observability-denied-backend-label-present',", " };", "}", @@ -1043,6 +1056,11 @@ function summarizeNodeObservabilityMetrics(observability: HwlabRuntimeObservabil deniedBackendEventLines: compactPrometheusLines(deniedBackendEventLines), backendVisibleCountLines: compactPrometheusLines(sampleLines.filter((line) => prometheusLineMatchesMetric(line, "hwlab_workbench_backend_event_visible_latency_seconds_count"))), phaseCountLines: compactPrometheusLines(sampleLines.filter((line) => prometheusLineMatchesMetric(line, "hwlab_workbench_event_phase_duration_seconds_count"))), + projectionLagCountLines: compactPrometheusLines(sampleLines.filter((line) => prometheusLineMatchesMetric(line, "hwlab_workbench_projection_lag_events_count") || prometheusLineMatchesMetric(line, "hwlab_workbench_projection_lag_seconds_count"))), + turnGetCountLines: compactPrometheusLines(sampleLines.filter((line) => prometheusLineMatchesMetric(line, "hwlab_workbench_turn_get_duration_seconds_count"))), + agentRunResultCountLines: compactPrometheusLines(sampleLines.filter((line) => prometheusLineMatchesMetric(line, "hwlab_agentrun_result_duration_seconds_count"))), + projectorBatchCountLines: compactPrometheusLines(sampleLines.filter((line) => prometheusLineMatchesMetric(line, "hwlab_workbench_projector_batch_duration_seconds_count"))), + projectorLastSuccessLines: compactPrometheusLines(sampleLines.filter((line) => prometheusLineMatchesMetric(line, "hwlab_workbench_projector_last_success_unixtime"))), degradedReason: ready ? undefined : missingSeries.length > 0 ? "node-observability-required-series-missing" : "node-observability-denied-backend-label-present", }; }