From 093eefe279638b4221b8d490903df50c55506189 Mon Sep 17 00:00:00 2001 From: Codex Date: Sun, 28 Jun 2026 00:31:39 +0000 Subject: [PATCH] fix: expose d518 sentinel cadence root cause --- config/platform-infra/observability.yaml | 29 +++++++++ .../src/hwlab-node-web-sentinel-service.ts | 60 ++++++++++++++++++- .../platform-infra-observability/options.ts | 16 ++--- 3 files changed, 95 insertions(+), 10 deletions(-) diff --git a/config/platform-infra/observability.yaml b/config/platform-infra/observability.yaml index aed32c0b..0108860b 100644 --- a/config/platform-infra/observability.yaml +++ b/config/platform-infra/observability.yaml @@ -7,6 +7,7 @@ metadata: spec: PJ2026-01060501 OTel追踪 draft-2026-06-19-p0 relatedIssues: - 489 + - 1181 defaults: targetId: D601 @@ -28,6 +29,12 @@ targets: role: active enabled: true createNamespace: true + - id: D518 + route: D518:k3s + namespace: platform-infra + role: active + enabled: true + createNamespace: true collector: deploymentName: otel-collector @@ -76,6 +83,20 @@ instrumentation: - projection_write - trace_events_read - turn_status_read + - serviceName: hwlab-cloud-api + owningRepo: pikasTech/HWLAB + configRefs: + targetNode: config/hwlab-node-lanes.yaml#lanes.v03.targets.D518.node + lane: config/hwlab-node-lanes.yaml#lanes.v03.version + namespace: config/hwlab-node-lanes.yaml#lanes.v03.targets.D518.runtime.namespace + requiredSpans: + - POST /v1/agent/chat + - durable_admission + - billing_preflight + - agentrun_dispatch + - projection_write + - trace_events_read + - turn_status_read - serviceName: user-billing owningRepo: pikasTech/HWLAB configRefs: @@ -84,6 +105,14 @@ instrumentation: namespace: config/hwlab-node-lanes.yaml#lanes.v03.targets.D601.runtime.namespace requiredSpans: - billing_preflight + - serviceName: user-billing + owningRepo: pikasTech/HWLAB + configRefs: + targetNode: config/hwlab-node-lanes.yaml#lanes.v03.targets.D518.node + lane: config/hwlab-node-lanes.yaml#lanes.v03.version + namespace: config/hwlab-node-lanes.yaml#lanes.v03.targets.D518.runtime.namespace + requiredSpans: + - billing_preflight - serviceName: agentrun-manager owningRepo: pikasTech/agentrun configRefs: diff --git a/scripts/src/hwlab-node-web-sentinel-service.ts b/scripts/src/hwlab-node-web-sentinel-service.ts index d867af50..c2ab84fb 100644 --- a/scripts/src/hwlab-node-web-sentinel-service.ts +++ b/scripts/src/hwlab-node-web-sentinel-service.ts @@ -414,14 +414,23 @@ function serviceHealth(config: WebProbeSentinelServiceConfig, db: Database, sche checks.sqlite = checkSqlite(db); const heartbeatAt = stringOrNull(scheduler.schedulerHeartbeatAt) ?? stringOrNull(readMetadata(db, "scheduler.heartbeat")?.at); const heartbeatAgeSeconds = heartbeatAt === null ? null : Math.max(0, Math.round((Date.now() - Date.parse(heartbeatAt)) / 1000)); + const planned = plannedRunBacklog(config, db); checks.scheduler = { - ok: scheduler.schedulerLastError === null && heartbeatAgeSeconds !== null && heartbeatAgeSeconds <= config.schedulerHeartbeatStaleSeconds, + ok: scheduler.schedulerLastError === null && heartbeatAgeSeconds !== null && heartbeatAgeSeconds <= config.schedulerHeartbeatStaleSeconds && !planned.stale, enabled: scheduler.schedulerEnabled === true, active: scheduler.schedulerTimerActive === true, heartbeatAt, heartbeatAgeSeconds, staleAfterSeconds: config.schedulerHeartbeatStaleSeconds, lastError: scheduler.schedulerLastError, + plannedRuns: planned.count, + oldestPlannedRunId: planned.oldestRunId, + oldestPlannedRunScenarioId: planned.oldestScenarioId, + oldestPlannedRunCreatedAt: planned.oldestCreatedAt, + oldestPlannedRunAgeSeconds: planned.oldestAgeSeconds, + plannedRunStaleAfterSeconds: planned.staleAfterSeconds, + plannedRunStale: planned.stale, + rootCause: planned.stale ? "planned-run-not-consumed-by-host-cadence" : null, }; checks.analyzer = { ok: true, @@ -534,12 +543,20 @@ function appendObserveCommandArgs(argv: string[], item: Record, } function schedulerSummary(config: WebProbeSentinelServiceConfig, db: Database): Record { + const planned = plannedRunBacklog(config, db); return { enabledScenarios: config.scenarios.filter((item) => boolAt(item, "enabled")).map((item) => stringAt(item, "id")), intervalMs: config.schedulerIntervalMs, maxConcurrentRuns: config.maxConcurrentRuns, activeRuns: countWhere(db, "status IN ('queued', 'running', 'analyzing')"), - plannedRuns: countWhere(db, "status = 'planned'"), + plannedRuns: planned.count, + oldestPlannedRunId: planned.oldestRunId, + oldestPlannedRunScenarioId: planned.oldestScenarioId, + oldestPlannedRunCreatedAt: planned.oldestCreatedAt, + oldestPlannedRunAgeSeconds: planned.oldestAgeSeconds, + plannedRunStaleAfterSeconds: planned.staleAfterSeconds, + plannedRunStale: planned.stale, + rootCause: planned.stale ? "planned-run-not-consumed-by-host-cadence" : null, heartbeat: readMetadata(db, "scheduler.heartbeat"), valuesRedacted: true, }; @@ -573,10 +590,42 @@ function renderMetrics(config: WebProbeSentinelServiceConfig, db: Database, heal "# HELP web_probe_sentinel_scheduler_heartbeat_age_seconds Scheduler heartbeat age.", "# TYPE web_probe_sentinel_scheduler_heartbeat_age_seconds gauge", `web_probe_sentinel_scheduler_heartbeat_age_seconds{${labels}} ${heartbeatAge}`, + "# HELP web_probe_sentinel_planned_runs Planned runs waiting for host cadence execution.", + "# TYPE web_probe_sentinel_planned_runs gauge", + `web_probe_sentinel_planned_runs{${labels}} ${countWhere(db, "status = 'planned'")}`, + "# HELP web_probe_sentinel_oldest_planned_run_age_seconds Oldest planned run age, or -1 when no planned run exists.", + "# TYPE web_probe_sentinel_oldest_planned_run_age_seconds gauge", + `web_probe_sentinel_oldest_planned_run_age_seconds{${labels}} ${plannedRunBacklog(config, db).oldestAgeSeconds ?? -1}`, ]; return `${lines.join("\n")}\n`; } +function plannedRunBacklog(config: WebProbeSentinelServiceConfig, db: Database): { + readonly count: number; + readonly oldestRunId: string | null; + readonly oldestScenarioId: string | null; + readonly oldestCreatedAt: string | null; + readonly oldestAgeSeconds: number | null; + readonly staleAfterSeconds: number; + readonly stale: boolean; +} { + const count = countWhere(db, "status = 'planned'"); + const oldest = db.query("SELECT id, scenario_id, created_at FROM runs WHERE status = 'planned' ORDER BY created_at ASC LIMIT 1") + .get() as Record | null; + const oldestCreatedAt = stringOrNull(oldest?.created_at); + const oldestAgeSeconds = oldestCreatedAt === null ? null : ageSeconds(oldestCreatedAt); + const staleAfterSeconds = Math.max(60, Math.round(config.schedulerIntervalMs / 1000)); + return { + count, + oldestRunId: stringOrNull(oldest?.id), + oldestScenarioId: stringOrNull(oldest?.scenario_id), + oldestCreatedAt, + oldestAgeSeconds, + staleAfterSeconds, + stale: oldestAgeSeconds !== null && oldestAgeSeconds >= staleAfterSeconds, + }; +} + function readConfigRefTarget(ref: string): unknown { const [file, path] = ref.split("#"); if (file === undefined || path === undefined) throw new Error(`invalid configRef: ${ref}`); @@ -634,7 +683,7 @@ function dashboardOverview(config: WebProbeSentinelServiceConfig, db: Database, targetValidation: { scenarioId: stringOrNull(record(config.cicd.targetValidation).scenarioId), maxSeconds: numberOr(record(config.cicd.targetValidation).maxSeconds, 120), - sourceRef: "config/hwlab-web-probe-sentinel/cicd.d601-v03.yaml#sentinel.cicd.targetValidation", + sourceRef: targetValidationSourceRef(config), }, traceability: { source: "sqlite-index+run-report-metadata", @@ -645,6 +694,11 @@ function dashboardOverview(config: WebProbeSentinelServiceConfig, db: Database, }; } +function targetValidationSourceRef(config: WebProbeSentinelServiceConfig): string | null { + const cicdRef = config.plan.refs.find((item) => item.key === "cicd")?.ref ?? null; + return cicdRef === null ? null : `${cicdRef}.targetValidation`; +} + function dashboardRunList(config: WebProbeSentinelServiceConfig, db: Database, url: URL): Record { const filters = dashboardRunFilters(url); const page = dashboardPage(url, config); diff --git a/scripts/src/platform-infra-observability/options.ts b/scripts/src/platform-infra-observability/options.ts index 656c689e..67c7ec09 100644 --- a/scripts/src/platform-infra-observability/options.ts +++ b/scripts/src/platform-infra-observability/options.ts @@ -31,15 +31,17 @@ export function observabilityHelp(): Record { spec: "PJ2026-01060501 OTel追踪 draft-2026-06-19-p0", usage: [ "bun scripts/cli.ts platform-infra observability plan --target D601", + "bun scripts/cli.ts platform-infra observability plan --target D518", "bun scripts/cli.ts platform-infra observability apply --target D601 --dry-run", - "bun scripts/cli.ts platform-infra observability apply --target D601 --confirm", + "bun scripts/cli.ts platform-infra observability apply --target D518 --confirm", "bun scripts/cli.ts platform-infra observability status --target D601 [--full|--raw]", - "bun scripts/cli.ts platform-infra observability validate --target D601 [--full|--raw]", - "bun scripts/cli.ts platform-infra observability trace --target D601 --trace-id [--grep provider-stream-disconnected] [--limit 40] [--full|--raw]", - "bun scripts/cli.ts platform-infra observability search --target D601 --grep 'no rollout found' [--lookback-minutes 360] [--candidate-limit 80] [--limit 20] [--full|--raw]", - "bun scripts/cli.ts platform-infra observability search --target D601 --path /v1/workbench/sessions --status 502 [--lookback-minutes 120] [--full|--raw]", - "bun scripts/cli.ts platform-infra observability diagnose-code-agent --target D601 --business-trace-id [--full|--raw]", - "bun scripts/cli.ts platform-infra observability diagnose-code-agent --target D601 --run-id [--command-id ] [--runner-job-id ] [--full|--raw]", + "bun scripts/cli.ts platform-infra observability status --target D518 [--full|--raw]", + "bun scripts/cli.ts platform-infra observability validate --target D518 [--full|--raw]", + "bun scripts/cli.ts platform-infra observability trace --target D518 --trace-id [--grep provider-stream-disconnected] [--limit 40] [--full|--raw]", + "bun scripts/cli.ts platform-infra observability search --target D518 --grep 'no rollout found' [--lookback-minutes 360] [--candidate-limit 80] [--limit 20] [--full|--raw]", + "bun scripts/cli.ts platform-infra observability search --target D518 --path /v1/workbench/sessions --status 502 [--lookback-minutes 120] [--full|--raw]", + "bun scripts/cli.ts platform-infra observability diagnose-code-agent --target D518 --business-trace-id [--full|--raw]", + "bun scripts/cli.ts platform-infra observability diagnose-code-agent --target D518 --run-id [--command-id ] [--runner-job-id ] [--full|--raw]", ], boundary: "Prometheus remains the metrics source; this command owns only platform-infra OTel Collector, trace backend readiness, and trace lookup.", };