Merge pull request #1182 from pikasTech/fix/1181-d518-sentinel-cadence

fix: expose D518 sentinel cadence root cause
This commit is contained in:
Lyon
2026-06-28 08:32:25 +08:00
committed by GitHub
3 changed files with 95 additions and 10 deletions
+29
View File
@@ -7,6 +7,7 @@ metadata:
spec: PJ2026-01060501 OTel追踪 draft-2026-06-19-p0
relatedIssues:
- 489
- 1181
defaults:
targetId: D601
@@ -28,6 +29,12 @@ targets:
role: active
enabled: true
createNamespace: true
- id: D518
route: D518:k3s
namespace: platform-infra
role: active
enabled: true
createNamespace: true
collector:
deploymentName: otel-collector
@@ -76,6 +83,20 @@ instrumentation:
- projection_write
- trace_events_read
- turn_status_read
- serviceName: hwlab-cloud-api
owningRepo: pikasTech/HWLAB
configRefs:
targetNode: config/hwlab-node-lanes.yaml#lanes.v03.targets.D518.node
lane: config/hwlab-node-lanes.yaml#lanes.v03.version
namespace: config/hwlab-node-lanes.yaml#lanes.v03.targets.D518.runtime.namespace
requiredSpans:
- POST /v1/agent/chat
- durable_admission
- billing_preflight
- agentrun_dispatch
- projection_write
- trace_events_read
- turn_status_read
- serviceName: user-billing
owningRepo: pikasTech/HWLAB
configRefs:
@@ -84,6 +105,14 @@ instrumentation:
namespace: config/hwlab-node-lanes.yaml#lanes.v03.targets.D601.runtime.namespace
requiredSpans:
- billing_preflight
- serviceName: user-billing
owningRepo: pikasTech/HWLAB
configRefs:
targetNode: config/hwlab-node-lanes.yaml#lanes.v03.targets.D518.node
lane: config/hwlab-node-lanes.yaml#lanes.v03.version
namespace: config/hwlab-node-lanes.yaml#lanes.v03.targets.D518.runtime.namespace
requiredSpans:
- billing_preflight
- serviceName: agentrun-manager
owningRepo: pikasTech/agentrun
configRefs:
+57 -3
View File
@@ -414,14 +414,23 @@ function serviceHealth(config: WebProbeSentinelServiceConfig, db: Database, sche
checks.sqlite = checkSqlite(db);
const heartbeatAt = stringOrNull(scheduler.schedulerHeartbeatAt) ?? stringOrNull(readMetadata(db, "scheduler.heartbeat")?.at);
const heartbeatAgeSeconds = heartbeatAt === null ? null : Math.max(0, Math.round((Date.now() - Date.parse(heartbeatAt)) / 1000));
const planned = plannedRunBacklog(config, db);
checks.scheduler = {
ok: scheduler.schedulerLastError === null && heartbeatAgeSeconds !== null && heartbeatAgeSeconds <= config.schedulerHeartbeatStaleSeconds,
ok: scheduler.schedulerLastError === null && heartbeatAgeSeconds !== null && heartbeatAgeSeconds <= config.schedulerHeartbeatStaleSeconds && !planned.stale,
enabled: scheduler.schedulerEnabled === true,
active: scheduler.schedulerTimerActive === true,
heartbeatAt,
heartbeatAgeSeconds,
staleAfterSeconds: config.schedulerHeartbeatStaleSeconds,
lastError: scheduler.schedulerLastError,
plannedRuns: planned.count,
oldestPlannedRunId: planned.oldestRunId,
oldestPlannedRunScenarioId: planned.oldestScenarioId,
oldestPlannedRunCreatedAt: planned.oldestCreatedAt,
oldestPlannedRunAgeSeconds: planned.oldestAgeSeconds,
plannedRunStaleAfterSeconds: planned.staleAfterSeconds,
plannedRunStale: planned.stale,
rootCause: planned.stale ? "planned-run-not-consumed-by-host-cadence" : null,
};
checks.analyzer = {
ok: true,
@@ -534,12 +543,20 @@ function appendObserveCommandArgs(argv: string[], item: Record<string, unknown>,
}
function schedulerSummary(config: WebProbeSentinelServiceConfig, db: Database): Record<string, unknown> {
const planned = plannedRunBacklog(config, db);
return {
enabledScenarios: config.scenarios.filter((item) => boolAt(item, "enabled")).map((item) => stringAt(item, "id")),
intervalMs: config.schedulerIntervalMs,
maxConcurrentRuns: config.maxConcurrentRuns,
activeRuns: countWhere(db, "status IN ('queued', 'running', 'analyzing')"),
plannedRuns: countWhere(db, "status = 'planned'"),
plannedRuns: planned.count,
oldestPlannedRunId: planned.oldestRunId,
oldestPlannedRunScenarioId: planned.oldestScenarioId,
oldestPlannedRunCreatedAt: planned.oldestCreatedAt,
oldestPlannedRunAgeSeconds: planned.oldestAgeSeconds,
plannedRunStaleAfterSeconds: planned.staleAfterSeconds,
plannedRunStale: planned.stale,
rootCause: planned.stale ? "planned-run-not-consumed-by-host-cadence" : null,
heartbeat: readMetadata(db, "scheduler.heartbeat"),
valuesRedacted: true,
};
@@ -573,10 +590,42 @@ function renderMetrics(config: WebProbeSentinelServiceConfig, db: Database, heal
"# HELP web_probe_sentinel_scheduler_heartbeat_age_seconds Scheduler heartbeat age.",
"# TYPE web_probe_sentinel_scheduler_heartbeat_age_seconds gauge",
`web_probe_sentinel_scheduler_heartbeat_age_seconds{${labels}} ${heartbeatAge}`,
"# HELP web_probe_sentinel_planned_runs Planned runs waiting for host cadence execution.",
"# TYPE web_probe_sentinel_planned_runs gauge",
`web_probe_sentinel_planned_runs{${labels}} ${countWhere(db, "status = 'planned'")}`,
"# HELP web_probe_sentinel_oldest_planned_run_age_seconds Oldest planned run age, or -1 when no planned run exists.",
"# TYPE web_probe_sentinel_oldest_planned_run_age_seconds gauge",
`web_probe_sentinel_oldest_planned_run_age_seconds{${labels}} ${plannedRunBacklog(config, db).oldestAgeSeconds ?? -1}`,
];
return `${lines.join("\n")}\n`;
}
function plannedRunBacklog(config: WebProbeSentinelServiceConfig, db: Database): {
readonly count: number;
readonly oldestRunId: string | null;
readonly oldestScenarioId: string | null;
readonly oldestCreatedAt: string | null;
readonly oldestAgeSeconds: number | null;
readonly staleAfterSeconds: number;
readonly stale: boolean;
} {
const count = countWhere(db, "status = 'planned'");
const oldest = db.query("SELECT id, scenario_id, created_at FROM runs WHERE status = 'planned' ORDER BY created_at ASC LIMIT 1")
.get() as Record<string, unknown> | null;
const oldestCreatedAt = stringOrNull(oldest?.created_at);
const oldestAgeSeconds = oldestCreatedAt === null ? null : ageSeconds(oldestCreatedAt);
const staleAfterSeconds = Math.max(60, Math.round(config.schedulerIntervalMs / 1000));
return {
count,
oldestRunId: stringOrNull(oldest?.id),
oldestScenarioId: stringOrNull(oldest?.scenario_id),
oldestCreatedAt,
oldestAgeSeconds,
staleAfterSeconds,
stale: oldestAgeSeconds !== null && oldestAgeSeconds >= staleAfterSeconds,
};
}
function readConfigRefTarget(ref: string): unknown {
const [file, path] = ref.split("#");
if (file === undefined || path === undefined) throw new Error(`invalid configRef: ${ref}`);
@@ -634,7 +683,7 @@ function dashboardOverview(config: WebProbeSentinelServiceConfig, db: Database,
targetValidation: {
scenarioId: stringOrNull(record(config.cicd.targetValidation).scenarioId),
maxSeconds: numberOr(record(config.cicd.targetValidation).maxSeconds, 120),
sourceRef: "config/hwlab-web-probe-sentinel/cicd.d601-v03.yaml#sentinel.cicd.targetValidation",
sourceRef: targetValidationSourceRef(config),
},
traceability: {
source: "sqlite-index+run-report-metadata",
@@ -645,6 +694,11 @@ function dashboardOverview(config: WebProbeSentinelServiceConfig, db: Database,
};
}
function targetValidationSourceRef(config: WebProbeSentinelServiceConfig): string | null {
const cicdRef = config.plan.refs.find((item) => item.key === "cicd")?.ref ?? null;
return cicdRef === null ? null : `${cicdRef}.targetValidation`;
}
function dashboardRunList(config: WebProbeSentinelServiceConfig, db: Database, url: URL): Record<string, unknown> {
const filters = dashboardRunFilters(url);
const page = dashboardPage(url, config);
@@ -31,15 +31,17 @@ export function observabilityHelp(): Record<string, unknown> {
spec: "PJ2026-01060501 OTel追踪 draft-2026-06-19-p0",
usage: [
"bun scripts/cli.ts platform-infra observability plan --target D601",
"bun scripts/cli.ts platform-infra observability plan --target D518",
"bun scripts/cli.ts platform-infra observability apply --target D601 --dry-run",
"bun scripts/cli.ts platform-infra observability apply --target D601 --confirm",
"bun scripts/cli.ts platform-infra observability apply --target D518 --confirm",
"bun scripts/cli.ts platform-infra observability status --target D601 [--full|--raw]",
"bun scripts/cli.ts platform-infra observability validate --target D601 [--full|--raw]",
"bun scripts/cli.ts platform-infra observability trace --target D601 --trace-id <traceId> [--grep provider-stream-disconnected] [--limit 40] [--full|--raw]",
"bun scripts/cli.ts platform-infra observability search --target D601 --grep 'no rollout found' [--lookback-minutes 360] [--candidate-limit 80] [--limit 20] [--full|--raw]",
"bun scripts/cli.ts platform-infra observability search --target D601 --path /v1/workbench/sessions --status 502 [--lookback-minutes 120] [--full|--raw]",
"bun scripts/cli.ts platform-infra observability diagnose-code-agent --target D601 --business-trace-id <trc_...> [--full|--raw]",
"bun scripts/cli.ts platform-infra observability diagnose-code-agent --target D601 --run-id <run_...> [--command-id <cmd_...>] [--runner-job-id <rjob_...>] [--full|--raw]",
"bun scripts/cli.ts platform-infra observability status --target D518 [--full|--raw]",
"bun scripts/cli.ts platform-infra observability validate --target D518 [--full|--raw]",
"bun scripts/cli.ts platform-infra observability trace --target D518 --trace-id <traceId> [--grep provider-stream-disconnected] [--limit 40] [--full|--raw]",
"bun scripts/cli.ts platform-infra observability search --target D518 --grep 'no rollout found' [--lookback-minutes 360] [--candidate-limit 80] [--limit 20] [--full|--raw]",
"bun scripts/cli.ts platform-infra observability search --target D518 --path /v1/workbench/sessions --status 502 [--lookback-minutes 120] [--full|--raw]",
"bun scripts/cli.ts platform-infra observability diagnose-code-agent --target D518 --business-trace-id <trc_...> [--full|--raw]",
"bun scripts/cli.ts platform-infra observability diagnose-code-agent --target D518 --run-id <run_...> [--command-id <cmd_...>] [--runner-job-id <rjob_...>] [--full|--raw]",
],
boundary: "Prometheus remains the metrics source; this command owns only platform-infra OTel Collector, trace backend readiness, and trace lookup.",
};