Merge pull request #1182 from pikasTech/fix/1181-d518-sentinel-cadence
fix: expose D518 sentinel cadence root cause
This commit is contained in:
@@ -7,6 +7,7 @@ metadata:
|
||||
spec: PJ2026-01060501 OTel追踪 draft-2026-06-19-p0
|
||||
relatedIssues:
|
||||
- 489
|
||||
- 1181
|
||||
|
||||
defaults:
|
||||
targetId: D601
|
||||
@@ -28,6 +29,12 @@ targets:
|
||||
role: active
|
||||
enabled: true
|
||||
createNamespace: true
|
||||
- id: D518
|
||||
route: D518:k3s
|
||||
namespace: platform-infra
|
||||
role: active
|
||||
enabled: true
|
||||
createNamespace: true
|
||||
|
||||
collector:
|
||||
deploymentName: otel-collector
|
||||
@@ -76,6 +83,20 @@ instrumentation:
|
||||
- projection_write
|
||||
- trace_events_read
|
||||
- turn_status_read
|
||||
- serviceName: hwlab-cloud-api
|
||||
owningRepo: pikasTech/HWLAB
|
||||
configRefs:
|
||||
targetNode: config/hwlab-node-lanes.yaml#lanes.v03.targets.D518.node
|
||||
lane: config/hwlab-node-lanes.yaml#lanes.v03.version
|
||||
namespace: config/hwlab-node-lanes.yaml#lanes.v03.targets.D518.runtime.namespace
|
||||
requiredSpans:
|
||||
- POST /v1/agent/chat
|
||||
- durable_admission
|
||||
- billing_preflight
|
||||
- agentrun_dispatch
|
||||
- projection_write
|
||||
- trace_events_read
|
||||
- turn_status_read
|
||||
- serviceName: user-billing
|
||||
owningRepo: pikasTech/HWLAB
|
||||
configRefs:
|
||||
@@ -84,6 +105,14 @@ instrumentation:
|
||||
namespace: config/hwlab-node-lanes.yaml#lanes.v03.targets.D601.runtime.namespace
|
||||
requiredSpans:
|
||||
- billing_preflight
|
||||
- serviceName: user-billing
|
||||
owningRepo: pikasTech/HWLAB
|
||||
configRefs:
|
||||
targetNode: config/hwlab-node-lanes.yaml#lanes.v03.targets.D518.node
|
||||
lane: config/hwlab-node-lanes.yaml#lanes.v03.version
|
||||
namespace: config/hwlab-node-lanes.yaml#lanes.v03.targets.D518.runtime.namespace
|
||||
requiredSpans:
|
||||
- billing_preflight
|
||||
- serviceName: agentrun-manager
|
||||
owningRepo: pikasTech/agentrun
|
||||
configRefs:
|
||||
|
||||
@@ -414,14 +414,23 @@ function serviceHealth(config: WebProbeSentinelServiceConfig, db: Database, sche
|
||||
checks.sqlite = checkSqlite(db);
|
||||
const heartbeatAt = stringOrNull(scheduler.schedulerHeartbeatAt) ?? stringOrNull(readMetadata(db, "scheduler.heartbeat")?.at);
|
||||
const heartbeatAgeSeconds = heartbeatAt === null ? null : Math.max(0, Math.round((Date.now() - Date.parse(heartbeatAt)) / 1000));
|
||||
const planned = plannedRunBacklog(config, db);
|
||||
checks.scheduler = {
|
||||
ok: scheduler.schedulerLastError === null && heartbeatAgeSeconds !== null && heartbeatAgeSeconds <= config.schedulerHeartbeatStaleSeconds,
|
||||
ok: scheduler.schedulerLastError === null && heartbeatAgeSeconds !== null && heartbeatAgeSeconds <= config.schedulerHeartbeatStaleSeconds && !planned.stale,
|
||||
enabled: scheduler.schedulerEnabled === true,
|
||||
active: scheduler.schedulerTimerActive === true,
|
||||
heartbeatAt,
|
||||
heartbeatAgeSeconds,
|
||||
staleAfterSeconds: config.schedulerHeartbeatStaleSeconds,
|
||||
lastError: scheduler.schedulerLastError,
|
||||
plannedRuns: planned.count,
|
||||
oldestPlannedRunId: planned.oldestRunId,
|
||||
oldestPlannedRunScenarioId: planned.oldestScenarioId,
|
||||
oldestPlannedRunCreatedAt: planned.oldestCreatedAt,
|
||||
oldestPlannedRunAgeSeconds: planned.oldestAgeSeconds,
|
||||
plannedRunStaleAfterSeconds: planned.staleAfterSeconds,
|
||||
plannedRunStale: planned.stale,
|
||||
rootCause: planned.stale ? "planned-run-not-consumed-by-host-cadence" : null,
|
||||
};
|
||||
checks.analyzer = {
|
||||
ok: true,
|
||||
@@ -534,12 +543,20 @@ function appendObserveCommandArgs(argv: string[], item: Record<string, unknown>,
|
||||
}
|
||||
|
||||
function schedulerSummary(config: WebProbeSentinelServiceConfig, db: Database): Record<string, unknown> {
|
||||
const planned = plannedRunBacklog(config, db);
|
||||
return {
|
||||
enabledScenarios: config.scenarios.filter((item) => boolAt(item, "enabled")).map((item) => stringAt(item, "id")),
|
||||
intervalMs: config.schedulerIntervalMs,
|
||||
maxConcurrentRuns: config.maxConcurrentRuns,
|
||||
activeRuns: countWhere(db, "status IN ('queued', 'running', 'analyzing')"),
|
||||
plannedRuns: countWhere(db, "status = 'planned'"),
|
||||
plannedRuns: planned.count,
|
||||
oldestPlannedRunId: planned.oldestRunId,
|
||||
oldestPlannedRunScenarioId: planned.oldestScenarioId,
|
||||
oldestPlannedRunCreatedAt: planned.oldestCreatedAt,
|
||||
oldestPlannedRunAgeSeconds: planned.oldestAgeSeconds,
|
||||
plannedRunStaleAfterSeconds: planned.staleAfterSeconds,
|
||||
plannedRunStale: planned.stale,
|
||||
rootCause: planned.stale ? "planned-run-not-consumed-by-host-cadence" : null,
|
||||
heartbeat: readMetadata(db, "scheduler.heartbeat"),
|
||||
valuesRedacted: true,
|
||||
};
|
||||
@@ -573,10 +590,42 @@ function renderMetrics(config: WebProbeSentinelServiceConfig, db: Database, heal
|
||||
"# HELP web_probe_sentinel_scheduler_heartbeat_age_seconds Scheduler heartbeat age.",
|
||||
"# TYPE web_probe_sentinel_scheduler_heartbeat_age_seconds gauge",
|
||||
`web_probe_sentinel_scheduler_heartbeat_age_seconds{${labels}} ${heartbeatAge}`,
|
||||
"# HELP web_probe_sentinel_planned_runs Planned runs waiting for host cadence execution.",
|
||||
"# TYPE web_probe_sentinel_planned_runs gauge",
|
||||
`web_probe_sentinel_planned_runs{${labels}} ${countWhere(db, "status = 'planned'")}`,
|
||||
"# HELP web_probe_sentinel_oldest_planned_run_age_seconds Oldest planned run age, or -1 when no planned run exists.",
|
||||
"# TYPE web_probe_sentinel_oldest_planned_run_age_seconds gauge",
|
||||
`web_probe_sentinel_oldest_planned_run_age_seconds{${labels}} ${plannedRunBacklog(config, db).oldestAgeSeconds ?? -1}`,
|
||||
];
|
||||
return `${lines.join("\n")}\n`;
|
||||
}
|
||||
|
||||
function plannedRunBacklog(config: WebProbeSentinelServiceConfig, db: Database): {
|
||||
readonly count: number;
|
||||
readonly oldestRunId: string | null;
|
||||
readonly oldestScenarioId: string | null;
|
||||
readonly oldestCreatedAt: string | null;
|
||||
readonly oldestAgeSeconds: number | null;
|
||||
readonly staleAfterSeconds: number;
|
||||
readonly stale: boolean;
|
||||
} {
|
||||
const count = countWhere(db, "status = 'planned'");
|
||||
const oldest = db.query("SELECT id, scenario_id, created_at FROM runs WHERE status = 'planned' ORDER BY created_at ASC LIMIT 1")
|
||||
.get() as Record<string, unknown> | null;
|
||||
const oldestCreatedAt = stringOrNull(oldest?.created_at);
|
||||
const oldestAgeSeconds = oldestCreatedAt === null ? null : ageSeconds(oldestCreatedAt);
|
||||
const staleAfterSeconds = Math.max(60, Math.round(config.schedulerIntervalMs / 1000));
|
||||
return {
|
||||
count,
|
||||
oldestRunId: stringOrNull(oldest?.id),
|
||||
oldestScenarioId: stringOrNull(oldest?.scenario_id),
|
||||
oldestCreatedAt,
|
||||
oldestAgeSeconds,
|
||||
staleAfterSeconds,
|
||||
stale: oldestAgeSeconds !== null && oldestAgeSeconds >= staleAfterSeconds,
|
||||
};
|
||||
}
|
||||
|
||||
function readConfigRefTarget(ref: string): unknown {
|
||||
const [file, path] = ref.split("#");
|
||||
if (file === undefined || path === undefined) throw new Error(`invalid configRef: ${ref}`);
|
||||
@@ -634,7 +683,7 @@ function dashboardOverview(config: WebProbeSentinelServiceConfig, db: Database,
|
||||
targetValidation: {
|
||||
scenarioId: stringOrNull(record(config.cicd.targetValidation).scenarioId),
|
||||
maxSeconds: numberOr(record(config.cicd.targetValidation).maxSeconds, 120),
|
||||
sourceRef: "config/hwlab-web-probe-sentinel/cicd.d601-v03.yaml#sentinel.cicd.targetValidation",
|
||||
sourceRef: targetValidationSourceRef(config),
|
||||
},
|
||||
traceability: {
|
||||
source: "sqlite-index+run-report-metadata",
|
||||
@@ -645,6 +694,11 @@ function dashboardOverview(config: WebProbeSentinelServiceConfig, db: Database,
|
||||
};
|
||||
}
|
||||
|
||||
function targetValidationSourceRef(config: WebProbeSentinelServiceConfig): string | null {
|
||||
const cicdRef = config.plan.refs.find((item) => item.key === "cicd")?.ref ?? null;
|
||||
return cicdRef === null ? null : `${cicdRef}.targetValidation`;
|
||||
}
|
||||
|
||||
function dashboardRunList(config: WebProbeSentinelServiceConfig, db: Database, url: URL): Record<string, unknown> {
|
||||
const filters = dashboardRunFilters(url);
|
||||
const page = dashboardPage(url, config);
|
||||
|
||||
@@ -31,15 +31,17 @@ export function observabilityHelp(): Record<string, unknown> {
|
||||
spec: "PJ2026-01060501 OTel追踪 draft-2026-06-19-p0",
|
||||
usage: [
|
||||
"bun scripts/cli.ts platform-infra observability plan --target D601",
|
||||
"bun scripts/cli.ts platform-infra observability plan --target D518",
|
||||
"bun scripts/cli.ts platform-infra observability apply --target D601 --dry-run",
|
||||
"bun scripts/cli.ts platform-infra observability apply --target D601 --confirm",
|
||||
"bun scripts/cli.ts platform-infra observability apply --target D518 --confirm",
|
||||
"bun scripts/cli.ts platform-infra observability status --target D601 [--full|--raw]",
|
||||
"bun scripts/cli.ts platform-infra observability validate --target D601 [--full|--raw]",
|
||||
"bun scripts/cli.ts platform-infra observability trace --target D601 --trace-id <traceId> [--grep provider-stream-disconnected] [--limit 40] [--full|--raw]",
|
||||
"bun scripts/cli.ts platform-infra observability search --target D601 --grep 'no rollout found' [--lookback-minutes 360] [--candidate-limit 80] [--limit 20] [--full|--raw]",
|
||||
"bun scripts/cli.ts platform-infra observability search --target D601 --path /v1/workbench/sessions --status 502 [--lookback-minutes 120] [--full|--raw]",
|
||||
"bun scripts/cli.ts platform-infra observability diagnose-code-agent --target D601 --business-trace-id <trc_...> [--full|--raw]",
|
||||
"bun scripts/cli.ts platform-infra observability diagnose-code-agent --target D601 --run-id <run_...> [--command-id <cmd_...>] [--runner-job-id <rjob_...>] [--full|--raw]",
|
||||
"bun scripts/cli.ts platform-infra observability status --target D518 [--full|--raw]",
|
||||
"bun scripts/cli.ts platform-infra observability validate --target D518 [--full|--raw]",
|
||||
"bun scripts/cli.ts platform-infra observability trace --target D518 --trace-id <traceId> [--grep provider-stream-disconnected] [--limit 40] [--full|--raw]",
|
||||
"bun scripts/cli.ts platform-infra observability search --target D518 --grep 'no rollout found' [--lookback-minutes 360] [--candidate-limit 80] [--limit 20] [--full|--raw]",
|
||||
"bun scripts/cli.ts platform-infra observability search --target D518 --path /v1/workbench/sessions --status 502 [--lookback-minutes 120] [--full|--raw]",
|
||||
"bun scripts/cli.ts platform-infra observability diagnose-code-agent --target D518 --business-trace-id <trc_...> [--full|--raw]",
|
||||
"bun scripts/cli.ts platform-infra observability diagnose-code-agent --target D518 --run-id <run_...> [--command-id <cmd_...>] [--runner-job-id <rjob_...>] [--full|--raw]",
|
||||
],
|
||||
boundary: "Prometheus remains the metrics source; this command owns only platform-infra OTel Collector, trace backend readiness, and trace lookup.",
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user