Merge pull request #1183 from pikasTech/fix/1181-sentinel-health-serving

fix: keep sentinel health serving during backlog alerts
This commit is contained in:
Lyon
2026-06-28 08:40:34 +08:00
committed by GitHub
@@ -282,7 +282,10 @@ export function startWebProbeSentinelHttpService(service: WebProbeSentinelServic
async function sentinelFetch(service: WebProbeSentinelService, request: Request): Promise<Response> {
const url = new URL(request.url);
const pathname = normalizedSentinelRequestPath(service, url.pathname);
if (request.method === "GET" && pathname === "/api/health") return jsonResponse(service.health(), service.health().ok === true ? 200 : 503);
if (request.method === "GET" && pathname === "/api/health") {
const health = service.health();
return jsonResponse(health, health.serving === false ? 503 : 200);
}
if (request.method === "GET" && pathname === "/api/status") return jsonResponse(service.status());
if (request.method === "GET" && pathname === "/api/overview") return jsonResponse(service.overview());
if (request.method === "GET" && pathname === "/api/runs") return jsonResponse(service.dashboardRuns(url));
@@ -415,8 +418,9 @@ function serviceHealth(config: WebProbeSentinelServiceConfig, db: Database, sche
const heartbeatAt = stringOrNull(scheduler.schedulerHeartbeatAt) ?? stringOrNull(readMetadata(db, "scheduler.heartbeat")?.at);
const heartbeatAgeSeconds = heartbeatAt === null ? null : Math.max(0, Math.round((Date.now() - Date.parse(heartbeatAt)) / 1000));
const planned = plannedRunBacklog(config, db);
const schedulerServing = scheduler.schedulerLastError === null && heartbeatAgeSeconds !== null && heartbeatAgeSeconds <= config.schedulerHeartbeatStaleSeconds;
checks.scheduler = {
ok: scheduler.schedulerLastError === null && heartbeatAgeSeconds !== null && heartbeatAgeSeconds <= config.schedulerHeartbeatStaleSeconds && !planned.stale,
ok: schedulerServing && !planned.stale,
enabled: scheduler.schedulerEnabled === true,
active: scheduler.schedulerTimerActive === true,
heartbeatAt,
@@ -438,7 +442,8 @@ function serviceHealth(config: WebProbeSentinelServiceConfig, db: Database, sche
command: `bun scripts/cli.ts web-probe observe analyze --node ${config.node} --lane ${config.lane} --state-dir <stateDir>`,
};
const ok = Object.values(checks).every((check) => check.ok === true);
return { ok, status: ok ? "healthy" : "degraded", node: config.node, lane: config.lane, sentinelId: config.sentinelId, checks, valuesRedacted: true };
const serving = checks.config.ok === true && checks.pvc.ok === true && checks.sqlite.ok === true && schedulerServing;
return { ok, serving, status: ok ? "healthy" : serving ? "degraded" : "unavailable", node: config.node, lane: config.lane, sentinelId: config.sentinelId, checks, valuesRedacted: true };
}
function checkWritable(stateRoot: string): Record<string, unknown> {