fix: restore web sentinel cadence and monitor hover

This commit is contained in:
Codex
2026-06-27 07:28:31 +00:00
parent a02fa6f6e5
commit 9ffb781c8b
5 changed files with 775 additions and 15 deletions
@@ -23,7 +23,7 @@
| Dashboard 实现引用版本 | draft-2026-06-26-p8-web-probe-sentinel-recovery |
| 多实例实现引用版本 | draft-2026-06-26-p9-multi-web-probe-sentinel |
| Monitor Web 聚合实现引用版本 | draft-2026-06-26-p10-monitor-web-aggregation |
| Monitor Web 观察面板治理实现引用版本 | draft-2026-06-27-p11-monitor-web-observability-dashboard |
| Monitor Web 观察面板治理实现引用版本 | draft-2026-06-27-p11-monitor-web-observability-dashboard; draft-2026-06-27-p12-cadence-scheduler-monitor-web |
| 需求规格模板 | [ISO/IEC/IEEE 29148 需求规格模板](../../templates/iso-iec-ieee-29148-requirements-spec-template.md) |
| 上级规格 | [PJ2026-010605 运维监控](PJ2026-010605-observability-monitoring.md) |
| 关联规格 | [PJ2026-010401 Web工作台](PJ2026-010401-web-workbench.md)、[PJ2026-0104010803 Workbench唯一投影](PJ2026-0104010803-workbench-unique-projection.md)、[PJ2026-010403 API契约](PJ2026-010403-api-contract.md)、[PJ2026-010601 发布流水](PJ2026-010601-controlled-release.md)、[PJ2026-010602 源码同步](PJ2026-010602-source-sync.md)、[PJ2026-010603 YAML运维](PJ2026-010603-yaml-first-ops.md)、[PJ2026-010604 公开入口](PJ2026-010604-public-entry.md)、[PJ2026-01060505 Workbench性能](PJ2026-01060505-workbench-performance.md) |
@@ -639,6 +639,10 @@ P10 dashboard 受控验收必须沉淀为 CLI 入口。`web-probe sentinel dashb
cadence freshness 必须成为 `monitor-web` 的一等状态。每个 sentinel 应显示 YAML expected cadence、scheduler heartbeat age、latest run age、latest analyzed report age、active run、planned/next run 和 stale 倍数。cadence stale 默认是非阻塞告警;只有 scheduler 停摆、run/report 长时间不产生、submit/command 失败、采样样本缺失、或 Code Agent 多轮业务链路不可继续时,才升级为 blocker。面板不得把 timing warning、terminal-boundary elapsed correction 或单纯超时预算告警伪装成业务 blocker。
P12 runner-served-bridge 形态下,sentinel runner Pod 可以只承载 API、PVC/SQLite index、health、metrics 和 dashboard 静态资源;若 Pod 内没有完整 repo 配置、`trans`、Chromium 或 observe 依赖,不得让 Pod 自行 SSH/回调宿主机触发巡检。周期巡检必须由受控宿主控制面调度器读取同一 YAML registry、scenario/workflow cadence、publicExposure 和 targetValidation timeout,按 stale 窗口触发现有 `web-probe sentinel validate --quick-verify --confirm --wait` 路径。该调度器只负责 due 判断、互斥锁、timeout、命令执行和 JSONL 事件日志,不实现第二套采样、analyze、finding 分类或 report 写入。
宿主控制面调度器必须能被 systemd timer 或等价受控入口周期调用,默认 tick 间隔不得替代 YAML cadence;每次 tick 必须输出 sentinel id、cadence、latest run age、due、trigger status、latest run id 和下一步 drill-down。触发失败要区分业务 finding、命令 submit/control 失败、overview/API 不可达、lock-held 和 timeout;业务 finding 已产生新 run 时不得把 scheduler 本身标为 infra blocker。`monitor-web` 应继续把 stale run 作为非阻塞告警展示,但 run/report 持续不更新或 submit/control 失败必须能在面板和 CLI 中直接看到根因。
`monitor-web` 前端必须使用 Vue 3 + TypeScript + Vite,并与 HWLAB Cloud Web/Sub2API 运维图表的组件化方式对齐:typed API client、format composable、auto refresh composable、chart component、timeline component、run table、detail tabs、finding groups、loading/empty/error 状态和深链路由。图表库不是前置结论;可选 Chart.js、ECharts 或原生 SVG/canvas,但 SPEC/PR 必须说明包体、构建耗时、交互能力和维护成本取舍。
Vue `monitor-web` 的 CI/CD 必须和架构一起交付。YAML 必须声明 source、build context、Node/Bun/Vite 构建环境、env image、dependency cache、registry image、GitOps path、Argo Application、Service、publicExposure、runner discovery selector 和 screenshot 验收命令。CI 读源码必须优先走 node/lane 声明的 git mirror read URL;触发 PipelineRun 前做受控 pre-syncGitOps promotion 后做受控 post-flush,并在 status/closeout 中输出 `pendingFlush``githubInSync`、source commit、GitOps revision 和 PipelineRun 名称。
@@ -666,3 +670,5 @@ P9 多实例巡检与账号切换链路执行 issue 为 [#1017](https://github.c
P10 monitor-web 聚合执行 issue 为 [#1056](https://github.com/pikasTech/unidesk/issues/1056)。P10 closeout 必须回写:SPEC P10 引用、runner/web 职责拆分状态、Vue+TS monitor-web 迁移边界或短修边界、root 与至少一个 route prefix 的 browser render 证据、`web-probe sentinel dashboard verify|screenshot` 证据、publicExposure 和 runtime provenance、单哨兵 API 兼容性、未完成 monitor-web 架构项是否拆出后续 issue,以及 `unidesk-monitor` skill 是否记录当前操作面。
P11 monitor-web 观察面板治理执行 issue 为 [#1112](https://github.com/pikasTech/unidesk/issues/1112)。P11 的第一阶段必须先完成本 SPEC 收敛;SPEC 未合并前不得推进 Vue `monitor-web` 实现、CI/CD、GitOps、publicExposure 或部署代码。P11 实现 PR closeout 必须回写:SPEC P11 引用、Vue monitor-web 源码和 CI/CD 文件头部追溯、趋势曲线和运行时间线截图、固定视口三栏 overflow 摘要、cadence freshness 状态、env reuse/buildServices 证据、git mirror pre-sync/post-flush 证据、PipelineRun/Argo/GitOps/source alignment、root 与至少一个 sentinel detail 远程截图 localPath/SHA,以及超过两分钟 CI/CD 耗时是否已先从 env reuse/git mirror 方向优化。
P12 cadence 调度和 monitor-web 交互修复执行 issue 为 [#1123](https://github.com/pikasTech/unidesk/issues/1123)。P12 closeout 必须回写:SPEC P12 引用、两个 10m cadence sentinel 的 stale 证据、宿主控制面调度器 due 判断和触发记录、auth sentinel Argo/source alignment、趋势曲线 hover 数值和时间截图/DOM 证据、三栏 sticky header 遮盖复测、远程 PNG localPath/SHA、systemd timer 状态、以及两个目标 sentinel 最新 run 已刷新到当前窗口的证据。
@@ -1,3 +1,4 @@
/* SPEC: PJ2026-01060508 Web哨兵 draft-2026-06-27-p12-cadence-scheduler-monitor-web. */
:root {
color-scheme: light;
--bg: #f5f7f8;
@@ -320,7 +321,7 @@ select {
border: 1px solid var(--line);
border-radius: 8px;
background: linear-gradient(180deg, #ffffff 0%, #f7faf9 100%);
overflow: hidden;
overflow: visible;
}
.trend-chart {
@@ -377,6 +378,47 @@ select {
fill: var(--amber);
}
.trend-dot-hit {
cursor: default;
outline: none;
}
.trend-dot-hit:focus-visible .trend-hit-area {
stroke: var(--blue);
stroke-width: 1.5;
}
.trend-hit-area {
fill: transparent;
stroke: transparent;
}
.trend-tooltip {
position: absolute;
z-index: 8;
display: grid;
width: min(218px, calc(100% - 16px));
max-width: calc(100% - 16px);
gap: 3px;
transform: translate(-50%, -100%);
border: 1px solid var(--line-strong);
border-radius: 8px;
background: #ffffff;
box-shadow: 0 12px 28px rgba(32, 51, 48, 0.16);
color: var(--muted);
padding: 9px 10px;
pointer-events: none;
font-size: 12px;
line-height: 1.35;
}
.trend-tooltip strong {
overflow: hidden;
color: var(--text);
text-overflow: ellipsis;
white-space: nowrap;
}
.trend-legend {
display: flex;
flex-wrap: wrap;
@@ -513,6 +555,8 @@ select {
}
.pane {
position: relative;
isolation: isolate;
display: flex;
min-width: 0;
min-height: 0;
@@ -523,16 +567,27 @@ select {
.pane-header {
position: sticky;
top: 0;
z-index: 2;
top: -12px;
z-index: 12;
display: flex;
align-items: start;
justify-content: space-between;
gap: 10px;
margin: -12px -12px 10px;
padding: 12px;
padding: 12px 12px 13px;
border-bottom: 1px solid var(--line);
background: rgba(255, 255, 255, 0.96);
background: var(--panel);
box-shadow: 0 10px 16px rgba(32, 51, 48, 0.08);
}
.pane-header::before {
position: absolute;
right: 0;
left: 0;
top: -18px;
height: 18px;
background: var(--panel);
content: "";
}
.pane-header h2 {
@@ -1,3 +1,5 @@
// SPEC: PJ2026-01060508 Web哨兵 draft-2026-06-27-p12-cadence-scheduler-monitor-web.
// Responsibility: Vue monitor-web runtime for sentinel trend, timeline, detail and finding observability.
import { createApp, computed, onMounted, ref } from "./vendor/vue.esm-browser.prod.js";
const bootstrap = readBootstrap();
@@ -17,6 +19,7 @@ createApp({
const autoRefresh = ref(true);
const refreshSeconds = ref(30);
const lastLoadedAt = ref("");
const hoveredTrendDot = ref(null);
let lastAutoRefreshAt = 0;
const sentinels = computed(() => {
@@ -43,14 +46,34 @@ createApp({
warning: trendPolyline((run) => warningCount(run)),
total: trendPolyline((run) => findingCount(run)),
}));
const trendDots = computed(() => trendRows.value.map((run, index) => ({
id: run.id || String(index),
x: trendX(index, trendRows.value.length),
redY: trendY(redCount(run)),
warningY: trendY(warningCount(run)),
severity: severityClass(run),
title: `${shortId(run.id)} ${formatDate(run.updatedAt || run.createdAt)}`,
})));
const trendDots = computed(() => trendRows.value.map((run, index) => {
const red = redCount(run);
const warning = warningCount(run);
const total = findingCount(run);
const x = trendX(index, trendRows.value.length);
const redY = trendY(red);
const warningY = trendY(warning);
const rawTime = run.updatedAt || run.createdAt || "";
return {
id: run.id || String(index),
runId: run.id || "",
x,
redY,
warningY,
tooltipLeft: `${clamp((x / 720) * 100, 16, 84)}%`,
tooltipTop: `${clamp(((Math.min(redY, warningY) + 18) / 142) * 100, 24, 76)}%`,
red,
warning,
total,
status: run.status || "-",
severity: severityClass(run),
rawTime,
timeLabel: formatDate(rawTime),
absoluteTime: formatAbsoluteDate(rawTime),
reportSha: shortHash(run.reportJsonSha256 || run.report_json_sha256 || run.reportSha256 || ""),
title: `${shortId(run.id)} ${formatAbsoluteDate(rawTime)} 红色 ${red} 警告 ${warning} 总量 ${total}`,
};
}));
const timelineRuns = computed(() => runs.value.slice(0, 16));
const rootCauseFindings = computed(() => {
const rows = findings.value.filter((item) => item.rootCause || item.nextAction || ["red", "warning"].includes(severityClass(item)));
@@ -142,6 +165,14 @@ createApp({
return Math.round(126 - (Number(value || 0) / trendMax.value) * 102);
}
function showTrendTooltip(dot) {
hoveredTrendDot.value = dot;
}
function hideTrendTooltip() {
hoveredTrendDot.value = null;
}
onMounted(() => {
void loadAll();
window.setInterval(() => {
@@ -165,6 +196,7 @@ createApp({
autoRefresh,
refreshSeconds,
lastLoadedAt,
hoveredTrendDot,
sentinels,
currentStatus,
latestRun,
@@ -182,11 +214,14 @@ createApp({
selectRun,
refreshNow,
currentHref,
showTrendTooltip,
hideTrendTooltip,
redCount,
warningCount,
findingCount,
severityClass,
formatDate,
formatAbsoluteDate,
formatDuration,
shortId,
rootCauseText,
@@ -258,7 +293,19 @@ createApp({
<polyline v-if="trendPolylines.total" class="trend-total" :points="trendPolylines.total"></polyline>
<polyline v-if="trendPolylines.warning" class="trend-warning" :points="trendPolylines.warning"></polyline>
<polyline v-if="trendPolylines.red" class="trend-red" :points="trendPolylines.red"></polyline>
<g v-for="dot in trendDots" :key="dot.id">
<g
v-for="dot in trendDots"
:key="dot.id"
class="trend-dot-hit"
tabindex="0"
role="button"
:aria-label="dot.title"
@mouseenter="showTrendTooltip(dot)"
@focusin="showTrendTooltip(dot)"
@mouseleave="hideTrendTooltip"
@focusout="hideTrendTooltip"
>
<circle class="trend-hit-area" :cx="dot.x" :cy="Math.min(dot.redY, dot.warningY)" r="12"></circle>
<circle class="trend-dot-warning" :cx="dot.x" :cy="dot.warningY" r="3">
<title>{{ dot.title }}</title>
</circle>
@@ -267,6 +314,18 @@ createApp({
</circle>
</g>
</svg>
<div
v-if="hoveredTrendDot"
class="trend-tooltip"
data-monitor-trend-tooltip="true"
:style="{ left: hoveredTrendDot.tooltipLeft, top: hoveredTrendDot.tooltipTop }"
>
<strong>{{ shortId(hoveredTrendDot.runId) }}</strong>
<span>{{ hoveredTrendDot.absoluteTime }}</span>
<span>状态 {{ hoveredTrendDot.status }}</span>
<span>红色 {{ hoveredTrendDot.red }} / 警告 {{ hoveredTrendDot.warning }} / 总量 {{ hoveredTrendDot.total }}</span>
<span v-if="hoveredTrendDot.reportSha">report {{ hoveredTrendDot.reportSha }}</span>
</div>
<div v-if="trendRows.length === 0" class="trend-empty">暂无运行数据</div>
</div>
<div class="trend-legend">
@@ -515,6 +574,13 @@ function formatDate(value) {
return date.toISOString().slice(5, 16).replace("T", " ");
}
function formatAbsoluteDate(value) {
if (!value) return "-";
const date = new Date(value);
if (Number.isNaN(date.getTime())) return String(value);
return `${date.toISOString().slice(0, 19).replace("T", " ")} UTC`;
}
function formatDuration(seconds) {
const value = Math.max(0, Number(seconds || 0));
if (value < 90) return `${Math.round(value)}s`;
@@ -528,6 +594,16 @@ function shortId(value) {
return text.length > 18 ? `${text.slice(0, 10)}...${text.slice(-6)}` : text || "-";
}
function shortHash(value) {
const text = String(value || "");
if (text.length === 0) return "";
return text.length > 12 ? text.slice(0, 12) : text;
}
function clamp(value, min, max) {
return Math.max(min, Math.min(max, value));
}
function rootCauseText(item) {
return item?.rootCause || item?.evidenceSummary || item?.summary || "尚未记录根因,等待下一次 OTel/报告归因。";
}
@@ -3,6 +3,7 @@
// SPEC: PJ2026-01060508 Web哨兵 draft-2026-06-26-p9-multi-web-probe-sentinel.
// SPEC: PJ2026-01060508 Web哨兵 draft-2026-06-26-p10-monitor-web-aggregation.
// SPEC: PJ2026-01060508 Web哨兵 draft-2026-06-27-p11-monitor-web-observability-dashboard.
// SPEC: PJ2026-01060508 Web哨兵 draft-2026-06-27-p12-cadence-scheduler-monitor-web.
// Responsibility: YAML-first CI/CD, image, GitOps and Argo command plan for the web-probe sentinel.
import { createHash, randomUUID } from "node:crypto";
import { existsSync, readFileSync } from "node:fs";
@@ -1999,6 +2000,24 @@ for (let attempt = 1; attempt <= maxNavigationAttempts; attempt += 1) {
await page.waitForTimeout(750 * attempt);
}
await page.evaluate(() => {
const detailPane = document.querySelector(".workspace-grid .pane-detail");
if (detailPane instanceof HTMLElement) detailPane.scrollTop = Math.min(96, Math.max(0, detailPane.scrollHeight - detailPane.clientHeight));
}).catch(() => {});
await page.waitForTimeout(150);
const trendHoverPoint = await page.evaluate(() => {
const target = document.querySelector(".trend-dot-hit .trend-dot-red") || document.querySelector(".trend-dot-hit .trend-dot-warning");
if (!(target instanceof SVGElement)) return null;
const rect = target.getBoundingClientRect();
if (rect.width <= 0 || rect.height <= 0) return null;
return { x: rect.left + rect.width / 2, y: rect.top + rect.height / 2 };
}).catch(() => null);
if (trendHoverPoint) {
await page.mouse.move(trendHoverPoint.x, trendHoverPoint.y);
await page.waitForTimeout(250);
}
if (captureScreenshot && screenshotPath) {
await page.screenshot({ path: screenshotPath, fullPage, animations: "disabled" }).catch((error) => {
pageErrors.push({ message: "screenshot failed: " + String(error?.message || error).slice(0, 400) });
@@ -2012,9 +2031,12 @@ const dom = await page.evaluate(() => {
const shell = document.querySelector("[data-monitor-shell='true']");
const error = document.querySelector("#monitor-web-error");
const trend = document.querySelector("[data-monitor-trend-curve]");
const trendTooltip = document.querySelector("[data-monitor-trend-tooltip='true']");
const timeline = document.querySelector("[data-monitor-timeline='true']");
const workspace = document.querySelector("[data-monitor-independent-scroll='true']");
const panes = Array.from(document.querySelectorAll(".workspace-grid .pane"));
const detailPane = document.querySelector(".workspace-grid .pane-detail");
const detailHeader = document.querySelector("#monitor-web-root > div > section.workspace-grid > main > div.pane-header");
const doc = document.documentElement;
const body = document.body;
const viewport = { width: window.innerWidth, height: window.innerHeight };
@@ -2063,6 +2085,8 @@ const dom = await page.evaluate(() => {
runRows: document.querySelectorAll(".run-list .run-row").length,
findingItems: document.querySelectorAll(".finding-list .finding-card").length,
trendCurve: Boolean(trend),
trendDotCount: document.querySelectorAll(".trend-dot-hit").length,
trendTooltip: tooltipSummary(trendTooltip),
trendPanelText: text("#trend-heading"),
timelineItems: document.querySelectorAll(".timeline-list .timeline-item").length,
timelineVisible: Boolean(timeline),
@@ -2089,6 +2113,7 @@ const dom = await page.evaluate(() => {
const style = window.getComputedStyle(pane);
return style.overflowY === "auto" || style.overflowY === "scroll";
}),
stickyHeader: stickyHeaderSummary(detailPane, detailHeader),
},
layout: {
viewport,
@@ -2098,6 +2123,46 @@ const dom = await page.evaluate(() => {
overflow,
},
};
function tooltipSummary(element) {
const body = String(element?.textContent || "").replace(/\s+/g, " ").trim();
return {
visible: Boolean(element && body.length > 0),
text: body.slice(0, 240),
hasValues: /\s+\d+/u.test(body) && /\s+\d+/u.test(body) && /\s+\d+/u.test(body),
hasTime: /UTC/u.test(body) || /\d{4}-\d{2}-\d{2}/u.test(body),
};
}
function stickyHeaderSummary(pane, header) {
if (!(pane instanceof HTMLElement) || !(header instanceof HTMLElement)) {
return { present: false, coversScroll: false, backgroundOpaque: false, detailScrollTop: null };
}
const rect = header.getBoundingClientRect();
const style = window.getComputedStyle(header);
const sampleX = Math.round(rect.left + Math.min(32, Math.max(2, rect.width / 2)));
const sampleY = Math.round(rect.top + Math.min(12, Math.max(2, rect.height / 2)));
const topElement = document.elementFromPoint(sampleX, sampleY);
return {
present: true,
detailScrollTop: pane.scrollTop,
headerTop: Math.round(rect.top),
headerBottom: Math.round(rect.bottom),
zIndex: style.zIndex,
backgroundColor: style.backgroundColor,
coversScroll: Boolean(topElement && header.contains(topElement)),
backgroundOpaque: backgroundIsOpaque(style.backgroundColor),
topElementClass: String(topElement?.className || "").slice(0, 80),
};
}
function backgroundIsOpaque(value) {
const rgba = /rgba?\(([^)]+)\)/u.exec(value);
if (rgba === null) return value.length > 0 && value !== "transparent";
const parts = rgba[1].split(",").map((part) => part.trim());
if (parts.length < 4) return true;
return Number(parts[3]) >= 0.99;
}
});
const consoleErrors = consoleMessages.filter((item) => item.type === "error");
@@ -2109,8 +2174,12 @@ const ok = !navigationError
&& dom.ready === true
&& dom.errorVisible !== true
&& dom.trendCurve === true
&& (dom.trendDotCount === 0 || (dom.trendTooltip?.visible === true && dom.trendTooltip?.hasValues === true && dom.trendTooltip?.hasTime === true))
&& dom.timelineVisible === true
&& dom.scrollModel?.independentScroll === true
&& dom.scrollModel?.stickyHeader?.present === true
&& dom.scrollModel?.stickyHeader?.coversScroll === true
&& dom.scrollModel?.stickyHeader?.backgroundOpaque === true
&& dom.layout?.horizontalOverflow !== true
&& pageErrors.length === 0;
+554
View File
@@ -0,0 +1,554 @@
#!/usr/bin/env bun
// SPEC: PJ2026-01060508 Web哨兵 draft-2026-06-27-p12-cadence-scheduler-monitor-web.
// Responsibility: Host-side cadence scheduler for YAML-first web-probe sentinels; it triggers the existing validate quick-verify path when runs become stale.
import { existsSync, mkdirSync, openSync, closeSync, statSync, unlinkSync, writeFileSync } from "node:fs";
import { join } from "node:path";
import { repoRoot, rootPath } from "./src/config";
import { runCommand, runCommandObserved, type CommandResult } from "./src/command";
import { hwlabDefaultRuntimeTarget, hwlabRuntimeLaneSpecForNode, isHwlabRuntimeLane } from "./src/hwlab-node-lanes";
import { readConfigRefTarget, resolveWebProbeSentinel, webProbeSentinelRegistryRows } from "./src/hwlab-node-web-sentinel-resolver";
type SchedulerAction = "run" | "install-systemd" | "status-systemd";
interface SchedulerOptions {
readonly action: SchedulerAction;
readonly node: string;
readonly lane: string;
readonly sentinelId: string | null;
readonly dryRun: boolean;
readonly force: boolean;
readonly confirm: boolean;
readonly staleMultiplier: number;
readonly timeoutSeconds: number | null;
readonly fetchTimeoutMs: number;
}
interface SentinelSchedule {
readonly sentinelId: string;
readonly enabled: boolean;
readonly publicBaseUrl: string;
readonly cadenceSeconds: number;
readonly timeoutSeconds: number;
readonly scenarioIds: readonly string[];
}
interface OverviewSnapshot {
readonly ok: boolean;
readonly latestRunId: string | null;
readonly latestRunAt: string | null;
readonly latestRunAgeSeconds: number | null;
readonly schedulerHeartbeatAt: string | null;
readonly schedulerHeartbeatAgeSeconds: number | null;
readonly error: string | null;
}
interface TriggerResult {
readonly attempted: boolean;
readonly exitCode: number | null;
readonly timedOut: boolean;
readonly durationMs: number | null;
readonly recorded: boolean;
readonly latestRunIdBefore: string | null;
readonly latestRunIdAfter: string | null;
readonly status: string;
readonly stdoutTail: string;
readonly stderrTail: string;
}
const DEFAULT_STALE_MULTIPLIER = 1;
const DEFAULT_FETCH_TIMEOUT_MS = 15_000;
const HOST_SCHEDULER_INTERVAL_SECONDS = 120;
const STATE_DIR = rootPath(".state", "web-probe-sentinel-scheduler");
await main().catch((error) => {
const message = error instanceof Error ? error.stack || error.message : String(error);
console.error(message);
process.exit(1);
});
async function main(): Promise<void> {
const options = parseArgs(process.argv.slice(2));
if (options.action === "install-systemd") {
installSystemd(options);
return;
}
if (options.action === "status-systemd") {
statusSystemd(options);
return;
}
await runScheduler(options);
}
async function runScheduler(options: SchedulerOptions): Promise<void> {
const spec = specFor(options);
const schedules = sentinelSchedules(spec, options);
const rows: Record<string, unknown>[] = [];
let infraFailure = false;
for (const schedule of schedules) {
if (!schedule.enabled) {
rows.push(rowFor(schedule, null, false, "disabled", null));
continue;
}
const before = await readOverview(schedule, options.fetchTimeoutMs);
const latestAge = before.latestRunAgeSeconds;
const dueThresholdSeconds = Math.max(1, Math.round(schedule.cadenceSeconds * options.staleMultiplier));
const due = options.force || latestAge === null || latestAge >= dueThresholdSeconds;
let trigger: TriggerResult | null = null;
if (due && !options.dryRun) {
const lock = acquireLock(options, schedule.sentinelId, schedule.timeoutSeconds);
if (lock.acquired) {
try {
trigger = await triggerSentinel(options, schedule, before);
infraFailure = infraFailure || trigger.status === "infra-failed" || trigger.status === "timeout";
} finally {
releaseLock(lock.path);
}
} else {
trigger = {
attempted: false,
exitCode: null,
timedOut: false,
durationMs: null,
recorded: false,
latestRunIdBefore: before.latestRunId,
latestRunIdAfter: before.latestRunId,
status: `lock-held:${lock.reason}`,
stdoutTail: "",
stderrTail: "",
};
}
}
const status = due ? options.dryRun ? "due-dry-run" : trigger?.status ?? "due" : "fresh";
const row = rowFor(schedule, before, due, status, trigger);
rows.push(row);
appendEvent({ at: new Date().toISOString(), node: options.node, lane: options.lane, ...row, valuesRedacted: true });
}
printRows(rows);
if (infraFailure) process.exitCode = 2;
}
function specFor(options: SchedulerOptions) {
if (!isHwlabRuntimeLane(options.lane)) throw new Error(`unknown lane ${options.lane}`);
return hwlabRuntimeLaneSpecForNode(options.lane, options.node);
}
function sentinelSchedules(spec: ReturnType<typeof hwlabRuntimeLaneSpecForNode>, options: SchedulerOptions): SentinelSchedule[] {
const registry = webProbeSentinelRegistryRows(spec);
const selectedRows = options.sentinelId === null
? registry
: registry.filter((row) => row.id === options.sentinelId);
if (selectedRows.length === 0) {
const ids = registry.map((row) => row.id).join(", ");
throw new Error(`unknown sentinel ${options.sentinelId ?? "-"}; available: ${ids}`);
}
return selectedRows.map((row) => {
const sentinel = resolveWebProbeSentinel(spec, row.id);
const publicExposure = record(readConfigRefTarget(sentinel.configRefs.publicExposure), sentinel.configRefs.publicExposure);
const runtime = record(readConfigRefTarget(sentinel.configRefs.runtime), sentinel.configRefs.runtime);
const cicd = record(readConfigRefTarget(sentinel.configRefs.cicd), sentinel.configRefs.cicd);
const scenarios = scenarioRows(readConfigRefTarget(sentinel.configRefs.scenarios));
const enabledScenarios = scenarios.filter((scenario) => scenario.enabled !== false);
const scenarioCadences = enabledScenarios
.map((scenario) => typeof scenario.cadence === "string" ? parseDurationSeconds(scenario.cadence) : null)
.filter((value): value is number => value !== null && value > 0);
const runtimeInterval = numberAt(runtime, "scheduler.intervalMs");
const yamlTimeout = numberAtNullable(cicd, "targetValidation.maxSeconds");
return {
sentinelId: sentinel.id,
enabled: row.enabled && sentinel.enabled && enabledScenarios.length > 0,
publicBaseUrl: stringAt(publicExposure, "publicBaseUrl").replace(/\/+$/u, ""),
cadenceSeconds: Math.min(...(scenarioCadences.length > 0 ? scenarioCadences : [Math.max(1, Math.round(runtimeInterval / 1000))])),
timeoutSeconds: options.timeoutSeconds ?? yamlTimeout ?? 300,
scenarioIds: enabledScenarios.map((scenario) => String(scenario.id || sentinel.id)),
};
});
}
async function triggerSentinel(options: SchedulerOptions, schedule: SentinelSchedule, before: OverviewSnapshot): Promise<TriggerResult> {
const command = [
"bun",
"scripts/cli.ts",
"web-probe",
"sentinel",
"validate",
"--node",
options.node,
"--lane",
options.lane,
"--sentinel",
schedule.sentinelId,
"--quick-verify",
"--confirm",
"--wait",
"--timeout-seconds",
String(schedule.timeoutSeconds),
];
const result = await runCommandObserved(command, repoRoot, {
timeoutMs: Math.max(60, schedule.timeoutSeconds + 90) * 1000,
heartbeatMs: 30_000,
maxCaptureChars: 24_000,
env: { ...process.env, NO_COLOR: "1" },
});
const after = await readOverview(schedule, options.fetchTimeoutMs);
const recorded = after.ok && (
before.latestRunId === null
|| after.latestRunId !== before.latestRunId
|| (after.latestRunAt !== null && after.latestRunAt !== before.latestRunAt)
);
const status = result.timedOut
? "timeout"
: recorded
? result.exitCode === 0 ? "recorded" : "recorded-with-findings"
: result.exitCode === 0 ? "completed-no-new-run" : "infra-failed";
return {
attempted: true,
exitCode: result.exitCode,
timedOut: result.timedOut,
durationMs: result.durationMs ?? null,
recorded,
latestRunIdBefore: before.latestRunId,
latestRunIdAfter: after.latestRunId,
status,
stdoutTail: tail(result.stdout, 900),
stderrTail: tail(result.stderr, 900),
};
}
async function readOverview(schedule: SentinelSchedule, timeoutMs: number): Promise<OverviewSnapshot> {
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), timeoutMs);
try {
const response = await fetch(`${schedule.publicBaseUrl}/api/overview`, { cache: "no-store", signal: controller.signal });
if (!response.ok) throw new Error(`HTTP ${response.status}`);
const overview = record(await response.json(), `${schedule.publicBaseUrl}/api/overview`);
const latestRun = isRecord(overview.latestRun) ? overview.latestRun : {};
const freshness = isRecord(overview.freshness) ? overview.freshness : {};
const latestRunAt = stringAtNullable(latestRun, "updatedAt") ?? stringAtNullable(latestRun, "createdAt");
return {
ok: true,
latestRunId: stringAtNullable(latestRun, "id"),
latestRunAt,
latestRunAgeSeconds: numberAtNullable(freshness, "latestRunAgeSeconds") ?? ageSeconds(latestRunAt),
schedulerHeartbeatAt: stringAtNullable(overview, "scheduler.heartbeatAt") ?? stringAtNullable(freshness, "schedulerHeartbeatAt"),
schedulerHeartbeatAgeSeconds: numberAtNullable(freshness, "schedulerHeartbeatAgeSeconds"),
error: null,
};
} catch (error) {
return {
ok: false,
latestRunId: null,
latestRunAt: null,
latestRunAgeSeconds: null,
schedulerHeartbeatAt: null,
schedulerHeartbeatAgeSeconds: null,
error: error instanceof Error ? error.message : String(error),
};
} finally {
clearTimeout(timer);
}
}
function installSystemd(options: SchedulerOptions): void {
const unit = systemdUnitName(options);
const servicePath = `/etc/systemd/system/${unit}.service`;
const timerPath = `/etc/systemd/system/${unit}.timer`;
const service = `[Unit]
Description=UniDesk web-probe sentinel host cadence scheduler for ${options.node}/${options.lane}
Wants=network-online.target
After=network-online.target
[Service]
Type=oneshot
WorkingDirectory=${repoRoot}
ExecStart=/usr/bin/env bun ${join(repoRoot, "scripts", "web-probe-sentinel-scheduler.ts")} run --node ${options.node} --lane ${options.lane} --stale-multiplier ${options.staleMultiplier}
`;
const timer = `[Unit]
Description=Run UniDesk web-probe sentinel host cadence scheduler for ${options.node}/${options.lane}
[Timer]
OnBootSec=${HOST_SCHEDULER_INTERVAL_SECONDS}s
OnUnitActiveSec=${HOST_SCHEDULER_INTERVAL_SECONDS}s
AccuracySec=15s
Persistent=true
Unit=${unit}.service
[Install]
WantedBy=timers.target
`;
if (!options.confirm || options.dryRun) {
console.log(JSON.stringify({ ok: true, mode: "dry-run", servicePath, timerPath, service, timer, valuesRedacted: true }, null, 2));
return;
}
writeFileSync(servicePath, service, "utf8");
writeFileSync(timerPath, timer, "utf8");
const commands = [
["systemctl", "daemon-reload"],
["systemctl", "enable", "--now", `${unit}.timer`],
];
const results = commands.map((command) => runCommand(command, "/"));
printSystemdResult(unit, servicePath, timerPath, results);
if (results.some((result) => result.exitCode !== 0)) process.exitCode = 2;
}
function statusSystemd(options: SchedulerOptions): void {
const unit = systemdUnitName(options);
const results = [
runCommand(["systemctl", "is-enabled", `${unit}.timer`], "/"),
runCommand(["systemctl", "is-active", `${unit}.timer`], "/"),
runCommand(["systemctl", "show", `${unit}.timer`, "--property=NextElapseUSecRealtime", "--property=LastTriggerUSec"], "/"),
];
printSystemdResult(unit, `/etc/systemd/system/${unit}.service`, `/etc/systemd/system/${unit}.timer`, results);
if (results[1]?.exitCode !== 0) process.exitCode = 2;
}
function printSystemdResult(unit: string, servicePath: string, timerPath: string, results: readonly CommandResult[]): void {
console.log(JSON.stringify({
ok: results.every((result) => result.exitCode === 0),
unit,
servicePath,
timerPath,
results: results.map(compactCommand),
valuesRedacted: true,
}, null, 2));
}
function rowFor(schedule: SentinelSchedule, overview: OverviewSnapshot | null, due: boolean, status: string, trigger: TriggerResult | null): Record<string, unknown> {
return {
sentinelId: schedule.sentinelId,
enabled: schedule.enabled,
cadence: formatSeconds(schedule.cadenceSeconds),
latestAge: overview?.latestRunAgeSeconds === null || overview?.latestRunAgeSeconds === undefined ? "-" : formatSeconds(overview.latestRunAgeSeconds),
heartbeatAge: overview?.schedulerHeartbeatAgeSeconds === null || overview?.schedulerHeartbeatAgeSeconds === undefined ? "-" : formatSeconds(overview.schedulerHeartbeatAgeSeconds),
due,
status,
latestRunId: trigger?.latestRunIdAfter ?? overview?.latestRunId ?? null,
scenarios: schedule.scenarioIds.join(","),
overviewOk: overview?.ok ?? null,
overviewError: overview?.error ?? null,
trigger: trigger === null ? null : {
attempted: trigger.attempted,
exitCode: trigger.exitCode,
timedOut: trigger.timedOut,
durationMs: trigger.durationMs,
recorded: trigger.recorded,
latestRunIdBefore: trigger.latestRunIdBefore,
latestRunIdAfter: trigger.latestRunIdAfter,
},
valuesRedacted: true,
};
}
function printRows(rows: readonly Record<string, unknown>[]): void {
const headers = ["SENTINEL", "CADENCE", "LATEST_AGE", "DUE", "STATUS", "LATEST_RUN"];
const body = rows.map((row) => [
String(row.sentinelId ?? ""),
String(row.cadence ?? ""),
String(row.latestAge ?? ""),
String(row.due ?? ""),
String(row.status ?? ""),
String(row.latestRunId ?? "-"),
]);
const widths = headers.map((header, index) => Math.max(header.length, ...body.map((line) => line[index].length)));
console.log(headers.map((header, index) => header.padEnd(widths[index])).join(" "));
for (const line of body) console.log(line.map((value, index) => value.padEnd(widths[index])).join(" "));
}
function acquireLock(options: SchedulerOptions, sentinelId: string, timeoutSeconds: number): { acquired: true; path: string } | { acquired: false; path: string; reason: string } {
const lockDir = join(STATE_DIR, "locks");
mkdirSync(lockDir, { recursive: true });
const lockPath = join(lockDir, `${safeSegment(options.node)}-${safeSegment(options.lane)}-${safeSegment(sentinelId)}.lock`);
const maxLockAgeMs = Math.max(3_600_000, (timeoutSeconds + 300) * 1000);
if (existsSync(lockPath)) {
const ageMs = Date.now() - statSync(lockPath).mtimeMs;
if (ageMs > maxLockAgeMs) unlinkSync(lockPath);
}
try {
const fd = openSync(lockPath, "wx");
writeFileSync(fd, JSON.stringify({ pid: process.pid, at: new Date().toISOString(), sentinelId, valuesRedacted: true }));
closeSync(fd);
return { acquired: true, path: lockPath };
} catch (error) {
const reason = error instanceof Error ? error.message : String(error);
return { acquired: false, path: lockPath, reason };
}
}
function releaseLock(lockPath: string): void {
try {
unlinkSync(lockPath);
} catch {
// Best-effort cleanup; stale locks are aged out on the next tick.
}
}
function appendEvent(event: Record<string, unknown>): void {
mkdirSync(STATE_DIR, { recursive: true });
const date = new Date().toISOString().slice(0, 10).replaceAll("-", "");
const path = join(STATE_DIR, `run-${date}.jsonl`);
writeFileSync(path, `${JSON.stringify(event)}\n`, { flag: "a" });
}
function parseArgs(argv: readonly string[]): SchedulerOptions {
const defaults = hwlabDefaultRuntimeTarget();
let action: SchedulerAction = "run";
let node = defaults.node;
let lane = defaults.lane;
let sentinelId: string | null = null;
let dryRun = false;
let force = false;
let confirm = false;
let staleMultiplier = DEFAULT_STALE_MULTIPLIER;
let timeoutSeconds: number | null = null;
let fetchTimeoutMs = DEFAULT_FETCH_TIMEOUT_MS;
const args = [...argv];
if (args[0] === "run" || args[0] === "install-systemd" || args[0] === "status-systemd") {
action = args.shift() as SchedulerAction;
}
while (args.length > 0) {
const arg = args.shift();
if (arg === undefined) break;
if (arg === "--node") node = requireValue(arg, args);
else if (arg === "--lane") lane = requireValue(arg, args);
else if (arg === "--sentinel") sentinelId = requireValue(arg, args);
else if (arg === "--dry-run") dryRun = true;
else if (arg === "--force") force = true;
else if (arg === "--confirm") confirm = true;
else if (arg === "--stale-multiplier") staleMultiplier = positiveNumber(requireValue(arg, args), arg);
else if (arg === "--timeout-seconds") timeoutSeconds = positiveInteger(requireValue(arg, args), arg);
else if (arg === "--fetch-timeout-ms") fetchTimeoutMs = positiveInteger(requireValue(arg, args), arg);
else if (arg === "-h" || arg === "--help") {
printUsage();
process.exit(0);
} else {
throw new Error(`unknown option ${arg}`);
}
}
return { action, node, lane, sentinelId, dryRun, force, confirm, staleMultiplier, timeoutSeconds, fetchTimeoutMs };
}
function printUsage(): void {
console.log(`Usage:
bun scripts/web-probe-sentinel-scheduler.ts run [--node D601] [--lane v03] [--sentinel ID] [--dry-run] [--force]
bun scripts/web-probe-sentinel-scheduler.ts install-systemd --node D601 --lane v03 --confirm
bun scripts/web-probe-sentinel-scheduler.ts status-systemd --node D601 --lane v03
`);
}
function requireValue(flag: string, args: string[]): string {
const value = args.shift();
if (value === undefined || value.length === 0) throw new Error(`${flag} requires a value`);
return value;
}
function positiveInteger(value: string, flag: string): number {
const parsed = Number(value);
if (!Number.isInteger(parsed) || parsed <= 0) throw new Error(`${flag} must be a positive integer`);
return parsed;
}
function positiveNumber(value: string, flag: string): number {
const parsed = Number(value);
if (!Number.isFinite(parsed) || parsed <= 0) throw new Error(`${flag} must be a positive number`);
return parsed;
}
function scenarioRows(value: unknown): Record<string, unknown>[] {
if (Array.isArray(value)) return value.map((item) => record(item, "scenario"));
if (!isRecord(value)) throw new Error("scenario configRef must point to a YAML object or array");
if (Array.isArray(value.scenarios)) return value.scenarios.map((item) => record(item, "scenario"));
if (isRecord(value.workflow)) return [value.workflow];
return [value];
}
function parseDurationSeconds(value: string): number | null {
const match = /^(\d+)(ms|s|m|h)$/u.exec(value.trim());
if (match === null) return null;
const amount = Number(match[1]);
const unit = match[2];
if (unit === "ms") return Math.max(1, Math.ceil(amount / 1000));
if (unit === "s") return amount;
if (unit === "m") return amount * 60;
if (unit === "h") return amount * 3600;
return null;
}
function formatSeconds(seconds: number): string {
if (seconds < 90) return `${Math.round(seconds)}s`;
if (seconds < 7200) return `${Math.round(seconds / 60)}m`;
if (seconds < 172800) return `${Math.round(seconds / 3600)}h`;
return `${Math.round(seconds / 86400)}d`;
}
function ageSeconds(value: string | null): number | null {
if (value === null) return null;
const parsed = Date.parse(value);
if (!Number.isFinite(parsed)) return null;
return Math.max(0, Math.round((Date.now() - parsed) / 1000));
}
function stringAt(value: unknown, path: string): string {
const found = valueAtPath(value, path);
if (typeof found !== "string" || found.length === 0) throw new Error(`${path} must be a non-empty string`);
return found;
}
function stringAtNullable(value: unknown, path: string): string | null {
const found = valueAtPath(value, path);
return typeof found === "string" && found.length > 0 ? found : null;
}
function numberAt(value: unknown, path: string): number {
const found = valueAtPath(value, path);
if (typeof found !== "number" || !Number.isFinite(found)) throw new Error(`${path} must be a number`);
return found;
}
function numberAtNullable(value: unknown, path: string): number | null {
const found = valueAtPath(value, path);
return typeof found === "number" && Number.isFinite(found) ? found : null;
}
function valueAtPath(value: unknown, path: string): unknown {
let current = value;
for (const segment of path.split(".")) {
if (!isRecord(current)) return undefined;
current = current[segment];
}
return current;
}
function record(value: unknown, label: string): Record<string, unknown> {
if (!isRecord(value)) throw new Error(`${label} must be an object`);
return value;
}
function isRecord(value: unknown): value is Record<string, unknown> {
return typeof value === "object" && value !== null && !Array.isArray(value);
}
function compactCommand(result: CommandResult): Record<string, unknown> {
return {
command: result.command.join(" "),
exitCode: result.exitCode,
timedOut: result.timedOut,
durationMs: result.durationMs ?? null,
stdoutTail: tail(result.stdout, 900),
stderrTail: tail(result.stderr, 900),
};
}
function tail(value: string, maxChars: number): string {
return value.length <= maxChars ? value : value.slice(-maxChars);
}
function systemdUnitName(options: SchedulerOptions): string {
return `unidesk-web-probe-sentinel-scheduler-${safeSegment(options.node)}-${safeSegment(options.lane)}`;
}
function safeSegment(value: string): string {
return value.toLowerCase().replace(/[^a-z0-9._-]+/gu, "-").replace(/^-+|-+$/gu, "") || "default";
}