diff --git a/config/hwlab-web-probe-sentinel/profiles.yaml b/config/hwlab-web-probe-sentinel/profiles.yaml
index 460be77c..ca72b576 100644
--- a/config/hwlab-web-probe-sentinel/profiles.yaml
+++ b/config/hwlab-web-probe-sentinel/profiles.yaml
@@ -26,6 +26,14 @@ baselines:
intervalMs: 600000
heartbeatStaleSeconds: 900
maxConcurrentRuns: 1
+ freshnessWarningMultiple: 2
+ observability:
+ otel:
+ enabled: true
+ serviceName: hwlab-web-probe-sentinel-${nodeLower}
+ tracesEndpoint: http://otel-collector.platform-infra.svc.cluster.local:4318/v1/traces
+ sampler: parentbased_traceidratio
+ samplerArg: "1"
scheduler15m: &scheduler-15m
intervalMs: 900000
heartbeatStaleSeconds: 900
@@ -229,8 +237,15 @@ nodes:
maxSeconds: 360
serviceUnavailablePolicy: structured-failure
cadenceScheduler:
- enabled: false
- reason: cicd-health-endpoint-only
+ enabled: true
+ reason: k8s-native-periodic-quick-verify
+ concurrencyPolicy: Forbid
+ startingDeadlineSeconds: 600
+ successfulJobsHistoryLimit: 3
+ failedJobsHistoryLimit: 5
+ activeDeadlineSlackSeconds: 60
+ ttlSecondsAfterFinished: 86400
+ backoffLimit: 0
secrets:
sources:
- <<: *jd01-bootstrap-source
diff --git a/config/hwlab-web-probe-sentinel/runtime.auth-session-switch.d601-v03.yaml b/config/hwlab-web-probe-sentinel/runtime.auth-session-switch.d601-v03.yaml
index 7a040d56..000ee70f 100644
--- a/config/hwlab-web-probe-sentinel/runtime.auth-session-switch.d601-v03.yaml
+++ b/config/hwlab-web-probe-sentinel/runtime.auth-session-switch.d601-v03.yaml
@@ -28,6 +28,7 @@ sentinel:
intervalMs: 600000
heartbeatStaleSeconds: 900
maxConcurrentRuns: 1
+ freshnessWarningMultiple: 2
sqlite:
path: /var/lib/web-probe-sentinel-auth-switch/index.sqlite
busyTimeoutMs: 2000
diff --git a/config/hwlab-web-probe-sentinel/runtime.d518-v03.yaml b/config/hwlab-web-probe-sentinel/runtime.d518-v03.yaml
index 96017718..d31a7986 100644
--- a/config/hwlab-web-probe-sentinel/runtime.d518-v03.yaml
+++ b/config/hwlab-web-probe-sentinel/runtime.d518-v03.yaml
@@ -28,6 +28,7 @@ sentinel:
intervalMs: 600000
heartbeatStaleSeconds: 900
maxConcurrentRuns: 1
+ freshnessWarningMultiple: 2
sqlite:
path: /var/lib/web-probe-sentinel-dsflash/index.sqlite
busyTimeoutMs: 2000
diff --git a/config/hwlab-web-probe-sentinel/runtime.d601-v03.yaml b/config/hwlab-web-probe-sentinel/runtime.d601-v03.yaml
index 9857686e..d2d216b3 100644
--- a/config/hwlab-web-probe-sentinel/runtime.d601-v03.yaml
+++ b/config/hwlab-web-probe-sentinel/runtime.d601-v03.yaml
@@ -28,6 +28,7 @@ sentinel:
intervalMs: 600000
heartbeatStaleSeconds: 900
maxConcurrentRuns: 1
+ freshnessWarningMultiple: 2
sqlite:
path: /var/lib/web-probe-sentinel/index.sqlite
busyTimeoutMs: 2000
diff --git a/config/hwlab-web-probe-sentinel/runtime.fake-echo.d518-v03.yaml b/config/hwlab-web-probe-sentinel/runtime.fake-echo.d518-v03.yaml
index 31efffb3..ad5538dc 100644
--- a/config/hwlab-web-probe-sentinel/runtime.fake-echo.d518-v03.yaml
+++ b/config/hwlab-web-probe-sentinel/runtime.fake-echo.d518-v03.yaml
@@ -29,6 +29,7 @@ sentinel:
intervalMs: 600000
heartbeatStaleSeconds: 900
maxConcurrentRuns: 1
+ freshnessWarningMultiple: 2
sqlite:
path: /var/lib/web-probe-sentinel-fake-echo/index.sqlite
busyTimeoutMs: 2000
diff --git a/config/hwlab-web-probe-sentinel/runtime.mdtodo.d601-v03.yaml b/config/hwlab-web-probe-sentinel/runtime.mdtodo.d601-v03.yaml
index 1de9c5d9..e5d3fee0 100644
--- a/config/hwlab-web-probe-sentinel/runtime.mdtodo.d601-v03.yaml
+++ b/config/hwlab-web-probe-sentinel/runtime.mdtodo.d601-v03.yaml
@@ -28,6 +28,7 @@ sentinel:
intervalMs: 900000
heartbeatStaleSeconds: 900
maxConcurrentRuns: 1
+ freshnessWarningMultiple: 2
sqlite:
path: /var/lib/web-probe-sentinel-mdtodo/index.sqlite
busyTimeoutMs: 2000
diff --git a/project-management/PJ2026-01/specs/PJ2026-01060508-web-probe-sentinel.md b/project-management/PJ2026-01/specs/PJ2026-01060508-web-probe-sentinel.md
index d87292f5..6da678bc 100644
--- a/project-management/PJ2026-01/specs/PJ2026-01060508-web-probe-sentinel.md
+++ b/project-management/PJ2026-01/specs/PJ2026-01060508-web-probe-sentinel.md
@@ -24,6 +24,7 @@
| 多实例实现引用版本 | draft-2026-06-26-p9-multi-web-probe-sentinel |
| Monitor Web 聚合实现引用版本 | draft-2026-06-26-p10-monitor-web-aggregation |
| Monitor Web 观察面板治理实现引用版本 | draft-2026-06-27-p11-monitor-web-observability-dashboard; draft-2026-06-27-p12-cadence-scheduler-monitor-web |
+| Cadence/OTel 稳定性实现引用版本 | draft-2026-07-01-p15-cadence-otel |
| 需求规格模板 | [ISO/IEC/IEEE 29148 需求规格模板](../../templates/iso-iec-ieee-29148-requirements-spec-template.md) |
| 上级规格 | [PJ2026-010605 运维监控](PJ2026-010605-observability-monitoring.md) |
| 关联规格 | [PJ2026-010401 Web工作台](PJ2026-010401-web-workbench.md)、[PJ2026-0104010803 Workbench唯一投影](PJ2026-0104010803-workbench-unique-projection.md)、[PJ2026-010403 API契约](PJ2026-010403-api-contract.md)、[PJ2026-010601 发布流水](PJ2026-010601-controlled-release.md)、[PJ2026-010602 源码同步](PJ2026-010602-source-sync.md)、[PJ2026-010603 YAML运维](PJ2026-010603-yaml-first-ops.md)、[PJ2026-010604 公开入口](PJ2026-010604-public-entry.md)、[PJ2026-01060505 Workbench性能](PJ2026-01060505-workbench-performance.md) |
@@ -126,6 +127,7 @@ Web哨兵必须遵循 UniDesk YAML-first ops。目标 node/lane、public origin
| PJ2026-0106050811 | Monitor Web 聚合 | 本规格 6.11 | runner/web 职责拆分、单 monitor-web 聚合、Kubernetes discovery、Vue+TS 前端和 public exposure 收敛 | 多实例与账号切换、Dashboard工作台、发布集成 | `monitor.pikapython.com` 统一值守入口 |
| PJ2026-0106050812 | Monitor Web 观察面板治理 | 本规格 6.12 | 趋势曲线、运行时间线、固定视口三栏、cadence freshness、Vue CI/CD/env reuse/git mirror | Monitor Web 聚合、Dashboard工作台、发布集成、源码同步 | 可滚动上线和值守的统一观察面板 |
| PJ2026-0106050814 | 哨兵 CI/CD 可见性 | 本规格 6.14 | publish 阶段耗时、env reuse、docker cache、超时诊断、git mirror/Argo/runtime 收敛下一步 | 发布集成、源码同步、Monitor Web 观察面板治理 | 小改动滚动上线可诊断、可续跑、可验收 |
+| PJ2026-0106050815 | Cadence/OTel 稳定性 | 本规格 6.15 | Kubernetes CronJob 周期巡检、状态缺口故障码、monitor-web cadence 可见性和 sentinel OTel span 合同 | Monitor Web 观察面板治理、发布集成、OTel、YAML运维 | JD01/v03 周期巡检恢复和后续防回归 |
### 5.1 目标架构图
@@ -567,6 +569,8 @@ P7 dashboard 增强范围内新增或修改的 dashboard API、frontend assets
P10/P11 monitor-web 范围内新增或修改的 Vue/TypeScript/Vite 前端、typed API client、聚合 API、runner discovery、dashboard verify/screenshot、CI/CD renderer、GitOps/publicExposure helper 和 env reuse 规划代码必须标注 `SPEC: PJ2026-01060508 Web哨兵 draft-2026-06-27-p11-monitor-web-observability-dashboard`。旧 `scripts/assets/web-probe-sentinel-dashboard/dashboard.js` 只能标注迁移前短修或兼容验证用途,不得作为 P11 新观察面板能力的主要承载面。
+P15 cadence/OTel 范围内新增或修改的 CronJob renderer/status probe、runner health/overview、quick verify record path、monitor-web cadence 展示和 OTel emitter 源码文件头部必须标注 `SPEC: PJ2026-01060508 Web哨兵 draft-2026-07-01-p15-cadence-otel`。
+
实现文件不得只写 issue 编号、`latest`、`current` 或“按最新方案”作为规格引用。自动生成文件、第三方 vendored 文件、纯 YAML/config、锁文件和无法承载注释头的二进制产物不要求加源码头部,但对应生成器、渲染器、owning YAML 或 CLI 入口必须能追溯到本 SPEC。
后续 P1-P6 阶段如果改变稳定需求、观察对象、数据流、接口、部署边界或验收口径,应先更新本规格和上级 [PJ2026-010605 运维监控](PJ2026-010605-observability-monitoring.md),再更新执行 issue。
@@ -689,6 +693,24 @@ publish Job 未在等待预算内结束时,CLI 必须输出 job 名称、pod
JD01/v03 `jd01-web-probe-sentinel` 的小改动滚动上线是本阶段验收入口。closeout 必须记录 SPEC P14 引用、source commit、publish job、digest、GitOps revision、git mirror pending/inSync、Argo/runtime alignment、`validate`、远程 dashboard screenshot 和 latest report 证据。若总等待仍超过 120s,closeout 必须记录阶段归因、env reuse/cache 摘要和下一步优化方向;不得通过单纯放宽 120s 预算收口。
+### 6.15 OPS-SENTINEL-REQ-015 Cadence 调度稳定性与 OTel 覆盖
+
+| 编号 | 短名 | 主责模块 | 关联模块 |
+| --- | --- | --- | --- |
+| OPS-SENTINEL-REQ-015 | Cadence/OTel 稳定性 | PJ2026-0106050815 Cadence/OTel 稳定性 | Monitor Web 观察面板治理、发布集成、OTel、YAML运维 |
+
+本阶段执行 issue 为 [#1372](https://github.com/pikasTech/unidesk/issues/1372),阶段子 issue 为 P0 [#1374](https://github.com/pikasTech/unidesk/issues/1374)、P1 [#1377](https://github.com/pikasTech/unidesk/issues/1377)、P2 [#1375](https://github.com/pikasTech/unidesk/issues/1375)、P3 [#1378](https://github.com/pikasTech/unidesk/issues/1378) 和 P4 [#1376](https://github.com/pikasTech/unidesk/issues/1376)。
+
+Web 哨兵周期巡检必须由目标 node/lane 的 Kubernetes CronJob/GitOps 受控对象承载。CronJob 的 enabled、scenarioId、cadence 来源、startingDeadlineSeconds、successfulJobsHistoryLimit、failedJobsHistoryLimit、activeDeadlineSlackSeconds、ttlSecondsAfterFinished、backoffLimit、concurrencyPolicy、targetValidation.maxSeconds、sampleInterval、screenshotInterval、maxRunSeconds、retention 和 OTel endpoint/sampling 都必须来自 owning YAML/configRef;代码只能解析、校验和渲染,不得用隐藏默认补阈值、历史保留、deadline、timeout、并发策略或采样策略。
+
+`web-probe sentinel control-plane status` 必须把 CronJob 作为独立 observed check,而不是只相信 Argo `Synced/Healthy`。当 YAML 启用 cadenceScheduler 但线上缺 CronJob 时,状态必须 blocked,故障码固定为 `sentinel-cadence-cronjob-missing`;schedule 不一致使用 `sentinel-cadence-cronjob-schedule-mismatch`;CronJob suspend 使用 `sentinel-cadence-cronjob-suspended`。状态输出至少展示 CronJob name、namespace、schedule/expectedSchedule、lastScheduleTime、lastSuccessfulTime、active job count、jobCount 和 latest job name。
+
+`monitor-web` 必须把 cadence freshness 作为一等状态:显示 YAML expected cadence、scheduler heartbeat age、latest run age、latest analyzed report age、active runs、planned runs、stale multiple、CronJob 观察状态和 OTel coverage/gap。runner API 不直接查询 Kubernetes CronJob 时,页面必须显式显示 `control-plane-status-required`,不得让用户误以为 CronJob 已观察通过。
+
+Web 哨兵必须向平台 OTel 后端发出有界、脱敏的 span 或在状态中显式标记 instrumentation gap。P15 span 名称固定为:`web_probe_sentinel.scheduler.heartbeat`、`web_probe_sentinel.cadence.expected`、`web_probe_sentinel.cadence.cronjob_rendered`、`web_probe_sentinel.cadence.cronjob_observed`、`web_probe_sentinel.quick_verify.job_start`、`web_probe_sentinel.quick_verify.job_finish`、`web_probe_sentinel.record_run`、`web_probe_sentinel.scheduler_gap.detected`。属性至少包含 node、lane、sentinelId、scenarioId、runId、cronJobName、jobName、podName、namespace、cadence、status、exitCode、failureKind、gitopsRevision、sourceCommit、imageDigest 和 valuesRedacted;不存在的属性可以省略但不得打印 Secret、prompt、cookie、provider payload 或完整 stdout/stderr。
+
+CI/CD rollout 门禁仍只验证配置声明的 `/health` endpoint。web-probe quick verify、Playwright/browser render、dashboard screenshot 和 OTel trace search 都是独立的 post-deploy evidence;不得重新塞回 `trigger-current --confirm --wait` 的同步门禁,也不得引入 Docker daemon/socket 依赖。
+
## 7. 过程控制
Web哨兵架构执行 issue 为 [#883](https://github.com/pikasTech/unidesk/issues/883)。阶段跟踪 issue 为 P0 [#885](https://github.com/pikasTech/unidesk/issues/885)、P1 [#886](https://github.com/pikasTech/unidesk/issues/886)、P2 [#887](https://github.com/pikasTech/unidesk/issues/887)、P3 [#888](https://github.com/pikasTech/unidesk/issues/888)、P4 [#889](https://github.com/pikasTech/unidesk/issues/889)、P5 [#890](https://github.com/pikasTech/unidesk/issues/890) 和 P6 [#891](https://github.com/pikasTech/unidesk/issues/891)。
@@ -714,3 +736,5 @@ P12 cadence 调度和 monitor-web 交互修复执行 issue 为 [#1123](https://g
P13 D518 多 runner 强边界与 OTel 根因收敛执行 issue 为 [#1206](https://github.com/pikasTech/unidesk/issues/1206)。P13 closeout 必须回写:SPEC P13 引用、[#1208](https://github.com/pikasTech/unidesk/issues/1208)-[#1216](https://github.com/pikasTech/unidesk/issues/1216) 阶段状态、D518 双 sentinel 独立 Deployment/Service/PVC/CronJob/GitOps/Argo/public route 证据、route/API sentinelId 强断言、report/index 不串线证据、dashboard verify/screenshot localPath/SHA、k3s CronJob 调度证据、latest selected run 与 historical trend 状态分层证据、以及 OTel AgentRun namespace/trace gap 是否已解除或拆入后续 issue。
P14 Web 哨兵 CI/CD 可见性执行 issue 为 [#1285](https://github.com/pikasTech/unidesk/issues/1285)。P14 closeout 必须回写:SPEC P14 引用、source commit、PR/merge commit、JD01/v03 `jd01-web-probe-sentinel` publish job、digest、GitOps revision、git mirror flush 状态、Argo/runtime observed alignment、`validate`、dashboard screenshot、latest report,以及超过 120s 时的结构化阶段归因和可续跑命令。
+
+P15 Cadence 调度稳定性与 OTel 覆盖执行 issue 为 [#1372](https://github.com/pikasTech/unidesk/issues/1372)。P15 closeout 必须回写:SPEC P15 引用、[#1374](https://github.com/pikasTech/unidesk/issues/1374)-[#1378](https://github.com/pikasTech/unidesk/issues/1378) 阶段状态、JD01/v03 `jd01-web-probe-sentinel` CronJob manifest/GitOps/Argo/runtime observed 证据、`sentinel-cadence-cronjob-missing` 防回归状态、monitor-web cadence/OTel coverage 显示、OTel trace search 或 instrumentation-gap 证据、受控 rollout/publish job、GitOps revision、source commit、dashboard/health 验收,以及 CI/CD 门禁仍只验证 `/health` 的证据。
diff --git a/scripts/assets/web-probe-sentinel-monitor-web/monitor-web.js b/scripts/assets/web-probe-sentinel-monitor-web/monitor-web.js
index 48fcb771..84f018de 100644
--- a/scripts/assets/web-probe-sentinel-monitor-web/monitor-web.js
+++ b/scripts/assets/web-probe-sentinel-monitor-web/monitor-web.js
@@ -1,4 +1,5 @@
// SPEC: PJ2026-01060508 Web哨兵 draft-2026-06-27-p12-cadence-scheduler-monitor-web.
+// SPEC: PJ2026-01060508 Web哨兵 draft-2026-07-01-p15-cadence-otel.
// Responsibility: Vue monitor-web runtime for sentinel trend, timeline, detail and finding observability.
import { createApp, computed, nextTick, onMounted, ref, watch } from "./vendor/vue.esm-browser.prod.js";
@@ -156,18 +157,28 @@ createApp({
return `运行记录 · ${checkScopeRun.value?.id || "未选择"}`;
});
const cadence = computed(() => {
+ const apiCadence = overview.value?.cadence || {};
const intervalMs = Number(overview.value?.scheduler?.intervalMs || 0);
- const latestAge = Number(overview.value?.freshness?.latestRunAgeSeconds ?? -1);
- const heartbeatAge = Number(overview.value?.freshness?.schedulerHeartbeatAgeSeconds ?? -1);
- const intervalSeconds = intervalMs > 0 ? Math.round(intervalMs / 1000) : 0;
- const stale = intervalSeconds > 0 && latestAge > intervalSeconds * 2;
+ const latestAge = Number(apiCadence.latestRunAgeSeconds ?? overview.value?.freshness?.latestRunAgeSeconds ?? -1);
+ const heartbeatAge = Number(apiCadence.schedulerHeartbeatAgeSeconds ?? overview.value?.freshness?.schedulerHeartbeatAgeSeconds ?? -1);
+ const intervalSeconds = Number(apiCadence.expectedCadenceSeconds || 0) || (intervalMs > 0 ? Math.round(intervalMs / 1000) : 0);
+ const status = String(apiCadence.status || "");
+ const stale = status === "warning" || status === "blocker";
+ const cronJob = apiCadence.cronJob || {};
+ const observability = overview.value?.observability || {};
return {
intervalSeconds,
latestAge,
heartbeatAge,
+ status,
stale,
+ blocker: status === "blocker",
+ cronJob,
+ observability,
label: intervalSeconds > 0 ? `${formatDuration(intervalSeconds)} 间隔` : "未配置",
- alert: stale ? `最近运行 ${formatDuration(latestAge)} 前,超过预设间隔 2 倍;按 SPEC 作为非阻塞报警展示。` : "运行新鲜度在预设窗口内",
+ alert: stale
+ ? `最近运行 ${formatDuration(latestAge)} 前;状态 ${status || "warning"},阈值来自 YAML。`
+ : "运行新鲜度在 YAML 窗口内",
};
});
const healthChecks = computed(() => {
@@ -668,6 +679,14 @@ createApp({
调度新鲜度
{{ cadence.latestAge >= 0 ? formatDuration(cadence.latestAge) : "-" }}
+
+ CronJob
+ {{ cadence.cronJob?.status || "-" }}
+
+
+ OTel
+ {{ cadence.observability?.coverage || "-" }}
+
历史错误样本
{{ redCount({ severityCounts: severityTotals }) }}
diff --git a/scripts/src/hwlab-node-web-sentinel-cicd.ts b/scripts/src/hwlab-node-web-sentinel-cicd.ts
index 7ffb91b6..6c23af31 100644
--- a/scripts/src/hwlab-node-web-sentinel-cicd.ts
+++ b/scripts/src/hwlab-node-web-sentinel-cicd.ts
@@ -6,6 +6,7 @@
// SPEC: PJ2026-01060508 Web哨兵 draft-2026-06-27-p12-cadence-scheduler-monitor-web.
// SPEC: PJ2026-01060508 Web哨兵 draft-2026-06-28-p13-1206-multi-runner-boundaries.
// SPEC: PJ2026-01060508 Web哨兵 draft-2026-06-30-p14-sentinel-cicd-visibility.
+// SPEC: PJ2026-01060508 Web哨兵 draft-2026-07-01-p15-cadence-otel.
// Responsibility: YAML-first CI/CD, image, GitOps and Argo command plan for the web-probe sentinel.
import { createHash, randomUUID } from "node:crypto";
import { existsSync, readFileSync } from "node:fs";
@@ -20,6 +21,7 @@ import type { HwlabRuntimeLaneSpec } from "./hwlab-node-lanes";
import type { RenderedCliResult } from "./output";
import { probeSentinelRuntimeHealthEndpoint, runSentinelDashboard, runSentinelMaintenance, runSentinelReport, runSentinelValidate } from "./hwlab-node-web-sentinel-p5";
import { runChildCli, sentinelP5Next } from "./hwlab-node-web-sentinel-p5-observe";
+import { emitWebProbeSentinelSpan, webProbeSentinelOtelSummary } from "./hwlab-node-web-sentinel-otel";
export type WebProbeSentinelConfigAction = "plan" | "status";
export type WebProbeSentinelImageAction = "status" | "build";
@@ -178,6 +180,7 @@ interface SentinelObservedStatus {
readonly gitops: Record;
readonly argo: Record;
readonly runtime: Record;
+ readonly cadence: Record;
readonly wait?: Record;
}
@@ -215,7 +218,7 @@ export interface ChildCliResult {
readonly result: CompactCommandResult & { stdoutTail: string; stderrTail: string };
}
-const SPEC_REF = "PJ2026-01060508 Web哨兵 draft-2026-06-30-p14-sentinel-cicd-visibility";
+const SPEC_REF = "PJ2026-01060508 Web哨兵 draft-2026-07-01-p15-cadence-otel";
export function runWebProbeSentinelCommand(spec: HwlabRuntimeLaneSpec, options: WebProbeSentinelOptions): RenderedCliResult {
if (options.kind === "config") return withWebProbeSentinelConfigRendered(webProbeSentinelConfigPlan(spec, options.action, options.sentinelId));
@@ -317,6 +320,14 @@ function runSentinelControlPlane(state: SentinelCicdState, options: Extract): readonly Record[] {
+function sentinelContainerEnv(sentinelId: string, runtime: Record, cicd: Record, secrets: Record): readonly Record[] {
const env: Record[] = [{ name: "UNIDESK_WEB_PROBE_SENTINEL_ID", value: sentinelId }];
+ const otelEnabled = booleanAtNullable(runtime, "observability.otel.enabled") ?? booleanAtNullable(cicd, "observability.otel.enabled") ?? false;
+ const otelEndpoint = stringAtNullable(runtime, "observability.otel.tracesEndpoint")
+ ?? stringAtNullable(runtime, "observability.otel.endpoint")
+ ?? stringAtNullable(cicd, "observability.otel.tracesEndpoint")
+ ?? stringAtNullable(cicd, "observability.otel.endpoint");
+ const otelServiceName = stringAtNullable(runtime, "observability.otel.serviceName") ?? stringAtNullable(cicd, "observability.otel.serviceName");
+ const otelSampler = stringAtNullable(runtime, "observability.otel.sampler") ?? stringAtNullable(cicd, "observability.otel.sampler");
+ const otelSamplerArg = stringAtNullable(runtime, "observability.otel.samplerArg") ?? stringAtNullable(cicd, "observability.otel.samplerArg");
const sourcesByPurpose = new Map>();
for (const source of arrayAt(secrets, "sources").map(record)) {
const purpose = stringAtNullable(source, "purpose");
@@ -1074,6 +1109,12 @@ function sentinelContainerEnv(sentinelId: string, secrets: Record[],
): Record | null {
+ const scheduler = record(valueAtPath(cicd, "targetValidation.cadenceScheduler"));
const cadenceSchedulerEnabled = booleanAtNullable(cicd, "targetValidation.cadenceScheduler.enabled") === true;
if (!cadenceSchedulerEnabled) return null;
const scenarioId = stringAtNullable(cicd, "targetValidation.scenarioId");
@@ -1114,9 +1156,12 @@ function sentinelCadenceCronJobPlan(
const namespace = stringAt(runtime, "namespace");
const deploymentName = stringAt(runtime, "deploymentName");
const serviceAccountName = stringAt(runtime, "serviceAccountName");
- const timeoutSeconds = numberAtNullable(cicd, "targetValidation.maxSeconds") ?? numberAtNullable(scenario, "maxRunSeconds") ?? 300;
+ const timeoutSeconds = numberAt(cicd, "targetValidation.maxSeconds");
+ const activeDeadlineSlackSeconds = numberAt(scheduler, "activeDeadlineSlackSeconds");
const mainServerHost = stringAtNullable(cicd, "scheduler.mainServerHost");
- const name = safeKubernetesSegment(`${deploymentName}-quick-verify`, 52);
+ const name = sentinelCadenceCronJobName(deploymentName);
+ const concurrencyPolicy = stringAt(scheduler, "concurrencyPolicy");
+ if (!["Allow", "Forbid", "Replace"].includes(concurrencyPolicy)) throw new Error("targetValidation.cadenceScheduler.concurrencyPolicy must be Allow, Forbid or Replace");
const labels = {
"app.kubernetes.io/name": name,
"app.kubernetes.io/part-of": "hwlab-web-probe-sentinel",
@@ -1137,19 +1182,20 @@ function sentinelCadenceCronJobPlan(
annotations: {
"unidesk.ai/cadence": String(scenario.cadence),
"unidesk.ai/target-validation-max-seconds": String(timeoutSeconds),
+ "unidesk.ai/source": "targetValidation.cadenceScheduler",
},
},
spec: {
schedule,
- concurrencyPolicy: "Forbid",
- successfulJobsHistoryLimit: 3,
- failedJobsHistoryLimit: 5,
- startingDeadlineSeconds: Math.max(60, cadenceSeconds),
+ concurrencyPolicy,
+ successfulJobsHistoryLimit: numberAt(scheduler, "successfulJobsHistoryLimit"),
+ failedJobsHistoryLimit: numberAt(scheduler, "failedJobsHistoryLimit"),
+ startingDeadlineSeconds: numberAt(scheduler, "startingDeadlineSeconds"),
jobTemplate: {
spec: {
- activeDeadlineSeconds: timeoutSeconds + 60,
- ttlSecondsAfterFinished: 86400,
- backoffLimit: 0,
+ activeDeadlineSeconds: timeoutSeconds + activeDeadlineSlackSeconds,
+ ttlSecondsAfterFinished: numberAt(scheduler, "ttlSecondsAfterFinished"),
+ backoffLimit: numberAt(scheduler, "backoffLimit"),
template: {
metadata: { labels },
spec: {
@@ -1190,6 +1236,10 @@ function sentinelCadenceCronJobPlan(
};
}
+function sentinelCadenceCronJobName(deploymentName: string): string {
+ return safeKubernetesSegment(`${deploymentName}-quick-verify`, 52);
+}
+
function scenarioRows(value: unknown): Record[] {
if (Array.isArray(value)) return value.map(record);
if (!isRecord(value)) return [];
@@ -1605,6 +1655,7 @@ function sentinelSkippedObservedStatus(reason: string): SentinelObservedStatus {
gitops: skipped,
argo: skipped,
runtime: skipped,
+ cadence: skipped,
wait: {
polls: 0,
elapsedMs: 0,
@@ -1633,6 +1684,7 @@ function collectSentinelObservedStatus(state: SentinelCicdState, timeoutSeconds:
gitops,
argo: probeArgoApplication(state, timeoutSeconds, effectiveExpectation.gitopsRevision),
runtime: probeRuntimeObjects(state, timeoutSeconds, effectiveExpectation.runtimeImage),
+ cadence: probeCadenceCronJob(state, timeoutSeconds),
};
}
@@ -1668,13 +1720,15 @@ function sentinelObservedReady(value: Record | SentinelObserved
&& gitMirrorReady
&& record(observed.gitops).ok === true
&& record(observed.argo).ok === true
- && record(observed.runtime).ok === true;
+ && record(observed.runtime).ok === true
+ && record(observed.cadence).ok === true;
}
function sentinelObservedWarnings(value: Record | SentinelObservedStatus | null): string[] {
const observed = record(value);
const argo = record(observed.argo);
- return mergeWarnings(argo.warning);
+ const cadence = record(observed.cadence);
+ return mergeWarnings(argo.warning, cadence.warning);
}
function probeSourceMirror(state: SentinelCicdState, timeoutSeconds: number): Record {
@@ -1900,6 +1954,74 @@ function probeRuntimeObjects(state: SentinelCicdState, timeoutSeconds: number, e
return { ok: result.exitCode === 0 && probe?.ok === true, probe, result: compactCommand(result) };
}
+function probeCadenceCronJob(state: SentinelCicdState, timeoutSeconds: number): Record {
+ const expected = state.manifests.find((item) => item.kind === "CronJob") ?? null;
+ if (expected === null) {
+ return { ok: true, skipped: true, reason: "targetValidation.cadenceScheduler.disabled", valuesRedacted: true };
+ }
+ const metadata = record(expected.metadata);
+ const spec = record(expected.spec);
+ const namespace = stringAt(metadata, "namespace");
+ const name = stringAt(metadata, "name");
+ const expectedSchedule = stringAt(spec, "schedule");
+ const script = [
+ "set +e",
+ `namespace=${shellQuote(namespace)}`,
+ `cronjob=${shellQuote(name)}`,
+ `sentinel=${shellQuote(state.sentinelId)}`,
+ `expected_schedule=${shellQuote(expectedSchedule)}`,
+ "tmp=$(mktemp -d)",
+ "kubectl -n \"$namespace\" get cronjob \"$cronjob\" -o json >\"$tmp/cronjob.json\" 2>/dev/null; echo $? >\"$tmp/cronjob.rc\"",
+ "kubectl -n \"$namespace\" get jobs -l \"unidesk.ai/web-probe-sentinel-id=$sentinel,app.kubernetes.io/component=cadence-scheduler\" -o json >\"$tmp/jobs.json\" 2>/dev/null; echo $? >\"$tmp/jobs.rc\"",
+ "node - \"$tmp\" \"$namespace\" \"$cronjob\" \"$expected_schedule\" <<'NODE'",
+ "const fs = require('node:fs');",
+ "const [dir, namespace, cronJobName, expectedSchedule] = process.argv.slice(2);",
+ "function rc(name){ try { return Number(fs.readFileSync(`${dir}/${name}.rc`, 'utf8').trim()); } catch { return 1; } }",
+ "function json(name){ try { return JSON.parse(fs.readFileSync(`${dir}/${name}.json`, 'utf8')); } catch { return null; } }",
+ "const cron = json('cronjob');",
+ "const jobs = Array.isArray(json('jobs')?.items) ? json('jobs').items : [];",
+ "const present = rc('cronjob') === 0 && !!cron;",
+ "const schedule = cron?.spec?.schedule || null;",
+ "const scheduleMatches = present && schedule === expectedSchedule;",
+ "const suspended = cron?.spec?.suspend === true;",
+ "const active = Array.isArray(cron?.status?.active) ? cron.status.active.length : 0;",
+ "const sortedJobs = jobs.slice().sort((a,b)=>String(b?.metadata?.creationTimestamp||'').localeCompare(String(a?.metadata?.creationTimestamp||''))).slice(0,8);",
+ "let code = null;",
+ "if (!present) code = 'sentinel-cadence-cronjob-missing';",
+ "else if (!scheduleMatches) code = 'sentinel-cadence-cronjob-schedule-mismatch';",
+ "else if (suspended) code = 'sentinel-cadence-cronjob-suspended';",
+ "const latestJob = sortedJobs[0] || null;",
+ "console.log(JSON.stringify({ ok: code === null, code, present, namespace, name: cronJobName, schedule, expectedSchedule, scheduleMatches, suspended, lastScheduleTime: cron?.status?.lastScheduleTime || null, lastSuccessfulTime: cron?.status?.lastSuccessfulTime || null, active, jobCount: jobs.length, latestJobs: sortedJobs.map((job)=>({ name: job?.metadata?.name || null, createdAt: job?.metadata?.creationTimestamp || null, active: Number(job?.status?.active || 0), succeeded: Number(job?.status?.succeeded || 0), failed: Number(job?.status?.failed || 0), completionTime: job?.status?.completionTime || null, valuesRedacted:true })), latestJobName: latestJob?.metadata?.name || null, valuesRedacted: true }));",
+ "NODE",
+ ].join("\n");
+ const result = runCommand(["trans", stringAt(state.controlPlaneNode, "kubeRoute"), "sh", "--", script], repoRoot, { timeoutMs: Math.min(timeoutSeconds, 60) * 1000 });
+ const probe = parseJsonObject(result.stdout);
+ const ok = result.exitCode === 0 && probe?.ok === true;
+ emitWebProbeSentinelSpan({
+ node: state.spec.nodeId,
+ lane: state.spec.lane,
+ sentinelId: state.sentinelId,
+ namespace,
+ runtime: state.runtime,
+ cicd: state.cicd,
+ }, "web_probe_sentinel.cadence.cronjob_observed", {
+ cronJobName: name,
+ namespace,
+ schedule: expectedSchedule,
+ status: ok ? "ok" : text(probe?.code ?? "unknown"),
+ jobName: probe?.latestJobName ?? null,
+ failureKind: probe?.code ?? null,
+ valuesRedacted: true,
+ }, ok);
+ return {
+ ok,
+ probe,
+ result: compactCommand(result),
+ warning: ok ? null : `cadence CronJob is not ready: ${text(probe?.code ?? "probe-failed")}`,
+ valuesRedacted: true,
+ };
+}
+
function expectedRuntimeImageFromRegistry(state: SentinelCicdState, registry: Record): string | null {
const digest = nonEmptyString(record(record(registry).probe).digest);
if (digest === null) return null;
@@ -3816,6 +3938,7 @@ function renderControlPlaneResult(result: Record): string {
const gitops = record(result.gitops);
const argo = record(result.argo);
const validation = record(result.validation);
+ const observability = record(result.observability);
const observed = record(result.observed);
const sourceMirrorSync = record(result.sourceMirrorSync);
const publish = record(result.publish);
@@ -3841,6 +3964,8 @@ function renderControlPlaneResult(result: Record): string {
"",
table(["SCENARIO", "MAX_SECONDS", "CI_WAIT", "QVERIFY", "SECOND_PATH"], [[validation.scenarioId, validation.maxSeconds, validation.controlPlaneWaitMaxSeconds ?? "-", validation.quickVerifyMode ?? "-", validation.automaticSecondPath]]),
"",
+ Object.keys(observability).length === 0 ? "OTEL\n-" : table(["ENABLED", "ENDPOINT", "SERVICE", "COVERAGE"], [[observability.enabled, observability.endpointConfigured, observability.serviceName, observability.coverage]]),
+ "",
renderObservedStatus(observed),
"",
Object.keys(sourceMirrorSync).length === 0 ? "SOURCE_MIRROR_SYNC\n-" : table(["OK", "PHASE", "JOB", "COMMIT", "ELAPSED"], [[sourceMirrorSync.ok, sourceMirrorSync.phase, sourceMirrorSync.jobName, short(record(sourceMirrorSync.payload).mirrorCommit), sourceMirrorSync.elapsedMs ?? "-"]]),
@@ -3913,6 +4038,7 @@ function renderObservedStatus(observed: Record): string {
observedStatusRow("gitops", observed.gitops),
observedStatusRow("argo", observed.argo),
observedStatusRow("runtime", observed.runtime),
+ observedStatusRow("cadence", observed.cadence),
].filter((row) => row !== null);
if (rows.length === 0) return "OBSERVED\n-";
return table(["CHECK", "OK", "DETAIL", "EXIT", "TIMED_OUT", "PREVIEW"], rows);
@@ -3944,6 +4070,11 @@ function observedDetail(name: string, item: Record): string {
const deployment = record(probe.deployment);
return `ready=${deployment.readyReplicas ?? "-"} image=${short(deployment.image)}/${short(deployment.expectedImage)}`;
}
+ if (name === "cadence") {
+ if (item.skipped === true) return `${item.reason ?? "skipped"}`;
+ const probe = record(item.probe);
+ return `${probe.code ?? "ok"} schedule=${probe.schedule ?? "-"}/${probe.expectedSchedule ?? "-"} last=${probe.lastScheduleTime ?? "-"} jobs=${probe.jobCount ?? "-"}`;
+ }
return "-";
}
diff --git a/scripts/src/hwlab-node-web-sentinel-otel.ts b/scripts/src/hwlab-node-web-sentinel-otel.ts
new file mode 100644
index 00000000..36fd774b
--- /dev/null
+++ b/scripts/src/hwlab-node-web-sentinel-otel.ts
@@ -0,0 +1,143 @@
+// SPEC: PJ2026-01060508 Web哨兵 draft-2026-07-01-p15-cadence-otel.
+// Responsibility: Best-effort OTLP span emitter for web-probe sentinel scheduler, cadence and quick-verify events.
+import { randomBytes } from "node:crypto";
+
+export interface SentinelOtelContext {
+ readonly node: string;
+ readonly lane: string;
+ readonly sentinelId: string;
+ readonly namespace?: string | null;
+ readonly runtime?: Record;
+ readonly cicd?: Record;
+}
+
+export function emitWebProbeSentinelSpan(context: SentinelOtelContext, name: string, attributes: Record = {}, ok = true): void {
+ const config = resolveOtelConfig(context);
+ if (!config.enabled || config.endpoint === null) return;
+ const start = BigInt(Date.now()) * 1_000_000n;
+ const end = start + 1_000_000n;
+ const traceId = randomHex(16);
+ const spanId = randomHex(8);
+ const payload = {
+ resourceSpans: [{
+ resource: {
+ attributes: otelAttributes({
+ "service.name": config.serviceName,
+ "deployment.environment": context.lane,
+ "unidesk.node": context.node,
+ "hwlab.lane": context.lane,
+ "k8s.namespace.name": context.namespace ?? stringAtNullable(context.runtime, "namespace"),
+ "unidesk.values_redacted": true,
+ }),
+ },
+ scopeSpans: [{
+ scope: { name: "unidesk.web_probe_sentinel", version: "PJ2026-01060508" },
+ spans: [{
+ traceId,
+ spanId,
+ name,
+ kind: 1,
+ startTimeUnixNano: start.toString(),
+ endTimeUnixNano: end.toString(),
+ attributes: otelAttributes({
+ "unidesk.node": context.node,
+ "hwlab.lane": context.lane,
+ "sentinelId": context.sentinelId,
+ "valuesRedacted": true,
+ ...attributes,
+ }),
+ status: { code: ok ? 1 : 2 },
+ }],
+ }],
+ }],
+ };
+ void fetch(config.endpoint, {
+ method: "POST",
+ headers: { "content-type": "application/json" },
+ body: JSON.stringify(payload),
+ }).catch(() => undefined);
+}
+
+export function webProbeSentinelOtelSummary(context: SentinelOtelContext): Record {
+ const config = resolveOtelConfig(context);
+ return {
+ enabled: config.enabled,
+ endpointConfigured: config.endpoint !== null,
+ serviceName: config.serviceName,
+ coverage: config.enabled && config.endpoint !== null ? "best-effort-otlp-spans" : "instrumentation-gap",
+ expectedSpans: [
+ "web_probe_sentinel.scheduler.heartbeat",
+ "web_probe_sentinel.cadence.expected",
+ "web_probe_sentinel.cadence.cronjob_rendered",
+ "web_probe_sentinel.cadence.cronjob_observed",
+ "web_probe_sentinel.quick_verify.job_start",
+ "web_probe_sentinel.quick_verify.job_finish",
+ "web_probe_sentinel.record_run",
+ "web_probe_sentinel.scheduler_gap.detected",
+ ],
+ valuesRedacted: true,
+ };
+}
+
+function resolveOtelConfig(context: SentinelOtelContext): { readonly enabled: boolean; readonly endpoint: string | null; readonly serviceName: string } {
+ const runtime = context.runtime ?? {};
+ const cicd = context.cicd ?? {};
+ const enabledFromYaml = booleanAtNullable(runtime, "observability.otel.enabled")
+ ?? booleanAtNullable(cicd, "observability.otel.enabled");
+ const disabledByEnv = /^(1|true)$/iu.test(process.env.OTEL_SDK_DISABLED ?? "");
+ const endpoint = stringAtNullable(runtime, "observability.otel.tracesEndpoint")
+ ?? stringAtNullable(runtime, "observability.otel.endpoint")
+ ?? stringAtNullable(cicd, "observability.otel.tracesEndpoint")
+ ?? stringAtNullable(cicd, "observability.otel.endpoint")
+ ?? nonEmptyString(process.env.OTEL_EXPORTER_OTLP_TRACES_ENDPOINT);
+ const serviceName = stringAtNullable(runtime, "observability.otel.serviceName")
+ ?? stringAtNullable(cicd, "observability.otel.serviceName")
+ ?? nonEmptyString(process.env.OTEL_SERVICE_NAME)
+ ?? `hwlab-web-probe-sentinel-${context.node.toLowerCase()}`;
+ return {
+ enabled: !disabledByEnv && (enabledFromYaml === true || endpoint !== null),
+ endpoint,
+ serviceName,
+ };
+}
+
+function otelAttributes(values: Record): readonly Record[] {
+ return Object.entries(values)
+ .filter(([, value]) => value !== undefined && value !== null)
+ .map(([key, value]) => ({ key, value: otelValue(value) }));
+}
+
+function otelValue(value: unknown): Record {
+ if (typeof value === "boolean") return { boolValue: value };
+ if (typeof value === "number" && Number.isFinite(value)) {
+ return Number.isInteger(value) ? { intValue: String(value) } : { doubleValue: value };
+ }
+ return { stringValue: typeof value === "string" ? value : JSON.stringify(value) };
+}
+
+function randomHex(bytes: number): string {
+ return randomBytes(bytes).toString("hex");
+}
+
+function stringAtNullable(value: unknown, path: string): string | null {
+ const found = valueAtPath(value, path);
+ return typeof found === "string" && found.length > 0 ? found : null;
+}
+
+function booleanAtNullable(value: unknown, path: string): boolean | null {
+ const found = valueAtPath(value, path);
+ return typeof found === "boolean" ? found : null;
+}
+
+function nonEmptyString(value: unknown): string | null {
+ return typeof value === "string" && value.length > 0 ? value : null;
+}
+
+function valueAtPath(value: unknown, path: string): unknown {
+ let current: unknown = value;
+ for (const segment of path.split(".")) {
+ if (typeof current !== "object" || current === null || Array.isArray(current)) return undefined;
+ current = (current as Record)[segment];
+ }
+ return current;
+}
diff --git a/scripts/src/hwlab-node-web-sentinel-p5-observe.ts b/scripts/src/hwlab-node-web-sentinel-p5-observe.ts
index 3afd6da2..a82f3218 100644
--- a/scripts/src/hwlab-node-web-sentinel-p5-observe.ts
+++ b/scripts/src/hwlab-node-web-sentinel-p5-observe.ts
@@ -1,4 +1,5 @@
// SPEC: PJ2026-01060508 Web哨兵 draft-2026-06-27-p11-monitor-web-observability-dashboard.
+// SPEC: PJ2026-01060508 Web哨兵 draft-2026-07-01-p15-cadence-otel.
// Responsibility: Quick-verify observe orchestration and artifact interpretation for web-probe sentinel P5 validation.
import { createHash, randomUUID } from "node:crypto";
import { existsSync, readFileSync } from "node:fs";
@@ -32,6 +33,7 @@ import {
text,
withWarnings,
} from "./hwlab-node-web-sentinel-cicd";
+import { emitWebProbeSentinelSpan } from "./hwlab-node-web-sentinel-otel";
function printQuickVerifyProgress(state: SentinelCicdState, runId: string | null, phase: string, status: string, extra: Record = {}): void {
const compactExtra = Object.fromEntries(Object.entries(extra).map(([key, value]) => {
@@ -60,12 +62,30 @@ export function runSentinelQuickVerify(state: SentinelCicdState, reason: string,
const maxSeconds = numberAt(state.cicd, "targetValidation.maxSeconds");
const scenario = findScenario(state, scenarioId);
if (scenario === null) return { ok: false, status: "blocked", reason: "scenario-not-found", scenarioId, valuesRedacted: true };
+ const runId = `sentinel-run-${Date.now().toString(36)}-${randomUUID().slice(0, 8)}`;
+ emitWebProbeSentinelSpan(sentinelOtelContext(state), "web_probe_sentinel.quick_verify.job_start", {
+ scenarioId,
+ runId,
+ cadence: stringAtNullable(scenario, "cadence"),
+ status: "running",
+ valuesRedacted: true,
+ });
const commandSequence = arrayAt(scenario, "commandSequence").map(record);
const needsPromptSet = commandSequence.some((item) => stringAt(item, "type") === "sendPrompt" && inlinePromptText(item) === null);
const prompts = needsPromptSet
? readPromptSetForScenario(state, scenario)
: { ok: true as const, prompts: [], summary: { source: "not-required", promptCount: 0, valuesRedacted: true } };
- if (!prompts.ok) return { ok: false, status: "blocked", reason: "prompt-source-unavailable", promptSource: prompts, valuesRedacted: true };
+ if (!prompts.ok) {
+ emitWebProbeSentinelSpan(sentinelOtelContext(state), "web_probe_sentinel.quick_verify.job_finish", {
+ scenarioId,
+ runId,
+ status: "blocked",
+ exitCode: 1,
+ failureKind: "prompt-source-unavailable",
+ valuesRedacted: true,
+ }, false);
+ return { ok: false, status: "blocked", reason: "prompt-source-unavailable", promptSource: prompts, valuesRedacted: true };
+ }
const accountEnv = quickVerifyAccountEnv(state);
if (!accountEnv.ok) {
const findings = [{
@@ -78,7 +98,7 @@ export function runSentinelQuickVerify(state: SentinelCicdState, reason: string,
}];
return recordQuickVerify(state, {
ok: false,
- runId: `sentinel-run-${Date.now().toString(36)}-${randomUUID().slice(0, 8)}`,
+ runId,
scenarioId,
reason,
status: "blocked",
@@ -104,7 +124,6 @@ export function runSentinelQuickVerify(state: SentinelCicdState, reason: string,
const hardBudgetSeconds = Math.min(timeoutSeconds, Math.max(maxSeconds, numberAt(scenario, "maxRunSeconds")));
const elapsedWarnings = () => targetValidationElapsedWarnings(elapsedMs(), "quick verify confirm-wait", warningBudgetSeconds);
const deadline = Date.now() + hardBudgetSeconds * 1000;
- const runId = `sentinel-run-${Date.now().toString(36)}-${randomUUID().slice(0, 8)}`;
printQuickVerifyProgress(state, runId, "start", "running", { scenarioId, reason, warningBudgetSeconds, hardBudgetSeconds, timeoutSeconds });
const steps: Record[] = [];
const startArgs = [
@@ -659,9 +678,29 @@ function recordQuickVerify(state: SentinelCicdState, payload: Record; readonly cicd: Record } {
+ return {
+ node: state.spec.nodeId,
+ lane: state.spec.lane,
+ sentinelId: state.sentinelId,
+ namespace: stringAtNullable(state.runtime, "namespace"),
+ runtime: state.runtime,
+ cicd: state.cicd,
+ };
+}
+
function compactQuickVerifyRecordViews(views: Record): Record {
const compacted: Record = {};
for (const [key, value] of Object.entries(views)) {
diff --git a/scripts/src/hwlab-node-web-sentinel-service.ts b/scripts/src/hwlab-node-web-sentinel-service.ts
index f64d45b1..82d7889c 100644
--- a/scripts/src/hwlab-node-web-sentinel-service.ts
+++ b/scripts/src/hwlab-node-web-sentinel-service.ts
@@ -4,6 +4,7 @@
// SPEC: PJ2026-01060508 Web哨兵 draft-2026-06-26-p9-multi-web-probe-sentinel.
// SPEC: PJ2026-01060508 Web哨兵 draft-2026-06-27-p11-monitor-web-observability-dashboard.
// SPEC: PJ2026-01060508 Web哨兵 draft-2026-06-28-p13-1206-multi-runner-boundaries.
+// SPEC: PJ2026-01060508 Web哨兵 draft-2026-07-01-p15-cadence-otel.
// Responsibility: Persistent HTTP wrapper service for web-probe observe scheduling, index, health, metrics, maintenance, and dashboard.
import { Buffer } from "node:buffer";
import { createHash, randomUUID } from "node:crypto";
@@ -14,6 +15,7 @@ import { renderWebProbeSentinelDashboardHtml, webProbeSentinelDashboardAssetResp
import { webProbeSentinelConfigPlan, type WebProbeSentinelConfigPlan } from "./hwlab-node-web-sentinel-config";
import type { HwlabRuntimeLaneSpec } from "./hwlab-node-lanes";
import { effectiveWebProbeSentinelPublicExposure, resolveWebProbeSentinel, readConfigRefTarget as readSentinelConfigRefTarget } from "./hwlab-node-web-sentinel-resolver";
+import { emitWebProbeSentinelSpan, webProbeSentinelOtelSummary } from "./hwlab-node-web-sentinel-otel";
const DASHBOARD_CONTRACT_VERSION = "draft-2026-06-27-p11-monitor-web-observability-dashboard";
const DASHBOARD_MAX_TEXT_BYTES = 16_000;
@@ -130,6 +132,7 @@ export function createWebProbeSentinelService(options: WebProbeSentinelServiceOp
let schedulerLastError: string | null = null;
writeMetadata(db, "service.boot", { at: schedulerHeartbeatAt, restoredInterruptedRuns: restored, valuesRedacted: true });
writeMetadata(db, "scheduler.heartbeat", { at: schedulerHeartbeatAt, loop: "boot" });
+ emitSchedulerHeartbeatSpan(config, "boot", schedulerHeartbeatAt, true);
const service: WebProbeSentinelService = {
config,
@@ -139,15 +142,21 @@ export function createWebProbeSentinelService(options: WebProbeSentinelServiceOp
if (!schedulerEnabled || schedulerTimer !== null) return;
schedulerHeartbeatAt = nowIso();
writeMetadata(db, "scheduler.heartbeat", { at: schedulerHeartbeatAt, loop: "started" });
+ emitSchedulerHeartbeatSpan(config, "started", schedulerHeartbeatAt, true);
schedulerTimer = setInterval(() => {
try {
schedulerHeartbeatAt = nowIso();
writeMetadata(db, "scheduler.heartbeat", { at: schedulerHeartbeatAt, loop: "tick" });
- writeMetadata(db, "scheduler.summary", schedulerSummary(config, db));
+ const summary = schedulerSummary(config, db);
+ writeMetadata(db, "scheduler.summary", summary);
+ emitSchedulerHeartbeatSpan(config, "tick", schedulerHeartbeatAt, true);
+ emitCadenceExpectedSpan(config, summary);
+ if (summary.rootCause === "planned-run-not-consumed-by-host-cadence") emitSchedulerGapSpan(config, summary);
schedulerLastError = null;
} catch (error) {
schedulerLastError = error instanceof Error ? error.message : String(error);
writeMetadata(db, "scheduler.error", { at: nowIso(), message: schedulerLastError });
+ emitSchedulerHeartbeatSpan(config, "tick-error", nowIso(), false, schedulerLastError);
}
}, config.schedulerIntervalMs);
},
@@ -249,7 +258,9 @@ export function createWebProbeSentinelService(options: WebProbeSentinelServiceOp
return { ok: true, runId, scenarioId, status: "planned", commandPlanSha256: sha256Json(commandPlan), valuesRedacted: true };
},
recordRun(input: Record) {
- return recordRunResult(config, db, input);
+ const result = recordRunResult(config, db, input);
+ emitRecordRunSpan(config, input, result);
+ return result;
},
report(view: string, runId: string | null) {
return reportRunView(config, db, view, runId);
@@ -636,6 +647,66 @@ function schedulerSummary(config: WebProbeSentinelServiceConfig, db: Database):
};
}
+function emitSchedulerHeartbeatSpan(config: WebProbeSentinelServiceConfig, loop: string, at: string, ok: boolean, failureKind: string | null = null): void {
+ emitWebProbeSentinelSpan(sentinelOtelContext(config), "web_probe_sentinel.scheduler.heartbeat", {
+ status: ok ? "ok" : "error",
+ failureKind,
+ namespace: stringOrNull(config.runtime.namespace),
+ heartbeatAt: at,
+ cadence: firstEnabledScenarioCadence(config),
+ valuesRedacted: true,
+ }, ok);
+}
+
+function emitCadenceExpectedSpan(config: WebProbeSentinelServiceConfig, summary: Record): void {
+ emitWebProbeSentinelSpan(sentinelOtelContext(config), "web_probe_sentinel.cadence.expected", {
+ cadence: firstEnabledScenarioCadence(config),
+ scenarioId: firstEnabledScenarioId(config),
+ status: summary.rootCause == null ? "ok" : "stale",
+ activeRunCount: summary.activeRuns ?? null,
+ plannedRunCount: summary.plannedRuns ?? null,
+ valuesRedacted: true,
+ }, summary.rootCause == null);
+}
+
+function emitSchedulerGapSpan(config: WebProbeSentinelServiceConfig, summary: Record): void {
+ emitWebProbeSentinelSpan(sentinelOtelContext(config), "web_probe_sentinel.scheduler_gap.detected", {
+ cadence: firstEnabledScenarioCadence(config),
+ scenarioId: summary.oldestPlannedRunScenarioId ?? firstEnabledScenarioId(config),
+ runId: summary.oldestPlannedRunId ?? null,
+ status: "planned-run-stale",
+ failureKind: summary.rootCause,
+ valuesRedacted: true,
+ }, false);
+}
+
+function emitRecordRunSpan(config: WebProbeSentinelServiceConfig, input: Record, result: Record): void {
+ emitWebProbeSentinelSpan(sentinelOtelContext(config), "web_probe_sentinel.record_run", {
+ scenarioId: result.scenarioId ?? input.scenarioId ?? null,
+ runId: result.runId ?? input.runId ?? null,
+ observerId: input.observerId ?? null,
+ status: result.status ?? input.status ?? null,
+ failureKind: result.ok === true ? null : result.error ?? "record-run-failed",
+ valuesRedacted: true,
+ }, result.ok === true);
+}
+
+function sentinelOtelContext(config: WebProbeSentinelServiceConfig): { readonly node: string; readonly lane: string; readonly sentinelId: string; readonly namespace: string | null; readonly runtime: Record; readonly cicd: Record } {
+ return {
+ node: config.node,
+ lane: config.lane,
+ sentinelId: config.sentinelId,
+ namespace: stringOrNull(config.runtime.namespace),
+ runtime: config.runtime,
+ cicd: config.cicd,
+ };
+}
+
+function firstEnabledScenarioCadence(config: WebProbeSentinelServiceConfig): string | null {
+ const scenario = config.scenarios.find((item) => boolAt(item, "enabled"));
+ return scenario === undefined ? null : stringOrNull(scenario.cadence);
+}
+
function renderMetrics(config: WebProbeSentinelServiceConfig, db: Database, health: Record, maintenance: MaintenanceState): string {
const counts = runCounts(config, db);
const heartbeat = record(readMetadata(db, "scheduler.heartbeat"));
@@ -740,6 +811,12 @@ function dashboardOverview(config: WebProbeSentinelServiceConfig, db: Database,
const severityCounts = globalSeverityCounts(config, db);
const latestUpdatedAt = latestRow === null ? null : stringOrNull(latestRow.updated_at);
const latestRunAgeSeconds = latestUpdatedAt === null ? null : ageSeconds(latestUpdatedAt);
+ const heartbeatAgeSeconds = numberOr(record(record(health.checks).scheduler).heartbeatAgeSeconds, -1);
+ const expectedCadence = firstEnabledScenarioCadence(config);
+ const expectedCadenceSeconds = durationStringSeconds(expectedCadence);
+ const staleMultiple = expectedCadenceSeconds === null || latestRunAgeSeconds === null ? null : latestRunAgeSeconds / expectedCadenceSeconds;
+ const freshnessWarningMultiple = numberAt(config.runtime, "scheduler.freshnessWarningMultiple");
+ const scheduler = schedulerSummary(config, db);
return {
ok: health.ok === true,
contractVersion: DASHBOARD_CONTRACT_VERSION,
@@ -750,7 +827,7 @@ function dashboardOverview(config: WebProbeSentinelServiceConfig, db: Database,
publicOrigin: stringOrNull(config.publicExposure.publicBaseUrl),
configReady: config.plan.ok,
health,
- scheduler: schedulerSummary(config, db),
+ scheduler,
maintenance,
latestRun,
runCounts: runCounts(config, db),
@@ -758,8 +835,30 @@ function dashboardOverview(config: WebProbeSentinelServiceConfig, db: Database,
freshness: {
latestRunUpdatedAt: latestUpdatedAt,
latestRunAgeSeconds,
- schedulerHeartbeatAgeSeconds: numberOr(record(record(health.checks).scheduler).heartbeatAgeSeconds, -1),
+ schedulerHeartbeatAgeSeconds: heartbeatAgeSeconds,
+ latestAnalyzedReportAgeSeconds: latestRow === null || stringOrNull(latestRow.report_json_sha256) === null ? null : latestRunAgeSeconds,
},
+ cadence: {
+ expectedCadence,
+ expectedCadenceSeconds,
+ schedulerHeartbeatAgeSeconds: heartbeatAgeSeconds,
+ latestRunAgeSeconds,
+ latestAnalyzedReportAgeSeconds: latestRow === null || stringOrNull(latestRow.report_json_sha256) === null ? null : latestRunAgeSeconds,
+ activeRuns: scheduler.activeRuns ?? null,
+ plannedRuns: scheduler.plannedRuns ?? null,
+ nextRun: null,
+ staleMultiple,
+ freshnessWarningMultiple,
+ status: scheduler.rootCause === "planned-run-not-consumed-by-host-cadence" ? "blocker" : staleMultiple !== null && staleMultiple > freshnessWarningMultiple ? "warning" : "fresh",
+ cronJob: {
+ observed: false,
+ status: "control-plane-status-required",
+ reason: "runner API does not query Kubernetes CronJob objects; use web-probe sentinel control-plane status for CronJob counts, lastScheduleTime and latest Jobs.",
+ valuesRedacted: true,
+ },
+ valuesRedacted: true,
+ },
+ observability: webProbeSentinelOtelSummary(sentinelOtelContext(config)),
targetValidation: {
scenarioId: stringOrNull(record(config.cicd.targetValidation).scenarioId),
maxSeconds: numberOr(record(config.cicd.targetValidation).maxSeconds, 120),