From c7ea34f2532c49d55e414dc4f8244374fcd115d2 Mon Sep 17 00:00:00 2001 From: Codex Date: Wed, 1 Jul 2026 06:43:30 +0000 Subject: [PATCH] feat(sentinel): restore JD01 cadence cronjob visibility --- config/hwlab-web-probe-sentinel/profiles.yaml | 19 ++- .../runtime.auth-session-switch.d601-v03.yaml | 1 + .../runtime.d518-v03.yaml | 1 + .../runtime.d601-v03.yaml | 1 + .../runtime.fake-echo.d518-v03.yaml | 1 + .../runtime.mdtodo.d601-v03.yaml | 1 + .../PJ2026-01060508-web-probe-sentinel.md | 24 +++ .../monitor-web.js | 29 +++- scripts/src/hwlab-node-web-sentinel-cicd.ts | 159 ++++++++++++++++-- scripts/src/hwlab-node-web-sentinel-otel.ts | 143 ++++++++++++++++ .../src/hwlab-node-web-sentinel-p5-observe.ts | 45 ++++- .../src/hwlab-node-web-sentinel-service.ts | 107 +++++++++++- 12 files changed, 503 insertions(+), 28 deletions(-) create mode 100644 scripts/src/hwlab-node-web-sentinel-otel.ts diff --git a/config/hwlab-web-probe-sentinel/profiles.yaml b/config/hwlab-web-probe-sentinel/profiles.yaml index 460be77c..ca72b576 100644 --- a/config/hwlab-web-probe-sentinel/profiles.yaml +++ b/config/hwlab-web-probe-sentinel/profiles.yaml @@ -26,6 +26,14 @@ baselines: intervalMs: 600000 heartbeatStaleSeconds: 900 maxConcurrentRuns: 1 + freshnessWarningMultiple: 2 + observability: + otel: + enabled: true + serviceName: hwlab-web-probe-sentinel-${nodeLower} + tracesEndpoint: http://otel-collector.platform-infra.svc.cluster.local:4318/v1/traces + sampler: parentbased_traceidratio + samplerArg: "1" scheduler15m: &scheduler-15m intervalMs: 900000 heartbeatStaleSeconds: 900 @@ -229,8 +237,15 @@ nodes: maxSeconds: 360 serviceUnavailablePolicy: structured-failure cadenceScheduler: - enabled: false - reason: cicd-health-endpoint-only + enabled: true + reason: k8s-native-periodic-quick-verify + concurrencyPolicy: Forbid + startingDeadlineSeconds: 600 + successfulJobsHistoryLimit: 3 + failedJobsHistoryLimit: 5 + activeDeadlineSlackSeconds: 60 + ttlSecondsAfterFinished: 86400 + backoffLimit: 0 secrets: sources: - <<: *jd01-bootstrap-source diff --git a/config/hwlab-web-probe-sentinel/runtime.auth-session-switch.d601-v03.yaml b/config/hwlab-web-probe-sentinel/runtime.auth-session-switch.d601-v03.yaml index 7a040d56..000ee70f 100644 --- a/config/hwlab-web-probe-sentinel/runtime.auth-session-switch.d601-v03.yaml +++ b/config/hwlab-web-probe-sentinel/runtime.auth-session-switch.d601-v03.yaml @@ -28,6 +28,7 @@ sentinel: intervalMs: 600000 heartbeatStaleSeconds: 900 maxConcurrentRuns: 1 + freshnessWarningMultiple: 2 sqlite: path: /var/lib/web-probe-sentinel-auth-switch/index.sqlite busyTimeoutMs: 2000 diff --git a/config/hwlab-web-probe-sentinel/runtime.d518-v03.yaml b/config/hwlab-web-probe-sentinel/runtime.d518-v03.yaml index 96017718..d31a7986 100644 --- a/config/hwlab-web-probe-sentinel/runtime.d518-v03.yaml +++ b/config/hwlab-web-probe-sentinel/runtime.d518-v03.yaml @@ -28,6 +28,7 @@ sentinel: intervalMs: 600000 heartbeatStaleSeconds: 900 maxConcurrentRuns: 1 + freshnessWarningMultiple: 2 sqlite: path: /var/lib/web-probe-sentinel-dsflash/index.sqlite busyTimeoutMs: 2000 diff --git a/config/hwlab-web-probe-sentinel/runtime.d601-v03.yaml b/config/hwlab-web-probe-sentinel/runtime.d601-v03.yaml index 9857686e..d2d216b3 100644 --- a/config/hwlab-web-probe-sentinel/runtime.d601-v03.yaml +++ b/config/hwlab-web-probe-sentinel/runtime.d601-v03.yaml @@ -28,6 +28,7 @@ sentinel: intervalMs: 600000 heartbeatStaleSeconds: 900 maxConcurrentRuns: 1 + freshnessWarningMultiple: 2 sqlite: path: /var/lib/web-probe-sentinel/index.sqlite busyTimeoutMs: 2000 diff --git a/config/hwlab-web-probe-sentinel/runtime.fake-echo.d518-v03.yaml b/config/hwlab-web-probe-sentinel/runtime.fake-echo.d518-v03.yaml index 31efffb3..ad5538dc 100644 --- a/config/hwlab-web-probe-sentinel/runtime.fake-echo.d518-v03.yaml +++ b/config/hwlab-web-probe-sentinel/runtime.fake-echo.d518-v03.yaml @@ -29,6 +29,7 @@ sentinel: intervalMs: 600000 heartbeatStaleSeconds: 900 maxConcurrentRuns: 1 + freshnessWarningMultiple: 2 sqlite: path: /var/lib/web-probe-sentinel-fake-echo/index.sqlite busyTimeoutMs: 2000 diff --git a/config/hwlab-web-probe-sentinel/runtime.mdtodo.d601-v03.yaml b/config/hwlab-web-probe-sentinel/runtime.mdtodo.d601-v03.yaml index 1de9c5d9..e5d3fee0 100644 --- a/config/hwlab-web-probe-sentinel/runtime.mdtodo.d601-v03.yaml +++ b/config/hwlab-web-probe-sentinel/runtime.mdtodo.d601-v03.yaml @@ -28,6 +28,7 @@ sentinel: intervalMs: 900000 heartbeatStaleSeconds: 900 maxConcurrentRuns: 1 + freshnessWarningMultiple: 2 sqlite: path: /var/lib/web-probe-sentinel-mdtodo/index.sqlite busyTimeoutMs: 2000 diff --git a/project-management/PJ2026-01/specs/PJ2026-01060508-web-probe-sentinel.md b/project-management/PJ2026-01/specs/PJ2026-01060508-web-probe-sentinel.md index d87292f5..6da678bc 100644 --- a/project-management/PJ2026-01/specs/PJ2026-01060508-web-probe-sentinel.md +++ b/project-management/PJ2026-01/specs/PJ2026-01060508-web-probe-sentinel.md @@ -24,6 +24,7 @@ | 多实例实现引用版本 | draft-2026-06-26-p9-multi-web-probe-sentinel | | Monitor Web 聚合实现引用版本 | draft-2026-06-26-p10-monitor-web-aggregation | | Monitor Web 观察面板治理实现引用版本 | draft-2026-06-27-p11-monitor-web-observability-dashboard; draft-2026-06-27-p12-cadence-scheduler-monitor-web | +| Cadence/OTel 稳定性实现引用版本 | draft-2026-07-01-p15-cadence-otel | | 需求规格模板 | [ISO/IEC/IEEE 29148 需求规格模板](../../templates/iso-iec-ieee-29148-requirements-spec-template.md) | | 上级规格 | [PJ2026-010605 运维监控](PJ2026-010605-observability-monitoring.md) | | 关联规格 | [PJ2026-010401 Web工作台](PJ2026-010401-web-workbench.md)、[PJ2026-0104010803 Workbench唯一投影](PJ2026-0104010803-workbench-unique-projection.md)、[PJ2026-010403 API契约](PJ2026-010403-api-contract.md)、[PJ2026-010601 发布流水](PJ2026-010601-controlled-release.md)、[PJ2026-010602 源码同步](PJ2026-010602-source-sync.md)、[PJ2026-010603 YAML运维](PJ2026-010603-yaml-first-ops.md)、[PJ2026-010604 公开入口](PJ2026-010604-public-entry.md)、[PJ2026-01060505 Workbench性能](PJ2026-01060505-workbench-performance.md) | @@ -126,6 +127,7 @@ Web哨兵必须遵循 UniDesk YAML-first ops。目标 node/lane、public origin | PJ2026-0106050811 | Monitor Web 聚合 | 本规格 6.11 | runner/web 职责拆分、单 monitor-web 聚合、Kubernetes discovery、Vue+TS 前端和 public exposure 收敛 | 多实例与账号切换、Dashboard工作台、发布集成 | `monitor.pikapython.com` 统一值守入口 | | PJ2026-0106050812 | Monitor Web 观察面板治理 | 本规格 6.12 | 趋势曲线、运行时间线、固定视口三栏、cadence freshness、Vue CI/CD/env reuse/git mirror | Monitor Web 聚合、Dashboard工作台、发布集成、源码同步 | 可滚动上线和值守的统一观察面板 | | PJ2026-0106050814 | 哨兵 CI/CD 可见性 | 本规格 6.14 | publish 阶段耗时、env reuse、docker cache、超时诊断、git mirror/Argo/runtime 收敛下一步 | 发布集成、源码同步、Monitor Web 观察面板治理 | 小改动滚动上线可诊断、可续跑、可验收 | +| PJ2026-0106050815 | Cadence/OTel 稳定性 | 本规格 6.15 | Kubernetes CronJob 周期巡检、状态缺口故障码、monitor-web cadence 可见性和 sentinel OTel span 合同 | Monitor Web 观察面板治理、发布集成、OTel、YAML运维 | JD01/v03 周期巡检恢复和后续防回归 | ### 5.1 目标架构图 @@ -567,6 +569,8 @@ P7 dashboard 增强范围内新增或修改的 dashboard API、frontend assets P10/P11 monitor-web 范围内新增或修改的 Vue/TypeScript/Vite 前端、typed API client、聚合 API、runner discovery、dashboard verify/screenshot、CI/CD renderer、GitOps/publicExposure helper 和 env reuse 规划代码必须标注 `SPEC: PJ2026-01060508 Web哨兵 draft-2026-06-27-p11-monitor-web-observability-dashboard`。旧 `scripts/assets/web-probe-sentinel-dashboard/dashboard.js` 只能标注迁移前短修或兼容验证用途,不得作为 P11 新观察面板能力的主要承载面。 +P15 cadence/OTel 范围内新增或修改的 CronJob renderer/status probe、runner health/overview、quick verify record path、monitor-web cadence 展示和 OTel emitter 源码文件头部必须标注 `SPEC: PJ2026-01060508 Web哨兵 draft-2026-07-01-p15-cadence-otel`。 + 实现文件不得只写 issue 编号、`latest`、`current` 或“按最新方案”作为规格引用。自动生成文件、第三方 vendored 文件、纯 YAML/config、锁文件和无法承载注释头的二进制产物不要求加源码头部,但对应生成器、渲染器、owning YAML 或 CLI 入口必须能追溯到本 SPEC。 后续 P1-P6 阶段如果改变稳定需求、观察对象、数据流、接口、部署边界或验收口径,应先更新本规格和上级 [PJ2026-010605 运维监控](PJ2026-010605-observability-monitoring.md),再更新执行 issue。 @@ -689,6 +693,24 @@ publish Job 未在等待预算内结束时,CLI 必须输出 job 名称、pod JD01/v03 `jd01-web-probe-sentinel` 的小改动滚动上线是本阶段验收入口。closeout 必须记录 SPEC P14 引用、source commit、publish job、digest、GitOps revision、git mirror pending/inSync、Argo/runtime alignment、`validate`、远程 dashboard screenshot 和 latest report 证据。若总等待仍超过 120s,closeout 必须记录阶段归因、env reuse/cache 摘要和下一步优化方向;不得通过单纯放宽 120s 预算收口。 +### 6.15 OPS-SENTINEL-REQ-015 Cadence 调度稳定性与 OTel 覆盖 + +| 编号 | 短名 | 主责模块 | 关联模块 | +| --- | --- | --- | --- | +| OPS-SENTINEL-REQ-015 | Cadence/OTel 稳定性 | PJ2026-0106050815 Cadence/OTel 稳定性 | Monitor Web 观察面板治理、发布集成、OTel、YAML运维 | + +本阶段执行 issue 为 [#1372](https://github.com/pikasTech/unidesk/issues/1372),阶段子 issue 为 P0 [#1374](https://github.com/pikasTech/unidesk/issues/1374)、P1 [#1377](https://github.com/pikasTech/unidesk/issues/1377)、P2 [#1375](https://github.com/pikasTech/unidesk/issues/1375)、P3 [#1378](https://github.com/pikasTech/unidesk/issues/1378) 和 P4 [#1376](https://github.com/pikasTech/unidesk/issues/1376)。 + +Web 哨兵周期巡检必须由目标 node/lane 的 Kubernetes CronJob/GitOps 受控对象承载。CronJob 的 enabled、scenarioId、cadence 来源、startingDeadlineSeconds、successfulJobsHistoryLimit、failedJobsHistoryLimit、activeDeadlineSlackSeconds、ttlSecondsAfterFinished、backoffLimit、concurrencyPolicy、targetValidation.maxSeconds、sampleInterval、screenshotInterval、maxRunSeconds、retention 和 OTel endpoint/sampling 都必须来自 owning YAML/configRef;代码只能解析、校验和渲染,不得用隐藏默认补阈值、历史保留、deadline、timeout、并发策略或采样策略。 + +`web-probe sentinel control-plane status` 必须把 CronJob 作为独立 observed check,而不是只相信 Argo `Synced/Healthy`。当 YAML 启用 cadenceScheduler 但线上缺 CronJob 时,状态必须 blocked,故障码固定为 `sentinel-cadence-cronjob-missing`;schedule 不一致使用 `sentinel-cadence-cronjob-schedule-mismatch`;CronJob suspend 使用 `sentinel-cadence-cronjob-suspended`。状态输出至少展示 CronJob name、namespace、schedule/expectedSchedule、lastScheduleTime、lastSuccessfulTime、active job count、jobCount 和 latest job name。 + +`monitor-web` 必须把 cadence freshness 作为一等状态:显示 YAML expected cadence、scheduler heartbeat age、latest run age、latest analyzed report age、active runs、planned runs、stale multiple、CronJob 观察状态和 OTel coverage/gap。runner API 不直接查询 Kubernetes CronJob 时,页面必须显式显示 `control-plane-status-required`,不得让用户误以为 CronJob 已观察通过。 + +Web 哨兵必须向平台 OTel 后端发出有界、脱敏的 span 或在状态中显式标记 instrumentation gap。P15 span 名称固定为:`web_probe_sentinel.scheduler.heartbeat`、`web_probe_sentinel.cadence.expected`、`web_probe_sentinel.cadence.cronjob_rendered`、`web_probe_sentinel.cadence.cronjob_observed`、`web_probe_sentinel.quick_verify.job_start`、`web_probe_sentinel.quick_verify.job_finish`、`web_probe_sentinel.record_run`、`web_probe_sentinel.scheduler_gap.detected`。属性至少包含 node、lane、sentinelId、scenarioId、runId、cronJobName、jobName、podName、namespace、cadence、status、exitCode、failureKind、gitopsRevision、sourceCommit、imageDigest 和 valuesRedacted;不存在的属性可以省略但不得打印 Secret、prompt、cookie、provider payload 或完整 stdout/stderr。 + +CI/CD rollout 门禁仍只验证配置声明的 `/health` endpoint。web-probe quick verify、Playwright/browser render、dashboard screenshot 和 OTel trace search 都是独立的 post-deploy evidence;不得重新塞回 `trigger-current --confirm --wait` 的同步门禁,也不得引入 Docker daemon/socket 依赖。 + ## 7. 过程控制 Web哨兵架构执行 issue 为 [#883](https://github.com/pikasTech/unidesk/issues/883)。阶段跟踪 issue 为 P0 [#885](https://github.com/pikasTech/unidesk/issues/885)、P1 [#886](https://github.com/pikasTech/unidesk/issues/886)、P2 [#887](https://github.com/pikasTech/unidesk/issues/887)、P3 [#888](https://github.com/pikasTech/unidesk/issues/888)、P4 [#889](https://github.com/pikasTech/unidesk/issues/889)、P5 [#890](https://github.com/pikasTech/unidesk/issues/890) 和 P6 [#891](https://github.com/pikasTech/unidesk/issues/891)。 @@ -714,3 +736,5 @@ P12 cadence 调度和 monitor-web 交互修复执行 issue 为 [#1123](https://g P13 D518 多 runner 强边界与 OTel 根因收敛执行 issue 为 [#1206](https://github.com/pikasTech/unidesk/issues/1206)。P13 closeout 必须回写:SPEC P13 引用、[#1208](https://github.com/pikasTech/unidesk/issues/1208)-[#1216](https://github.com/pikasTech/unidesk/issues/1216) 阶段状态、D518 双 sentinel 独立 Deployment/Service/PVC/CronJob/GitOps/Argo/public route 证据、route/API sentinelId 强断言、report/index 不串线证据、dashboard verify/screenshot localPath/SHA、k3s CronJob 调度证据、latest selected run 与 historical trend 状态分层证据、以及 OTel AgentRun namespace/trace gap 是否已解除或拆入后续 issue。 P14 Web 哨兵 CI/CD 可见性执行 issue 为 [#1285](https://github.com/pikasTech/unidesk/issues/1285)。P14 closeout 必须回写:SPEC P14 引用、source commit、PR/merge commit、JD01/v03 `jd01-web-probe-sentinel` publish job、digest、GitOps revision、git mirror flush 状态、Argo/runtime observed alignment、`validate`、dashboard screenshot、latest report,以及超过 120s 时的结构化阶段归因和可续跑命令。 + +P15 Cadence 调度稳定性与 OTel 覆盖执行 issue 为 [#1372](https://github.com/pikasTech/unidesk/issues/1372)。P15 closeout 必须回写:SPEC P15 引用、[#1374](https://github.com/pikasTech/unidesk/issues/1374)-[#1378](https://github.com/pikasTech/unidesk/issues/1378) 阶段状态、JD01/v03 `jd01-web-probe-sentinel` CronJob manifest/GitOps/Argo/runtime observed 证据、`sentinel-cadence-cronjob-missing` 防回归状态、monitor-web cadence/OTel coverage 显示、OTel trace search 或 instrumentation-gap 证据、受控 rollout/publish job、GitOps revision、source commit、dashboard/health 验收,以及 CI/CD 门禁仍只验证 `/health` 的证据。 diff --git a/scripts/assets/web-probe-sentinel-monitor-web/monitor-web.js b/scripts/assets/web-probe-sentinel-monitor-web/monitor-web.js index 48fcb771..84f018de 100644 --- a/scripts/assets/web-probe-sentinel-monitor-web/monitor-web.js +++ b/scripts/assets/web-probe-sentinel-monitor-web/monitor-web.js @@ -1,4 +1,5 @@ // SPEC: PJ2026-01060508 Web哨兵 draft-2026-06-27-p12-cadence-scheduler-monitor-web. +// SPEC: PJ2026-01060508 Web哨兵 draft-2026-07-01-p15-cadence-otel. // Responsibility: Vue monitor-web runtime for sentinel trend, timeline, detail and finding observability. import { createApp, computed, nextTick, onMounted, ref, watch } from "./vendor/vue.esm-browser.prod.js"; @@ -156,18 +157,28 @@ createApp({ return `运行记录 · ${checkScopeRun.value?.id || "未选择"}`; }); const cadence = computed(() => { + const apiCadence = overview.value?.cadence || {}; const intervalMs = Number(overview.value?.scheduler?.intervalMs || 0); - const latestAge = Number(overview.value?.freshness?.latestRunAgeSeconds ?? -1); - const heartbeatAge = Number(overview.value?.freshness?.schedulerHeartbeatAgeSeconds ?? -1); - const intervalSeconds = intervalMs > 0 ? Math.round(intervalMs / 1000) : 0; - const stale = intervalSeconds > 0 && latestAge > intervalSeconds * 2; + const latestAge = Number(apiCadence.latestRunAgeSeconds ?? overview.value?.freshness?.latestRunAgeSeconds ?? -1); + const heartbeatAge = Number(apiCadence.schedulerHeartbeatAgeSeconds ?? overview.value?.freshness?.schedulerHeartbeatAgeSeconds ?? -1); + const intervalSeconds = Number(apiCadence.expectedCadenceSeconds || 0) || (intervalMs > 0 ? Math.round(intervalMs / 1000) : 0); + const status = String(apiCadence.status || ""); + const stale = status === "warning" || status === "blocker"; + const cronJob = apiCadence.cronJob || {}; + const observability = overview.value?.observability || {}; return { intervalSeconds, latestAge, heartbeatAge, + status, stale, + blocker: status === "blocker", + cronJob, + observability, label: intervalSeconds > 0 ? `${formatDuration(intervalSeconds)} 间隔` : "未配置", - alert: stale ? `最近运行 ${formatDuration(latestAge)} 前,超过预设间隔 2 倍;按 SPEC 作为非阻塞报警展示。` : "运行新鲜度在预设窗口内", + alert: stale + ? `最近运行 ${formatDuration(latestAge)} 前;状态 ${status || "warning"},阈值来自 YAML。` + : "运行新鲜度在 YAML 窗口内", }; }); const healthChecks = computed(() => { @@ -668,6 +679,14 @@ createApp({ 调度新鲜度 {{ cadence.latestAge >= 0 ? formatDuration(cadence.latestAge) : "-" }} +
+ CronJob + {{ cadence.cronJob?.status || "-" }} +
+
+ OTel + {{ cadence.observability?.coverage || "-" }} +
历史错误样本 {{ redCount({ severityCounts: severityTotals }) }} diff --git a/scripts/src/hwlab-node-web-sentinel-cicd.ts b/scripts/src/hwlab-node-web-sentinel-cicd.ts index 7ffb91b6..6c23af31 100644 --- a/scripts/src/hwlab-node-web-sentinel-cicd.ts +++ b/scripts/src/hwlab-node-web-sentinel-cicd.ts @@ -6,6 +6,7 @@ // SPEC: PJ2026-01060508 Web哨兵 draft-2026-06-27-p12-cadence-scheduler-monitor-web. // SPEC: PJ2026-01060508 Web哨兵 draft-2026-06-28-p13-1206-multi-runner-boundaries. // SPEC: PJ2026-01060508 Web哨兵 draft-2026-06-30-p14-sentinel-cicd-visibility. +// SPEC: PJ2026-01060508 Web哨兵 draft-2026-07-01-p15-cadence-otel. // Responsibility: YAML-first CI/CD, image, GitOps and Argo command plan for the web-probe sentinel. import { createHash, randomUUID } from "node:crypto"; import { existsSync, readFileSync } from "node:fs"; @@ -20,6 +21,7 @@ import type { HwlabRuntimeLaneSpec } from "./hwlab-node-lanes"; import type { RenderedCliResult } from "./output"; import { probeSentinelRuntimeHealthEndpoint, runSentinelDashboard, runSentinelMaintenance, runSentinelReport, runSentinelValidate } from "./hwlab-node-web-sentinel-p5"; import { runChildCli, sentinelP5Next } from "./hwlab-node-web-sentinel-p5-observe"; +import { emitWebProbeSentinelSpan, webProbeSentinelOtelSummary } from "./hwlab-node-web-sentinel-otel"; export type WebProbeSentinelConfigAction = "plan" | "status"; export type WebProbeSentinelImageAction = "status" | "build"; @@ -178,6 +180,7 @@ interface SentinelObservedStatus { readonly gitops: Record; readonly argo: Record; readonly runtime: Record; + readonly cadence: Record; readonly wait?: Record; } @@ -215,7 +218,7 @@ export interface ChildCliResult { readonly result: CompactCommandResult & { stdoutTail: string; stderrTail: string }; } -const SPEC_REF = "PJ2026-01060508 Web哨兵 draft-2026-06-30-p14-sentinel-cicd-visibility"; +const SPEC_REF = "PJ2026-01060508 Web哨兵 draft-2026-07-01-p15-cadence-otel"; export function runWebProbeSentinelCommand(spec: HwlabRuntimeLaneSpec, options: WebProbeSentinelOptions): RenderedCliResult { if (options.kind === "config") return withWebProbeSentinelConfigRendered(webProbeSentinelConfigPlan(spec, options.action, options.sentinelId)); @@ -317,6 +320,14 @@ function runSentinelControlPlane(state: SentinelCicdState, options: Extract): readonly Record[] { +function sentinelContainerEnv(sentinelId: string, runtime: Record, cicd: Record, secrets: Record): readonly Record[] { const env: Record[] = [{ name: "UNIDESK_WEB_PROBE_SENTINEL_ID", value: sentinelId }]; + const otelEnabled = booleanAtNullable(runtime, "observability.otel.enabled") ?? booleanAtNullable(cicd, "observability.otel.enabled") ?? false; + const otelEndpoint = stringAtNullable(runtime, "observability.otel.tracesEndpoint") + ?? stringAtNullable(runtime, "observability.otel.endpoint") + ?? stringAtNullable(cicd, "observability.otel.tracesEndpoint") + ?? stringAtNullable(cicd, "observability.otel.endpoint"); + const otelServiceName = stringAtNullable(runtime, "observability.otel.serviceName") ?? stringAtNullable(cicd, "observability.otel.serviceName"); + const otelSampler = stringAtNullable(runtime, "observability.otel.sampler") ?? stringAtNullable(cicd, "observability.otel.sampler"); + const otelSamplerArg = stringAtNullable(runtime, "observability.otel.samplerArg") ?? stringAtNullable(cicd, "observability.otel.samplerArg"); const sourcesByPurpose = new Map>(); for (const source of arrayAt(secrets, "sources").map(record)) { const purpose = stringAtNullable(source, "purpose"); @@ -1074,6 +1109,12 @@ function sentinelContainerEnv(sentinelId: string, secrets: Record[], ): Record | null { + const scheduler = record(valueAtPath(cicd, "targetValidation.cadenceScheduler")); const cadenceSchedulerEnabled = booleanAtNullable(cicd, "targetValidation.cadenceScheduler.enabled") === true; if (!cadenceSchedulerEnabled) return null; const scenarioId = stringAtNullable(cicd, "targetValidation.scenarioId"); @@ -1114,9 +1156,12 @@ function sentinelCadenceCronJobPlan( const namespace = stringAt(runtime, "namespace"); const deploymentName = stringAt(runtime, "deploymentName"); const serviceAccountName = stringAt(runtime, "serviceAccountName"); - const timeoutSeconds = numberAtNullable(cicd, "targetValidation.maxSeconds") ?? numberAtNullable(scenario, "maxRunSeconds") ?? 300; + const timeoutSeconds = numberAt(cicd, "targetValidation.maxSeconds"); + const activeDeadlineSlackSeconds = numberAt(scheduler, "activeDeadlineSlackSeconds"); const mainServerHost = stringAtNullable(cicd, "scheduler.mainServerHost"); - const name = safeKubernetesSegment(`${deploymentName}-quick-verify`, 52); + const name = sentinelCadenceCronJobName(deploymentName); + const concurrencyPolicy = stringAt(scheduler, "concurrencyPolicy"); + if (!["Allow", "Forbid", "Replace"].includes(concurrencyPolicy)) throw new Error("targetValidation.cadenceScheduler.concurrencyPolicy must be Allow, Forbid or Replace"); const labels = { "app.kubernetes.io/name": name, "app.kubernetes.io/part-of": "hwlab-web-probe-sentinel", @@ -1137,19 +1182,20 @@ function sentinelCadenceCronJobPlan( annotations: { "unidesk.ai/cadence": String(scenario.cadence), "unidesk.ai/target-validation-max-seconds": String(timeoutSeconds), + "unidesk.ai/source": "targetValidation.cadenceScheduler", }, }, spec: { schedule, - concurrencyPolicy: "Forbid", - successfulJobsHistoryLimit: 3, - failedJobsHistoryLimit: 5, - startingDeadlineSeconds: Math.max(60, cadenceSeconds), + concurrencyPolicy, + successfulJobsHistoryLimit: numberAt(scheduler, "successfulJobsHistoryLimit"), + failedJobsHistoryLimit: numberAt(scheduler, "failedJobsHistoryLimit"), + startingDeadlineSeconds: numberAt(scheduler, "startingDeadlineSeconds"), jobTemplate: { spec: { - activeDeadlineSeconds: timeoutSeconds + 60, - ttlSecondsAfterFinished: 86400, - backoffLimit: 0, + activeDeadlineSeconds: timeoutSeconds + activeDeadlineSlackSeconds, + ttlSecondsAfterFinished: numberAt(scheduler, "ttlSecondsAfterFinished"), + backoffLimit: numberAt(scheduler, "backoffLimit"), template: { metadata: { labels }, spec: { @@ -1190,6 +1236,10 @@ function sentinelCadenceCronJobPlan( }; } +function sentinelCadenceCronJobName(deploymentName: string): string { + return safeKubernetesSegment(`${deploymentName}-quick-verify`, 52); +} + function scenarioRows(value: unknown): Record[] { if (Array.isArray(value)) return value.map(record); if (!isRecord(value)) return []; @@ -1605,6 +1655,7 @@ function sentinelSkippedObservedStatus(reason: string): SentinelObservedStatus { gitops: skipped, argo: skipped, runtime: skipped, + cadence: skipped, wait: { polls: 0, elapsedMs: 0, @@ -1633,6 +1684,7 @@ function collectSentinelObservedStatus(state: SentinelCicdState, timeoutSeconds: gitops, argo: probeArgoApplication(state, timeoutSeconds, effectiveExpectation.gitopsRevision), runtime: probeRuntimeObjects(state, timeoutSeconds, effectiveExpectation.runtimeImage), + cadence: probeCadenceCronJob(state, timeoutSeconds), }; } @@ -1668,13 +1720,15 @@ function sentinelObservedReady(value: Record | SentinelObserved && gitMirrorReady && record(observed.gitops).ok === true && record(observed.argo).ok === true - && record(observed.runtime).ok === true; + && record(observed.runtime).ok === true + && record(observed.cadence).ok === true; } function sentinelObservedWarnings(value: Record | SentinelObservedStatus | null): string[] { const observed = record(value); const argo = record(observed.argo); - return mergeWarnings(argo.warning); + const cadence = record(observed.cadence); + return mergeWarnings(argo.warning, cadence.warning); } function probeSourceMirror(state: SentinelCicdState, timeoutSeconds: number): Record { @@ -1900,6 +1954,74 @@ function probeRuntimeObjects(state: SentinelCicdState, timeoutSeconds: number, e return { ok: result.exitCode === 0 && probe?.ok === true, probe, result: compactCommand(result) }; } +function probeCadenceCronJob(state: SentinelCicdState, timeoutSeconds: number): Record { + const expected = state.manifests.find((item) => item.kind === "CronJob") ?? null; + if (expected === null) { + return { ok: true, skipped: true, reason: "targetValidation.cadenceScheduler.disabled", valuesRedacted: true }; + } + const metadata = record(expected.metadata); + const spec = record(expected.spec); + const namespace = stringAt(metadata, "namespace"); + const name = stringAt(metadata, "name"); + const expectedSchedule = stringAt(spec, "schedule"); + const script = [ + "set +e", + `namespace=${shellQuote(namespace)}`, + `cronjob=${shellQuote(name)}`, + `sentinel=${shellQuote(state.sentinelId)}`, + `expected_schedule=${shellQuote(expectedSchedule)}`, + "tmp=$(mktemp -d)", + "kubectl -n \"$namespace\" get cronjob \"$cronjob\" -o json >\"$tmp/cronjob.json\" 2>/dev/null; echo $? >\"$tmp/cronjob.rc\"", + "kubectl -n \"$namespace\" get jobs -l \"unidesk.ai/web-probe-sentinel-id=$sentinel,app.kubernetes.io/component=cadence-scheduler\" -o json >\"$tmp/jobs.json\" 2>/dev/null; echo $? >\"$tmp/jobs.rc\"", + "node - \"$tmp\" \"$namespace\" \"$cronjob\" \"$expected_schedule\" <<'NODE'", + "const fs = require('node:fs');", + "const [dir, namespace, cronJobName, expectedSchedule] = process.argv.slice(2);", + "function rc(name){ try { return Number(fs.readFileSync(`${dir}/${name}.rc`, 'utf8').trim()); } catch { return 1; } }", + "function json(name){ try { return JSON.parse(fs.readFileSync(`${dir}/${name}.json`, 'utf8')); } catch { return null; } }", + "const cron = json('cronjob');", + "const jobs = Array.isArray(json('jobs')?.items) ? json('jobs').items : [];", + "const present = rc('cronjob') === 0 && !!cron;", + "const schedule = cron?.spec?.schedule || null;", + "const scheduleMatches = present && schedule === expectedSchedule;", + "const suspended = cron?.spec?.suspend === true;", + "const active = Array.isArray(cron?.status?.active) ? cron.status.active.length : 0;", + "const sortedJobs = jobs.slice().sort((a,b)=>String(b?.metadata?.creationTimestamp||'').localeCompare(String(a?.metadata?.creationTimestamp||''))).slice(0,8);", + "let code = null;", + "if (!present) code = 'sentinel-cadence-cronjob-missing';", + "else if (!scheduleMatches) code = 'sentinel-cadence-cronjob-schedule-mismatch';", + "else if (suspended) code = 'sentinel-cadence-cronjob-suspended';", + "const latestJob = sortedJobs[0] || null;", + "console.log(JSON.stringify({ ok: code === null, code, present, namespace, name: cronJobName, schedule, expectedSchedule, scheduleMatches, suspended, lastScheduleTime: cron?.status?.lastScheduleTime || null, lastSuccessfulTime: cron?.status?.lastSuccessfulTime || null, active, jobCount: jobs.length, latestJobs: sortedJobs.map((job)=>({ name: job?.metadata?.name || null, createdAt: job?.metadata?.creationTimestamp || null, active: Number(job?.status?.active || 0), succeeded: Number(job?.status?.succeeded || 0), failed: Number(job?.status?.failed || 0), completionTime: job?.status?.completionTime || null, valuesRedacted:true })), latestJobName: latestJob?.metadata?.name || null, valuesRedacted: true }));", + "NODE", + ].join("\n"); + const result = runCommand(["trans", stringAt(state.controlPlaneNode, "kubeRoute"), "sh", "--", script], repoRoot, { timeoutMs: Math.min(timeoutSeconds, 60) * 1000 }); + const probe = parseJsonObject(result.stdout); + const ok = result.exitCode === 0 && probe?.ok === true; + emitWebProbeSentinelSpan({ + node: state.spec.nodeId, + lane: state.spec.lane, + sentinelId: state.sentinelId, + namespace, + runtime: state.runtime, + cicd: state.cicd, + }, "web_probe_sentinel.cadence.cronjob_observed", { + cronJobName: name, + namespace, + schedule: expectedSchedule, + status: ok ? "ok" : text(probe?.code ?? "unknown"), + jobName: probe?.latestJobName ?? null, + failureKind: probe?.code ?? null, + valuesRedacted: true, + }, ok); + return { + ok, + probe, + result: compactCommand(result), + warning: ok ? null : `cadence CronJob is not ready: ${text(probe?.code ?? "probe-failed")}`, + valuesRedacted: true, + }; +} + function expectedRuntimeImageFromRegistry(state: SentinelCicdState, registry: Record): string | null { const digest = nonEmptyString(record(record(registry).probe).digest); if (digest === null) return null; @@ -3816,6 +3938,7 @@ function renderControlPlaneResult(result: Record): string { const gitops = record(result.gitops); const argo = record(result.argo); const validation = record(result.validation); + const observability = record(result.observability); const observed = record(result.observed); const sourceMirrorSync = record(result.sourceMirrorSync); const publish = record(result.publish); @@ -3841,6 +3964,8 @@ function renderControlPlaneResult(result: Record): string { "", table(["SCENARIO", "MAX_SECONDS", "CI_WAIT", "QVERIFY", "SECOND_PATH"], [[validation.scenarioId, validation.maxSeconds, validation.controlPlaneWaitMaxSeconds ?? "-", validation.quickVerifyMode ?? "-", validation.automaticSecondPath]]), "", + Object.keys(observability).length === 0 ? "OTEL\n-" : table(["ENABLED", "ENDPOINT", "SERVICE", "COVERAGE"], [[observability.enabled, observability.endpointConfigured, observability.serviceName, observability.coverage]]), + "", renderObservedStatus(observed), "", Object.keys(sourceMirrorSync).length === 0 ? "SOURCE_MIRROR_SYNC\n-" : table(["OK", "PHASE", "JOB", "COMMIT", "ELAPSED"], [[sourceMirrorSync.ok, sourceMirrorSync.phase, sourceMirrorSync.jobName, short(record(sourceMirrorSync.payload).mirrorCommit), sourceMirrorSync.elapsedMs ?? "-"]]), @@ -3913,6 +4038,7 @@ function renderObservedStatus(observed: Record): string { observedStatusRow("gitops", observed.gitops), observedStatusRow("argo", observed.argo), observedStatusRow("runtime", observed.runtime), + observedStatusRow("cadence", observed.cadence), ].filter((row) => row !== null); if (rows.length === 0) return "OBSERVED\n-"; return table(["CHECK", "OK", "DETAIL", "EXIT", "TIMED_OUT", "PREVIEW"], rows); @@ -3944,6 +4070,11 @@ function observedDetail(name: string, item: Record): string { const deployment = record(probe.deployment); return `ready=${deployment.readyReplicas ?? "-"} image=${short(deployment.image)}/${short(deployment.expectedImage)}`; } + if (name === "cadence") { + if (item.skipped === true) return `${item.reason ?? "skipped"}`; + const probe = record(item.probe); + return `${probe.code ?? "ok"} schedule=${probe.schedule ?? "-"}/${probe.expectedSchedule ?? "-"} last=${probe.lastScheduleTime ?? "-"} jobs=${probe.jobCount ?? "-"}`; + } return "-"; } diff --git a/scripts/src/hwlab-node-web-sentinel-otel.ts b/scripts/src/hwlab-node-web-sentinel-otel.ts new file mode 100644 index 00000000..36fd774b --- /dev/null +++ b/scripts/src/hwlab-node-web-sentinel-otel.ts @@ -0,0 +1,143 @@ +// SPEC: PJ2026-01060508 Web哨兵 draft-2026-07-01-p15-cadence-otel. +// Responsibility: Best-effort OTLP span emitter for web-probe sentinel scheduler, cadence and quick-verify events. +import { randomBytes } from "node:crypto"; + +export interface SentinelOtelContext { + readonly node: string; + readonly lane: string; + readonly sentinelId: string; + readonly namespace?: string | null; + readonly runtime?: Record; + readonly cicd?: Record; +} + +export function emitWebProbeSentinelSpan(context: SentinelOtelContext, name: string, attributes: Record = {}, ok = true): void { + const config = resolveOtelConfig(context); + if (!config.enabled || config.endpoint === null) return; + const start = BigInt(Date.now()) * 1_000_000n; + const end = start + 1_000_000n; + const traceId = randomHex(16); + const spanId = randomHex(8); + const payload = { + resourceSpans: [{ + resource: { + attributes: otelAttributes({ + "service.name": config.serviceName, + "deployment.environment": context.lane, + "unidesk.node": context.node, + "hwlab.lane": context.lane, + "k8s.namespace.name": context.namespace ?? stringAtNullable(context.runtime, "namespace"), + "unidesk.values_redacted": true, + }), + }, + scopeSpans: [{ + scope: { name: "unidesk.web_probe_sentinel", version: "PJ2026-01060508" }, + spans: [{ + traceId, + spanId, + name, + kind: 1, + startTimeUnixNano: start.toString(), + endTimeUnixNano: end.toString(), + attributes: otelAttributes({ + "unidesk.node": context.node, + "hwlab.lane": context.lane, + "sentinelId": context.sentinelId, + "valuesRedacted": true, + ...attributes, + }), + status: { code: ok ? 1 : 2 }, + }], + }], + }], + }; + void fetch(config.endpoint, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify(payload), + }).catch(() => undefined); +} + +export function webProbeSentinelOtelSummary(context: SentinelOtelContext): Record { + const config = resolveOtelConfig(context); + return { + enabled: config.enabled, + endpointConfigured: config.endpoint !== null, + serviceName: config.serviceName, + coverage: config.enabled && config.endpoint !== null ? "best-effort-otlp-spans" : "instrumentation-gap", + expectedSpans: [ + "web_probe_sentinel.scheduler.heartbeat", + "web_probe_sentinel.cadence.expected", + "web_probe_sentinel.cadence.cronjob_rendered", + "web_probe_sentinel.cadence.cronjob_observed", + "web_probe_sentinel.quick_verify.job_start", + "web_probe_sentinel.quick_verify.job_finish", + "web_probe_sentinel.record_run", + "web_probe_sentinel.scheduler_gap.detected", + ], + valuesRedacted: true, + }; +} + +function resolveOtelConfig(context: SentinelOtelContext): { readonly enabled: boolean; readonly endpoint: string | null; readonly serviceName: string } { + const runtime = context.runtime ?? {}; + const cicd = context.cicd ?? {}; + const enabledFromYaml = booleanAtNullable(runtime, "observability.otel.enabled") + ?? booleanAtNullable(cicd, "observability.otel.enabled"); + const disabledByEnv = /^(1|true)$/iu.test(process.env.OTEL_SDK_DISABLED ?? ""); + const endpoint = stringAtNullable(runtime, "observability.otel.tracesEndpoint") + ?? stringAtNullable(runtime, "observability.otel.endpoint") + ?? stringAtNullable(cicd, "observability.otel.tracesEndpoint") + ?? stringAtNullable(cicd, "observability.otel.endpoint") + ?? nonEmptyString(process.env.OTEL_EXPORTER_OTLP_TRACES_ENDPOINT); + const serviceName = stringAtNullable(runtime, "observability.otel.serviceName") + ?? stringAtNullable(cicd, "observability.otel.serviceName") + ?? nonEmptyString(process.env.OTEL_SERVICE_NAME) + ?? `hwlab-web-probe-sentinel-${context.node.toLowerCase()}`; + return { + enabled: !disabledByEnv && (enabledFromYaml === true || endpoint !== null), + endpoint, + serviceName, + }; +} + +function otelAttributes(values: Record): readonly Record[] { + return Object.entries(values) + .filter(([, value]) => value !== undefined && value !== null) + .map(([key, value]) => ({ key, value: otelValue(value) })); +} + +function otelValue(value: unknown): Record { + if (typeof value === "boolean") return { boolValue: value }; + if (typeof value === "number" && Number.isFinite(value)) { + return Number.isInteger(value) ? { intValue: String(value) } : { doubleValue: value }; + } + return { stringValue: typeof value === "string" ? value : JSON.stringify(value) }; +} + +function randomHex(bytes: number): string { + return randomBytes(bytes).toString("hex"); +} + +function stringAtNullable(value: unknown, path: string): string | null { + const found = valueAtPath(value, path); + return typeof found === "string" && found.length > 0 ? found : null; +} + +function booleanAtNullable(value: unknown, path: string): boolean | null { + const found = valueAtPath(value, path); + return typeof found === "boolean" ? found : null; +} + +function nonEmptyString(value: unknown): string | null { + return typeof value === "string" && value.length > 0 ? value : null; +} + +function valueAtPath(value: unknown, path: string): unknown { + let current: unknown = value; + for (const segment of path.split(".")) { + if (typeof current !== "object" || current === null || Array.isArray(current)) return undefined; + current = (current as Record)[segment]; + } + return current; +} diff --git a/scripts/src/hwlab-node-web-sentinel-p5-observe.ts b/scripts/src/hwlab-node-web-sentinel-p5-observe.ts index 3afd6da2..a82f3218 100644 --- a/scripts/src/hwlab-node-web-sentinel-p5-observe.ts +++ b/scripts/src/hwlab-node-web-sentinel-p5-observe.ts @@ -1,4 +1,5 @@ // SPEC: PJ2026-01060508 Web哨兵 draft-2026-06-27-p11-monitor-web-observability-dashboard. +// SPEC: PJ2026-01060508 Web哨兵 draft-2026-07-01-p15-cadence-otel. // Responsibility: Quick-verify observe orchestration and artifact interpretation for web-probe sentinel P5 validation. import { createHash, randomUUID } from "node:crypto"; import { existsSync, readFileSync } from "node:fs"; @@ -32,6 +33,7 @@ import { text, withWarnings, } from "./hwlab-node-web-sentinel-cicd"; +import { emitWebProbeSentinelSpan } from "./hwlab-node-web-sentinel-otel"; function printQuickVerifyProgress(state: SentinelCicdState, runId: string | null, phase: string, status: string, extra: Record = {}): void { const compactExtra = Object.fromEntries(Object.entries(extra).map(([key, value]) => { @@ -60,12 +62,30 @@ export function runSentinelQuickVerify(state: SentinelCicdState, reason: string, const maxSeconds = numberAt(state.cicd, "targetValidation.maxSeconds"); const scenario = findScenario(state, scenarioId); if (scenario === null) return { ok: false, status: "blocked", reason: "scenario-not-found", scenarioId, valuesRedacted: true }; + const runId = `sentinel-run-${Date.now().toString(36)}-${randomUUID().slice(0, 8)}`; + emitWebProbeSentinelSpan(sentinelOtelContext(state), "web_probe_sentinel.quick_verify.job_start", { + scenarioId, + runId, + cadence: stringAtNullable(scenario, "cadence"), + status: "running", + valuesRedacted: true, + }); const commandSequence = arrayAt(scenario, "commandSequence").map(record); const needsPromptSet = commandSequence.some((item) => stringAt(item, "type") === "sendPrompt" && inlinePromptText(item) === null); const prompts = needsPromptSet ? readPromptSetForScenario(state, scenario) : { ok: true as const, prompts: [], summary: { source: "not-required", promptCount: 0, valuesRedacted: true } }; - if (!prompts.ok) return { ok: false, status: "blocked", reason: "prompt-source-unavailable", promptSource: prompts, valuesRedacted: true }; + if (!prompts.ok) { + emitWebProbeSentinelSpan(sentinelOtelContext(state), "web_probe_sentinel.quick_verify.job_finish", { + scenarioId, + runId, + status: "blocked", + exitCode: 1, + failureKind: "prompt-source-unavailable", + valuesRedacted: true, + }, false); + return { ok: false, status: "blocked", reason: "prompt-source-unavailable", promptSource: prompts, valuesRedacted: true }; + } const accountEnv = quickVerifyAccountEnv(state); if (!accountEnv.ok) { const findings = [{ @@ -78,7 +98,7 @@ export function runSentinelQuickVerify(state: SentinelCicdState, reason: string, }]; return recordQuickVerify(state, { ok: false, - runId: `sentinel-run-${Date.now().toString(36)}-${randomUUID().slice(0, 8)}`, + runId, scenarioId, reason, status: "blocked", @@ -104,7 +124,6 @@ export function runSentinelQuickVerify(state: SentinelCicdState, reason: string, const hardBudgetSeconds = Math.min(timeoutSeconds, Math.max(maxSeconds, numberAt(scenario, "maxRunSeconds"))); const elapsedWarnings = () => targetValidationElapsedWarnings(elapsedMs(), "quick verify confirm-wait", warningBudgetSeconds); const deadline = Date.now() + hardBudgetSeconds * 1000; - const runId = `sentinel-run-${Date.now().toString(36)}-${randomUUID().slice(0, 8)}`; printQuickVerifyProgress(state, runId, "start", "running", { scenarioId, reason, warningBudgetSeconds, hardBudgetSeconds, timeoutSeconds }); const steps: Record[] = []; const startArgs = [ @@ -659,9 +678,29 @@ function recordQuickVerify(state: SentinelCicdState, payload: Record; readonly cicd: Record } { + return { + node: state.spec.nodeId, + lane: state.spec.lane, + sentinelId: state.sentinelId, + namespace: stringAtNullable(state.runtime, "namespace"), + runtime: state.runtime, + cicd: state.cicd, + }; +} + function compactQuickVerifyRecordViews(views: Record): Record { const compacted: Record = {}; for (const [key, value] of Object.entries(views)) { diff --git a/scripts/src/hwlab-node-web-sentinel-service.ts b/scripts/src/hwlab-node-web-sentinel-service.ts index f64d45b1..82d7889c 100644 --- a/scripts/src/hwlab-node-web-sentinel-service.ts +++ b/scripts/src/hwlab-node-web-sentinel-service.ts @@ -4,6 +4,7 @@ // SPEC: PJ2026-01060508 Web哨兵 draft-2026-06-26-p9-multi-web-probe-sentinel. // SPEC: PJ2026-01060508 Web哨兵 draft-2026-06-27-p11-monitor-web-observability-dashboard. // SPEC: PJ2026-01060508 Web哨兵 draft-2026-06-28-p13-1206-multi-runner-boundaries. +// SPEC: PJ2026-01060508 Web哨兵 draft-2026-07-01-p15-cadence-otel. // Responsibility: Persistent HTTP wrapper service for web-probe observe scheduling, index, health, metrics, maintenance, and dashboard. import { Buffer } from "node:buffer"; import { createHash, randomUUID } from "node:crypto"; @@ -14,6 +15,7 @@ import { renderWebProbeSentinelDashboardHtml, webProbeSentinelDashboardAssetResp import { webProbeSentinelConfigPlan, type WebProbeSentinelConfigPlan } from "./hwlab-node-web-sentinel-config"; import type { HwlabRuntimeLaneSpec } from "./hwlab-node-lanes"; import { effectiveWebProbeSentinelPublicExposure, resolveWebProbeSentinel, readConfigRefTarget as readSentinelConfigRefTarget } from "./hwlab-node-web-sentinel-resolver"; +import { emitWebProbeSentinelSpan, webProbeSentinelOtelSummary } from "./hwlab-node-web-sentinel-otel"; const DASHBOARD_CONTRACT_VERSION = "draft-2026-06-27-p11-monitor-web-observability-dashboard"; const DASHBOARD_MAX_TEXT_BYTES = 16_000; @@ -130,6 +132,7 @@ export function createWebProbeSentinelService(options: WebProbeSentinelServiceOp let schedulerLastError: string | null = null; writeMetadata(db, "service.boot", { at: schedulerHeartbeatAt, restoredInterruptedRuns: restored, valuesRedacted: true }); writeMetadata(db, "scheduler.heartbeat", { at: schedulerHeartbeatAt, loop: "boot" }); + emitSchedulerHeartbeatSpan(config, "boot", schedulerHeartbeatAt, true); const service: WebProbeSentinelService = { config, @@ -139,15 +142,21 @@ export function createWebProbeSentinelService(options: WebProbeSentinelServiceOp if (!schedulerEnabled || schedulerTimer !== null) return; schedulerHeartbeatAt = nowIso(); writeMetadata(db, "scheduler.heartbeat", { at: schedulerHeartbeatAt, loop: "started" }); + emitSchedulerHeartbeatSpan(config, "started", schedulerHeartbeatAt, true); schedulerTimer = setInterval(() => { try { schedulerHeartbeatAt = nowIso(); writeMetadata(db, "scheduler.heartbeat", { at: schedulerHeartbeatAt, loop: "tick" }); - writeMetadata(db, "scheduler.summary", schedulerSummary(config, db)); + const summary = schedulerSummary(config, db); + writeMetadata(db, "scheduler.summary", summary); + emitSchedulerHeartbeatSpan(config, "tick", schedulerHeartbeatAt, true); + emitCadenceExpectedSpan(config, summary); + if (summary.rootCause === "planned-run-not-consumed-by-host-cadence") emitSchedulerGapSpan(config, summary); schedulerLastError = null; } catch (error) { schedulerLastError = error instanceof Error ? error.message : String(error); writeMetadata(db, "scheduler.error", { at: nowIso(), message: schedulerLastError }); + emitSchedulerHeartbeatSpan(config, "tick-error", nowIso(), false, schedulerLastError); } }, config.schedulerIntervalMs); }, @@ -249,7 +258,9 @@ export function createWebProbeSentinelService(options: WebProbeSentinelServiceOp return { ok: true, runId, scenarioId, status: "planned", commandPlanSha256: sha256Json(commandPlan), valuesRedacted: true }; }, recordRun(input: Record) { - return recordRunResult(config, db, input); + const result = recordRunResult(config, db, input); + emitRecordRunSpan(config, input, result); + return result; }, report(view: string, runId: string | null) { return reportRunView(config, db, view, runId); @@ -636,6 +647,66 @@ function schedulerSummary(config: WebProbeSentinelServiceConfig, db: Database): }; } +function emitSchedulerHeartbeatSpan(config: WebProbeSentinelServiceConfig, loop: string, at: string, ok: boolean, failureKind: string | null = null): void { + emitWebProbeSentinelSpan(sentinelOtelContext(config), "web_probe_sentinel.scheduler.heartbeat", { + status: ok ? "ok" : "error", + failureKind, + namespace: stringOrNull(config.runtime.namespace), + heartbeatAt: at, + cadence: firstEnabledScenarioCadence(config), + valuesRedacted: true, + }, ok); +} + +function emitCadenceExpectedSpan(config: WebProbeSentinelServiceConfig, summary: Record): void { + emitWebProbeSentinelSpan(sentinelOtelContext(config), "web_probe_sentinel.cadence.expected", { + cadence: firstEnabledScenarioCadence(config), + scenarioId: firstEnabledScenarioId(config), + status: summary.rootCause == null ? "ok" : "stale", + activeRunCount: summary.activeRuns ?? null, + plannedRunCount: summary.plannedRuns ?? null, + valuesRedacted: true, + }, summary.rootCause == null); +} + +function emitSchedulerGapSpan(config: WebProbeSentinelServiceConfig, summary: Record): void { + emitWebProbeSentinelSpan(sentinelOtelContext(config), "web_probe_sentinel.scheduler_gap.detected", { + cadence: firstEnabledScenarioCadence(config), + scenarioId: summary.oldestPlannedRunScenarioId ?? firstEnabledScenarioId(config), + runId: summary.oldestPlannedRunId ?? null, + status: "planned-run-stale", + failureKind: summary.rootCause, + valuesRedacted: true, + }, false); +} + +function emitRecordRunSpan(config: WebProbeSentinelServiceConfig, input: Record, result: Record): void { + emitWebProbeSentinelSpan(sentinelOtelContext(config), "web_probe_sentinel.record_run", { + scenarioId: result.scenarioId ?? input.scenarioId ?? null, + runId: result.runId ?? input.runId ?? null, + observerId: input.observerId ?? null, + status: result.status ?? input.status ?? null, + failureKind: result.ok === true ? null : result.error ?? "record-run-failed", + valuesRedacted: true, + }, result.ok === true); +} + +function sentinelOtelContext(config: WebProbeSentinelServiceConfig): { readonly node: string; readonly lane: string; readonly sentinelId: string; readonly namespace: string | null; readonly runtime: Record; readonly cicd: Record } { + return { + node: config.node, + lane: config.lane, + sentinelId: config.sentinelId, + namespace: stringOrNull(config.runtime.namespace), + runtime: config.runtime, + cicd: config.cicd, + }; +} + +function firstEnabledScenarioCadence(config: WebProbeSentinelServiceConfig): string | null { + const scenario = config.scenarios.find((item) => boolAt(item, "enabled")); + return scenario === undefined ? null : stringOrNull(scenario.cadence); +} + function renderMetrics(config: WebProbeSentinelServiceConfig, db: Database, health: Record, maintenance: MaintenanceState): string { const counts = runCounts(config, db); const heartbeat = record(readMetadata(db, "scheduler.heartbeat")); @@ -740,6 +811,12 @@ function dashboardOverview(config: WebProbeSentinelServiceConfig, db: Database, const severityCounts = globalSeverityCounts(config, db); const latestUpdatedAt = latestRow === null ? null : stringOrNull(latestRow.updated_at); const latestRunAgeSeconds = latestUpdatedAt === null ? null : ageSeconds(latestUpdatedAt); + const heartbeatAgeSeconds = numberOr(record(record(health.checks).scheduler).heartbeatAgeSeconds, -1); + const expectedCadence = firstEnabledScenarioCadence(config); + const expectedCadenceSeconds = durationStringSeconds(expectedCadence); + const staleMultiple = expectedCadenceSeconds === null || latestRunAgeSeconds === null ? null : latestRunAgeSeconds / expectedCadenceSeconds; + const freshnessWarningMultiple = numberAt(config.runtime, "scheduler.freshnessWarningMultiple"); + const scheduler = schedulerSummary(config, db); return { ok: health.ok === true, contractVersion: DASHBOARD_CONTRACT_VERSION, @@ -750,7 +827,7 @@ function dashboardOverview(config: WebProbeSentinelServiceConfig, db: Database, publicOrigin: stringOrNull(config.publicExposure.publicBaseUrl), configReady: config.plan.ok, health, - scheduler: schedulerSummary(config, db), + scheduler, maintenance, latestRun, runCounts: runCounts(config, db), @@ -758,8 +835,30 @@ function dashboardOverview(config: WebProbeSentinelServiceConfig, db: Database, freshness: { latestRunUpdatedAt: latestUpdatedAt, latestRunAgeSeconds, - schedulerHeartbeatAgeSeconds: numberOr(record(record(health.checks).scheduler).heartbeatAgeSeconds, -1), + schedulerHeartbeatAgeSeconds: heartbeatAgeSeconds, + latestAnalyzedReportAgeSeconds: latestRow === null || stringOrNull(latestRow.report_json_sha256) === null ? null : latestRunAgeSeconds, }, + cadence: { + expectedCadence, + expectedCadenceSeconds, + schedulerHeartbeatAgeSeconds: heartbeatAgeSeconds, + latestRunAgeSeconds, + latestAnalyzedReportAgeSeconds: latestRow === null || stringOrNull(latestRow.report_json_sha256) === null ? null : latestRunAgeSeconds, + activeRuns: scheduler.activeRuns ?? null, + plannedRuns: scheduler.plannedRuns ?? null, + nextRun: null, + staleMultiple, + freshnessWarningMultiple, + status: scheduler.rootCause === "planned-run-not-consumed-by-host-cadence" ? "blocker" : staleMultiple !== null && staleMultiple > freshnessWarningMultiple ? "warning" : "fresh", + cronJob: { + observed: false, + status: "control-plane-status-required", + reason: "runner API does not query Kubernetes CronJob objects; use web-probe sentinel control-plane status for CronJob counts, lastScheduleTime and latest Jobs.", + valuesRedacted: true, + }, + valuesRedacted: true, + }, + observability: webProbeSentinelOtelSummary(sentinelOtelContext(config)), targetValidation: { scenarioId: stringOrNull(record(config.cicd.targetValidation).scenarioId), maxSeconds: numberOr(record(config.cicd.targetValidation).maxSeconds, 120),