feat(sentinel): restore JD01 cadence cronjob visibility
This commit is contained in:
@@ -26,6 +26,14 @@ baselines:
|
||||
intervalMs: 600000
|
||||
heartbeatStaleSeconds: 900
|
||||
maxConcurrentRuns: 1
|
||||
freshnessWarningMultiple: 2
|
||||
observability:
|
||||
otel:
|
||||
enabled: true
|
||||
serviceName: hwlab-web-probe-sentinel-${nodeLower}
|
||||
tracesEndpoint: http://otel-collector.platform-infra.svc.cluster.local:4318/v1/traces
|
||||
sampler: parentbased_traceidratio
|
||||
samplerArg: "1"
|
||||
scheduler15m: &scheduler-15m
|
||||
intervalMs: 900000
|
||||
heartbeatStaleSeconds: 900
|
||||
@@ -229,8 +237,15 @@ nodes:
|
||||
maxSeconds: 360
|
||||
serviceUnavailablePolicy: structured-failure
|
||||
cadenceScheduler:
|
||||
enabled: false
|
||||
reason: cicd-health-endpoint-only
|
||||
enabled: true
|
||||
reason: k8s-native-periodic-quick-verify
|
||||
concurrencyPolicy: Forbid
|
||||
startingDeadlineSeconds: 600
|
||||
successfulJobsHistoryLimit: 3
|
||||
failedJobsHistoryLimit: 5
|
||||
activeDeadlineSlackSeconds: 60
|
||||
ttlSecondsAfterFinished: 86400
|
||||
backoffLimit: 0
|
||||
secrets:
|
||||
sources:
|
||||
- <<: *jd01-bootstrap-source
|
||||
|
||||
@@ -28,6 +28,7 @@ sentinel:
|
||||
intervalMs: 600000
|
||||
heartbeatStaleSeconds: 900
|
||||
maxConcurrentRuns: 1
|
||||
freshnessWarningMultiple: 2
|
||||
sqlite:
|
||||
path: /var/lib/web-probe-sentinel-auth-switch/index.sqlite
|
||||
busyTimeoutMs: 2000
|
||||
|
||||
@@ -28,6 +28,7 @@ sentinel:
|
||||
intervalMs: 600000
|
||||
heartbeatStaleSeconds: 900
|
||||
maxConcurrentRuns: 1
|
||||
freshnessWarningMultiple: 2
|
||||
sqlite:
|
||||
path: /var/lib/web-probe-sentinel-dsflash/index.sqlite
|
||||
busyTimeoutMs: 2000
|
||||
|
||||
@@ -28,6 +28,7 @@ sentinel:
|
||||
intervalMs: 600000
|
||||
heartbeatStaleSeconds: 900
|
||||
maxConcurrentRuns: 1
|
||||
freshnessWarningMultiple: 2
|
||||
sqlite:
|
||||
path: /var/lib/web-probe-sentinel/index.sqlite
|
||||
busyTimeoutMs: 2000
|
||||
|
||||
@@ -29,6 +29,7 @@ sentinel:
|
||||
intervalMs: 600000
|
||||
heartbeatStaleSeconds: 900
|
||||
maxConcurrentRuns: 1
|
||||
freshnessWarningMultiple: 2
|
||||
sqlite:
|
||||
path: /var/lib/web-probe-sentinel-fake-echo/index.sqlite
|
||||
busyTimeoutMs: 2000
|
||||
|
||||
@@ -28,6 +28,7 @@ sentinel:
|
||||
intervalMs: 900000
|
||||
heartbeatStaleSeconds: 900
|
||||
maxConcurrentRuns: 1
|
||||
freshnessWarningMultiple: 2
|
||||
sqlite:
|
||||
path: /var/lib/web-probe-sentinel-mdtodo/index.sqlite
|
||||
busyTimeoutMs: 2000
|
||||
|
||||
@@ -24,6 +24,7 @@
|
||||
| 多实例实现引用版本 | draft-2026-06-26-p9-multi-web-probe-sentinel |
|
||||
| Monitor Web 聚合实现引用版本 | draft-2026-06-26-p10-monitor-web-aggregation |
|
||||
| Monitor Web 观察面板治理实现引用版本 | draft-2026-06-27-p11-monitor-web-observability-dashboard; draft-2026-06-27-p12-cadence-scheduler-monitor-web |
|
||||
| Cadence/OTel 稳定性实现引用版本 | draft-2026-07-01-p15-cadence-otel |
|
||||
| 需求规格模板 | [ISO/IEC/IEEE 29148 需求规格模板](../../templates/iso-iec-ieee-29148-requirements-spec-template.md) |
|
||||
| 上级规格 | [PJ2026-010605 运维监控](PJ2026-010605-observability-monitoring.md) |
|
||||
| 关联规格 | [PJ2026-010401 Web工作台](PJ2026-010401-web-workbench.md)、[PJ2026-0104010803 Workbench唯一投影](PJ2026-0104010803-workbench-unique-projection.md)、[PJ2026-010403 API契约](PJ2026-010403-api-contract.md)、[PJ2026-010601 发布流水](PJ2026-010601-controlled-release.md)、[PJ2026-010602 源码同步](PJ2026-010602-source-sync.md)、[PJ2026-010603 YAML运维](PJ2026-010603-yaml-first-ops.md)、[PJ2026-010604 公开入口](PJ2026-010604-public-entry.md)、[PJ2026-01060505 Workbench性能](PJ2026-01060505-workbench-performance.md) |
|
||||
@@ -126,6 +127,7 @@ Web哨兵必须遵循 UniDesk YAML-first ops。目标 node/lane、public origin
|
||||
| PJ2026-0106050811 | Monitor Web 聚合 | 本规格 6.11 | runner/web 职责拆分、单 monitor-web 聚合、Kubernetes discovery、Vue+TS 前端和 public exposure 收敛 | 多实例与账号切换、Dashboard工作台、发布集成 | `monitor.pikapython.com` 统一值守入口 |
|
||||
| PJ2026-0106050812 | Monitor Web 观察面板治理 | 本规格 6.12 | 趋势曲线、运行时间线、固定视口三栏、cadence freshness、Vue CI/CD/env reuse/git mirror | Monitor Web 聚合、Dashboard工作台、发布集成、源码同步 | 可滚动上线和值守的统一观察面板 |
|
||||
| PJ2026-0106050814 | 哨兵 CI/CD 可见性 | 本规格 6.14 | publish 阶段耗时、env reuse、docker cache、超时诊断、git mirror/Argo/runtime 收敛下一步 | 发布集成、源码同步、Monitor Web 观察面板治理 | 小改动滚动上线可诊断、可续跑、可验收 |
|
||||
| PJ2026-0106050815 | Cadence/OTel 稳定性 | 本规格 6.15 | Kubernetes CronJob 周期巡检、状态缺口故障码、monitor-web cadence 可见性和 sentinel OTel span 合同 | Monitor Web 观察面板治理、发布集成、OTel、YAML运维 | JD01/v03 周期巡检恢复和后续防回归 |
|
||||
|
||||
### 5.1 目标架构图
|
||||
|
||||
@@ -567,6 +569,8 @@ P7 dashboard 增强范围内新增或修改的 dashboard API、frontend assets
|
||||
|
||||
P10/P11 monitor-web 范围内新增或修改的 Vue/TypeScript/Vite 前端、typed API client、聚合 API、runner discovery、dashboard verify/screenshot、CI/CD renderer、GitOps/publicExposure helper 和 env reuse 规划代码必须标注 `SPEC: PJ2026-01060508 Web哨兵 draft-2026-06-27-p11-monitor-web-observability-dashboard`。旧 `scripts/assets/web-probe-sentinel-dashboard/dashboard.js` 只能标注迁移前短修或兼容验证用途,不得作为 P11 新观察面板能力的主要承载面。
|
||||
|
||||
P15 cadence/OTel 范围内新增或修改的 CronJob renderer/status probe、runner health/overview、quick verify record path、monitor-web cadence 展示和 OTel emitter 源码文件头部必须标注 `SPEC: PJ2026-01060508 Web哨兵 draft-2026-07-01-p15-cadence-otel`。
|
||||
|
||||
实现文件不得只写 issue 编号、`latest`、`current` 或“按最新方案”作为规格引用。自动生成文件、第三方 vendored 文件、纯 YAML/config、锁文件和无法承载注释头的二进制产物不要求加源码头部,但对应生成器、渲染器、owning YAML 或 CLI 入口必须能追溯到本 SPEC。
|
||||
|
||||
后续 P1-P6 阶段如果改变稳定需求、观察对象、数据流、接口、部署边界或验收口径,应先更新本规格和上级 [PJ2026-010605 运维监控](PJ2026-010605-observability-monitoring.md),再更新执行 issue。
|
||||
@@ -689,6 +693,24 @@ publish Job 未在等待预算内结束时,CLI 必须输出 job 名称、pod
|
||||
|
||||
JD01/v03 `jd01-web-probe-sentinel` 的小改动滚动上线是本阶段验收入口。closeout 必须记录 SPEC P14 引用、source commit、publish job、digest、GitOps revision、git mirror pending/inSync、Argo/runtime alignment、`validate`、远程 dashboard screenshot 和 latest report 证据。若总等待仍超过 120s,closeout 必须记录阶段归因、env reuse/cache 摘要和下一步优化方向;不得通过单纯放宽 120s 预算收口。
|
||||
|
||||
### 6.15 OPS-SENTINEL-REQ-015 Cadence 调度稳定性与 OTel 覆盖
|
||||
|
||||
| 编号 | 短名 | 主责模块 | 关联模块 |
|
||||
| --- | --- | --- | --- |
|
||||
| OPS-SENTINEL-REQ-015 | Cadence/OTel 稳定性 | PJ2026-0106050815 Cadence/OTel 稳定性 | Monitor Web 观察面板治理、发布集成、OTel、YAML运维 |
|
||||
|
||||
本阶段执行 issue 为 [#1372](https://github.com/pikasTech/unidesk/issues/1372),阶段子 issue 为 P0 [#1374](https://github.com/pikasTech/unidesk/issues/1374)、P1 [#1377](https://github.com/pikasTech/unidesk/issues/1377)、P2 [#1375](https://github.com/pikasTech/unidesk/issues/1375)、P3 [#1378](https://github.com/pikasTech/unidesk/issues/1378) 和 P4 [#1376](https://github.com/pikasTech/unidesk/issues/1376)。
|
||||
|
||||
Web 哨兵周期巡检必须由目标 node/lane 的 Kubernetes CronJob/GitOps 受控对象承载。CronJob 的 enabled、scenarioId、cadence 来源、startingDeadlineSeconds、successfulJobsHistoryLimit、failedJobsHistoryLimit、activeDeadlineSlackSeconds、ttlSecondsAfterFinished、backoffLimit、concurrencyPolicy、targetValidation.maxSeconds、sampleInterval、screenshotInterval、maxRunSeconds、retention 和 OTel endpoint/sampling 都必须来自 owning YAML/configRef;代码只能解析、校验和渲染,不得用隐藏默认补阈值、历史保留、deadline、timeout、并发策略或采样策略。
|
||||
|
||||
`web-probe sentinel control-plane status` 必须把 CronJob 作为独立 observed check,而不是只相信 Argo `Synced/Healthy`。当 YAML 启用 cadenceScheduler 但线上缺 CronJob 时,状态必须 blocked,故障码固定为 `sentinel-cadence-cronjob-missing`;schedule 不一致使用 `sentinel-cadence-cronjob-schedule-mismatch`;CronJob suspend 使用 `sentinel-cadence-cronjob-suspended`。状态输出至少展示 CronJob name、namespace、schedule/expectedSchedule、lastScheduleTime、lastSuccessfulTime、active job count、jobCount 和 latest job name。
|
||||
|
||||
`monitor-web` 必须把 cadence freshness 作为一等状态:显示 YAML expected cadence、scheduler heartbeat age、latest run age、latest analyzed report age、active runs、planned runs、stale multiple、CronJob 观察状态和 OTel coverage/gap。runner API 不直接查询 Kubernetes CronJob 时,页面必须显式显示 `control-plane-status-required`,不得让用户误以为 CronJob 已观察通过。
|
||||
|
||||
Web 哨兵必须向平台 OTel 后端发出有界、脱敏的 span 或在状态中显式标记 instrumentation gap。P15 span 名称固定为:`web_probe_sentinel.scheduler.heartbeat`、`web_probe_sentinel.cadence.expected`、`web_probe_sentinel.cadence.cronjob_rendered`、`web_probe_sentinel.cadence.cronjob_observed`、`web_probe_sentinel.quick_verify.job_start`、`web_probe_sentinel.quick_verify.job_finish`、`web_probe_sentinel.record_run`、`web_probe_sentinel.scheduler_gap.detected`。属性至少包含 node、lane、sentinelId、scenarioId、runId、cronJobName、jobName、podName、namespace、cadence、status、exitCode、failureKind、gitopsRevision、sourceCommit、imageDigest 和 valuesRedacted;不存在的属性可以省略但不得打印 Secret、prompt、cookie、provider payload 或完整 stdout/stderr。
|
||||
|
||||
CI/CD rollout 门禁仍只验证配置声明的 `/health` endpoint。web-probe quick verify、Playwright/browser render、dashboard screenshot 和 OTel trace search 都是独立的 post-deploy evidence;不得重新塞回 `trigger-current --confirm --wait` 的同步门禁,也不得引入 Docker daemon/socket 依赖。
|
||||
|
||||
## 7. 过程控制
|
||||
|
||||
Web哨兵架构执行 issue 为 [#883](https://github.com/pikasTech/unidesk/issues/883)。阶段跟踪 issue 为 P0 [#885](https://github.com/pikasTech/unidesk/issues/885)、P1 [#886](https://github.com/pikasTech/unidesk/issues/886)、P2 [#887](https://github.com/pikasTech/unidesk/issues/887)、P3 [#888](https://github.com/pikasTech/unidesk/issues/888)、P4 [#889](https://github.com/pikasTech/unidesk/issues/889)、P5 [#890](https://github.com/pikasTech/unidesk/issues/890) 和 P6 [#891](https://github.com/pikasTech/unidesk/issues/891)。
|
||||
@@ -714,3 +736,5 @@ P12 cadence 调度和 monitor-web 交互修复执行 issue 为 [#1123](https://g
|
||||
P13 D518 多 runner 强边界与 OTel 根因收敛执行 issue 为 [#1206](https://github.com/pikasTech/unidesk/issues/1206)。P13 closeout 必须回写:SPEC P13 引用、[#1208](https://github.com/pikasTech/unidesk/issues/1208)-[#1216](https://github.com/pikasTech/unidesk/issues/1216) 阶段状态、D518 双 sentinel 独立 Deployment/Service/PVC/CronJob/GitOps/Argo/public route 证据、route/API sentinelId 强断言、report/index 不串线证据、dashboard verify/screenshot localPath/SHA、k3s CronJob 调度证据、latest selected run 与 historical trend 状态分层证据、以及 OTel AgentRun namespace/trace gap 是否已解除或拆入后续 issue。
|
||||
|
||||
P14 Web 哨兵 CI/CD 可见性执行 issue 为 [#1285](https://github.com/pikasTech/unidesk/issues/1285)。P14 closeout 必须回写:SPEC P14 引用、source commit、PR/merge commit、JD01/v03 `jd01-web-probe-sentinel` publish job、digest、GitOps revision、git mirror flush 状态、Argo/runtime observed alignment、`validate`、dashboard screenshot、latest report,以及超过 120s 时的结构化阶段归因和可续跑命令。
|
||||
|
||||
P15 Cadence 调度稳定性与 OTel 覆盖执行 issue 为 [#1372](https://github.com/pikasTech/unidesk/issues/1372)。P15 closeout 必须回写:SPEC P15 引用、[#1374](https://github.com/pikasTech/unidesk/issues/1374)-[#1378](https://github.com/pikasTech/unidesk/issues/1378) 阶段状态、JD01/v03 `jd01-web-probe-sentinel` CronJob manifest/GitOps/Argo/runtime observed 证据、`sentinel-cadence-cronjob-missing` 防回归状态、monitor-web cadence/OTel coverage 显示、OTel trace search 或 instrumentation-gap 证据、受控 rollout/publish job、GitOps revision、source commit、dashboard/health 验收,以及 CI/CD 门禁仍只验证 `/health` 的证据。
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
// SPEC: PJ2026-01060508 Web哨兵 draft-2026-06-27-p12-cadence-scheduler-monitor-web.
|
||||
// SPEC: PJ2026-01060508 Web哨兵 draft-2026-07-01-p15-cadence-otel.
|
||||
// Responsibility: Vue monitor-web runtime for sentinel trend, timeline, detail and finding observability.
|
||||
import { createApp, computed, nextTick, onMounted, ref, watch } from "./vendor/vue.esm-browser.prod.js";
|
||||
|
||||
@@ -156,18 +157,28 @@ createApp({
|
||||
return `运行记录 · ${checkScopeRun.value?.id || "未选择"}`;
|
||||
});
|
||||
const cadence = computed(() => {
|
||||
const apiCadence = overview.value?.cadence || {};
|
||||
const intervalMs = Number(overview.value?.scheduler?.intervalMs || 0);
|
||||
const latestAge = Number(overview.value?.freshness?.latestRunAgeSeconds ?? -1);
|
||||
const heartbeatAge = Number(overview.value?.freshness?.schedulerHeartbeatAgeSeconds ?? -1);
|
||||
const intervalSeconds = intervalMs > 0 ? Math.round(intervalMs / 1000) : 0;
|
||||
const stale = intervalSeconds > 0 && latestAge > intervalSeconds * 2;
|
||||
const latestAge = Number(apiCadence.latestRunAgeSeconds ?? overview.value?.freshness?.latestRunAgeSeconds ?? -1);
|
||||
const heartbeatAge = Number(apiCadence.schedulerHeartbeatAgeSeconds ?? overview.value?.freshness?.schedulerHeartbeatAgeSeconds ?? -1);
|
||||
const intervalSeconds = Number(apiCadence.expectedCadenceSeconds || 0) || (intervalMs > 0 ? Math.round(intervalMs / 1000) : 0);
|
||||
const status = String(apiCadence.status || "");
|
||||
const stale = status === "warning" || status === "blocker";
|
||||
const cronJob = apiCadence.cronJob || {};
|
||||
const observability = overview.value?.observability || {};
|
||||
return {
|
||||
intervalSeconds,
|
||||
latestAge,
|
||||
heartbeatAge,
|
||||
status,
|
||||
stale,
|
||||
blocker: status === "blocker",
|
||||
cronJob,
|
||||
observability,
|
||||
label: intervalSeconds > 0 ? `${formatDuration(intervalSeconds)} 间隔` : "未配置",
|
||||
alert: stale ? `最近运行 ${formatDuration(latestAge)} 前,超过预设间隔 2 倍;按 SPEC 作为非阻塞报警展示。` : "运行新鲜度在预设窗口内",
|
||||
alert: stale
|
||||
? `最近运行 ${formatDuration(latestAge)} 前;状态 ${status || "warning"},阈值来自 YAML。`
|
||||
: "运行新鲜度在 YAML 窗口内",
|
||||
};
|
||||
});
|
||||
const healthChecks = computed(() => {
|
||||
@@ -668,6 +679,14 @@ createApp({
|
||||
<span>调度新鲜度</span>
|
||||
<strong>{{ cadence.latestAge >= 0 ? formatDuration(cadence.latestAge) : "-" }}</strong>
|
||||
</div>
|
||||
<div class="metric" :class="{ warning: cadence.cronJob?.status !== 'ok' }">
|
||||
<span>CronJob</span>
|
||||
<strong>{{ cadence.cronJob?.status || "-" }}</strong>
|
||||
</div>
|
||||
<div class="metric" :class="{ warning: cadence.observability?.coverage === 'instrumentation-gap' }">
|
||||
<span>OTel</span>
|
||||
<strong>{{ cadence.observability?.coverage || "-" }}</strong>
|
||||
</div>
|
||||
<div class="metric">
|
||||
<span>历史错误样本</span>
|
||||
<strong>{{ redCount({ severityCounts: severityTotals }) }}</strong>
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
// SPEC: PJ2026-01060508 Web哨兵 draft-2026-06-27-p12-cadence-scheduler-monitor-web.
|
||||
// SPEC: PJ2026-01060508 Web哨兵 draft-2026-06-28-p13-1206-multi-runner-boundaries.
|
||||
// SPEC: PJ2026-01060508 Web哨兵 draft-2026-06-30-p14-sentinel-cicd-visibility.
|
||||
// SPEC: PJ2026-01060508 Web哨兵 draft-2026-07-01-p15-cadence-otel.
|
||||
// Responsibility: YAML-first CI/CD, image, GitOps and Argo command plan for the web-probe sentinel.
|
||||
import { createHash, randomUUID } from "node:crypto";
|
||||
import { existsSync, readFileSync } from "node:fs";
|
||||
@@ -20,6 +21,7 @@ import type { HwlabRuntimeLaneSpec } from "./hwlab-node-lanes";
|
||||
import type { RenderedCliResult } from "./output";
|
||||
import { probeSentinelRuntimeHealthEndpoint, runSentinelDashboard, runSentinelMaintenance, runSentinelReport, runSentinelValidate } from "./hwlab-node-web-sentinel-p5";
|
||||
import { runChildCli, sentinelP5Next } from "./hwlab-node-web-sentinel-p5-observe";
|
||||
import { emitWebProbeSentinelSpan, webProbeSentinelOtelSummary } from "./hwlab-node-web-sentinel-otel";
|
||||
|
||||
export type WebProbeSentinelConfigAction = "plan" | "status";
|
||||
export type WebProbeSentinelImageAction = "status" | "build";
|
||||
@@ -178,6 +180,7 @@ interface SentinelObservedStatus {
|
||||
readonly gitops: Record<string, unknown>;
|
||||
readonly argo: Record<string, unknown>;
|
||||
readonly runtime: Record<string, unknown>;
|
||||
readonly cadence: Record<string, unknown>;
|
||||
readonly wait?: Record<string, unknown>;
|
||||
}
|
||||
|
||||
@@ -215,7 +218,7 @@ export interface ChildCliResult {
|
||||
readonly result: CompactCommandResult & { stdoutTail: string; stderrTail: string };
|
||||
}
|
||||
|
||||
const SPEC_REF = "PJ2026-01060508 Web哨兵 draft-2026-06-30-p14-sentinel-cicd-visibility";
|
||||
const SPEC_REF = "PJ2026-01060508 Web哨兵 draft-2026-07-01-p15-cadence-otel";
|
||||
|
||||
export function runWebProbeSentinelCommand(spec: HwlabRuntimeLaneSpec, options: WebProbeSentinelOptions): RenderedCliResult {
|
||||
if (options.kind === "config") return withWebProbeSentinelConfigRendered(webProbeSentinelConfigPlan(spec, options.action, options.sentinelId));
|
||||
@@ -317,6 +320,14 @@ function runSentinelControlPlane(state: SentinelCicdState, options: Extract<WebP
|
||||
objects: manifestObjectSummary(state.manifests),
|
||||
sha256: state.manifestSha256,
|
||||
},
|
||||
observability: webProbeSentinelOtelSummary({
|
||||
node: state.spec.nodeId,
|
||||
lane: state.spec.lane,
|
||||
sentinelId: state.sentinelId,
|
||||
namespace: stringAt(state.runtime, "namespace"),
|
||||
runtime: state.runtime,
|
||||
cicd: state.cicd,
|
||||
}),
|
||||
observed,
|
||||
warnings: observedWarnings,
|
||||
blocker: null,
|
||||
@@ -924,8 +935,24 @@ function renderSentinelManifests(
|
||||
const servicePort = numberAt(runtime, "servicePort");
|
||||
const pvcStorage = stringAt(runtime, "pvcStorage");
|
||||
const stateRoot = stringAt(runtime, "stateRoot");
|
||||
const sentinelEnv = sentinelContainerEnv(sentinelId, secrets);
|
||||
const sentinelEnv = sentinelContainerEnv(sentinelId, runtime, cicd, secrets);
|
||||
const cadenceJob = sentinelCadenceCronJobPlan(spec, sentinelId, runtime, cicd, scenarios, image.ref, sentinelEnv);
|
||||
if (cadenceJob !== null) {
|
||||
emitWebProbeSentinelSpan({
|
||||
node: spec.nodeId,
|
||||
lane: spec.lane,
|
||||
sentinelId,
|
||||
namespace,
|
||||
runtime,
|
||||
cicd,
|
||||
}, "web_probe_sentinel.cadence.cronjob_rendered", {
|
||||
cronJobName: record(cadenceJob.metadata).name ?? null,
|
||||
namespace,
|
||||
cadence: record(cadenceJob.metadata).annotations === undefined ? null : record(record(cadenceJob.metadata).annotations)["unidesk.ai/cadence"],
|
||||
schedule: record(cadenceJob.spec).schedule ?? null,
|
||||
valuesRedacted: true,
|
||||
});
|
||||
}
|
||||
return [
|
||||
{
|
||||
apiVersion: "v1",
|
||||
@@ -1060,8 +1087,16 @@ function renderSentinelManifests(
|
||||
];
|
||||
}
|
||||
|
||||
function sentinelContainerEnv(sentinelId: string, secrets: Record<string, unknown>): readonly Record<string, unknown>[] {
|
||||
function sentinelContainerEnv(sentinelId: string, runtime: Record<string, unknown>, cicd: Record<string, unknown>, secrets: Record<string, unknown>): readonly Record<string, unknown>[] {
|
||||
const env: Record<string, unknown>[] = [{ name: "UNIDESK_WEB_PROBE_SENTINEL_ID", value: sentinelId }];
|
||||
const otelEnabled = booleanAtNullable(runtime, "observability.otel.enabled") ?? booleanAtNullable(cicd, "observability.otel.enabled") ?? false;
|
||||
const otelEndpoint = stringAtNullable(runtime, "observability.otel.tracesEndpoint")
|
||||
?? stringAtNullable(runtime, "observability.otel.endpoint")
|
||||
?? stringAtNullable(cicd, "observability.otel.tracesEndpoint")
|
||||
?? stringAtNullable(cicd, "observability.otel.endpoint");
|
||||
const otelServiceName = stringAtNullable(runtime, "observability.otel.serviceName") ?? stringAtNullable(cicd, "observability.otel.serviceName");
|
||||
const otelSampler = stringAtNullable(runtime, "observability.otel.sampler") ?? stringAtNullable(cicd, "observability.otel.sampler");
|
||||
const otelSamplerArg = stringAtNullable(runtime, "observability.otel.samplerArg") ?? stringAtNullable(cicd, "observability.otel.samplerArg");
|
||||
const sourcesByPurpose = new Map<string, Record<string, unknown>>();
|
||||
for (const source of arrayAt(secrets, "sources").map(record)) {
|
||||
const purpose = stringAtNullable(source, "purpose");
|
||||
@@ -1074,6 +1109,12 @@ function sentinelContainerEnv(sentinelId: string, secrets: Record<string, unknow
|
||||
used.add(name);
|
||||
env.push(item);
|
||||
};
|
||||
if (otelEnabled) {
|
||||
if (otelEndpoint !== null) pushEnv({ name: "OTEL_EXPORTER_OTLP_TRACES_ENDPOINT", value: otelEndpoint });
|
||||
if (otelServiceName !== null) pushEnv({ name: "OTEL_SERVICE_NAME", value: otelServiceName });
|
||||
if (otelSampler !== null) pushEnv({ name: "OTEL_TRACES_SAMPLER", value: otelSampler });
|
||||
if (otelSamplerArg !== null) pushEnv({ name: "OTEL_TRACES_SAMPLER_ARG", value: otelSamplerArg });
|
||||
}
|
||||
for (const runtimeSecret of arrayAt(secrets, "runtimeSecrets").map(record)) {
|
||||
const secretName = stringAtNullable(runtimeSecret, "name");
|
||||
if (secretName === null) continue;
|
||||
@@ -1102,6 +1143,7 @@ function sentinelCadenceCronJobPlan(
|
||||
imageRef: string,
|
||||
sentinelEnv: readonly Record<string, unknown>[],
|
||||
): Record<string, unknown> | null {
|
||||
const scheduler = record(valueAtPath(cicd, "targetValidation.cadenceScheduler"));
|
||||
const cadenceSchedulerEnabled = booleanAtNullable(cicd, "targetValidation.cadenceScheduler.enabled") === true;
|
||||
if (!cadenceSchedulerEnabled) return null;
|
||||
const scenarioId = stringAtNullable(cicd, "targetValidation.scenarioId");
|
||||
@@ -1114,9 +1156,12 @@ function sentinelCadenceCronJobPlan(
|
||||
const namespace = stringAt(runtime, "namespace");
|
||||
const deploymentName = stringAt(runtime, "deploymentName");
|
||||
const serviceAccountName = stringAt(runtime, "serviceAccountName");
|
||||
const timeoutSeconds = numberAtNullable(cicd, "targetValidation.maxSeconds") ?? numberAtNullable(scenario, "maxRunSeconds") ?? 300;
|
||||
const timeoutSeconds = numberAt(cicd, "targetValidation.maxSeconds");
|
||||
const activeDeadlineSlackSeconds = numberAt(scheduler, "activeDeadlineSlackSeconds");
|
||||
const mainServerHost = stringAtNullable(cicd, "scheduler.mainServerHost");
|
||||
const name = safeKubernetesSegment(`${deploymentName}-quick-verify`, 52);
|
||||
const name = sentinelCadenceCronJobName(deploymentName);
|
||||
const concurrencyPolicy = stringAt(scheduler, "concurrencyPolicy");
|
||||
if (!["Allow", "Forbid", "Replace"].includes(concurrencyPolicy)) throw new Error("targetValidation.cadenceScheduler.concurrencyPolicy must be Allow, Forbid or Replace");
|
||||
const labels = {
|
||||
"app.kubernetes.io/name": name,
|
||||
"app.kubernetes.io/part-of": "hwlab-web-probe-sentinel",
|
||||
@@ -1137,19 +1182,20 @@ function sentinelCadenceCronJobPlan(
|
||||
annotations: {
|
||||
"unidesk.ai/cadence": String(scenario.cadence),
|
||||
"unidesk.ai/target-validation-max-seconds": String(timeoutSeconds),
|
||||
"unidesk.ai/source": "targetValidation.cadenceScheduler",
|
||||
},
|
||||
},
|
||||
spec: {
|
||||
schedule,
|
||||
concurrencyPolicy: "Forbid",
|
||||
successfulJobsHistoryLimit: 3,
|
||||
failedJobsHistoryLimit: 5,
|
||||
startingDeadlineSeconds: Math.max(60, cadenceSeconds),
|
||||
concurrencyPolicy,
|
||||
successfulJobsHistoryLimit: numberAt(scheduler, "successfulJobsHistoryLimit"),
|
||||
failedJobsHistoryLimit: numberAt(scheduler, "failedJobsHistoryLimit"),
|
||||
startingDeadlineSeconds: numberAt(scheduler, "startingDeadlineSeconds"),
|
||||
jobTemplate: {
|
||||
spec: {
|
||||
activeDeadlineSeconds: timeoutSeconds + 60,
|
||||
ttlSecondsAfterFinished: 86400,
|
||||
backoffLimit: 0,
|
||||
activeDeadlineSeconds: timeoutSeconds + activeDeadlineSlackSeconds,
|
||||
ttlSecondsAfterFinished: numberAt(scheduler, "ttlSecondsAfterFinished"),
|
||||
backoffLimit: numberAt(scheduler, "backoffLimit"),
|
||||
template: {
|
||||
metadata: { labels },
|
||||
spec: {
|
||||
@@ -1190,6 +1236,10 @@ function sentinelCadenceCronJobPlan(
|
||||
};
|
||||
}
|
||||
|
||||
function sentinelCadenceCronJobName(deploymentName: string): string {
|
||||
return safeKubernetesSegment(`${deploymentName}-quick-verify`, 52);
|
||||
}
|
||||
|
||||
function scenarioRows(value: unknown): Record<string, unknown>[] {
|
||||
if (Array.isArray(value)) return value.map(record);
|
||||
if (!isRecord(value)) return [];
|
||||
@@ -1605,6 +1655,7 @@ function sentinelSkippedObservedStatus(reason: string): SentinelObservedStatus {
|
||||
gitops: skipped,
|
||||
argo: skipped,
|
||||
runtime: skipped,
|
||||
cadence: skipped,
|
||||
wait: {
|
||||
polls: 0,
|
||||
elapsedMs: 0,
|
||||
@@ -1633,6 +1684,7 @@ function collectSentinelObservedStatus(state: SentinelCicdState, timeoutSeconds:
|
||||
gitops,
|
||||
argo: probeArgoApplication(state, timeoutSeconds, effectiveExpectation.gitopsRevision),
|
||||
runtime: probeRuntimeObjects(state, timeoutSeconds, effectiveExpectation.runtimeImage),
|
||||
cadence: probeCadenceCronJob(state, timeoutSeconds),
|
||||
};
|
||||
}
|
||||
|
||||
@@ -1668,13 +1720,15 @@ function sentinelObservedReady(value: Record<string, unknown> | SentinelObserved
|
||||
&& gitMirrorReady
|
||||
&& record(observed.gitops).ok === true
|
||||
&& record(observed.argo).ok === true
|
||||
&& record(observed.runtime).ok === true;
|
||||
&& record(observed.runtime).ok === true
|
||||
&& record(observed.cadence).ok === true;
|
||||
}
|
||||
|
||||
function sentinelObservedWarnings(value: Record<string, unknown> | SentinelObservedStatus | null): string[] {
|
||||
const observed = record(value);
|
||||
const argo = record(observed.argo);
|
||||
return mergeWarnings(argo.warning);
|
||||
const cadence = record(observed.cadence);
|
||||
return mergeWarnings(argo.warning, cadence.warning);
|
||||
}
|
||||
|
||||
function probeSourceMirror(state: SentinelCicdState, timeoutSeconds: number): Record<string, unknown> {
|
||||
@@ -1900,6 +1954,74 @@ function probeRuntimeObjects(state: SentinelCicdState, timeoutSeconds: number, e
|
||||
return { ok: result.exitCode === 0 && probe?.ok === true, probe, result: compactCommand(result) };
|
||||
}
|
||||
|
||||
function probeCadenceCronJob(state: SentinelCicdState, timeoutSeconds: number): Record<string, unknown> {
|
||||
const expected = state.manifests.find((item) => item.kind === "CronJob") ?? null;
|
||||
if (expected === null) {
|
||||
return { ok: true, skipped: true, reason: "targetValidation.cadenceScheduler.disabled", valuesRedacted: true };
|
||||
}
|
||||
const metadata = record(expected.metadata);
|
||||
const spec = record(expected.spec);
|
||||
const namespace = stringAt(metadata, "namespace");
|
||||
const name = stringAt(metadata, "name");
|
||||
const expectedSchedule = stringAt(spec, "schedule");
|
||||
const script = [
|
||||
"set +e",
|
||||
`namespace=${shellQuote(namespace)}`,
|
||||
`cronjob=${shellQuote(name)}`,
|
||||
`sentinel=${shellQuote(state.sentinelId)}`,
|
||||
`expected_schedule=${shellQuote(expectedSchedule)}`,
|
||||
"tmp=$(mktemp -d)",
|
||||
"kubectl -n \"$namespace\" get cronjob \"$cronjob\" -o json >\"$tmp/cronjob.json\" 2>/dev/null; echo $? >\"$tmp/cronjob.rc\"",
|
||||
"kubectl -n \"$namespace\" get jobs -l \"unidesk.ai/web-probe-sentinel-id=$sentinel,app.kubernetes.io/component=cadence-scheduler\" -o json >\"$tmp/jobs.json\" 2>/dev/null; echo $? >\"$tmp/jobs.rc\"",
|
||||
"node - \"$tmp\" \"$namespace\" \"$cronjob\" \"$expected_schedule\" <<'NODE'",
|
||||
"const fs = require('node:fs');",
|
||||
"const [dir, namespace, cronJobName, expectedSchedule] = process.argv.slice(2);",
|
||||
"function rc(name){ try { return Number(fs.readFileSync(`${dir}/${name}.rc`, 'utf8').trim()); } catch { return 1; } }",
|
||||
"function json(name){ try { return JSON.parse(fs.readFileSync(`${dir}/${name}.json`, 'utf8')); } catch { return null; } }",
|
||||
"const cron = json('cronjob');",
|
||||
"const jobs = Array.isArray(json('jobs')?.items) ? json('jobs').items : [];",
|
||||
"const present = rc('cronjob') === 0 && !!cron;",
|
||||
"const schedule = cron?.spec?.schedule || null;",
|
||||
"const scheduleMatches = present && schedule === expectedSchedule;",
|
||||
"const suspended = cron?.spec?.suspend === true;",
|
||||
"const active = Array.isArray(cron?.status?.active) ? cron.status.active.length : 0;",
|
||||
"const sortedJobs = jobs.slice().sort((a,b)=>String(b?.metadata?.creationTimestamp||'').localeCompare(String(a?.metadata?.creationTimestamp||''))).slice(0,8);",
|
||||
"let code = null;",
|
||||
"if (!present) code = 'sentinel-cadence-cronjob-missing';",
|
||||
"else if (!scheduleMatches) code = 'sentinel-cadence-cronjob-schedule-mismatch';",
|
||||
"else if (suspended) code = 'sentinel-cadence-cronjob-suspended';",
|
||||
"const latestJob = sortedJobs[0] || null;",
|
||||
"console.log(JSON.stringify({ ok: code === null, code, present, namespace, name: cronJobName, schedule, expectedSchedule, scheduleMatches, suspended, lastScheduleTime: cron?.status?.lastScheduleTime || null, lastSuccessfulTime: cron?.status?.lastSuccessfulTime || null, active, jobCount: jobs.length, latestJobs: sortedJobs.map((job)=>({ name: job?.metadata?.name || null, createdAt: job?.metadata?.creationTimestamp || null, active: Number(job?.status?.active || 0), succeeded: Number(job?.status?.succeeded || 0), failed: Number(job?.status?.failed || 0), completionTime: job?.status?.completionTime || null, valuesRedacted:true })), latestJobName: latestJob?.metadata?.name || null, valuesRedacted: true }));",
|
||||
"NODE",
|
||||
].join("\n");
|
||||
const result = runCommand(["trans", stringAt(state.controlPlaneNode, "kubeRoute"), "sh", "--", script], repoRoot, { timeoutMs: Math.min(timeoutSeconds, 60) * 1000 });
|
||||
const probe = parseJsonObject(result.stdout);
|
||||
const ok = result.exitCode === 0 && probe?.ok === true;
|
||||
emitWebProbeSentinelSpan({
|
||||
node: state.spec.nodeId,
|
||||
lane: state.spec.lane,
|
||||
sentinelId: state.sentinelId,
|
||||
namespace,
|
||||
runtime: state.runtime,
|
||||
cicd: state.cicd,
|
||||
}, "web_probe_sentinel.cadence.cronjob_observed", {
|
||||
cronJobName: name,
|
||||
namespace,
|
||||
schedule: expectedSchedule,
|
||||
status: ok ? "ok" : text(probe?.code ?? "unknown"),
|
||||
jobName: probe?.latestJobName ?? null,
|
||||
failureKind: probe?.code ?? null,
|
||||
valuesRedacted: true,
|
||||
}, ok);
|
||||
return {
|
||||
ok,
|
||||
probe,
|
||||
result: compactCommand(result),
|
||||
warning: ok ? null : `cadence CronJob is not ready: ${text(probe?.code ?? "probe-failed")}`,
|
||||
valuesRedacted: true,
|
||||
};
|
||||
}
|
||||
|
||||
function expectedRuntimeImageFromRegistry(state: SentinelCicdState, registry: Record<string, unknown>): string | null {
|
||||
const digest = nonEmptyString(record(record(registry).probe).digest);
|
||||
if (digest === null) return null;
|
||||
@@ -3816,6 +3938,7 @@ function renderControlPlaneResult(result: Record<string, unknown>): string {
|
||||
const gitops = record(result.gitops);
|
||||
const argo = record(result.argo);
|
||||
const validation = record(result.validation);
|
||||
const observability = record(result.observability);
|
||||
const observed = record(result.observed);
|
||||
const sourceMirrorSync = record(result.sourceMirrorSync);
|
||||
const publish = record(result.publish);
|
||||
@@ -3841,6 +3964,8 @@ function renderControlPlaneResult(result: Record<string, unknown>): string {
|
||||
"",
|
||||
table(["SCENARIO", "MAX_SECONDS", "CI_WAIT", "QVERIFY", "SECOND_PATH"], [[validation.scenarioId, validation.maxSeconds, validation.controlPlaneWaitMaxSeconds ?? "-", validation.quickVerifyMode ?? "-", validation.automaticSecondPath]]),
|
||||
"",
|
||||
Object.keys(observability).length === 0 ? "OTEL\n-" : table(["ENABLED", "ENDPOINT", "SERVICE", "COVERAGE"], [[observability.enabled, observability.endpointConfigured, observability.serviceName, observability.coverage]]),
|
||||
"",
|
||||
renderObservedStatus(observed),
|
||||
"",
|
||||
Object.keys(sourceMirrorSync).length === 0 ? "SOURCE_MIRROR_SYNC\n-" : table(["OK", "PHASE", "JOB", "COMMIT", "ELAPSED"], [[sourceMirrorSync.ok, sourceMirrorSync.phase, sourceMirrorSync.jobName, short(record(sourceMirrorSync.payload).mirrorCommit), sourceMirrorSync.elapsedMs ?? "-"]]),
|
||||
@@ -3913,6 +4038,7 @@ function renderObservedStatus(observed: Record<string, unknown>): string {
|
||||
observedStatusRow("gitops", observed.gitops),
|
||||
observedStatusRow("argo", observed.argo),
|
||||
observedStatusRow("runtime", observed.runtime),
|
||||
observedStatusRow("cadence", observed.cadence),
|
||||
].filter((row) => row !== null);
|
||||
if (rows.length === 0) return "OBSERVED\n-";
|
||||
return table(["CHECK", "OK", "DETAIL", "EXIT", "TIMED_OUT", "PREVIEW"], rows);
|
||||
@@ -3944,6 +4070,11 @@ function observedDetail(name: string, item: Record<string, unknown>): string {
|
||||
const deployment = record(probe.deployment);
|
||||
return `ready=${deployment.readyReplicas ?? "-"} image=${short(deployment.image)}/${short(deployment.expectedImage)}`;
|
||||
}
|
||||
if (name === "cadence") {
|
||||
if (item.skipped === true) return `${item.reason ?? "skipped"}`;
|
||||
const probe = record(item.probe);
|
||||
return `${probe.code ?? "ok"} schedule=${probe.schedule ?? "-"}/${probe.expectedSchedule ?? "-"} last=${probe.lastScheduleTime ?? "-"} jobs=${probe.jobCount ?? "-"}`;
|
||||
}
|
||||
return "-";
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,143 @@
|
||||
// SPEC: PJ2026-01060508 Web哨兵 draft-2026-07-01-p15-cadence-otel.
|
||||
// Responsibility: Best-effort OTLP span emitter for web-probe sentinel scheduler, cadence and quick-verify events.
|
||||
import { randomBytes } from "node:crypto";
|
||||
|
||||
export interface SentinelOtelContext {
|
||||
readonly node: string;
|
||||
readonly lane: string;
|
||||
readonly sentinelId: string;
|
||||
readonly namespace?: string | null;
|
||||
readonly runtime?: Record<string, unknown>;
|
||||
readonly cicd?: Record<string, unknown>;
|
||||
}
|
||||
|
||||
export function emitWebProbeSentinelSpan(context: SentinelOtelContext, name: string, attributes: Record<string, unknown> = {}, ok = true): void {
|
||||
const config = resolveOtelConfig(context);
|
||||
if (!config.enabled || config.endpoint === null) return;
|
||||
const start = BigInt(Date.now()) * 1_000_000n;
|
||||
const end = start + 1_000_000n;
|
||||
const traceId = randomHex(16);
|
||||
const spanId = randomHex(8);
|
||||
const payload = {
|
||||
resourceSpans: [{
|
||||
resource: {
|
||||
attributes: otelAttributes({
|
||||
"service.name": config.serviceName,
|
||||
"deployment.environment": context.lane,
|
||||
"unidesk.node": context.node,
|
||||
"hwlab.lane": context.lane,
|
||||
"k8s.namespace.name": context.namespace ?? stringAtNullable(context.runtime, "namespace"),
|
||||
"unidesk.values_redacted": true,
|
||||
}),
|
||||
},
|
||||
scopeSpans: [{
|
||||
scope: { name: "unidesk.web_probe_sentinel", version: "PJ2026-01060508" },
|
||||
spans: [{
|
||||
traceId,
|
||||
spanId,
|
||||
name,
|
||||
kind: 1,
|
||||
startTimeUnixNano: start.toString(),
|
||||
endTimeUnixNano: end.toString(),
|
||||
attributes: otelAttributes({
|
||||
"unidesk.node": context.node,
|
||||
"hwlab.lane": context.lane,
|
||||
"sentinelId": context.sentinelId,
|
||||
"valuesRedacted": true,
|
||||
...attributes,
|
||||
}),
|
||||
status: { code: ok ? 1 : 2 },
|
||||
}],
|
||||
}],
|
||||
}],
|
||||
};
|
||||
void fetch(config.endpoint, {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify(payload),
|
||||
}).catch(() => undefined);
|
||||
}
|
||||
|
||||
export function webProbeSentinelOtelSummary(context: SentinelOtelContext): Record<string, unknown> {
|
||||
const config = resolveOtelConfig(context);
|
||||
return {
|
||||
enabled: config.enabled,
|
||||
endpointConfigured: config.endpoint !== null,
|
||||
serviceName: config.serviceName,
|
||||
coverage: config.enabled && config.endpoint !== null ? "best-effort-otlp-spans" : "instrumentation-gap",
|
||||
expectedSpans: [
|
||||
"web_probe_sentinel.scheduler.heartbeat",
|
||||
"web_probe_sentinel.cadence.expected",
|
||||
"web_probe_sentinel.cadence.cronjob_rendered",
|
||||
"web_probe_sentinel.cadence.cronjob_observed",
|
||||
"web_probe_sentinel.quick_verify.job_start",
|
||||
"web_probe_sentinel.quick_verify.job_finish",
|
||||
"web_probe_sentinel.record_run",
|
||||
"web_probe_sentinel.scheduler_gap.detected",
|
||||
],
|
||||
valuesRedacted: true,
|
||||
};
|
||||
}
|
||||
|
||||
function resolveOtelConfig(context: SentinelOtelContext): { readonly enabled: boolean; readonly endpoint: string | null; readonly serviceName: string } {
|
||||
const runtime = context.runtime ?? {};
|
||||
const cicd = context.cicd ?? {};
|
||||
const enabledFromYaml = booleanAtNullable(runtime, "observability.otel.enabled")
|
||||
?? booleanAtNullable(cicd, "observability.otel.enabled");
|
||||
const disabledByEnv = /^(1|true)$/iu.test(process.env.OTEL_SDK_DISABLED ?? "");
|
||||
const endpoint = stringAtNullable(runtime, "observability.otel.tracesEndpoint")
|
||||
?? stringAtNullable(runtime, "observability.otel.endpoint")
|
||||
?? stringAtNullable(cicd, "observability.otel.tracesEndpoint")
|
||||
?? stringAtNullable(cicd, "observability.otel.endpoint")
|
||||
?? nonEmptyString(process.env.OTEL_EXPORTER_OTLP_TRACES_ENDPOINT);
|
||||
const serviceName = stringAtNullable(runtime, "observability.otel.serviceName")
|
||||
?? stringAtNullable(cicd, "observability.otel.serviceName")
|
||||
?? nonEmptyString(process.env.OTEL_SERVICE_NAME)
|
||||
?? `hwlab-web-probe-sentinel-${context.node.toLowerCase()}`;
|
||||
return {
|
||||
enabled: !disabledByEnv && (enabledFromYaml === true || endpoint !== null),
|
||||
endpoint,
|
||||
serviceName,
|
||||
};
|
||||
}
|
||||
|
||||
function otelAttributes(values: Record<string, unknown>): readonly Record<string, unknown>[] {
|
||||
return Object.entries(values)
|
||||
.filter(([, value]) => value !== undefined && value !== null)
|
||||
.map(([key, value]) => ({ key, value: otelValue(value) }));
|
||||
}
|
||||
|
||||
function otelValue(value: unknown): Record<string, unknown> {
|
||||
if (typeof value === "boolean") return { boolValue: value };
|
||||
if (typeof value === "number" && Number.isFinite(value)) {
|
||||
return Number.isInteger(value) ? { intValue: String(value) } : { doubleValue: value };
|
||||
}
|
||||
return { stringValue: typeof value === "string" ? value : JSON.stringify(value) };
|
||||
}
|
||||
|
||||
function randomHex(bytes: number): string {
|
||||
return randomBytes(bytes).toString("hex");
|
||||
}
|
||||
|
||||
function stringAtNullable(value: unknown, path: string): string | null {
|
||||
const found = valueAtPath(value, path);
|
||||
return typeof found === "string" && found.length > 0 ? found : null;
|
||||
}
|
||||
|
||||
function booleanAtNullable(value: unknown, path: string): boolean | null {
|
||||
const found = valueAtPath(value, path);
|
||||
return typeof found === "boolean" ? found : null;
|
||||
}
|
||||
|
||||
function nonEmptyString(value: unknown): string | null {
|
||||
return typeof value === "string" && value.length > 0 ? value : null;
|
||||
}
|
||||
|
||||
function valueAtPath(value: unknown, path: string): unknown {
|
||||
let current: unknown = value;
|
||||
for (const segment of path.split(".")) {
|
||||
if (typeof current !== "object" || current === null || Array.isArray(current)) return undefined;
|
||||
current = (current as Record<string, unknown>)[segment];
|
||||
}
|
||||
return current;
|
||||
}
|
||||
@@ -1,4 +1,5 @@
|
||||
// SPEC: PJ2026-01060508 Web哨兵 draft-2026-06-27-p11-monitor-web-observability-dashboard.
|
||||
// SPEC: PJ2026-01060508 Web哨兵 draft-2026-07-01-p15-cadence-otel.
|
||||
// Responsibility: Quick-verify observe orchestration and artifact interpretation for web-probe sentinel P5 validation.
|
||||
import { createHash, randomUUID } from "node:crypto";
|
||||
import { existsSync, readFileSync } from "node:fs";
|
||||
@@ -32,6 +33,7 @@ import {
|
||||
text,
|
||||
withWarnings,
|
||||
} from "./hwlab-node-web-sentinel-cicd";
|
||||
import { emitWebProbeSentinelSpan } from "./hwlab-node-web-sentinel-otel";
|
||||
|
||||
function printQuickVerifyProgress(state: SentinelCicdState, runId: string | null, phase: string, status: string, extra: Record<string, unknown> = {}): void {
|
||||
const compactExtra = Object.fromEntries(Object.entries(extra).map(([key, value]) => {
|
||||
@@ -60,12 +62,30 @@ export function runSentinelQuickVerify(state: SentinelCicdState, reason: string,
|
||||
const maxSeconds = numberAt(state.cicd, "targetValidation.maxSeconds");
|
||||
const scenario = findScenario(state, scenarioId);
|
||||
if (scenario === null) return { ok: false, status: "blocked", reason: "scenario-not-found", scenarioId, valuesRedacted: true };
|
||||
const runId = `sentinel-run-${Date.now().toString(36)}-${randomUUID().slice(0, 8)}`;
|
||||
emitWebProbeSentinelSpan(sentinelOtelContext(state), "web_probe_sentinel.quick_verify.job_start", {
|
||||
scenarioId,
|
||||
runId,
|
||||
cadence: stringAtNullable(scenario, "cadence"),
|
||||
status: "running",
|
||||
valuesRedacted: true,
|
||||
});
|
||||
const commandSequence = arrayAt(scenario, "commandSequence").map(record);
|
||||
const needsPromptSet = commandSequence.some((item) => stringAt(item, "type") === "sendPrompt" && inlinePromptText(item) === null);
|
||||
const prompts = needsPromptSet
|
||||
? readPromptSetForScenario(state, scenario)
|
||||
: { ok: true as const, prompts: [], summary: { source: "not-required", promptCount: 0, valuesRedacted: true } };
|
||||
if (!prompts.ok) return { ok: false, status: "blocked", reason: "prompt-source-unavailable", promptSource: prompts, valuesRedacted: true };
|
||||
if (!prompts.ok) {
|
||||
emitWebProbeSentinelSpan(sentinelOtelContext(state), "web_probe_sentinel.quick_verify.job_finish", {
|
||||
scenarioId,
|
||||
runId,
|
||||
status: "blocked",
|
||||
exitCode: 1,
|
||||
failureKind: "prompt-source-unavailable",
|
||||
valuesRedacted: true,
|
||||
}, false);
|
||||
return { ok: false, status: "blocked", reason: "prompt-source-unavailable", promptSource: prompts, valuesRedacted: true };
|
||||
}
|
||||
const accountEnv = quickVerifyAccountEnv(state);
|
||||
if (!accountEnv.ok) {
|
||||
const findings = [{
|
||||
@@ -78,7 +98,7 @@ export function runSentinelQuickVerify(state: SentinelCicdState, reason: string,
|
||||
}];
|
||||
return recordQuickVerify(state, {
|
||||
ok: false,
|
||||
runId: `sentinel-run-${Date.now().toString(36)}-${randomUUID().slice(0, 8)}`,
|
||||
runId,
|
||||
scenarioId,
|
||||
reason,
|
||||
status: "blocked",
|
||||
@@ -104,7 +124,6 @@ export function runSentinelQuickVerify(state: SentinelCicdState, reason: string,
|
||||
const hardBudgetSeconds = Math.min(timeoutSeconds, Math.max(maxSeconds, numberAt(scenario, "maxRunSeconds")));
|
||||
const elapsedWarnings = () => targetValidationElapsedWarnings(elapsedMs(), "quick verify confirm-wait", warningBudgetSeconds);
|
||||
const deadline = Date.now() + hardBudgetSeconds * 1000;
|
||||
const runId = `sentinel-run-${Date.now().toString(36)}-${randomUUID().slice(0, 8)}`;
|
||||
printQuickVerifyProgress(state, runId, "start", "running", { scenarioId, reason, warningBudgetSeconds, hardBudgetSeconds, timeoutSeconds });
|
||||
const steps: Record<string, unknown>[] = [];
|
||||
const startArgs = [
|
||||
@@ -659,9 +678,29 @@ function recordQuickVerify(state: SentinelCicdState, payload: Record<string, unk
|
||||
maintenance: payload.reason === "maintenance-stop",
|
||||
valuesRedacted: true,
|
||||
}, 60);
|
||||
emitWebProbeSentinelSpan(sentinelOtelContext(state), "web_probe_sentinel.quick_verify.job_finish", {
|
||||
scenarioId: payload.scenarioId,
|
||||
runId: payload.runId,
|
||||
observerId: payload.observerId,
|
||||
status: payload.status,
|
||||
exitCode: payload.ok === true && recordResult.ok === true ? 0 : 1,
|
||||
failureKind: payload.failure ?? (recordResult.ok === true ? null : "record-run-failed"),
|
||||
valuesRedacted: true,
|
||||
}, payload.ok === true && recordResult.ok === true);
|
||||
return withWarnings({ ...payload, views, recordResult, valuesRedacted: true }, recordResult.ok === true ? [] : ["quick verify completed but sentinel report index record failed; report/dashboard may lag until record payload is reduced or retried."]);
|
||||
}
|
||||
|
||||
function sentinelOtelContext(state: SentinelCicdState): { readonly node: string; readonly lane: string; readonly sentinelId: string; readonly namespace: string | null; readonly runtime: Record<string, unknown>; readonly cicd: Record<string, unknown> } {
|
||||
return {
|
||||
node: state.spec.nodeId,
|
||||
lane: state.spec.lane,
|
||||
sentinelId: state.sentinelId,
|
||||
namespace: stringAtNullable(state.runtime, "namespace"),
|
||||
runtime: state.runtime,
|
||||
cicd: state.cicd,
|
||||
};
|
||||
}
|
||||
|
||||
function compactQuickVerifyRecordViews(views: Record<string, unknown>): Record<string, unknown> {
|
||||
const compacted: Record<string, unknown> = {};
|
||||
for (const [key, value] of Object.entries(views)) {
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
// SPEC: PJ2026-01060508 Web哨兵 draft-2026-06-26-p9-multi-web-probe-sentinel.
|
||||
// SPEC: PJ2026-01060508 Web哨兵 draft-2026-06-27-p11-monitor-web-observability-dashboard.
|
||||
// SPEC: PJ2026-01060508 Web哨兵 draft-2026-06-28-p13-1206-multi-runner-boundaries.
|
||||
// SPEC: PJ2026-01060508 Web哨兵 draft-2026-07-01-p15-cadence-otel.
|
||||
// Responsibility: Persistent HTTP wrapper service for web-probe observe scheduling, index, health, metrics, maintenance, and dashboard.
|
||||
import { Buffer } from "node:buffer";
|
||||
import { createHash, randomUUID } from "node:crypto";
|
||||
@@ -14,6 +15,7 @@ import { renderWebProbeSentinelDashboardHtml, webProbeSentinelDashboardAssetResp
|
||||
import { webProbeSentinelConfigPlan, type WebProbeSentinelConfigPlan } from "./hwlab-node-web-sentinel-config";
|
||||
import type { HwlabRuntimeLaneSpec } from "./hwlab-node-lanes";
|
||||
import { effectiveWebProbeSentinelPublicExposure, resolveWebProbeSentinel, readConfigRefTarget as readSentinelConfigRefTarget } from "./hwlab-node-web-sentinel-resolver";
|
||||
import { emitWebProbeSentinelSpan, webProbeSentinelOtelSummary } from "./hwlab-node-web-sentinel-otel";
|
||||
|
||||
const DASHBOARD_CONTRACT_VERSION = "draft-2026-06-27-p11-monitor-web-observability-dashboard";
|
||||
const DASHBOARD_MAX_TEXT_BYTES = 16_000;
|
||||
@@ -130,6 +132,7 @@ export function createWebProbeSentinelService(options: WebProbeSentinelServiceOp
|
||||
let schedulerLastError: string | null = null;
|
||||
writeMetadata(db, "service.boot", { at: schedulerHeartbeatAt, restoredInterruptedRuns: restored, valuesRedacted: true });
|
||||
writeMetadata(db, "scheduler.heartbeat", { at: schedulerHeartbeatAt, loop: "boot" });
|
||||
emitSchedulerHeartbeatSpan(config, "boot", schedulerHeartbeatAt, true);
|
||||
|
||||
const service: WebProbeSentinelService = {
|
||||
config,
|
||||
@@ -139,15 +142,21 @@ export function createWebProbeSentinelService(options: WebProbeSentinelServiceOp
|
||||
if (!schedulerEnabled || schedulerTimer !== null) return;
|
||||
schedulerHeartbeatAt = nowIso();
|
||||
writeMetadata(db, "scheduler.heartbeat", { at: schedulerHeartbeatAt, loop: "started" });
|
||||
emitSchedulerHeartbeatSpan(config, "started", schedulerHeartbeatAt, true);
|
||||
schedulerTimer = setInterval(() => {
|
||||
try {
|
||||
schedulerHeartbeatAt = nowIso();
|
||||
writeMetadata(db, "scheduler.heartbeat", { at: schedulerHeartbeatAt, loop: "tick" });
|
||||
writeMetadata(db, "scheduler.summary", schedulerSummary(config, db));
|
||||
const summary = schedulerSummary(config, db);
|
||||
writeMetadata(db, "scheduler.summary", summary);
|
||||
emitSchedulerHeartbeatSpan(config, "tick", schedulerHeartbeatAt, true);
|
||||
emitCadenceExpectedSpan(config, summary);
|
||||
if (summary.rootCause === "planned-run-not-consumed-by-host-cadence") emitSchedulerGapSpan(config, summary);
|
||||
schedulerLastError = null;
|
||||
} catch (error) {
|
||||
schedulerLastError = error instanceof Error ? error.message : String(error);
|
||||
writeMetadata(db, "scheduler.error", { at: nowIso(), message: schedulerLastError });
|
||||
emitSchedulerHeartbeatSpan(config, "tick-error", nowIso(), false, schedulerLastError);
|
||||
}
|
||||
}, config.schedulerIntervalMs);
|
||||
},
|
||||
@@ -249,7 +258,9 @@ export function createWebProbeSentinelService(options: WebProbeSentinelServiceOp
|
||||
return { ok: true, runId, scenarioId, status: "planned", commandPlanSha256: sha256Json(commandPlan), valuesRedacted: true };
|
||||
},
|
||||
recordRun(input: Record<string, unknown>) {
|
||||
return recordRunResult(config, db, input);
|
||||
const result = recordRunResult(config, db, input);
|
||||
emitRecordRunSpan(config, input, result);
|
||||
return result;
|
||||
},
|
||||
report(view: string, runId: string | null) {
|
||||
return reportRunView(config, db, view, runId);
|
||||
@@ -636,6 +647,66 @@ function schedulerSummary(config: WebProbeSentinelServiceConfig, db: Database):
|
||||
};
|
||||
}
|
||||
|
||||
function emitSchedulerHeartbeatSpan(config: WebProbeSentinelServiceConfig, loop: string, at: string, ok: boolean, failureKind: string | null = null): void {
|
||||
emitWebProbeSentinelSpan(sentinelOtelContext(config), "web_probe_sentinel.scheduler.heartbeat", {
|
||||
status: ok ? "ok" : "error",
|
||||
failureKind,
|
||||
namespace: stringOrNull(config.runtime.namespace),
|
||||
heartbeatAt: at,
|
||||
cadence: firstEnabledScenarioCadence(config),
|
||||
valuesRedacted: true,
|
||||
}, ok);
|
||||
}
|
||||
|
||||
function emitCadenceExpectedSpan(config: WebProbeSentinelServiceConfig, summary: Record<string, unknown>): void {
|
||||
emitWebProbeSentinelSpan(sentinelOtelContext(config), "web_probe_sentinel.cadence.expected", {
|
||||
cadence: firstEnabledScenarioCadence(config),
|
||||
scenarioId: firstEnabledScenarioId(config),
|
||||
status: summary.rootCause == null ? "ok" : "stale",
|
||||
activeRunCount: summary.activeRuns ?? null,
|
||||
plannedRunCount: summary.plannedRuns ?? null,
|
||||
valuesRedacted: true,
|
||||
}, summary.rootCause == null);
|
||||
}
|
||||
|
||||
function emitSchedulerGapSpan(config: WebProbeSentinelServiceConfig, summary: Record<string, unknown>): void {
|
||||
emitWebProbeSentinelSpan(sentinelOtelContext(config), "web_probe_sentinel.scheduler_gap.detected", {
|
||||
cadence: firstEnabledScenarioCadence(config),
|
||||
scenarioId: summary.oldestPlannedRunScenarioId ?? firstEnabledScenarioId(config),
|
||||
runId: summary.oldestPlannedRunId ?? null,
|
||||
status: "planned-run-stale",
|
||||
failureKind: summary.rootCause,
|
||||
valuesRedacted: true,
|
||||
}, false);
|
||||
}
|
||||
|
||||
function emitRecordRunSpan(config: WebProbeSentinelServiceConfig, input: Record<string, unknown>, result: Record<string, unknown>): void {
|
||||
emitWebProbeSentinelSpan(sentinelOtelContext(config), "web_probe_sentinel.record_run", {
|
||||
scenarioId: result.scenarioId ?? input.scenarioId ?? null,
|
||||
runId: result.runId ?? input.runId ?? null,
|
||||
observerId: input.observerId ?? null,
|
||||
status: result.status ?? input.status ?? null,
|
||||
failureKind: result.ok === true ? null : result.error ?? "record-run-failed",
|
||||
valuesRedacted: true,
|
||||
}, result.ok === true);
|
||||
}
|
||||
|
||||
function sentinelOtelContext(config: WebProbeSentinelServiceConfig): { readonly node: string; readonly lane: string; readonly sentinelId: string; readonly namespace: string | null; readonly runtime: Record<string, unknown>; readonly cicd: Record<string, unknown> } {
|
||||
return {
|
||||
node: config.node,
|
||||
lane: config.lane,
|
||||
sentinelId: config.sentinelId,
|
||||
namespace: stringOrNull(config.runtime.namespace),
|
||||
runtime: config.runtime,
|
||||
cicd: config.cicd,
|
||||
};
|
||||
}
|
||||
|
||||
function firstEnabledScenarioCadence(config: WebProbeSentinelServiceConfig): string | null {
|
||||
const scenario = config.scenarios.find((item) => boolAt(item, "enabled"));
|
||||
return scenario === undefined ? null : stringOrNull(scenario.cadence);
|
||||
}
|
||||
|
||||
function renderMetrics(config: WebProbeSentinelServiceConfig, db: Database, health: Record<string, unknown>, maintenance: MaintenanceState): string {
|
||||
const counts = runCounts(config, db);
|
||||
const heartbeat = record(readMetadata(db, "scheduler.heartbeat"));
|
||||
@@ -740,6 +811,12 @@ function dashboardOverview(config: WebProbeSentinelServiceConfig, db: Database,
|
||||
const severityCounts = globalSeverityCounts(config, db);
|
||||
const latestUpdatedAt = latestRow === null ? null : stringOrNull(latestRow.updated_at);
|
||||
const latestRunAgeSeconds = latestUpdatedAt === null ? null : ageSeconds(latestUpdatedAt);
|
||||
const heartbeatAgeSeconds = numberOr(record(record(health.checks).scheduler).heartbeatAgeSeconds, -1);
|
||||
const expectedCadence = firstEnabledScenarioCadence(config);
|
||||
const expectedCadenceSeconds = durationStringSeconds(expectedCadence);
|
||||
const staleMultiple = expectedCadenceSeconds === null || latestRunAgeSeconds === null ? null : latestRunAgeSeconds / expectedCadenceSeconds;
|
||||
const freshnessWarningMultiple = numberAt(config.runtime, "scheduler.freshnessWarningMultiple");
|
||||
const scheduler = schedulerSummary(config, db);
|
||||
return {
|
||||
ok: health.ok === true,
|
||||
contractVersion: DASHBOARD_CONTRACT_VERSION,
|
||||
@@ -750,7 +827,7 @@ function dashboardOverview(config: WebProbeSentinelServiceConfig, db: Database,
|
||||
publicOrigin: stringOrNull(config.publicExposure.publicBaseUrl),
|
||||
configReady: config.plan.ok,
|
||||
health,
|
||||
scheduler: schedulerSummary(config, db),
|
||||
scheduler,
|
||||
maintenance,
|
||||
latestRun,
|
||||
runCounts: runCounts(config, db),
|
||||
@@ -758,8 +835,30 @@ function dashboardOverview(config: WebProbeSentinelServiceConfig, db: Database,
|
||||
freshness: {
|
||||
latestRunUpdatedAt: latestUpdatedAt,
|
||||
latestRunAgeSeconds,
|
||||
schedulerHeartbeatAgeSeconds: numberOr(record(record(health.checks).scheduler).heartbeatAgeSeconds, -1),
|
||||
schedulerHeartbeatAgeSeconds: heartbeatAgeSeconds,
|
||||
latestAnalyzedReportAgeSeconds: latestRow === null || stringOrNull(latestRow.report_json_sha256) === null ? null : latestRunAgeSeconds,
|
||||
},
|
||||
cadence: {
|
||||
expectedCadence,
|
||||
expectedCadenceSeconds,
|
||||
schedulerHeartbeatAgeSeconds: heartbeatAgeSeconds,
|
||||
latestRunAgeSeconds,
|
||||
latestAnalyzedReportAgeSeconds: latestRow === null || stringOrNull(latestRow.report_json_sha256) === null ? null : latestRunAgeSeconds,
|
||||
activeRuns: scheduler.activeRuns ?? null,
|
||||
plannedRuns: scheduler.plannedRuns ?? null,
|
||||
nextRun: null,
|
||||
staleMultiple,
|
||||
freshnessWarningMultiple,
|
||||
status: scheduler.rootCause === "planned-run-not-consumed-by-host-cadence" ? "blocker" : staleMultiple !== null && staleMultiple > freshnessWarningMultiple ? "warning" : "fresh",
|
||||
cronJob: {
|
||||
observed: false,
|
||||
status: "control-plane-status-required",
|
||||
reason: "runner API does not query Kubernetes CronJob objects; use web-probe sentinel control-plane status for CronJob counts, lastScheduleTime and latest Jobs.",
|
||||
valuesRedacted: true,
|
||||
},
|
||||
valuesRedacted: true,
|
||||
},
|
||||
observability: webProbeSentinelOtelSummary(sentinelOtelContext(config)),
|
||||
targetValidation: {
|
||||
scenarioId: stringOrNull(record(config.cicd.targetValidation).scenarioId),
|
||||
maxSeconds: numberOr(record(config.cicd.targetValidation).maxSeconds, 120),
|
||||
|
||||
Reference in New Issue
Block a user