feat(sentinel): restore JD01 cadence cronjob visibility

This commit is contained in:
Codex
2026-07-01 06:43:30 +00:00
parent 622804f95d
commit c7ea34f253
12 changed files with 503 additions and 28 deletions
@@ -1,4 +1,5 @@
// SPEC: PJ2026-01060508 Web哨兵 draft-2026-06-27-p12-cadence-scheduler-monitor-web.
// SPEC: PJ2026-01060508 Web哨兵 draft-2026-07-01-p15-cadence-otel.
// Responsibility: Vue monitor-web runtime for sentinel trend, timeline, detail and finding observability.
import { createApp, computed, nextTick, onMounted, ref, watch } from "./vendor/vue.esm-browser.prod.js";
@@ -156,18 +157,28 @@ createApp({
return `运行记录 · ${checkScopeRun.value?.id || "未选择"}`;
});
const cadence = computed(() => {
const apiCadence = overview.value?.cadence || {};
const intervalMs = Number(overview.value?.scheduler?.intervalMs || 0);
const latestAge = Number(overview.value?.freshness?.latestRunAgeSeconds ?? -1);
const heartbeatAge = Number(overview.value?.freshness?.schedulerHeartbeatAgeSeconds ?? -1);
const intervalSeconds = intervalMs > 0 ? Math.round(intervalMs / 1000) : 0;
const stale = intervalSeconds > 0 && latestAge > intervalSeconds * 2;
const latestAge = Number(apiCadence.latestRunAgeSeconds ?? overview.value?.freshness?.latestRunAgeSeconds ?? -1);
const heartbeatAge = Number(apiCadence.schedulerHeartbeatAgeSeconds ?? overview.value?.freshness?.schedulerHeartbeatAgeSeconds ?? -1);
const intervalSeconds = Number(apiCadence.expectedCadenceSeconds || 0) || (intervalMs > 0 ? Math.round(intervalMs / 1000) : 0);
const status = String(apiCadence.status || "");
const stale = status === "warning" || status === "blocker";
const cronJob = apiCadence.cronJob || {};
const observability = overview.value?.observability || {};
return {
intervalSeconds,
latestAge,
heartbeatAge,
status,
stale,
blocker: status === "blocker",
cronJob,
observability,
label: intervalSeconds > 0 ? `${formatDuration(intervalSeconds)} 间隔` : "未配置",
alert: stale ? `最近运行 ${formatDuration(latestAge)} 前,超过预设间隔 2 倍;按 SPEC 作为非阻塞报警展示。` : "运行新鲜度在预设窗口内",
alert: stale
? `最近运行 ${formatDuration(latestAge)} 前;状态 ${status || "warning"},阈值来自 YAML。`
: "运行新鲜度在 YAML 窗口内",
};
});
const healthChecks = computed(() => {
@@ -668,6 +679,14 @@ createApp({
<span>调度新鲜度</span>
<strong>{{ cadence.latestAge >= 0 ? formatDuration(cadence.latestAge) : "-" }}</strong>
</div>
<div class="metric" :class="{ warning: cadence.cronJob?.status !== 'ok' }">
<span>CronJob</span>
<strong>{{ cadence.cronJob?.status || "-" }}</strong>
</div>
<div class="metric" :class="{ warning: cadence.observability?.coverage === 'instrumentation-gap' }">
<span>OTel</span>
<strong>{{ cadence.observability?.coverage || "-" }}</strong>
</div>
<div class="metric">
<span>历史错误样本</span>
<strong>{{ redCount({ severityCounts: severityTotals }) }}</strong>
+145 -14
View File
@@ -6,6 +6,7 @@
// SPEC: PJ2026-01060508 Web哨兵 draft-2026-06-27-p12-cadence-scheduler-monitor-web.
// SPEC: PJ2026-01060508 Web哨兵 draft-2026-06-28-p13-1206-multi-runner-boundaries.
// SPEC: PJ2026-01060508 Web哨兵 draft-2026-06-30-p14-sentinel-cicd-visibility.
// SPEC: PJ2026-01060508 Web哨兵 draft-2026-07-01-p15-cadence-otel.
// Responsibility: YAML-first CI/CD, image, GitOps and Argo command plan for the web-probe sentinel.
import { createHash, randomUUID } from "node:crypto";
import { existsSync, readFileSync } from "node:fs";
@@ -20,6 +21,7 @@ import type { HwlabRuntimeLaneSpec } from "./hwlab-node-lanes";
import type { RenderedCliResult } from "./output";
import { probeSentinelRuntimeHealthEndpoint, runSentinelDashboard, runSentinelMaintenance, runSentinelReport, runSentinelValidate } from "./hwlab-node-web-sentinel-p5";
import { runChildCli, sentinelP5Next } from "./hwlab-node-web-sentinel-p5-observe";
import { emitWebProbeSentinelSpan, webProbeSentinelOtelSummary } from "./hwlab-node-web-sentinel-otel";
export type WebProbeSentinelConfigAction = "plan" | "status";
export type WebProbeSentinelImageAction = "status" | "build";
@@ -178,6 +180,7 @@ interface SentinelObservedStatus {
readonly gitops: Record<string, unknown>;
readonly argo: Record<string, unknown>;
readonly runtime: Record<string, unknown>;
readonly cadence: Record<string, unknown>;
readonly wait?: Record<string, unknown>;
}
@@ -215,7 +218,7 @@ export interface ChildCliResult {
readonly result: CompactCommandResult & { stdoutTail: string; stderrTail: string };
}
const SPEC_REF = "PJ2026-01060508 Web哨兵 draft-2026-06-30-p14-sentinel-cicd-visibility";
const SPEC_REF = "PJ2026-01060508 Web哨兵 draft-2026-07-01-p15-cadence-otel";
export function runWebProbeSentinelCommand(spec: HwlabRuntimeLaneSpec, options: WebProbeSentinelOptions): RenderedCliResult {
if (options.kind === "config") return withWebProbeSentinelConfigRendered(webProbeSentinelConfigPlan(spec, options.action, options.sentinelId));
@@ -317,6 +320,14 @@ function runSentinelControlPlane(state: SentinelCicdState, options: Extract<WebP
objects: manifestObjectSummary(state.manifests),
sha256: state.manifestSha256,
},
observability: webProbeSentinelOtelSummary({
node: state.spec.nodeId,
lane: state.spec.lane,
sentinelId: state.sentinelId,
namespace: stringAt(state.runtime, "namespace"),
runtime: state.runtime,
cicd: state.cicd,
}),
observed,
warnings: observedWarnings,
blocker: null,
@@ -924,8 +935,24 @@ function renderSentinelManifests(
const servicePort = numberAt(runtime, "servicePort");
const pvcStorage = stringAt(runtime, "pvcStorage");
const stateRoot = stringAt(runtime, "stateRoot");
const sentinelEnv = sentinelContainerEnv(sentinelId, secrets);
const sentinelEnv = sentinelContainerEnv(sentinelId, runtime, cicd, secrets);
const cadenceJob = sentinelCadenceCronJobPlan(spec, sentinelId, runtime, cicd, scenarios, image.ref, sentinelEnv);
if (cadenceJob !== null) {
emitWebProbeSentinelSpan({
node: spec.nodeId,
lane: spec.lane,
sentinelId,
namespace,
runtime,
cicd,
}, "web_probe_sentinel.cadence.cronjob_rendered", {
cronJobName: record(cadenceJob.metadata).name ?? null,
namespace,
cadence: record(cadenceJob.metadata).annotations === undefined ? null : record(record(cadenceJob.metadata).annotations)["unidesk.ai/cadence"],
schedule: record(cadenceJob.spec).schedule ?? null,
valuesRedacted: true,
});
}
return [
{
apiVersion: "v1",
@@ -1060,8 +1087,16 @@ function renderSentinelManifests(
];
}
function sentinelContainerEnv(sentinelId: string, secrets: Record<string, unknown>): readonly Record<string, unknown>[] {
function sentinelContainerEnv(sentinelId: string, runtime: Record<string, unknown>, cicd: Record<string, unknown>, secrets: Record<string, unknown>): readonly Record<string, unknown>[] {
const env: Record<string, unknown>[] = [{ name: "UNIDESK_WEB_PROBE_SENTINEL_ID", value: sentinelId }];
const otelEnabled = booleanAtNullable(runtime, "observability.otel.enabled") ?? booleanAtNullable(cicd, "observability.otel.enabled") ?? false;
const otelEndpoint = stringAtNullable(runtime, "observability.otel.tracesEndpoint")
?? stringAtNullable(runtime, "observability.otel.endpoint")
?? stringAtNullable(cicd, "observability.otel.tracesEndpoint")
?? stringAtNullable(cicd, "observability.otel.endpoint");
const otelServiceName = stringAtNullable(runtime, "observability.otel.serviceName") ?? stringAtNullable(cicd, "observability.otel.serviceName");
const otelSampler = stringAtNullable(runtime, "observability.otel.sampler") ?? stringAtNullable(cicd, "observability.otel.sampler");
const otelSamplerArg = stringAtNullable(runtime, "observability.otel.samplerArg") ?? stringAtNullable(cicd, "observability.otel.samplerArg");
const sourcesByPurpose = new Map<string, Record<string, unknown>>();
for (const source of arrayAt(secrets, "sources").map(record)) {
const purpose = stringAtNullable(source, "purpose");
@@ -1074,6 +1109,12 @@ function sentinelContainerEnv(sentinelId: string, secrets: Record<string, unknow
used.add(name);
env.push(item);
};
if (otelEnabled) {
if (otelEndpoint !== null) pushEnv({ name: "OTEL_EXPORTER_OTLP_TRACES_ENDPOINT", value: otelEndpoint });
if (otelServiceName !== null) pushEnv({ name: "OTEL_SERVICE_NAME", value: otelServiceName });
if (otelSampler !== null) pushEnv({ name: "OTEL_TRACES_SAMPLER", value: otelSampler });
if (otelSamplerArg !== null) pushEnv({ name: "OTEL_TRACES_SAMPLER_ARG", value: otelSamplerArg });
}
for (const runtimeSecret of arrayAt(secrets, "runtimeSecrets").map(record)) {
const secretName = stringAtNullable(runtimeSecret, "name");
if (secretName === null) continue;
@@ -1102,6 +1143,7 @@ function sentinelCadenceCronJobPlan(
imageRef: string,
sentinelEnv: readonly Record<string, unknown>[],
): Record<string, unknown> | null {
const scheduler = record(valueAtPath(cicd, "targetValidation.cadenceScheduler"));
const cadenceSchedulerEnabled = booleanAtNullable(cicd, "targetValidation.cadenceScheduler.enabled") === true;
if (!cadenceSchedulerEnabled) return null;
const scenarioId = stringAtNullable(cicd, "targetValidation.scenarioId");
@@ -1114,9 +1156,12 @@ function sentinelCadenceCronJobPlan(
const namespace = stringAt(runtime, "namespace");
const deploymentName = stringAt(runtime, "deploymentName");
const serviceAccountName = stringAt(runtime, "serviceAccountName");
const timeoutSeconds = numberAtNullable(cicd, "targetValidation.maxSeconds") ?? numberAtNullable(scenario, "maxRunSeconds") ?? 300;
const timeoutSeconds = numberAt(cicd, "targetValidation.maxSeconds");
const activeDeadlineSlackSeconds = numberAt(scheduler, "activeDeadlineSlackSeconds");
const mainServerHost = stringAtNullable(cicd, "scheduler.mainServerHost");
const name = safeKubernetesSegment(`${deploymentName}-quick-verify`, 52);
const name = sentinelCadenceCronJobName(deploymentName);
const concurrencyPolicy = stringAt(scheduler, "concurrencyPolicy");
if (!["Allow", "Forbid", "Replace"].includes(concurrencyPolicy)) throw new Error("targetValidation.cadenceScheduler.concurrencyPolicy must be Allow, Forbid or Replace");
const labels = {
"app.kubernetes.io/name": name,
"app.kubernetes.io/part-of": "hwlab-web-probe-sentinel",
@@ -1137,19 +1182,20 @@ function sentinelCadenceCronJobPlan(
annotations: {
"unidesk.ai/cadence": String(scenario.cadence),
"unidesk.ai/target-validation-max-seconds": String(timeoutSeconds),
"unidesk.ai/source": "targetValidation.cadenceScheduler",
},
},
spec: {
schedule,
concurrencyPolicy: "Forbid",
successfulJobsHistoryLimit: 3,
failedJobsHistoryLimit: 5,
startingDeadlineSeconds: Math.max(60, cadenceSeconds),
concurrencyPolicy,
successfulJobsHistoryLimit: numberAt(scheduler, "successfulJobsHistoryLimit"),
failedJobsHistoryLimit: numberAt(scheduler, "failedJobsHistoryLimit"),
startingDeadlineSeconds: numberAt(scheduler, "startingDeadlineSeconds"),
jobTemplate: {
spec: {
activeDeadlineSeconds: timeoutSeconds + 60,
ttlSecondsAfterFinished: 86400,
backoffLimit: 0,
activeDeadlineSeconds: timeoutSeconds + activeDeadlineSlackSeconds,
ttlSecondsAfterFinished: numberAt(scheduler, "ttlSecondsAfterFinished"),
backoffLimit: numberAt(scheduler, "backoffLimit"),
template: {
metadata: { labels },
spec: {
@@ -1190,6 +1236,10 @@ function sentinelCadenceCronJobPlan(
};
}
function sentinelCadenceCronJobName(deploymentName: string): string {
return safeKubernetesSegment(`${deploymentName}-quick-verify`, 52);
}
function scenarioRows(value: unknown): Record<string, unknown>[] {
if (Array.isArray(value)) return value.map(record);
if (!isRecord(value)) return [];
@@ -1605,6 +1655,7 @@ function sentinelSkippedObservedStatus(reason: string): SentinelObservedStatus {
gitops: skipped,
argo: skipped,
runtime: skipped,
cadence: skipped,
wait: {
polls: 0,
elapsedMs: 0,
@@ -1633,6 +1684,7 @@ function collectSentinelObservedStatus(state: SentinelCicdState, timeoutSeconds:
gitops,
argo: probeArgoApplication(state, timeoutSeconds, effectiveExpectation.gitopsRevision),
runtime: probeRuntimeObjects(state, timeoutSeconds, effectiveExpectation.runtimeImage),
cadence: probeCadenceCronJob(state, timeoutSeconds),
};
}
@@ -1668,13 +1720,15 @@ function sentinelObservedReady(value: Record<string, unknown> | SentinelObserved
&& gitMirrorReady
&& record(observed.gitops).ok === true
&& record(observed.argo).ok === true
&& record(observed.runtime).ok === true;
&& record(observed.runtime).ok === true
&& record(observed.cadence).ok === true;
}
function sentinelObservedWarnings(value: Record<string, unknown> | SentinelObservedStatus | null): string[] {
const observed = record(value);
const argo = record(observed.argo);
return mergeWarnings(argo.warning);
const cadence = record(observed.cadence);
return mergeWarnings(argo.warning, cadence.warning);
}
function probeSourceMirror(state: SentinelCicdState, timeoutSeconds: number): Record<string, unknown> {
@@ -1900,6 +1954,74 @@ function probeRuntimeObjects(state: SentinelCicdState, timeoutSeconds: number, e
return { ok: result.exitCode === 0 && probe?.ok === true, probe, result: compactCommand(result) };
}
function probeCadenceCronJob(state: SentinelCicdState, timeoutSeconds: number): Record<string, unknown> {
const expected = state.manifests.find((item) => item.kind === "CronJob") ?? null;
if (expected === null) {
return { ok: true, skipped: true, reason: "targetValidation.cadenceScheduler.disabled", valuesRedacted: true };
}
const metadata = record(expected.metadata);
const spec = record(expected.spec);
const namespace = stringAt(metadata, "namespace");
const name = stringAt(metadata, "name");
const expectedSchedule = stringAt(spec, "schedule");
const script = [
"set +e",
`namespace=${shellQuote(namespace)}`,
`cronjob=${shellQuote(name)}`,
`sentinel=${shellQuote(state.sentinelId)}`,
`expected_schedule=${shellQuote(expectedSchedule)}`,
"tmp=$(mktemp -d)",
"kubectl -n \"$namespace\" get cronjob \"$cronjob\" -o json >\"$tmp/cronjob.json\" 2>/dev/null; echo $? >\"$tmp/cronjob.rc\"",
"kubectl -n \"$namespace\" get jobs -l \"unidesk.ai/web-probe-sentinel-id=$sentinel,app.kubernetes.io/component=cadence-scheduler\" -o json >\"$tmp/jobs.json\" 2>/dev/null; echo $? >\"$tmp/jobs.rc\"",
"node - \"$tmp\" \"$namespace\" \"$cronjob\" \"$expected_schedule\" <<'NODE'",
"const fs = require('node:fs');",
"const [dir, namespace, cronJobName, expectedSchedule] = process.argv.slice(2);",
"function rc(name){ try { return Number(fs.readFileSync(`${dir}/${name}.rc`, 'utf8').trim()); } catch { return 1; } }",
"function json(name){ try { return JSON.parse(fs.readFileSync(`${dir}/${name}.json`, 'utf8')); } catch { return null; } }",
"const cron = json('cronjob');",
"const jobs = Array.isArray(json('jobs')?.items) ? json('jobs').items : [];",
"const present = rc('cronjob') === 0 && !!cron;",
"const schedule = cron?.spec?.schedule || null;",
"const scheduleMatches = present && schedule === expectedSchedule;",
"const suspended = cron?.spec?.suspend === true;",
"const active = Array.isArray(cron?.status?.active) ? cron.status.active.length : 0;",
"const sortedJobs = jobs.slice().sort((a,b)=>String(b?.metadata?.creationTimestamp||'').localeCompare(String(a?.metadata?.creationTimestamp||''))).slice(0,8);",
"let code = null;",
"if (!present) code = 'sentinel-cadence-cronjob-missing';",
"else if (!scheduleMatches) code = 'sentinel-cadence-cronjob-schedule-mismatch';",
"else if (suspended) code = 'sentinel-cadence-cronjob-suspended';",
"const latestJob = sortedJobs[0] || null;",
"console.log(JSON.stringify({ ok: code === null, code, present, namespace, name: cronJobName, schedule, expectedSchedule, scheduleMatches, suspended, lastScheduleTime: cron?.status?.lastScheduleTime || null, lastSuccessfulTime: cron?.status?.lastSuccessfulTime || null, active, jobCount: jobs.length, latestJobs: sortedJobs.map((job)=>({ name: job?.metadata?.name || null, createdAt: job?.metadata?.creationTimestamp || null, active: Number(job?.status?.active || 0), succeeded: Number(job?.status?.succeeded || 0), failed: Number(job?.status?.failed || 0), completionTime: job?.status?.completionTime || null, valuesRedacted:true })), latestJobName: latestJob?.metadata?.name || null, valuesRedacted: true }));",
"NODE",
].join("\n");
const result = runCommand(["trans", stringAt(state.controlPlaneNode, "kubeRoute"), "sh", "--", script], repoRoot, { timeoutMs: Math.min(timeoutSeconds, 60) * 1000 });
const probe = parseJsonObject(result.stdout);
const ok = result.exitCode === 0 && probe?.ok === true;
emitWebProbeSentinelSpan({
node: state.spec.nodeId,
lane: state.spec.lane,
sentinelId: state.sentinelId,
namespace,
runtime: state.runtime,
cicd: state.cicd,
}, "web_probe_sentinel.cadence.cronjob_observed", {
cronJobName: name,
namespace,
schedule: expectedSchedule,
status: ok ? "ok" : text(probe?.code ?? "unknown"),
jobName: probe?.latestJobName ?? null,
failureKind: probe?.code ?? null,
valuesRedacted: true,
}, ok);
return {
ok,
probe,
result: compactCommand(result),
warning: ok ? null : `cadence CronJob is not ready: ${text(probe?.code ?? "probe-failed")}`,
valuesRedacted: true,
};
}
function expectedRuntimeImageFromRegistry(state: SentinelCicdState, registry: Record<string, unknown>): string | null {
const digest = nonEmptyString(record(record(registry).probe).digest);
if (digest === null) return null;
@@ -3816,6 +3938,7 @@ function renderControlPlaneResult(result: Record<string, unknown>): string {
const gitops = record(result.gitops);
const argo = record(result.argo);
const validation = record(result.validation);
const observability = record(result.observability);
const observed = record(result.observed);
const sourceMirrorSync = record(result.sourceMirrorSync);
const publish = record(result.publish);
@@ -3841,6 +3964,8 @@ function renderControlPlaneResult(result: Record<string, unknown>): string {
"",
table(["SCENARIO", "MAX_SECONDS", "CI_WAIT", "QVERIFY", "SECOND_PATH"], [[validation.scenarioId, validation.maxSeconds, validation.controlPlaneWaitMaxSeconds ?? "-", validation.quickVerifyMode ?? "-", validation.automaticSecondPath]]),
"",
Object.keys(observability).length === 0 ? "OTEL\n-" : table(["ENABLED", "ENDPOINT", "SERVICE", "COVERAGE"], [[observability.enabled, observability.endpointConfigured, observability.serviceName, observability.coverage]]),
"",
renderObservedStatus(observed),
"",
Object.keys(sourceMirrorSync).length === 0 ? "SOURCE_MIRROR_SYNC\n-" : table(["OK", "PHASE", "JOB", "COMMIT", "ELAPSED"], [[sourceMirrorSync.ok, sourceMirrorSync.phase, sourceMirrorSync.jobName, short(record(sourceMirrorSync.payload).mirrorCommit), sourceMirrorSync.elapsedMs ?? "-"]]),
@@ -3913,6 +4038,7 @@ function renderObservedStatus(observed: Record<string, unknown>): string {
observedStatusRow("gitops", observed.gitops),
observedStatusRow("argo", observed.argo),
observedStatusRow("runtime", observed.runtime),
observedStatusRow("cadence", observed.cadence),
].filter((row) => row !== null);
if (rows.length === 0) return "OBSERVED\n-";
return table(["CHECK", "OK", "DETAIL", "EXIT", "TIMED_OUT", "PREVIEW"], rows);
@@ -3944,6 +4070,11 @@ function observedDetail(name: string, item: Record<string, unknown>): string {
const deployment = record(probe.deployment);
return `ready=${deployment.readyReplicas ?? "-"} image=${short(deployment.image)}/${short(deployment.expectedImage)}`;
}
if (name === "cadence") {
if (item.skipped === true) return `${item.reason ?? "skipped"}`;
const probe = record(item.probe);
return `${probe.code ?? "ok"} schedule=${probe.schedule ?? "-"}/${probe.expectedSchedule ?? "-"} last=${probe.lastScheduleTime ?? "-"} jobs=${probe.jobCount ?? "-"}`;
}
return "-";
}
+143
View File
@@ -0,0 +1,143 @@
// SPEC: PJ2026-01060508 Web哨兵 draft-2026-07-01-p15-cadence-otel.
// Responsibility: Best-effort OTLP span emitter for web-probe sentinel scheduler, cadence and quick-verify events.
import { randomBytes } from "node:crypto";
export interface SentinelOtelContext {
readonly node: string;
readonly lane: string;
readonly sentinelId: string;
readonly namespace?: string | null;
readonly runtime?: Record<string, unknown>;
readonly cicd?: Record<string, unknown>;
}
export function emitWebProbeSentinelSpan(context: SentinelOtelContext, name: string, attributes: Record<string, unknown> = {}, ok = true): void {
const config = resolveOtelConfig(context);
if (!config.enabled || config.endpoint === null) return;
const start = BigInt(Date.now()) * 1_000_000n;
const end = start + 1_000_000n;
const traceId = randomHex(16);
const spanId = randomHex(8);
const payload = {
resourceSpans: [{
resource: {
attributes: otelAttributes({
"service.name": config.serviceName,
"deployment.environment": context.lane,
"unidesk.node": context.node,
"hwlab.lane": context.lane,
"k8s.namespace.name": context.namespace ?? stringAtNullable(context.runtime, "namespace"),
"unidesk.values_redacted": true,
}),
},
scopeSpans: [{
scope: { name: "unidesk.web_probe_sentinel", version: "PJ2026-01060508" },
spans: [{
traceId,
spanId,
name,
kind: 1,
startTimeUnixNano: start.toString(),
endTimeUnixNano: end.toString(),
attributes: otelAttributes({
"unidesk.node": context.node,
"hwlab.lane": context.lane,
"sentinelId": context.sentinelId,
"valuesRedacted": true,
...attributes,
}),
status: { code: ok ? 1 : 2 },
}],
}],
}],
};
void fetch(config.endpoint, {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify(payload),
}).catch(() => undefined);
}
export function webProbeSentinelOtelSummary(context: SentinelOtelContext): Record<string, unknown> {
const config = resolveOtelConfig(context);
return {
enabled: config.enabled,
endpointConfigured: config.endpoint !== null,
serviceName: config.serviceName,
coverage: config.enabled && config.endpoint !== null ? "best-effort-otlp-spans" : "instrumentation-gap",
expectedSpans: [
"web_probe_sentinel.scheduler.heartbeat",
"web_probe_sentinel.cadence.expected",
"web_probe_sentinel.cadence.cronjob_rendered",
"web_probe_sentinel.cadence.cronjob_observed",
"web_probe_sentinel.quick_verify.job_start",
"web_probe_sentinel.quick_verify.job_finish",
"web_probe_sentinel.record_run",
"web_probe_sentinel.scheduler_gap.detected",
],
valuesRedacted: true,
};
}
function resolveOtelConfig(context: SentinelOtelContext): { readonly enabled: boolean; readonly endpoint: string | null; readonly serviceName: string } {
const runtime = context.runtime ?? {};
const cicd = context.cicd ?? {};
const enabledFromYaml = booleanAtNullable(runtime, "observability.otel.enabled")
?? booleanAtNullable(cicd, "observability.otel.enabled");
const disabledByEnv = /^(1|true)$/iu.test(process.env.OTEL_SDK_DISABLED ?? "");
const endpoint = stringAtNullable(runtime, "observability.otel.tracesEndpoint")
?? stringAtNullable(runtime, "observability.otel.endpoint")
?? stringAtNullable(cicd, "observability.otel.tracesEndpoint")
?? stringAtNullable(cicd, "observability.otel.endpoint")
?? nonEmptyString(process.env.OTEL_EXPORTER_OTLP_TRACES_ENDPOINT);
const serviceName = stringAtNullable(runtime, "observability.otel.serviceName")
?? stringAtNullable(cicd, "observability.otel.serviceName")
?? nonEmptyString(process.env.OTEL_SERVICE_NAME)
?? `hwlab-web-probe-sentinel-${context.node.toLowerCase()}`;
return {
enabled: !disabledByEnv && (enabledFromYaml === true || endpoint !== null),
endpoint,
serviceName,
};
}
function otelAttributes(values: Record<string, unknown>): readonly Record<string, unknown>[] {
return Object.entries(values)
.filter(([, value]) => value !== undefined && value !== null)
.map(([key, value]) => ({ key, value: otelValue(value) }));
}
function otelValue(value: unknown): Record<string, unknown> {
if (typeof value === "boolean") return { boolValue: value };
if (typeof value === "number" && Number.isFinite(value)) {
return Number.isInteger(value) ? { intValue: String(value) } : { doubleValue: value };
}
return { stringValue: typeof value === "string" ? value : JSON.stringify(value) };
}
function randomHex(bytes: number): string {
return randomBytes(bytes).toString("hex");
}
function stringAtNullable(value: unknown, path: string): string | null {
const found = valueAtPath(value, path);
return typeof found === "string" && found.length > 0 ? found : null;
}
function booleanAtNullable(value: unknown, path: string): boolean | null {
const found = valueAtPath(value, path);
return typeof found === "boolean" ? found : null;
}
function nonEmptyString(value: unknown): string | null {
return typeof value === "string" && value.length > 0 ? value : null;
}
function valueAtPath(value: unknown, path: string): unknown {
let current: unknown = value;
for (const segment of path.split(".")) {
if (typeof current !== "object" || current === null || Array.isArray(current)) return undefined;
current = (current as Record<string, unknown>)[segment];
}
return current;
}
@@ -1,4 +1,5 @@
// SPEC: PJ2026-01060508 Web哨兵 draft-2026-06-27-p11-monitor-web-observability-dashboard.
// SPEC: PJ2026-01060508 Web哨兵 draft-2026-07-01-p15-cadence-otel.
// Responsibility: Quick-verify observe orchestration and artifact interpretation for web-probe sentinel P5 validation.
import { createHash, randomUUID } from "node:crypto";
import { existsSync, readFileSync } from "node:fs";
@@ -32,6 +33,7 @@ import {
text,
withWarnings,
} from "./hwlab-node-web-sentinel-cicd";
import { emitWebProbeSentinelSpan } from "./hwlab-node-web-sentinel-otel";
function printQuickVerifyProgress(state: SentinelCicdState, runId: string | null, phase: string, status: string, extra: Record<string, unknown> = {}): void {
const compactExtra = Object.fromEntries(Object.entries(extra).map(([key, value]) => {
@@ -60,12 +62,30 @@ export function runSentinelQuickVerify(state: SentinelCicdState, reason: string,
const maxSeconds = numberAt(state.cicd, "targetValidation.maxSeconds");
const scenario = findScenario(state, scenarioId);
if (scenario === null) return { ok: false, status: "blocked", reason: "scenario-not-found", scenarioId, valuesRedacted: true };
const runId = `sentinel-run-${Date.now().toString(36)}-${randomUUID().slice(0, 8)}`;
emitWebProbeSentinelSpan(sentinelOtelContext(state), "web_probe_sentinel.quick_verify.job_start", {
scenarioId,
runId,
cadence: stringAtNullable(scenario, "cadence"),
status: "running",
valuesRedacted: true,
});
const commandSequence = arrayAt(scenario, "commandSequence").map(record);
const needsPromptSet = commandSequence.some((item) => stringAt(item, "type") === "sendPrompt" && inlinePromptText(item) === null);
const prompts = needsPromptSet
? readPromptSetForScenario(state, scenario)
: { ok: true as const, prompts: [], summary: { source: "not-required", promptCount: 0, valuesRedacted: true } };
if (!prompts.ok) return { ok: false, status: "blocked", reason: "prompt-source-unavailable", promptSource: prompts, valuesRedacted: true };
if (!prompts.ok) {
emitWebProbeSentinelSpan(sentinelOtelContext(state), "web_probe_sentinel.quick_verify.job_finish", {
scenarioId,
runId,
status: "blocked",
exitCode: 1,
failureKind: "prompt-source-unavailable",
valuesRedacted: true,
}, false);
return { ok: false, status: "blocked", reason: "prompt-source-unavailable", promptSource: prompts, valuesRedacted: true };
}
const accountEnv = quickVerifyAccountEnv(state);
if (!accountEnv.ok) {
const findings = [{
@@ -78,7 +98,7 @@ export function runSentinelQuickVerify(state: SentinelCicdState, reason: string,
}];
return recordQuickVerify(state, {
ok: false,
runId: `sentinel-run-${Date.now().toString(36)}-${randomUUID().slice(0, 8)}`,
runId,
scenarioId,
reason,
status: "blocked",
@@ -104,7 +124,6 @@ export function runSentinelQuickVerify(state: SentinelCicdState, reason: string,
const hardBudgetSeconds = Math.min(timeoutSeconds, Math.max(maxSeconds, numberAt(scenario, "maxRunSeconds")));
const elapsedWarnings = () => targetValidationElapsedWarnings(elapsedMs(), "quick verify confirm-wait", warningBudgetSeconds);
const deadline = Date.now() + hardBudgetSeconds * 1000;
const runId = `sentinel-run-${Date.now().toString(36)}-${randomUUID().slice(0, 8)}`;
printQuickVerifyProgress(state, runId, "start", "running", { scenarioId, reason, warningBudgetSeconds, hardBudgetSeconds, timeoutSeconds });
const steps: Record<string, unknown>[] = [];
const startArgs = [
@@ -659,9 +678,29 @@ function recordQuickVerify(state: SentinelCicdState, payload: Record<string, unk
maintenance: payload.reason === "maintenance-stop",
valuesRedacted: true,
}, 60);
emitWebProbeSentinelSpan(sentinelOtelContext(state), "web_probe_sentinel.quick_verify.job_finish", {
scenarioId: payload.scenarioId,
runId: payload.runId,
observerId: payload.observerId,
status: payload.status,
exitCode: payload.ok === true && recordResult.ok === true ? 0 : 1,
failureKind: payload.failure ?? (recordResult.ok === true ? null : "record-run-failed"),
valuesRedacted: true,
}, payload.ok === true && recordResult.ok === true);
return withWarnings({ ...payload, views, recordResult, valuesRedacted: true }, recordResult.ok === true ? [] : ["quick verify completed but sentinel report index record failed; report/dashboard may lag until record payload is reduced or retried."]);
}
function sentinelOtelContext(state: SentinelCicdState): { readonly node: string; readonly lane: string; readonly sentinelId: string; readonly namespace: string | null; readonly runtime: Record<string, unknown>; readonly cicd: Record<string, unknown> } {
return {
node: state.spec.nodeId,
lane: state.spec.lane,
sentinelId: state.sentinelId,
namespace: stringAtNullable(state.runtime, "namespace"),
runtime: state.runtime,
cicd: state.cicd,
};
}
function compactQuickVerifyRecordViews(views: Record<string, unknown>): Record<string, unknown> {
const compacted: Record<string, unknown> = {};
for (const [key, value] of Object.entries(views)) {
+103 -4
View File
@@ -4,6 +4,7 @@
// SPEC: PJ2026-01060508 Web哨兵 draft-2026-06-26-p9-multi-web-probe-sentinel.
// SPEC: PJ2026-01060508 Web哨兵 draft-2026-06-27-p11-monitor-web-observability-dashboard.
// SPEC: PJ2026-01060508 Web哨兵 draft-2026-06-28-p13-1206-multi-runner-boundaries.
// SPEC: PJ2026-01060508 Web哨兵 draft-2026-07-01-p15-cadence-otel.
// Responsibility: Persistent HTTP wrapper service for web-probe observe scheduling, index, health, metrics, maintenance, and dashboard.
import { Buffer } from "node:buffer";
import { createHash, randomUUID } from "node:crypto";
@@ -14,6 +15,7 @@ import { renderWebProbeSentinelDashboardHtml, webProbeSentinelDashboardAssetResp
import { webProbeSentinelConfigPlan, type WebProbeSentinelConfigPlan } from "./hwlab-node-web-sentinel-config";
import type { HwlabRuntimeLaneSpec } from "./hwlab-node-lanes";
import { effectiveWebProbeSentinelPublicExposure, resolveWebProbeSentinel, readConfigRefTarget as readSentinelConfigRefTarget } from "./hwlab-node-web-sentinel-resolver";
import { emitWebProbeSentinelSpan, webProbeSentinelOtelSummary } from "./hwlab-node-web-sentinel-otel";
const DASHBOARD_CONTRACT_VERSION = "draft-2026-06-27-p11-monitor-web-observability-dashboard";
const DASHBOARD_MAX_TEXT_BYTES = 16_000;
@@ -130,6 +132,7 @@ export function createWebProbeSentinelService(options: WebProbeSentinelServiceOp
let schedulerLastError: string | null = null;
writeMetadata(db, "service.boot", { at: schedulerHeartbeatAt, restoredInterruptedRuns: restored, valuesRedacted: true });
writeMetadata(db, "scheduler.heartbeat", { at: schedulerHeartbeatAt, loop: "boot" });
emitSchedulerHeartbeatSpan(config, "boot", schedulerHeartbeatAt, true);
const service: WebProbeSentinelService = {
config,
@@ -139,15 +142,21 @@ export function createWebProbeSentinelService(options: WebProbeSentinelServiceOp
if (!schedulerEnabled || schedulerTimer !== null) return;
schedulerHeartbeatAt = nowIso();
writeMetadata(db, "scheduler.heartbeat", { at: schedulerHeartbeatAt, loop: "started" });
emitSchedulerHeartbeatSpan(config, "started", schedulerHeartbeatAt, true);
schedulerTimer = setInterval(() => {
try {
schedulerHeartbeatAt = nowIso();
writeMetadata(db, "scheduler.heartbeat", { at: schedulerHeartbeatAt, loop: "tick" });
writeMetadata(db, "scheduler.summary", schedulerSummary(config, db));
const summary = schedulerSummary(config, db);
writeMetadata(db, "scheduler.summary", summary);
emitSchedulerHeartbeatSpan(config, "tick", schedulerHeartbeatAt, true);
emitCadenceExpectedSpan(config, summary);
if (summary.rootCause === "planned-run-not-consumed-by-host-cadence") emitSchedulerGapSpan(config, summary);
schedulerLastError = null;
} catch (error) {
schedulerLastError = error instanceof Error ? error.message : String(error);
writeMetadata(db, "scheduler.error", { at: nowIso(), message: schedulerLastError });
emitSchedulerHeartbeatSpan(config, "tick-error", nowIso(), false, schedulerLastError);
}
}, config.schedulerIntervalMs);
},
@@ -249,7 +258,9 @@ export function createWebProbeSentinelService(options: WebProbeSentinelServiceOp
return { ok: true, runId, scenarioId, status: "planned", commandPlanSha256: sha256Json(commandPlan), valuesRedacted: true };
},
recordRun(input: Record<string, unknown>) {
return recordRunResult(config, db, input);
const result = recordRunResult(config, db, input);
emitRecordRunSpan(config, input, result);
return result;
},
report(view: string, runId: string | null) {
return reportRunView(config, db, view, runId);
@@ -636,6 +647,66 @@ function schedulerSummary(config: WebProbeSentinelServiceConfig, db: Database):
};
}
function emitSchedulerHeartbeatSpan(config: WebProbeSentinelServiceConfig, loop: string, at: string, ok: boolean, failureKind: string | null = null): void {
emitWebProbeSentinelSpan(sentinelOtelContext(config), "web_probe_sentinel.scheduler.heartbeat", {
status: ok ? "ok" : "error",
failureKind,
namespace: stringOrNull(config.runtime.namespace),
heartbeatAt: at,
cadence: firstEnabledScenarioCadence(config),
valuesRedacted: true,
}, ok);
}
function emitCadenceExpectedSpan(config: WebProbeSentinelServiceConfig, summary: Record<string, unknown>): void {
emitWebProbeSentinelSpan(sentinelOtelContext(config), "web_probe_sentinel.cadence.expected", {
cadence: firstEnabledScenarioCadence(config),
scenarioId: firstEnabledScenarioId(config),
status: summary.rootCause == null ? "ok" : "stale",
activeRunCount: summary.activeRuns ?? null,
plannedRunCount: summary.plannedRuns ?? null,
valuesRedacted: true,
}, summary.rootCause == null);
}
function emitSchedulerGapSpan(config: WebProbeSentinelServiceConfig, summary: Record<string, unknown>): void {
emitWebProbeSentinelSpan(sentinelOtelContext(config), "web_probe_sentinel.scheduler_gap.detected", {
cadence: firstEnabledScenarioCadence(config),
scenarioId: summary.oldestPlannedRunScenarioId ?? firstEnabledScenarioId(config),
runId: summary.oldestPlannedRunId ?? null,
status: "planned-run-stale",
failureKind: summary.rootCause,
valuesRedacted: true,
}, false);
}
function emitRecordRunSpan(config: WebProbeSentinelServiceConfig, input: Record<string, unknown>, result: Record<string, unknown>): void {
emitWebProbeSentinelSpan(sentinelOtelContext(config), "web_probe_sentinel.record_run", {
scenarioId: result.scenarioId ?? input.scenarioId ?? null,
runId: result.runId ?? input.runId ?? null,
observerId: input.observerId ?? null,
status: result.status ?? input.status ?? null,
failureKind: result.ok === true ? null : result.error ?? "record-run-failed",
valuesRedacted: true,
}, result.ok === true);
}
function sentinelOtelContext(config: WebProbeSentinelServiceConfig): { readonly node: string; readonly lane: string; readonly sentinelId: string; readonly namespace: string | null; readonly runtime: Record<string, unknown>; readonly cicd: Record<string, unknown> } {
return {
node: config.node,
lane: config.lane,
sentinelId: config.sentinelId,
namespace: stringOrNull(config.runtime.namespace),
runtime: config.runtime,
cicd: config.cicd,
};
}
function firstEnabledScenarioCadence(config: WebProbeSentinelServiceConfig): string | null {
const scenario = config.scenarios.find((item) => boolAt(item, "enabled"));
return scenario === undefined ? null : stringOrNull(scenario.cadence);
}
function renderMetrics(config: WebProbeSentinelServiceConfig, db: Database, health: Record<string, unknown>, maintenance: MaintenanceState): string {
const counts = runCounts(config, db);
const heartbeat = record(readMetadata(db, "scheduler.heartbeat"));
@@ -740,6 +811,12 @@ function dashboardOverview(config: WebProbeSentinelServiceConfig, db: Database,
const severityCounts = globalSeverityCounts(config, db);
const latestUpdatedAt = latestRow === null ? null : stringOrNull(latestRow.updated_at);
const latestRunAgeSeconds = latestUpdatedAt === null ? null : ageSeconds(latestUpdatedAt);
const heartbeatAgeSeconds = numberOr(record(record(health.checks).scheduler).heartbeatAgeSeconds, -1);
const expectedCadence = firstEnabledScenarioCadence(config);
const expectedCadenceSeconds = durationStringSeconds(expectedCadence);
const staleMultiple = expectedCadenceSeconds === null || latestRunAgeSeconds === null ? null : latestRunAgeSeconds / expectedCadenceSeconds;
const freshnessWarningMultiple = numberAt(config.runtime, "scheduler.freshnessWarningMultiple");
const scheduler = schedulerSummary(config, db);
return {
ok: health.ok === true,
contractVersion: DASHBOARD_CONTRACT_VERSION,
@@ -750,7 +827,7 @@ function dashboardOverview(config: WebProbeSentinelServiceConfig, db: Database,
publicOrigin: stringOrNull(config.publicExposure.publicBaseUrl),
configReady: config.plan.ok,
health,
scheduler: schedulerSummary(config, db),
scheduler,
maintenance,
latestRun,
runCounts: runCounts(config, db),
@@ -758,8 +835,30 @@ function dashboardOverview(config: WebProbeSentinelServiceConfig, db: Database,
freshness: {
latestRunUpdatedAt: latestUpdatedAt,
latestRunAgeSeconds,
schedulerHeartbeatAgeSeconds: numberOr(record(record(health.checks).scheduler).heartbeatAgeSeconds, -1),
schedulerHeartbeatAgeSeconds: heartbeatAgeSeconds,
latestAnalyzedReportAgeSeconds: latestRow === null || stringOrNull(latestRow.report_json_sha256) === null ? null : latestRunAgeSeconds,
},
cadence: {
expectedCadence,
expectedCadenceSeconds,
schedulerHeartbeatAgeSeconds: heartbeatAgeSeconds,
latestRunAgeSeconds,
latestAnalyzedReportAgeSeconds: latestRow === null || stringOrNull(latestRow.report_json_sha256) === null ? null : latestRunAgeSeconds,
activeRuns: scheduler.activeRuns ?? null,
plannedRuns: scheduler.plannedRuns ?? null,
nextRun: null,
staleMultiple,
freshnessWarningMultiple,
status: scheduler.rootCause === "planned-run-not-consumed-by-host-cadence" ? "blocker" : staleMultiple !== null && staleMultiple > freshnessWarningMultiple ? "warning" : "fresh",
cronJob: {
observed: false,
status: "control-plane-status-required",
reason: "runner API does not query Kubernetes CronJob objects; use web-probe sentinel control-plane status for CronJob counts, lastScheduleTime and latest Jobs.",
valuesRedacted: true,
},
valuesRedacted: true,
},
observability: webProbeSentinelOtelSummary(sentinelOtelContext(config)),
targetValidation: {
scenarioId: stringOrNull(record(config.cicd.targetValidation).scenarioId),
maxSeconds: numberOr(record(config.cicd.targetValidation).maxSeconds, 120),