diff --git a/config/platform-infra/observability.yaml b/config/platform-infra/observability.yaml new file mode 100644 index 00000000..3319fef1 --- /dev/null +++ b/config/platform-infra/observability.yaml @@ -0,0 +1,122 @@ +version: 1 +kind: platform-infra-observability + +metadata: + id: observability + owner: unidesk + spec: PJ2026-01060501 OTel追踪 draft-2026-06-19-p0 + relatedIssues: + - 489 + +defaults: + targetId: D601 + +images: + collector: + repository: otel/opentelemetry-collector-contrib + tag: 0.130.1 + pullPolicy: IfNotPresent + tempo: + repository: grafana/tempo + tag: 2.8.1 + pullPolicy: IfNotPresent + +targets: + - id: D601 + route: D601:k3s + namespace: platform-infra + role: active + enabled: true + createNamespace: true + +collector: + deploymentName: otel-collector + serviceName: otel-collector + configMapName: otel-collector-config + replicas: 1 + healthPort: 13133 + otlp: + grpcPort: 4317 + httpPort: 4318 + +traceBackend: + type: tempo + deploymentName: tempo + serviceName: tempo + configMapName: tempo-config + replicas: 1 + httpPort: 3200 + otlp: + grpcPort: 4317 + httpPort: 4318 + storage: + mode: emptyDir + retention: 24h + +sampling: + mode: parentbased_traceidratio + ratio: 1 + +instrumentation: + contextPropagation: + - tracecontext + - baggage + serviceConnections: + - serviceName: hwlab-cloud-api + owningRepo: pikasTech/HWLAB + targetNode: D601 + lane: v0.3 + namespace: hwlab-v03 + requiredSpans: + - POST /v1/agent/chat + - durable_admission + - billing_preflight + - agentrun_dispatch + - projection_write + - trace_events_read + - turn_status_read + - serviceName: user-billing + owningRepo: pikasTech/HWLAB + targetNode: D601 + lane: v0.3 + namespace: hwlab-v03 + requiredSpans: + - billing_preflight + - serviceName: agentrun-manager + owningRepo: pikasTech/agentrun + targetNode: D601 + lane: v0.2 + namespace: agentrun-v02 + requiredSpans: + - agentrun_dispatch + - run_created + - command_result + - projection_sync + +resourceAttributes: + required: + - service.name + - deployment.environment + - unidesk.node + - hwlab.lane + - k8s.namespace.name + - git.commit + businessCorrelationAttributes: + - traceId + - sessionId + - turnId + - runId + - commandId + +probes: + readinessPath: /ready + traceQueryPathTemplate: /api/traces/{{traceId}} + statusEndpoints: + - name: tempo-ready + service: tempo + portName: http + path: /ready + - name: collector-health + service: otel-collector + portName: health + path: / diff --git a/project-management/PJ2026-01/specs/PJ2026-010605-observability-monitoring.md b/project-management/PJ2026-01/specs/PJ2026-010605-observability-monitoring.md index ba5b4ee6..0498789f 100644 --- a/project-management/PJ2026-01/specs/PJ2026-010605-observability-monitoring.md +++ b/project-management/PJ2026-01/specs/PJ2026-010605-observability-monitoring.md @@ -23,30 +23,32 @@ | 上级规格 | [PJ2026-0106 平台运维](PJ2026-0106-platform-ops.md) | | 规格治理索引 | [规格治理](spec-governance.md) | -本文采用 ISO/IEC/IEEE 29148 需求规格模板的项目裁剪版:正文只保留 HWLAB Prometheus 运维监控的稳定使命、范围、术语、系统边界、内部分工和原子需求。Workbench 用户可感知性能监控的架构图、数据流图、时序图和实现引用要求由 [PJ2026-01060505 Workbench性能](PJ2026-01060505-workbench-performance.md) 细化。 +本文采用 ISO/IEC/IEEE 29148 需求规格模板的项目裁剪版:正文只保留 HWLAB 运维监控的稳定使命、范围、术语、系统边界、内部分工和原子需求。Prometheus 继续承载 metrics、dashboard、alert 和 Workbench 用户可感知性能指标;OpenTelemetry 首期只承载 tracing、span、trace id 关联和跨服务因果链路。Workbench 用户可感知性能监控的架构图、数据流图、时序图和实现引用要求由 [PJ2026-01060505 Workbench性能](PJ2026-01060505-workbench-performance.md) 细化。 ## 2. 目的和范围 ### 2.1 目的 -运维监控负责通过 Prometheus 和配套状态查询入口采集 HWLAB 运行面 metrics、scrape target、alert rule、服务健康指标和资源指标,使平台运维能够发现服务不可用、入口漂移、资源异常和发布后运行状态退化。 +运维监控负责通过 Prometheus 和 OpenTelemetry 的分工协作采集 HWLAB 运行面 metrics、scrape target、alert rule、服务健康指标、资源指标和关键请求 trace,使平台运维能够发现服务不可用、入口漂移、资源异常、发布后运行状态退化,以及单个 Workbench/AgentRun 请求在 admission、billing、dispatch、projection 或 replay 阶段的失败位置。 -本课题的目标状态是:各业务 L1 提供自己的服务健康和业务状态指标,平台运维统一接入 Prometheus 并输出可查询、可告警、可回溯的运维状态;监控只判断运行面健康和资源状态,不替代业务成功标准。 +本课题的目标状态是:各业务 L1 提供自己的服务健康、业务状态指标和关键链路 trace 语义,平台运维统一接入 Prometheus metrics 面和 OpenTelemetry tracing 面并输出可查询、可告警、可回溯的运维状态;监控只判断运行面健康、资源状态和单次请求因果链,不替代业务成功标准。 -运维监控接入必须遵循 UniDesk YAML-first ops:scrape target、alert rule、recording rule、summary query、target/node/lane、namespace、endpoint、采样/保留/阈值等 UniDesk 自有运维事实必须归属 YAML;受控 CLI 负责读取、校验、渲染、apply/status/summary。运行面对象和 Prometheus 查询结果只作为观测证据,不反推为配置真相。 +运维监控接入必须遵循 UniDesk YAML-first ops:scrape target、alert rule、recording rule、summary query、OTel Collector、trace backend、target/node/lane、namespace、endpoint、采样/保留/阈值等 UniDesk 自有运维事实必须归属 YAML;受控 CLI 负责读取、校验、渲染、apply/status/summary/trace。运行面对象、Prometheus 查询结果和 trace backend 查询结果只作为观测证据,不反推为配置真相。 ### 2.2 范围内 - Prometheus scrape target、metrics endpoint、alert rule 和服务健康指标接入。 +- OpenTelemetry Collector、trace backend、span 语义、trace context 传播和 trace 查询入口。 - Web/API/AgentRun/HWPOD/Harness/用户管理等服务的运行面健康、资源状态、公开入口健康和用户可感知性能观测。 - 发布后 runtime readiness、resource usage、error rate、queue depth、target availability 和 alert 状态摘要。 -- 监控数据的受控查询、低噪声摘要、失败归因和敏感输出约束。 +- 监控和 trace 数据的受控查询、低噪声摘要、失败归因和敏感输出约束。 ### 2.3 范围外 - Agent command、CaseRun、HWPOD operation 或用户账本的业务成功标准由对应 L1 定义。 -- 长 trace、原始日志、测试报告、截图、CaseRun registry 和用户反馈正文不进入本规格。 +- 完整 prompt、assistant 正文、tool 参数、命令输出、原始日志、测试报告、截图、CaseRun registry 和用户反馈正文不进入本规格或默认 trace attribute。 - Prometheus 具体阈值、保留周期、告警路由、采样间隔和容量数值以 YAML/config 为准,不在本规格硬编码。 +- OpenTelemetry Collector image、trace backend image、存储模式、retention、采样率、endpoint 和服务接入关系以 YAML/config 为准,不在业务代码硬编码。 - 监控告警不能代替发布流水、业务验证、CaseRun 评价或用户反馈分流。 ## 3. 术语表 @@ -54,6 +56,10 @@ | 术语 | 定义 | | --- | --- | | Prometheus 运维监控 | 通过 Prometheus 采集、查询和告警 HWLAB 运行面 metrics 的运维能力。 | +| OpenTelemetry tracing | 通过 OTel Collector、trace backend、span 和 W3C trace context 关联单次请求跨服务因果链路的运维能力。 | +| trace backend | 保存并查询 trace/span 的平台组件,首期可由 Tempo 或 Jaeger 承担,部署事实由 YAML 控制。 | +| span | 一次请求在 admission、billing、AgentRun dispatch、projection、read/replay 等阶段中的一个可观察工作单元。 | +| trace context | W3C `traceparent`/`baggage` 等跨进程传播上下文;业务 `traceId/sessionId/runId/turnId` 只能作为 span attribute,不能替代标准 trace context。 | | 用户可感知性能 | 用户在 Web、CLI 或 API 入口中直接感受到的等待时间、加载时间、首个可读内容出现时间和完整可用状态。 | | YAML-first 运维监控 | Prometheus scrape、rule、summary 和 target 归属先进入 UniDesk YAML,再通过受控 CLI 渲染和验证的运维形态。 | | scrape target | Prometheus 抓取 metrics 的目标服务、path 和 label 集合。 | @@ -67,33 +73,119 @@ | 边界项 | 内容 | | --- | --- | | 外部使用者 | 平台管理员、发布操作人员、值守自动化、需要运行状态的各 L1 owner。 | -| 外部输入 | 服务 metrics endpoint、health/readiness、runtime resource 指标、scrape 配置、alert rule 配置和查询请求。 | -| 受控资源 | Prometheus target、metrics、alert rule、运行面状态摘要、服务健康指标和资源指标。 | -| 外部输出 | target 状态、metrics 查询结果、alert 状态、运行健康摘要、入口健康摘要和 redacted 失败原因。 | -| 用户接口 | Prometheus 查询、受控平台运维 CLI、发布状态摘要和各 L1 health/status 页面或命令。 | -| 系统边界 | 运维监控负责运行面可观察和告警;不定义业务完成标准,不保存长证据,不把监控 green 当作业务通过。 | +| 外部输入 | 服务 metrics endpoint、health/readiness、runtime resource 指标、scrape 配置、alert rule 配置、OTLP trace/span、trace context、trace 查询请求和 metrics 查询请求。 | +| 受控资源 | Prometheus target、metrics、alert rule、OTel Collector、trace backend、运行面状态摘要、服务健康指标、资源指标和 redacted trace 摘要。 | +| 外部输出 | target 状态、metrics 查询结果、alert 状态、trace 查询结果、运行健康摘要、入口健康摘要和 redacted 失败阶段。 | +| 用户接口 | Prometheus 查询、OpenTelemetry trace 查询、受控平台运维 CLI、发布状态摘要和各 L1 health/status 页面或命令。 | +| 系统边界 | 运维监控负责运行面可观察、告警和单次请求因果链;不定义业务完成标准,不保存默认长证据,不把监控 green 或 trace 可查当作业务通过。 | ## 5. 内部分工与规格索引 | 编号 | 模块或课题 | 规格文档 | 主责边界 | 上游依赖 | 下游支撑 | | --- | --- | --- | --- | --- | --- | -| PJ2026-01060501 | Metrics接入 | 本规格 6.1 | metrics endpoint、scrape target 和 label 口径 | 各 L1 服务健康指标 | Prometheus 查询 | +| PJ2026-01060501 | OTel追踪 | 本规格 6.1 | OTel Collector、trace backend、span 语义、trace context 和受控 trace 查询 | HWLAB v0.3、AgentRun、用户账本、Web工作台 | 平台排障、Prometheus 关联 | | PJ2026-01060502 | 告警规则 | 本规格 6.2 | alert rule、状态分类和配置来源 | Metrics接入、YAML运维 | 平台值守、发布判定 | | PJ2026-01060503 | 运行摘要 | 本规格 6.3 | health、readiness、resource、error rate 和入口健康摘要 | 公开入口、发布流水 | 管理员和各 L1 owner | | PJ2026-01060504 | 边界约束 | 本规格 6.4 | 监控与业务通过、长证据、敏感输出的边界 | 全部 L1 | 用户反馈和排障 | | PJ2026-01060505 | Workbench性能 | [PJ2026-01060505 Workbench性能](PJ2026-01060505-workbench-performance.md) | Web 工作台用户可感知性能、RUM、AgentRun event visible latency 和 Prometheus 指标口径 | Web工作台、Agent编排、API契约 | 平台运维、客户端和性能回归调查 | +| PJ2026-01060506 | Metrics接入 | 本规格 6.6 | metrics endpoint、scrape target 和 label 口径 | 各 L1 服务健康指标 | Prometheus 查询 | ## 6. 原子需求 -### 6.1 OPS-MON-REQ-001 Metrics 接入 +### 6.1 OPS-MON-REQ-001 OTel 追踪 | 编号 | 短名 | 主责模块 | 关联模块 | | --- | --- | --- | --- | -| OPS-MON-REQ-001 | Metrics接入 | PJ2026-01060501 Metrics接入 | [YAML运维](PJ2026-010603-yaml-first-ops.md)、[发布流水](PJ2026-010601-controlled-release.md) | +| OPS-MON-REQ-001 | OTel追踪 | PJ2026-01060501 OTel追踪 | [YAML运维](PJ2026-010603-yaml-first-ops.md)、[Agent编排](PJ2026-0102-agent-orchestration.md)、[客户端](PJ2026-0104-client.md)、[用户管理](PJ2026-0105-user-management.md) | -运维监控应接入 HWLAB 各运行服务的 metrics endpoint 和 scrape target,使服务 live、ready、resource、queue、error、latency 和依赖健康可以被 Prometheus 查询。 +运维监控应在 `platform-infra` 提供 YAML-first OpenTelemetry tracing 能力,使一次 Workbench submit 或等价 API 请求可以沿 Cloud API admission、billing preflight、AgentRun dispatch、projection write、trace events read 和 Web replay 等阶段保留可查询 span。 -各 L1 负责定义自身服务健康指标含义;平台运维负责通过 YAML-first UniDesk ops 统一接入、命名、label 和可查询状态。metrics 缺失应暴露为监控缺口,不得用日志 grep、一次性 curl、手工 `kubectl apply` 或人工截图替代长期监控。 +OpenTelemetry 不替代 Prometheus。Prometheus 继续承载 metrics、dashboard、alert 和用户可感知性能指标;OTel tracing 只回答单个 `traceId/sessionId/runId/turnId` 跨服务断在哪个阶段。业务 ID 可以作为 span attribute 用于排查,但不能成为 Prometheus label,也不能替代 W3C trace context。 + +#### 6.1.1 目标架构图 + +```mermaid +flowchart LR + W[Workbench Web] --> API[HWLAB cloud-api] + API --> Billing[user-billing] + API --> AR[AgentRun manager] + AR --> Runner[AgentRun runner] + API --> Projection[projection/read model] + API --> Metrics[Prometheus metrics endpoint] + Billing --> Metrics + AR --> Metrics + API -- OTLP spans --> Collector[OTel Collector
platform-infra] + Billing -- OTLP spans --> Collector + AR -- OTLP spans --> Collector + Runner -- OTLP spans --> Collector + Collector --> TraceBackend[Trace backend
Tempo or Jaeger] + Metrics --> Prometheus[Prometheus metrics面] + CLI[UniDesk platform-infra observability CLI] --> Collector + CLI --> TraceBackend + CLI --> Prometheus +``` + +#### 6.1.2 数据流图 + +```mermaid +flowchart TD + Span[应用关键 span] --> Collector[OTel Collector] + Collector --> Backend[trace backend] + Metric[低基数 metrics] --> Prometheus[Prometheus] + Backend --> TraceQuery[CLI trace query] + Prometheus --> MetricQuery[CLI metrics/status query] + TraceQuery --> Diagnose[按 traceId/sessionId/runId 定位失败阶段] + MetricQuery --> Health[运行健康、趋势和告警判断] + Diagnose -.关联.-> Health +``` + +数据流必须保持职责分离:trace backend 保存高基数字段和跨服务因果链;Prometheus 只保存低基数指标、alert 和趋势。CLI 可以把 metrics 摘要与 trace 查询结果放到同一排障报告中,但不能把两套存储合并为一个第二真相。 + +#### 6.1.3 关键时序图 + +```mermaid +sequenceDiagram + participant W as Workbench + participant API as cloud-api + participant B as user-billing + participant AR as AgentRun + participant P as projection + participant OTel as OTel Collector + participant TB as trace backend + + W->>API: POST /v1/agent/chat + API->>OTel: span admission + API->>B: billing_preflight with trace context + B->>OTel: span billing_preflight + API->>AR: dispatch command with trace context + AR->>OTel: span agentrun_dispatch + AR-->>API: runId/commandId + API->>P: projection_write + P->>OTel: span projection_write + W->>API: replay/read trace events + API->>OTel: span trace_events_read + OTel->>TB: export spans +``` + +#### 6.1.4 YAML 与 CLI 归属 + +OTel Collector、trace backend、target route、namespace、image、storage、retention、sampling、endpoint、probe 和应用接入关系必须进入 `config/platform-infra/observability.yaml` 或后续确认的 owning YAML。正式入口必须是 `bun scripts/cli.ts platform-infra observability plan|apply|status|validate|trace` 或等价受控 CLI;`trans :k3s` 只能作为有界诊断底座,不能成为长期 mutate path。 + +首期应用侧手工 span 至少覆盖: + +- `POST /v1/agent/chat` +- `durable_admission` +- `billing_preflight` +- `agentrun_dispatch` +- `projection_write` +- `trace_events_read` +- `turn_status_read` + +统一 resource attributes 至少包含 `service.name`、`deployment.environment`、`unidesk.node`、`hwlab.lane`、`k8s.namespace.name` 和 `git.commit`。统一传播使用 W3C `traceparent`/`baggage`;`traceId`、`sessionId`、`turnId`、`runId`、`commandId` 只能作为 span attribute。 + +#### 6.1.5 代码引用规则 + +本需求范围内新增或修改的源码文件头部必须标注 `SPEC: PJ2026-01060501 OTel追踪 draft-2026-06-19-p0`,并用一句话说明文件职责。纯 YAML、生成 manifest、锁文件或第三方 CRD 如不能加头部,必须能从 owning YAML、渲染器或 CLI 命令追溯到本 SPEC。 ### 6.2 OPS-MON-REQ-002 告警规则 @@ -135,6 +227,16 @@ Workbench 性能监控只记录低基数指标、阶段耗时、状态分类和脱敏 correlation;不得把 traceId、sessionId、runId、prompt、assistant 正文、tool 参数、stdout/stderr、Secret 或用户个人信息写入 Prometheus label。具体架构、数据流、时序和代码引用规则由 [PJ2026-01060505 Workbench性能](PJ2026-01060505-workbench-performance.md) 定义。 +### 6.6 OPS-MON-REQ-006 Metrics 接入 + +| 编号 | 短名 | 主责模块 | 关联模块 | +| --- | --- | --- | --- | +| OPS-MON-REQ-006 | Metrics接入 | PJ2026-01060506 Metrics接入 | [YAML运维](PJ2026-010603-yaml-first-ops.md)、[发布流水](PJ2026-010601-controlled-release.md) | + +运维监控应接入 HWLAB 各运行服务的 metrics endpoint 和 scrape target,使服务 live、ready、resource、queue、error、latency 和依赖健康可以被 Prometheus 查询。 + +各 L1 负责定义自身服务健康指标含义;平台运维负责通过 YAML-first UniDesk ops 统一接入、命名、label 和可查询状态。metrics 缺失应暴露为监控缺口,不得用日志 grep、一次性 curl、手工 `kubectl apply` 或人工截图替代长期监控。 + ## 7. 过程控制 本规格不单独索引过程 issue;跨 L1 的内测、灰度和阶段活动索引统一保留在 [PJ2026-01 HWLAB 总规格](PJ2026-01-HWLAB.md) 的 `7. 过程控制`。 diff --git a/scripts/src/platform-infra-observability.ts b/scripts/src/platform-infra-observability.ts new file mode 100644 index 00000000..e95e235e --- /dev/null +++ b/scripts/src/platform-infra-observability.ts @@ -0,0 +1,1124 @@ +// SPEC: PJ2026-01060501 OTel追踪 draft-2026-06-19-p0. +// Responsibility: YAML-first platform-infra OpenTelemetry tracing control commands. +import { Buffer } from "node:buffer"; +import { readFileSync } from "node:fs"; +import type { UniDeskConfig } from "./config"; +import { rootPath } from "./config"; +import { startJob } from "./jobs"; +import { + compactCapture, + compactUnknown, + createYamlFieldReader, + numberField, + parseJsonOutput, + redactSensitiveUnknown, + shQuote, + capture, +} from "./platform-infra-ops-library"; + +const configFile = rootPath("config", "platform-infra", "observability.yaml"); +const configLabel = "config/platform-infra/observability.yaml"; +const fieldManager = "unidesk-platform-observability"; +const { + asRecord, + objectField, + arrayOfRecords, + stringField, + integerField, + booleanField, + stringArrayField, + numberArrayField, + enumField, + kubernetesNameField, + portField, + apiPathField, +} = createYamlFieldReader(configLabel); + +interface ObservabilityConfig { + version: number; + kind: "platform-infra-observability"; + metadata: { id: string; owner: string; spec: string; relatedIssues: number[] }; + defaults: { targetId: string }; + images: { + collector: ImageSpec; + tempo: ImageSpec; + }; + targets: ObservabilityTarget[]; + collector: { + deploymentName: string; + serviceName: string; + configMapName: string; + replicas: number; + healthPort: number; + otlp: OtlpPorts; + }; + traceBackend: { + type: "tempo"; + deploymentName: string; + serviceName: string; + configMapName: string; + replicas: number; + httpPort: number; + otlp: OtlpPorts; + storage: { mode: "emptyDir"; retention: string }; + }; + sampling: { mode: "parentbased_traceidratio"; ratio: number }; + instrumentation: { + contextPropagation: string[]; + serviceConnections: ServiceConnection[]; + }; + resourceAttributes: { + required: string[]; + businessCorrelationAttributes: string[]; + }; + probes: { + readinessPath: string; + traceQueryPathTemplate: string; + statusEndpoints: StatusEndpoint[]; + }; +} + +interface ImageSpec { + repository: string; + tag: string; + pullPolicy: "Always" | "IfNotPresent" | "Never"; +} + +interface ObservabilityTarget { + id: string; + route: string; + namespace: string; + role: "active" | "standby"; + enabled: boolean; + createNamespace: boolean; +} + +interface OtlpPorts { + grpcPort: number; + httpPort: number; +} + +interface ServiceConnection { + serviceName: string; + owningRepo: string; + targetNode: string; + lane: string; + namespace: string; + requiredSpans: string[]; +} + +interface StatusEndpoint { + name: string; + service: string; + portName: string; + path: string; +} + +interface CommonOptions { + targetId: string | null; + full: boolean; + raw: boolean; +} + +interface ApplyOptions extends CommonOptions { + confirm: boolean; + dryRun: boolean; + wait: boolean; +} + +interface TraceOptions extends CommonOptions { + traceId: string | null; +} + +export function observabilityHelp(): Record { + return { + command: "platform-infra observability plan|apply|status|validate|trace", + output: "json", + configTruth: "config/platform-infra/observability.yaml", + spec: "PJ2026-01060501 OTel追踪 draft-2026-06-19-p0", + usage: [ + "bun scripts/cli.ts platform-infra observability plan --target D601", + "bun scripts/cli.ts platform-infra observability apply --target D601 --dry-run", + "bun scripts/cli.ts platform-infra observability apply --target D601 --confirm", + "bun scripts/cli.ts platform-infra observability status --target D601 [--full|--raw]", + "bun scripts/cli.ts platform-infra observability validate --target D601 [--full|--raw]", + "bun scripts/cli.ts platform-infra observability trace --target D601 --trace-id [--full|--raw]", + ], + boundary: "Prometheus remains the metrics source; this command owns only platform-infra OTel Collector, trace backend readiness, and trace lookup.", + }; +} + +export async function runPlatformObservabilityCommand(config: UniDeskConfig, args: string[]): Promise> { + const [action = "plan"] = args; + if (action === "plan") return plan(parseCommonOptions(args.slice(1))); + if (action === "apply") return await apply(config, parseApplyOptions(args.slice(1))); + if (action === "status") return await status(config, parseCommonOptions(args.slice(1))); + if (action === "validate") return await validate(config, parseCommonOptions(args.slice(1))); + if (action === "trace") return await trace(config, parseTraceOptions(args.slice(1))); + return { ok: false, error: "unsupported-platform-infra-observability-command", args, help: observabilityHelp() }; +} + +function parseCommonOptions(args: string[]): CommonOptions { + let targetId: string | null = null; + let full = false; + let raw = false; + for (let index = 0; index < args.length; index += 1) { + const arg = args[index]; + if (arg === "--target") { + const value = args[index + 1]; + if (value === undefined || value.startsWith("--")) throw new Error("--target requires a value"); + if (!/^[A-Za-z0-9._-]+$/u.test(value)) throw new Error("--target must be a simple target id"); + targetId = value; + index += 1; + } else if (arg === "--full") { + full = true; + } else if (arg === "--raw") { + raw = true; + full = true; + } else { + throw new Error(`unsupported observability option: ${arg}`); + } + } + return { targetId, full, raw }; +} + +function parseApplyOptions(args: string[]): ApplyOptions { + const commonArgs: string[] = []; + let confirm = false; + let dryRun = false; + let wait = false; + for (let index = 0; index < args.length; index += 1) { + const arg = args[index]; + if (arg === "--confirm") confirm = true; + else if (arg === "--dry-run") dryRun = true; + else if (arg === "--wait") wait = true; + else { + commonArgs.push(arg); + if (arg === "--target") { + commonArgs.push(args[index + 1] ?? ""); + index += 1; + } + } + } + if (confirm && dryRun) throw new Error("observability apply accepts only one of --confirm or --dry-run"); + return { ...parseCommonOptions(commonArgs), confirm, dryRun: dryRun || !confirm, wait }; +} + +function parseTraceOptions(args: string[]): TraceOptions { + const commonArgs: string[] = []; + let traceId: string | null = null; + for (let index = 0; index < args.length; index += 1) { + const arg = args[index]; + if (arg === "--trace-id") { + const value = args[index + 1]; + if (value === undefined || value.startsWith("--")) throw new Error("--trace-id requires a value"); + if (!/^[A-Za-z0-9._:-]+$/u.test(value)) throw new Error("--trace-id has an unsupported format"); + traceId = value; + index += 1; + } else { + commonArgs.push(arg); + if (arg === "--target") { + commonArgs.push(args[index + 1] ?? ""); + index += 1; + } + } + } + return { ...parseCommonOptions(commonArgs), traceId }; +} + +function readObservabilityConfig(): ObservabilityConfig { + const parsed = Bun.YAML.parse(readFileSync(configFile, "utf8")) as unknown; + const root = asRecord(parsed, configLabel); + const version = integerField(root, "version", ""); + const kind = stringField(root, "kind", ""); + if (kind !== "platform-infra-observability") throw new Error(`${configLabel}.kind must be platform-infra-observability`); + const metadata = objectField(root, "metadata", ""); + const defaults = objectField(root, "defaults", ""); + const images = objectField(root, "images", ""); + const collector = objectField(root, "collector", ""); + const collectorOtlp = objectField(collector, "otlp", "collector"); + const traceBackend = objectField(root, "traceBackend", ""); + const traceBackendOtlp = objectField(traceBackend, "otlp", "traceBackend"); + const traceBackendStorage = objectField(traceBackend, "storage", "traceBackend"); + const sampling = objectField(root, "sampling", ""); + const instrumentation = objectField(root, "instrumentation", ""); + const resourceAttributes = objectField(root, "resourceAttributes", ""); + const probes = objectField(root, "probes", ""); + const config: ObservabilityConfig = { + version, + kind, + metadata: { + id: stringField(metadata, "id", "metadata"), + owner: stringField(metadata, "owner", "metadata"), + spec: stringField(metadata, "spec", "metadata"), + relatedIssues: numberArrayField(metadata, "relatedIssues", "metadata"), + }, + defaults: { targetId: stringField(defaults, "targetId", "defaults") }, + images: { + collector: imageSpec(objectField(images, "collector", "images"), "images.collector"), + tempo: imageSpec(objectField(images, "tempo", "images"), "images.tempo"), + }, + targets: arrayOfRecords(root.targets, "targets").map(parseTarget), + collector: { + deploymentName: kubernetesNameField(collector, "deploymentName", "collector"), + serviceName: kubernetesNameField(collector, "serviceName", "collector"), + configMapName: kubernetesNameField(collector, "configMapName", "collector"), + replicas: integerField(collector, "replicas", "collector"), + healthPort: portField(collector, "healthPort", "collector"), + otlp: parseOtlpPorts(collectorOtlp, "collector.otlp"), + }, + traceBackend: { + type: enumField(traceBackend, "type", "traceBackend", ["tempo"] as const), + deploymentName: kubernetesNameField(traceBackend, "deploymentName", "traceBackend"), + serviceName: kubernetesNameField(traceBackend, "serviceName", "traceBackend"), + configMapName: kubernetesNameField(traceBackend, "configMapName", "traceBackend"), + replicas: integerField(traceBackend, "replicas", "traceBackend"), + httpPort: portField(traceBackend, "httpPort", "traceBackend"), + otlp: parseOtlpPorts(traceBackendOtlp, "traceBackend.otlp"), + storage: { + mode: enumField(traceBackendStorage, "mode", "traceBackend.storage", ["emptyDir"] as const), + retention: stringField(traceBackendStorage, "retention", "traceBackend.storage"), + }, + }, + sampling: { + mode: enumField(sampling, "mode", "sampling", ["parentbased_traceidratio"] as const), + ratio: numberField(sampling, "ratio", "sampling"), + }, + instrumentation: { + contextPropagation: stringArrayField(instrumentation, "contextPropagation", "instrumentation"), + serviceConnections: arrayOfRecords(instrumentation.serviceConnections, "instrumentation.serviceConnections").map(parseServiceConnection), + }, + resourceAttributes: { + required: stringArrayField(resourceAttributes, "required", "resourceAttributes"), + businessCorrelationAttributes: stringArrayField(resourceAttributes, "businessCorrelationAttributes", "resourceAttributes"), + }, + probes: { + readinessPath: apiPathField(probes, "readinessPath", "probes"), + traceQueryPathTemplate: stringField(probes, "traceQueryPathTemplate", "probes"), + statusEndpoints: arrayOfRecords(probes.statusEndpoints, "probes.statusEndpoints").map(parseStatusEndpoint), + }, + }; + if (config.targets.length === 0) throw new Error(`${configLabel}.targets must not be empty`); + assertKnownEnabledTarget(config.targets, config.defaults.targetId, "defaults.targetId"); + if (config.collector.replicas < 0 || config.traceBackend.replicas < 0) throw new Error(`${configLabel} replicas must be >= 0`); + if (config.sampling.ratio < 0 || config.sampling.ratio > 1) throw new Error(`${configLabel}.sampling.ratio must be between 0 and 1`); + if (!config.probes.traceQueryPathTemplate.includes("{{traceId}}")) throw new Error(`${configLabel}.probes.traceQueryPathTemplate must include {{traceId}}`); + return config; +} + +function imageSpec(record: Record, path: string): ImageSpec { + const image = { + repository: stringField(record, "repository", path), + tag: stringField(record, "tag", path), + pullPolicy: enumField(record, "pullPolicy", path, ["Always", "IfNotPresent", "Never"] as const), + }; + if (!/^[A-Za-z0-9._/:@-]+$/u.test(`${image.repository}:${image.tag}`)) throw new Error(`${configLabel}.${path} must render a valid image reference`); + return image; +} + +function parseTarget(record: Record, index: number): ObservabilityTarget { + const path = `targets[${index}]`; + return { + id: stringField(record, "id", path), + route: stringField(record, "route", path), + namespace: kubernetesNameField(record, "namespace", path), + role: enumField(record, "role", path, ["active", "standby"] as const), + enabled: booleanField(record, "enabled", path), + createNamespace: booleanField(record, "createNamespace", path), + }; +} + +function parseOtlpPorts(record: Record, path: string): OtlpPorts { + return { + grpcPort: portField(record, "grpcPort", path), + httpPort: portField(record, "httpPort", path), + }; +} + +function parseServiceConnection(record: Record, index: number): ServiceConnection { + const path = `instrumentation.serviceConnections[${index}]`; + return { + serviceName: stringField(record, "serviceName", path), + owningRepo: stringField(record, "owningRepo", path), + targetNode: stringField(record, "targetNode", path), + lane: stringField(record, "lane", path), + namespace: kubernetesNameField(record, "namespace", path), + requiredSpans: stringArrayField(record, "requiredSpans", path), + }; +} + +function parseStatusEndpoint(record: Record, index: number): StatusEndpoint { + const path = `probes.statusEndpoints[${index}]`; + return { + name: stringField(record, "name", path), + service: kubernetesNameField(record, "service", path), + portName: stringField(record, "portName", path), + path: apiPathField(record, "path", path), + }; +} + +function assertKnownEnabledTarget(targets: ObservabilityTarget[], targetId: string, path: string): void { + const target = targets.find((item) => item.id.toLowerCase() === targetId.toLowerCase()); + if (target === undefined) throw new Error(`${configLabel}.${path} references unknown target ${targetId}; known targets: ${targets.map((item) => item.id).join(", ")}`); + if (!target.enabled) throw new Error(`${configLabel}.${path} references disabled target ${target.id}`); +} + +function resolveTarget(observability: ObservabilityConfig, targetId: string | null): ObservabilityTarget { + const resolved = targetId ?? observability.defaults.targetId; + const target = observability.targets.find((item) => item.id.toLowerCase() === resolved.toLowerCase()); + if (target === undefined) throw new Error(`unknown observability target ${resolved}; known targets: ${observability.targets.map((item) => item.id).join(", ")}`); + if (!target.enabled) throw new Error(`observability target ${target.id} is disabled in ${configLabel}`); + return target; +} + +function plan(options: CommonOptions): Record { + const observability = readObservabilityConfig(); + const target = resolveTarget(observability, options.targetId); + const yaml = renderManifest(observability, target); + const policy = policyChecks(yaml, target); + return { + ok: policy.every((check) => check.ok), + action: "platform-infra-observability-plan", + mutation: false, + config: configSummary(observability, target), + renderPlan: { + target: targetSummary(target), + objects: manifestObjectSummary(yaml), + otlp: { + collectorGrpcEndpoint: `${observability.collector.serviceName}.${target.namespace}.svc.cluster.local:${observability.collector.otlp.grpcPort}`, + collectorHttpEndpoint: `http://${observability.collector.serviceName}.${target.namespace}.svc.cluster.local:${observability.collector.otlp.httpPort}`, + backendGrpcEndpoint: `${observability.traceBackend.serviceName}.${target.namespace}.svc.cluster.local:${observability.traceBackend.otlp.grpcPort}`, + }, + instrumentation: observability.instrumentation.serviceConnections, + resourceAttributes: observability.resourceAttributes, + }, + policy, + next: { + dryRun: `bun scripts/cli.ts platform-infra observability apply --target ${target.id} --dry-run`, + apply: `bun scripts/cli.ts platform-infra observability apply --target ${target.id} --confirm`, + status: `bun scripts/cli.ts platform-infra observability status --target ${target.id}`, + validate: `bun scripts/cli.ts platform-infra observability validate --target ${target.id}`, + trace: `bun scripts/cli.ts platform-infra observability trace --target ${target.id} --trace-id `, + }, + }; +} + +async function apply(config: UniDeskConfig, options: ApplyOptions): Promise> { + const observability = readObservabilityConfig(); + const target = resolveTarget(observability, options.targetId); + const yaml = renderManifest(observability, target); + const policy = policyChecks(yaml, target); + if (!policy.every((check) => check.ok)) return { ok: false, action: "platform-infra-observability-apply", mode: "policy-blocked", policy }; + if (options.confirm && !options.wait) { + const job = startJob( + `platform_infra_observability_apply_${target.id.toLowerCase()}`, + ["bun", "scripts/cli.ts", "platform-infra", "observability", "apply", "--target", target.id, "--confirm", "--wait"], + `Apply ${target.id} platform-infra OTel Collector and trace backend through the controlled UniDesk CLI`, + ); + return { + ok: true, + action: "platform-infra-observability-apply", + mode: "async-job", + mutation: true, + target: targetSummary(target), + job, + statusCommand: `bun scripts/cli.ts job status ${job.id} --tail-bytes 12000`, + next: { + status: `bun scripts/cli.ts job status ${job.id} --tail-bytes 12000`, + rollout: `bun scripts/cli.ts platform-infra observability status --target ${target.id}`, + validate: `bun scripts/cli.ts platform-infra observability validate --target ${target.id}`, + }, + }; + } + const result = await capture(config, target.route, ["sh"], applyScript({ + yaml, + target, + dryRun: options.dryRun, + wait: options.wait, + collectorDeploymentName: observability.collector.deploymentName, + backendDeploymentName: observability.traceBackend.deploymentName, + })); + const parsed = parseJsonOutput(result.stdout); + return { + ok: result.exitCode === 0 && parsed?.ok === true, + action: "platform-infra-observability-apply", + mode: options.dryRun ? "dry-run" : "confirmed", + mutation: !options.dryRun, + target: targetSummary(target), + policy, + remote: parsed ?? compactCapture(result, { full: true }), + }; +} + +async function status(config: UniDeskConfig, options: CommonOptions): Promise> { + const observability = readObservabilityConfig(); + const target = resolveTarget(observability, options.targetId); + const result = await capture(config, target.route, ["sh"], statusScript(observability, target, options.full)); + const parsed = parseJsonOutput(result.stdout); + const summary = parsed === null ? null : statusSummary(parsed); + return { + ok: result.exitCode === 0 && summary?.ready === true, + action: "platform-infra-observability-status", + mutation: false, + target: targetSummary(target), + summary, + remote: options.raw ? parsed : compactStatus(parsed, options.full) ?? compactCapture(result, { full: true }), + next: { + plan: `bun scripts/cli.ts platform-infra observability plan --target ${target.id}`, + apply: `bun scripts/cli.ts platform-infra observability apply --target ${target.id} --confirm`, + validate: `bun scripts/cli.ts platform-infra observability validate --target ${target.id}`, + }, + }; +} + +async function validate(config: UniDeskConfig, options: CommonOptions): Promise> { + const observability = readObservabilityConfig(); + const target = resolveTarget(observability, options.targetId); + const result = await capture(config, target.route, ["sh"], statusScript(observability, target, options.full)); + const parsed = parseJsonOutput(result.stdout); + const summary = parsed === null ? null : statusSummary(parsed); + const ready = summary?.ready === true; + return { + ok: result.exitCode === 0 && ready, + action: "platform-infra-observability-validate", + mutation: false, + target: targetSummary(target), + summary, + validation: { + readiness: ready ? "passed" : "failed", + testTrace: "not-generated-by-this-stage", + traceQuery: ready ? `bun scripts/cli.ts platform-infra observability trace --target ${target.id} --trace-id ` : "blocked-until-runtime-ready", + metricsBoundary: "Prometheus/RUM remains outside this trace readiness check.", + }, + remote: options.raw ? parsed : compactStatus(parsed, options.full) ?? compactCapture(result, { full: true }), + }; +} + +async function trace(config: UniDeskConfig, options: TraceOptions): Promise> { + if (options.traceId === null) throw new Error("observability trace requires --trace-id "); + const observability = readObservabilityConfig(); + const target = resolveTarget(observability, options.targetId); + const tracePath = observability.probes.traceQueryPathTemplate.replaceAll("{{traceId}}", encodeURIComponent(options.traceId)); + const result = await capture(config, target.route, ["sh"], traceScript(observability, target, tracePath)); + const parsed = parseJsonOutput(result.stdout); + return { + ok: result.exitCode === 0 && parsed?.ok === true, + action: "platform-infra-observability-trace", + mutation: false, + target: targetSummary(target), + traceId: options.traceId, + query: { + backend: observability.traceBackend.type, + service: observability.traceBackend.serviceName, + path: tracePath, + }, + result: options.raw ? redactSensitiveUnknown(parsed) : compactUnknown(redactSensitiveUnknown(parsed)), + }; +} + +function renderManifest(observability: ObservabilityConfig, target: ObservabilityTarget): string { + const collectorImage = imageReference(observability.images.collector); + const tempoImage = imageReference(observability.images.tempo); + return [ + target.createNamespace ? namespaceManifest(target) : "", + allowAllNetworkPolicy(target), + collectorConfigMap(observability, target), + collectorDeployment(observability, target, collectorImage), + collectorService(observability, target), + tempoConfigMap(observability, target), + tempoDeployment(observability, target, tempoImage), + tempoService(observability, target), + ].filter((item) => item.trim().length > 0).join("\n---\n"); +} + +function namespaceManifest(target: ObservabilityTarget): string { + return `apiVersion: v1 +kind: Namespace +metadata: + name: ${target.namespace} + labels: + app.kubernetes.io/part-of: platform-infra + app.kubernetes.io/managed-by: unidesk + unidesk.ai/runtime-node: ${target.id} +`; +} + +function allowAllNetworkPolicy(target: ObservabilityTarget): string { + return `apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: allow-all + namespace: ${target.namespace} + labels: + app.kubernetes.io/part-of: platform-infra + app.kubernetes.io/managed-by: unidesk +spec: + podSelector: {} + policyTypes: + - Ingress + - Egress + ingress: + - {} + egress: + - {} +`; +} + +function collectorConfigMap(observability: ObservabilityConfig, target: ObservabilityTarget): string { + const config = `receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:${observability.collector.otlp.grpcPort} + http: + endpoint: 0.0.0.0:${observability.collector.otlp.httpPort} +processors: + batch: {} +exporters: + otlp/tempo: + endpoint: ${observability.traceBackend.serviceName}.${target.namespace}.svc.cluster.local:${observability.traceBackend.otlp.grpcPort} + tls: + insecure: true +extensions: + health_check: + endpoint: 0.0.0.0:${observability.collector.healthPort} +service: + extensions: [health_check] + pipelines: + traces: + receivers: [otlp] + processors: [batch] + exporters: [otlp/tempo] +`; + return `apiVersion: v1 +kind: ConfigMap +metadata: + name: ${observability.collector.configMapName} + namespace: ${target.namespace} + labels: + app.kubernetes.io/name: ${observability.collector.deploymentName} + app.kubernetes.io/component: tracing + app.kubernetes.io/part-of: platform-infra + app.kubernetes.io/managed-by: unidesk + annotations: + unidesk.ai/spec: "${observability.metadata.spec}" +data: + collector.yaml: | +${indent(config, 4)} +`; +} + +function collectorDeployment(observability: ObservabilityConfig, target: ObservabilityTarget, image: string): string { + return `apiVersion: apps/v1 +kind: Deployment +metadata: + name: ${observability.collector.deploymentName} + namespace: ${target.namespace} + labels: + app.kubernetes.io/name: ${observability.collector.deploymentName} + app.kubernetes.io/component: tracing + app.kubernetes.io/part-of: platform-infra + app.kubernetes.io/managed-by: unidesk +spec: + replicas: ${observability.collector.replicas} + selector: + matchLabels: + app.kubernetes.io/name: ${observability.collector.deploymentName} + app.kubernetes.io/component: tracing + template: + metadata: + labels: + app.kubernetes.io/name: ${observability.collector.deploymentName} + app.kubernetes.io/component: tracing + app.kubernetes.io/part-of: platform-infra + annotations: + unidesk.ai/spec: "${observability.metadata.spec}" + spec: + containers: + - name: collector + image: ${image} + imagePullPolicy: ${observability.images.collector.pullPolicy} + args: + - --config=/etc/otelcol/collector.yaml + ports: + - name: otlp-grpc + containerPort: ${observability.collector.otlp.grpcPort} + - name: otlp-http + containerPort: ${observability.collector.otlp.httpPort} + - name: health + containerPort: ${observability.collector.healthPort} + readinessProbe: + httpGet: + path: / + port: health + volumeMounts: + - name: config + mountPath: /etc/otelcol/collector.yaml + subPath: collector.yaml + readOnly: true + volumes: + - name: config + configMap: + name: ${observability.collector.configMapName} +`; +} + +function collectorService(observability: ObservabilityConfig, target: ObservabilityTarget): string { + return `apiVersion: v1 +kind: Service +metadata: + name: ${observability.collector.serviceName} + namespace: ${target.namespace} + labels: + app.kubernetes.io/name: ${observability.collector.deploymentName} + app.kubernetes.io/component: tracing + app.kubernetes.io/part-of: platform-infra + app.kubernetes.io/managed-by: unidesk +spec: + type: ClusterIP + selector: + app.kubernetes.io/name: ${observability.collector.deploymentName} + app.kubernetes.io/component: tracing + ports: + - name: otlp-grpc + port: ${observability.collector.otlp.grpcPort} + targetPort: otlp-grpc + - name: otlp-http + port: ${observability.collector.otlp.httpPort} + targetPort: otlp-http + - name: health + port: ${observability.collector.healthPort} + targetPort: health +`; +} + +function tempoConfigMap(observability: ObservabilityConfig, _target: ObservabilityTarget): string { + const config = `server: + http_listen_port: ${observability.traceBackend.httpPort} +distributor: + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:${observability.traceBackend.otlp.grpcPort} + http: + endpoint: 0.0.0.0:${observability.traceBackend.otlp.httpPort} +ingester: + trace_idle_period: 10s + max_block_duration: 5m +compactor: + compaction: + block_retention: ${observability.traceBackend.storage.retention} +storage: + trace: + backend: local + wal: + path: /var/tempo/wal + local: + path: /var/tempo/traces +`; + return `apiVersion: v1 +kind: ConfigMap +metadata: + name: ${observability.traceBackend.configMapName} + namespace: ${_target.namespace} + labels: + app.kubernetes.io/name: ${observability.traceBackend.deploymentName} + app.kubernetes.io/component: trace-backend + app.kubernetes.io/part-of: platform-infra + app.kubernetes.io/managed-by: unidesk + annotations: + unidesk.ai/spec: "${observability.metadata.spec}" +data: + tempo.yaml: | +${indent(config, 4)} +`; +} + +function tempoDeployment(observability: ObservabilityConfig, target: ObservabilityTarget, image: string): string { + return `apiVersion: apps/v1 +kind: Deployment +metadata: + name: ${observability.traceBackend.deploymentName} + namespace: ${target.namespace} + labels: + app.kubernetes.io/name: ${observability.traceBackend.deploymentName} + app.kubernetes.io/component: trace-backend + app.kubernetes.io/part-of: platform-infra + app.kubernetes.io/managed-by: unidesk +spec: + replicas: ${observability.traceBackend.replicas} + selector: + matchLabels: + app.kubernetes.io/name: ${observability.traceBackend.deploymentName} + app.kubernetes.io/component: trace-backend + template: + metadata: + labels: + app.kubernetes.io/name: ${observability.traceBackend.deploymentName} + app.kubernetes.io/component: trace-backend + app.kubernetes.io/part-of: platform-infra + annotations: + unidesk.ai/spec: "${observability.metadata.spec}" + spec: + containers: + - name: tempo + image: ${image} + imagePullPolicy: ${observability.images.tempo.pullPolicy} + args: + - -config.file=/etc/tempo/tempo.yaml + ports: + - name: http + containerPort: ${observability.traceBackend.httpPort} + - name: otlp-grpc + containerPort: ${observability.traceBackend.otlp.grpcPort} + - name: otlp-http + containerPort: ${observability.traceBackend.otlp.httpPort} + readinessProbe: + httpGet: + path: ${observability.probes.readinessPath} + port: http + volumeMounts: + - name: config + mountPath: /etc/tempo/tempo.yaml + subPath: tempo.yaml + readOnly: true + - name: data + mountPath: /var/tempo + volumes: + - name: config + configMap: + name: ${observability.traceBackend.configMapName} + - name: data + emptyDir: {} +`; +} + +function tempoService(observability: ObservabilityConfig, target: ObservabilityTarget): string { + return `apiVersion: v1 +kind: Service +metadata: + name: ${observability.traceBackend.serviceName} + namespace: ${target.namespace} + labels: + app.kubernetes.io/name: ${observability.traceBackend.deploymentName} + app.kubernetes.io/component: trace-backend + app.kubernetes.io/part-of: platform-infra + app.kubernetes.io/managed-by: unidesk +spec: + type: ClusterIP + selector: + app.kubernetes.io/name: ${observability.traceBackend.deploymentName} + app.kubernetes.io/component: trace-backend + ports: + - name: http + port: ${observability.traceBackend.httpPort} + targetPort: http + - name: otlp-grpc + port: ${observability.traceBackend.otlp.grpcPort} + targetPort: otlp-grpc + - name: otlp-http + port: ${observability.traceBackend.otlp.httpPort} + targetPort: otlp-http +`; +} + +function applyScript(params: { + yaml: string; + target: ObservabilityTarget; + dryRun: boolean; + wait: boolean; + collectorDeploymentName: string; + backendDeploymentName: string; +}): string { + const encoded = Buffer.from(params.yaml, "utf8").toString("base64"); + const dryRunArg = params.dryRun ? "--dry-run=server" : ""; + const wait = params.dryRun || !params.wait + ? "wait_disposition=skipped" + : [ + `kubectl -n ${shQuote(params.target.namespace)} rollout status deployment/${shQuote(params.collectorDeploymentName)} --timeout=180s >"$tmp/collector-rollout.out" 2>"$tmp/collector-rollout.err"`, + "collector_rollout_rc=$?", + `kubectl -n ${shQuote(params.target.namespace)} rollout status deployment/${shQuote(params.backendDeploymentName)} --timeout=180s >"$tmp/backend-rollout.out" 2>"$tmp/backend-rollout.err"`, + "backend_rollout_rc=$?", + "wait_disposition=executed", + ].join("\n"); + return ` +set -u +tmp="$(mktemp -d)" +trap 'rm -rf "$tmp"' EXIT +manifest="$tmp/platform-infra-observability.yaml" +printf '%s' '${encoded}' | base64 -d > "$manifest" +kubectl apply --server-side --field-manager=${fieldManager} ${dryRunArg} -f "$manifest" >"$tmp/apply.out" 2>"$tmp/apply.err" +apply_rc=$? +collector_rollout_rc=0 +backend_rollout_rc=0 +${wait} +python3 - "$apply_rc" "$collector_rollout_rc" "$backend_rollout_rc" "$wait_disposition" "$tmp/apply.out" "$tmp/apply.err" "$tmp/collector-rollout.out" "$tmp/collector-rollout.err" "$tmp/backend-rollout.out" "$tmp/backend-rollout.err" <<'PY' +import json, os, sys +def text(path, limit=6000): + try: + return open(path, encoding="utf-8", errors="replace").read()[-limit:] + except FileNotFoundError: + return "" +apply_rc = int(sys.argv[1]) +collector_rc = int(sys.argv[2]) +backend_rc = int(sys.argv[3]) +payload = { + "ok": apply_rc == 0 and collector_rc == 0 and backend_rc == 0, + "target": "${params.target.id}", + "namespace": "${params.target.namespace}", + "dryRun": ${params.dryRun ? "True" : "False"}, + "apply": {"exitCode": apply_rc, "stdout": text(sys.argv[5]), "stderr": text(sys.argv[6])}, + "rollout": { + "disposition": sys.argv[4], + "collector": {"exitCode": collector_rc, "stdout": text(sys.argv[7]), "stderr": text(sys.argv[8])}, + "backend": {"exitCode": backend_rc, "stdout": text(sys.argv[9]), "stderr": text(sys.argv[10])}, + }, +} +print(json.dumps(payload, ensure_ascii=False, indent=2)) +sys.exit(0 if payload["ok"] else 1) +PY +`; +} + +function statusScript(observability: ObservabilityConfig, target: ObservabilityTarget, full: boolean): string { + const endpointsJson = JSON.stringify(observability.probes.statusEndpoints); + return ` +set -u +tmp="$(mktemp -d)" +trap 'rm -rf "$tmp"' EXIT +capture_json() { + name="$1" + shift + "$@" >"$tmp/$name.json" 2>"$tmp/$name.err" + echo $? >"$tmp/$name.rc" +} +capture_raw() { + name="$1" + shift + "$@" >"$tmp/$name.out" 2>"$tmp/$name.err" + echo $? >"$tmp/$name.rc" +} +capture_json namespace kubectl get namespace ${shQuote(target.namespace)} -o json +capture_json deployments kubectl -n ${shQuote(target.namespace)} get deployment ${shQuote(observability.collector.deploymentName)} ${shQuote(observability.traceBackend.deploymentName)} -o json +capture_json services kubectl -n ${shQuote(target.namespace)} get service ${shQuote(observability.collector.serviceName)} ${shQuote(observability.traceBackend.serviceName)} -o json +capture_json pods kubectl -n ${shQuote(target.namespace)} get pods -l ${shQuote(`app.kubernetes.io/name in (${observability.collector.deploymentName},${observability.traceBackend.deploymentName})`)} -o json +python3 - "$tmp" '${endpointsJson.replaceAll("'", "'\"'\"'")}' <<'PY' +import json, subprocess, sys +tmp = sys.argv[1] +endpoints = json.loads(sys.argv[2]) +namespace = "${target.namespace}" +def read(path, binary=False, limit=8000): + try: + mode = "rb" if binary else "r" + with open(path, mode, encoding=None if binary else "utf-8", errors=None if binary else "replace") as fh: + data = fh.read() + if binary: + data = data.decode("utf-8", errors="replace") + return data[-limit:] + except FileNotFoundError: + return "" +def rc(name): + try: + return int(read(f"{tmp}/{name}.rc").strip() or "1") + except ValueError: + return 1 +def parsed_json(name): + raw = read(f"{tmp}/{name}.json", limit=1000000) + try: + return json.loads(raw) if raw else None + except Exception: + return None +def compact_section(name): + return { + "exitCode": rc(name), + "stderrTail": read(f"{tmp}/{name}.err", limit=2000), + "json": parsed_json(name), + } +probe_results = [] +for ep in endpoints: + path = ep.get("path") or "/" + if not path.startswith("/"): + path = "/" + path + proxy_path = f"/api/v1/namespaces/{namespace}/services/http:{ep['service']}:{ep['portName']}/proxy{path}" + proc = subprocess.run(["kubectl", "get", "--raw", proxy_path], text=True, capture_output=True, timeout=20) + probe_results.append({ + "name": ep["name"], + "service": ep["service"], + "portName": ep["portName"], + "path": path, + "exitCode": proc.returncode, + "stdoutTail": proc.stdout[-1000:], + "stderrTail": proc.stderr[-1000:], + "ok": proc.returncode == 0, + }) +payload = { + "ok": rc("namespace") == 0 and rc("deployments") == 0 and rc("services") == 0 and all(item["ok"] for item in probe_results), + "target": "${target.id}", + "namespace": namespace, + "sections": { + "namespace": compact_section("namespace"), + "deployments": compact_section("deployments"), + "services": compact_section("services"), + "pods": compact_section("pods"), + }, + "probes": probe_results, +} +print(json.dumps(payload, ensure_ascii=False, indent=2 if ${full ? "True" : "False"} else None)) +sys.exit(0 if payload["ok"] else 1) +PY +`; +} + +function traceScript(observability: ObservabilityConfig, target: ObservabilityTarget, tracePath: string): string { + const proxyPath = `/api/v1/namespaces/${target.namespace}/services/http:${observability.traceBackend.serviceName}:http/proxy${tracePath}`; + return ` +set -u +tmp="$(mktemp -d)" +trap 'rm -rf "$tmp"' EXIT +kubectl get --raw ${shQuote(proxyPath)} >"$tmp/trace.out" 2>"$tmp/trace.err" +rc=$? +python3 - "$rc" "$tmp/trace.out" "$tmp/trace.err" <<'PY' +import json, sys +def text(path, limit=12000): + try: + return open(path, encoding="utf-8", errors="replace").read()[-limit:] + except FileNotFoundError: + return "" +body = text(sys.argv[2], 200000) +try: + parsed = json.loads(body) if body else None +except Exception: + parsed = body[-12000:] +payload = { + "ok": int(sys.argv[1]) == 0, + "path": "${tracePath}", + "proxyPath": "${proxyPath}", + "body": parsed, + "stderrTail": text(sys.argv[3], 4000), +} +print(json.dumps(payload, ensure_ascii=False, indent=2)) +sys.exit(0 if payload["ok"] else 1) +PY +`; +} + +function configSummary(observability: ObservabilityConfig, target: ObservabilityTarget): Record { + return { + configPath: configLabel, + spec: observability.metadata.spec, + target: targetSummary(target), + images: { + collector: imageReference(observability.images.collector), + traceBackend: imageReference(observability.images.tempo), + }, + collector: observability.collector, + traceBackend: observability.traceBackend, + sampling: observability.sampling, + }; +} + +function targetSummary(target: ObservabilityTarget): Record { + return { + id: target.id, + route: target.route, + namespace: target.namespace, + role: target.role, + createNamespace: target.createNamespace, + }; +} + +function policyChecks(yaml: string, target: ObservabilityTarget): Array> { + return [ + { name: "yaml-source-of-truth", ok: true, detail: "All concrete images, routes, namespace, ports, retention and sampling values are read from config/platform-infra/observability.yaml." }, + { name: "clusterip-only", ok: !/^\s*type:\s*(NodePort|LoadBalancer)\s*$/mu.test(yaml), detail: "Collector and trace backend stay ClusterIP-only." }, + { name: "no-ingress", ok: !/^\s*kind:\s*Ingress\s*$/mu.test(yaml), detail: "No public ingress is rendered for the first tracing backend." }, + { name: "no-host-network", ok: !/^\s*hostNetwork:\s*true\s*$/mu.test(yaml), detail: "Pods must not use host network." }, + { name: "allow-all-network-policy", ok: yaml.includes("kind: NetworkPolicy") && yaml.includes("name: allow-all") && yaml.includes(`namespace: ${target.namespace}`), detail: `NetworkPolicy/allow-all is rendered in ${target.namespace}.` }, + ]; +} + +function statusSummary(payload: Record): Record { + const sections = asRecord(payload.sections, "status.sections"); + const deployments = objectList(sectionJson(sections, "deployments")); + const services = objectList(sectionJson(sections, "services")); + const pods = objectList(sectionJson(sections, "pods")); + const probes = Array.isArray(payload.probes) ? payload.probes as Array> : []; + const readyDeployments = deployments.map((item) => deploymentSummary(item)); + return { + ready: payload.ok === true, + namespace: payload.namespace, + deployments: readyDeployments, + services: services.map((item) => metadataName(item)), + pods: pods.map((item) => podSummary(item)), + probes: probes.map((item) => ({ + name: item.name, + ok: item.ok === true, + service: item.service, + path: item.path, + stderrTail: item.ok === true ? "" : item.stderrTail, + })), + }; +} + +function compactStatus(payload: Record | null, full: boolean): Record | null { + if (payload === null) return null; + if (full) return redactSensitiveUnknown(payload) as Record; + return statusSummary(payload); +} + +function sectionJson(sections: Record, name: string): unknown { + const section = asRecord(sections[name], `sections.${name}`); + return section.json; +} + +function objectList(value: unknown): Record[] { + if (typeof value !== "object" || value === null) return []; + const items = (value as Record).items; + if (!Array.isArray(items)) return []; + return items.filter((item): item is Record => typeof item === "object" && item !== null && !Array.isArray(item)); +} + +function deploymentSummary(item: Record): Record { + const spec = item.spec as Record | undefined; + const status = item.status as Record | undefined; + const replicas = typeof spec?.replicas === "number" ? spec.replicas : null; + const available = typeof status?.availableReplicas === "number" ? status.availableReplicas : 0; + return { + name: metadataName(item), + replicas, + availableReplicas: available, + ready: replicas !== null && available >= replicas, + }; +} + +function podSummary(item: Record): Record { + const status = item.status as Record | undefined; + return { + name: metadataName(item), + phase: status?.phase ?? null, + }; +} + +function metadataName(item: Record): string | null { + const metadata = item.metadata as Record | undefined; + return typeof metadata?.name === "string" ? metadata.name : null; +} + +function manifestObjectSummary(yaml: string): Array> { + const docs = yaml.split(/^---$/mu); + return docs.map((doc) => { + const kind = doc.match(/^\s*kind:\s*(.+)$/mu)?.[1]?.trim() ?? "unknown"; + const name = doc.match(/^\s*name:\s*(.+)$/mu)?.[1]?.trim() ?? "unknown"; + return { kind, name }; + }); +} + +function imageReference(image: ImageSpec): string { + return `${image.repository}:${image.tag}`; +} + +function indent(value: string, spaces: number): string { + const prefix = " ".repeat(spaces); + return value.split("\n").map((line) => `${prefix}${line}`).join("\n"); +} diff --git a/scripts/src/platform-infra.ts b/scripts/src/platform-infra.ts index 8c8eee2e..edc8a5b2 100644 --- a/scripts/src/platform-infra.ts +++ b/scripts/src/platform-infra.ts @@ -262,7 +262,7 @@ interface ManagedResourceCleanupPlan { export function platformInfraHelp(): unknown { const target = sub2ApiHelpTargetSummary(); return { - command: "platform-infra sub2api|langbot|n8n|wechat-archive ...", + command: "platform-infra sub2api|langbot|n8n|wechat-archive|observability ...", output: "json", usage: [ "bun scripts/cli.ts platform-infra sub2api plan [--target G14|D601]", @@ -297,8 +297,14 @@ export function platformInfraHelp(): unknown { "bun scripts/cli.ts platform-infra wechat-archive collector-image-build --confirm", "bun scripts/cli.ts platform-infra wechat-archive collector-apply --confirm", "bun scripts/cli.ts platform-infra wechat-archive collector-status --full", + "bun scripts/cli.ts platform-infra observability plan --target D601", + "bun scripts/cli.ts platform-infra observability apply --target D601 --dry-run", + "bun scripts/cli.ts platform-infra observability apply --target D601 --confirm", + "bun scripts/cli.ts platform-infra observability status --target D601", + "bun scripts/cli.ts platform-infra observability validate --target D601", + "bun scripts/cli.ts platform-infra observability trace --target D601 --trace-id ", ], - description: "Operate YAML-controlled platform-infra services such as Sub2API, LangBot, n8n and WeChat archive workflows. Public services use PK01 Caddy+FRP rather than Kubernetes Ingress, NodePort, or LoadBalancer.", + description: "Operate YAML-controlled platform-infra services such as Sub2API, LangBot, n8n, WeChat archive workflows and OpenTelemetry tracing. Public services use PK01 Caddy+FRP rather than Kubernetes Ingress, NodePort, or LoadBalancer.", target, codexPool: { usage: [ @@ -341,6 +347,10 @@ export async function runPlatformInfraCommand(config: UniDeskConfig, args: strin const { runWechatArchiveCommand } = await import("./platform-infra-wechat-archive"); return await runWechatArchiveCommand(config, args.slice(1)); } + if (target === "observability") { + const { runPlatformObservabilityCommand } = await import("./platform-infra-observability"); + return await runPlatformObservabilityCommand(config, args.slice(1)); + } if (target !== "sub2api") return unsupported(args); if (action === "plan" || action === undefined) return plan(parseTargetOptions(args.slice(2))); if (action === "apply") return await apply(config, parseApplyOptions(args.slice(2)));