diff --git a/config/platform-infra/observability.yaml b/config/platform-infra/observability.yaml
new file mode 100644
index 00000000..3319fef1
--- /dev/null
+++ b/config/platform-infra/observability.yaml
@@ -0,0 +1,122 @@
+version: 1
+kind: platform-infra-observability
+
+metadata:
+ id: observability
+ owner: unidesk
+ spec: PJ2026-01060501 OTel追踪 draft-2026-06-19-p0
+ relatedIssues:
+ - 489
+
+defaults:
+ targetId: D601
+
+images:
+ collector:
+ repository: otel/opentelemetry-collector-contrib
+ tag: 0.130.1
+ pullPolicy: IfNotPresent
+ tempo:
+ repository: grafana/tempo
+ tag: 2.8.1
+ pullPolicy: IfNotPresent
+
+targets:
+ - id: D601
+ route: D601:k3s
+ namespace: platform-infra
+ role: active
+ enabled: true
+ createNamespace: true
+
+collector:
+ deploymentName: otel-collector
+ serviceName: otel-collector
+ configMapName: otel-collector-config
+ replicas: 1
+ healthPort: 13133
+ otlp:
+ grpcPort: 4317
+ httpPort: 4318
+
+traceBackend:
+ type: tempo
+ deploymentName: tempo
+ serviceName: tempo
+ configMapName: tempo-config
+ replicas: 1
+ httpPort: 3200
+ otlp:
+ grpcPort: 4317
+ httpPort: 4318
+ storage:
+ mode: emptyDir
+ retention: 24h
+
+sampling:
+ mode: parentbased_traceidratio
+ ratio: 1
+
+instrumentation:
+ contextPropagation:
+ - tracecontext
+ - baggage
+ serviceConnections:
+ - serviceName: hwlab-cloud-api
+ owningRepo: pikasTech/HWLAB
+ targetNode: D601
+ lane: v0.3
+ namespace: hwlab-v03
+ requiredSpans:
+ - POST /v1/agent/chat
+ - durable_admission
+ - billing_preflight
+ - agentrun_dispatch
+ - projection_write
+ - trace_events_read
+ - turn_status_read
+ - serviceName: user-billing
+ owningRepo: pikasTech/HWLAB
+ targetNode: D601
+ lane: v0.3
+ namespace: hwlab-v03
+ requiredSpans:
+ - billing_preflight
+ - serviceName: agentrun-manager
+ owningRepo: pikasTech/agentrun
+ targetNode: D601
+ lane: v0.2
+ namespace: agentrun-v02
+ requiredSpans:
+ - agentrun_dispatch
+ - run_created
+ - command_result
+ - projection_sync
+
+resourceAttributes:
+ required:
+ - service.name
+ - deployment.environment
+ - unidesk.node
+ - hwlab.lane
+ - k8s.namespace.name
+ - git.commit
+ businessCorrelationAttributes:
+ - traceId
+ - sessionId
+ - turnId
+ - runId
+ - commandId
+
+probes:
+ readinessPath: /ready
+ traceQueryPathTemplate: /api/traces/{{traceId}}
+ statusEndpoints:
+ - name: tempo-ready
+ service: tempo
+ portName: http
+ path: /ready
+ - name: collector-health
+ service: otel-collector
+ portName: health
+ path: /
diff --git a/project-management/PJ2026-01/specs/PJ2026-010605-observability-monitoring.md b/project-management/PJ2026-01/specs/PJ2026-010605-observability-monitoring.md
index ba5b4ee6..0498789f 100644
--- a/project-management/PJ2026-01/specs/PJ2026-010605-observability-monitoring.md
+++ b/project-management/PJ2026-01/specs/PJ2026-010605-observability-monitoring.md
@@ -23,30 +23,32 @@
| 上级规格 | [PJ2026-0106 平台运维](PJ2026-0106-platform-ops.md) |
| 规格治理索引 | [规格治理](spec-governance.md) |
-本文采用 ISO/IEC/IEEE 29148 需求规格模板的项目裁剪版:正文只保留 HWLAB Prometheus 运维监控的稳定使命、范围、术语、系统边界、内部分工和原子需求。Workbench 用户可感知性能监控的架构图、数据流图、时序图和实现引用要求由 [PJ2026-01060505 Workbench性能](PJ2026-01060505-workbench-performance.md) 细化。
+本文采用 ISO/IEC/IEEE 29148 需求规格模板的项目裁剪版:正文只保留 HWLAB 运维监控的稳定使命、范围、术语、系统边界、内部分工和原子需求。Prometheus 继续承载 metrics、dashboard、alert 和 Workbench 用户可感知性能指标;OpenTelemetry 首期只承载 tracing、span、trace id 关联和跨服务因果链路。Workbench 用户可感知性能监控的架构图、数据流图、时序图和实现引用要求由 [PJ2026-01060505 Workbench性能](PJ2026-01060505-workbench-performance.md) 细化。
## 2. 目的和范围
### 2.1 目的
-运维监控负责通过 Prometheus 和配套状态查询入口采集 HWLAB 运行面 metrics、scrape target、alert rule、服务健康指标和资源指标,使平台运维能够发现服务不可用、入口漂移、资源异常和发布后运行状态退化。
+运维监控负责通过 Prometheus 和 OpenTelemetry 的分工协作采集 HWLAB 运行面 metrics、scrape target、alert rule、服务健康指标、资源指标和关键请求 trace,使平台运维能够发现服务不可用、入口漂移、资源异常、发布后运行状态退化,以及单个 Workbench/AgentRun 请求在 admission、billing、dispatch、projection 或 replay 阶段的失败位置。
-本课题的目标状态是:各业务 L1 提供自己的服务健康和业务状态指标,平台运维统一接入 Prometheus 并输出可查询、可告警、可回溯的运维状态;监控只判断运行面健康和资源状态,不替代业务成功标准。
+本课题的目标状态是:各业务 L1 提供自己的服务健康、业务状态指标和关键链路 trace 语义,平台运维统一接入 Prometheus metrics 面和 OpenTelemetry tracing 面并输出可查询、可告警、可回溯的运维状态;监控只判断运行面健康、资源状态和单次请求因果链,不替代业务成功标准。
-运维监控接入必须遵循 UniDesk YAML-first ops:scrape target、alert rule、recording rule、summary query、target/node/lane、namespace、endpoint、采样/保留/阈值等 UniDesk 自有运维事实必须归属 YAML;受控 CLI 负责读取、校验、渲染、apply/status/summary。运行面对象和 Prometheus 查询结果只作为观测证据,不反推为配置真相。
+运维监控接入必须遵循 UniDesk YAML-first ops:scrape target、alert rule、recording rule、summary query、OTel Collector、trace backend、target/node/lane、namespace、endpoint、采样/保留/阈值等 UniDesk 自有运维事实必须归属 YAML;受控 CLI 负责读取、校验、渲染、apply/status/summary/trace。运行面对象、Prometheus 查询结果和 trace backend 查询结果只作为观测证据,不反推为配置真相。
### 2.2 范围内
- Prometheus scrape target、metrics endpoint、alert rule 和服务健康指标接入。
+- OpenTelemetry Collector、trace backend、span 语义、trace context 传播和 trace 查询入口。
- Web/API/AgentRun/HWPOD/Harness/用户管理等服务的运行面健康、资源状态、公开入口健康和用户可感知性能观测。
- 发布后 runtime readiness、resource usage、error rate、queue depth、target availability 和 alert 状态摘要。
-- 监控数据的受控查询、低噪声摘要、失败归因和敏感输出约束。
+- 监控和 trace 数据的受控查询、低噪声摘要、失败归因和敏感输出约束。
### 2.3 范围外
- Agent command、CaseRun、HWPOD operation 或用户账本的业务成功标准由对应 L1 定义。
-- 长 trace、原始日志、测试报告、截图、CaseRun registry 和用户反馈正文不进入本规格。
+- 完整 prompt、assistant 正文、tool 参数、命令输出、原始日志、测试报告、截图、CaseRun registry 和用户反馈正文不进入本规格或默认 trace attribute。
- Prometheus 具体阈值、保留周期、告警路由、采样间隔和容量数值以 YAML/config 为准,不在本规格硬编码。
+- OpenTelemetry Collector image、trace backend image、存储模式、retention、采样率、endpoint 和服务接入关系以 YAML/config 为准,不在业务代码硬编码。
- 监控告警不能代替发布流水、业务验证、CaseRun 评价或用户反馈分流。
## 3. 术语表
@@ -54,6 +56,10 @@
| 术语 | 定义 |
| --- | --- |
| Prometheus 运维监控 | 通过 Prometheus 采集、查询和告警 HWLAB 运行面 metrics 的运维能力。 |
+| OpenTelemetry tracing | 通过 OTel Collector、trace backend、span 和 W3C trace context 关联单次请求跨服务因果链路的运维能力。 |
+| trace backend | 保存并查询 trace/span 的平台组件,首期可由 Tempo 或 Jaeger 承担,部署事实由 YAML 控制。 |
+| span | 一次请求在 admission、billing、AgentRun dispatch、projection、read/replay 等阶段中的一个可观察工作单元。 |
+| trace context | W3C `traceparent`/`baggage` 等跨进程传播上下文;业务 `traceId/sessionId/runId/turnId` 只能作为 span attribute,不能替代标准 trace context。 |
| 用户可感知性能 | 用户在 Web、CLI 或 API 入口中直接感受到的等待时间、加载时间、首个可读内容出现时间和完整可用状态。 |
| YAML-first 运维监控 | Prometheus scrape、rule、summary 和 target 归属先进入 UniDesk YAML,再通过受控 CLI 渲染和验证的运维形态。 |
| scrape target | Prometheus 抓取 metrics 的目标服务、path 和 label 集合。 |
@@ -67,33 +73,119 @@
| 边界项 | 内容 |
| --- | --- |
| 外部使用者 | 平台管理员、发布操作人员、值守自动化、需要运行状态的各 L1 owner。 |
-| 外部输入 | 服务 metrics endpoint、health/readiness、runtime resource 指标、scrape 配置、alert rule 配置和查询请求。 |
-| 受控资源 | Prometheus target、metrics、alert rule、运行面状态摘要、服务健康指标和资源指标。 |
-| 外部输出 | target 状态、metrics 查询结果、alert 状态、运行健康摘要、入口健康摘要和 redacted 失败原因。 |
-| 用户接口 | Prometheus 查询、受控平台运维 CLI、发布状态摘要和各 L1 health/status 页面或命令。 |
-| 系统边界 | 运维监控负责运行面可观察和告警;不定义业务完成标准,不保存长证据,不把监控 green 当作业务通过。 |
+| 外部输入 | 服务 metrics endpoint、health/readiness、runtime resource 指标、scrape 配置、alert rule 配置、OTLP trace/span、trace context、trace 查询请求和 metrics 查询请求。 |
+| 受控资源 | Prometheus target、metrics、alert rule、OTel Collector、trace backend、运行面状态摘要、服务健康指标、资源指标和 redacted trace 摘要。 |
+| 外部输出 | target 状态、metrics 查询结果、alert 状态、trace 查询结果、运行健康摘要、入口健康摘要和 redacted 失败阶段。 |
+| 用户接口 | Prometheus 查询、OpenTelemetry trace 查询、受控平台运维 CLI、发布状态摘要和各 L1 health/status 页面或命令。 |
+| 系统边界 | 运维监控负责运行面可观察、告警和单次请求因果链;不定义业务完成标准,不保存默认长证据,不把监控 green 或 trace 可查当作业务通过。 |
## 5. 内部分工与规格索引
| 编号 | 模块或课题 | 规格文档 | 主责边界 | 上游依赖 | 下游支撑 |
| --- | --- | --- | --- | --- | --- |
-| PJ2026-01060501 | Metrics接入 | 本规格 6.1 | metrics endpoint、scrape target 和 label 口径 | 各 L1 服务健康指标 | Prometheus 查询 |
+| PJ2026-01060501 | OTel追踪 | 本规格 6.1 | OTel Collector、trace backend、span 语义、trace context 和受控 trace 查询 | HWLAB v0.3、AgentRun、用户账本、Web工作台 | 平台排障、Prometheus 关联 |
| PJ2026-01060502 | 告警规则 | 本规格 6.2 | alert rule、状态分类和配置来源 | Metrics接入、YAML运维 | 平台值守、发布判定 |
| PJ2026-01060503 | 运行摘要 | 本规格 6.3 | health、readiness、resource、error rate 和入口健康摘要 | 公开入口、发布流水 | 管理员和各 L1 owner |
| PJ2026-01060504 | 边界约束 | 本规格 6.4 | 监控与业务通过、长证据、敏感输出的边界 | 全部 L1 | 用户反馈和排障 |
| PJ2026-01060505 | Workbench性能 | [PJ2026-01060505 Workbench性能](PJ2026-01060505-workbench-performance.md) | Web 工作台用户可感知性能、RUM、AgentRun event visible latency 和 Prometheus 指标口径 | Web工作台、Agent编排、API契约 | 平台运维、客户端和性能回归调查 |
+| PJ2026-01060506 | Metrics接入 | 本规格 6.6 | metrics endpoint、scrape target 和 label 口径 | 各 L1 服务健康指标 | Prometheus 查询 |
## 6. 原子需求
-### 6.1 OPS-MON-REQ-001 Metrics 接入
+### 6.1 OPS-MON-REQ-001 OTel 追踪
| 编号 | 短名 | 主责模块 | 关联模块 |
| --- | --- | --- | --- |
-| OPS-MON-REQ-001 | Metrics接入 | PJ2026-01060501 Metrics接入 | [YAML运维](PJ2026-010603-yaml-first-ops.md)、[发布流水](PJ2026-010601-controlled-release.md) |
+| OPS-MON-REQ-001 | OTel追踪 | PJ2026-01060501 OTel追踪 | [YAML运维](PJ2026-010603-yaml-first-ops.md)、[Agent编排](PJ2026-0102-agent-orchestration.md)、[客户端](PJ2026-0104-client.md)、[用户管理](PJ2026-0105-user-management.md) |
-运维监控应接入 HWLAB 各运行服务的 metrics endpoint 和 scrape target,使服务 live、ready、resource、queue、error、latency 和依赖健康可以被 Prometheus 查询。
+运维监控应在 `platform-infra` 提供 YAML-first OpenTelemetry tracing 能力,使一次 Workbench submit 或等价 API 请求可以沿 Cloud API admission、billing preflight、AgentRun dispatch、projection write、trace events read 和 Web replay 等阶段保留可查询 span。
-各 L1 负责定义自身服务健康指标含义;平台运维负责通过 YAML-first UniDesk ops 统一接入、命名、label 和可查询状态。metrics 缺失应暴露为监控缺口,不得用日志 grep、一次性 curl、手工 `kubectl apply` 或人工截图替代长期监控。
+OpenTelemetry 不替代 Prometheus。Prometheus 继续承载 metrics、dashboard、alert 和用户可感知性能指标;OTel tracing 只回答单个 `traceId/sessionId/runId/turnId` 跨服务断在哪个阶段。业务 ID 可以作为 span attribute 用于排查,但不能成为 Prometheus label,也不能替代 W3C trace context。
+
+#### 6.1.1 目标架构图
+
+```mermaid
+flowchart LR
+ W[Workbench Web] --> API[HWLAB cloud-api]
+ API --> Billing[user-billing]
+ API --> AR[AgentRun manager]
+ AR --> Runner[AgentRun runner]
+ API --> Projection[projection/read model]
+ API --> Metrics[Prometheus metrics endpoint]
+ Billing --> Metrics
+ AR --> Metrics
+ API -- OTLP spans --> Collector[OTel Collector
platform-infra]
+ Billing -- OTLP spans --> Collector
+ AR -- OTLP spans --> Collector
+ Runner -- OTLP spans --> Collector
+ Collector --> TraceBackend[Trace backend
Tempo or Jaeger]
+ Metrics --> Prometheus[Prometheus metrics面]
+ CLI[UniDesk platform-infra observability CLI] --> Collector
+ CLI --> TraceBackend
+ CLI --> Prometheus
+```
+
+#### 6.1.2 数据流图
+
+```mermaid
+flowchart TD
+ Span[应用关键 span] --> Collector[OTel Collector]
+ Collector --> Backend[trace backend]
+ Metric[低基数 metrics] --> Prometheus[Prometheus]
+ Backend --> TraceQuery[CLI trace query]
+ Prometheus --> MetricQuery[CLI metrics/status query]
+ TraceQuery --> Diagnose[按 traceId/sessionId/runId 定位失败阶段]
+ MetricQuery --> Health[运行健康、趋势和告警判断]
+ Diagnose -.关联.-> Health
+```
+
+数据流必须保持职责分离:trace backend 保存高基数字段和跨服务因果链;Prometheus 只保存低基数指标、alert 和趋势。CLI 可以把 metrics 摘要与 trace 查询结果放到同一排障报告中,但不能把两套存储合并为一个第二真相。
+
+#### 6.1.3 关键时序图
+
+```mermaid
+sequenceDiagram
+ participant W as Workbench
+ participant API as cloud-api
+ participant B as user-billing
+ participant AR as AgentRun
+ participant P as projection
+ participant OTel as OTel Collector
+ participant TB as trace backend
+
+ W->>API: POST /v1/agent/chat
+ API->>OTel: span admission
+ API->>B: billing_preflight with trace context
+ B->>OTel: span billing_preflight
+ API->>AR: dispatch command with trace context
+ AR->>OTel: span agentrun_dispatch
+ AR-->>API: runId/commandId
+ API->>P: projection_write
+ P->>OTel: span projection_write
+ W->>API: replay/read trace events
+ API->>OTel: span trace_events_read
+ OTel->>TB: export spans
+```
+
+#### 6.1.4 YAML 与 CLI 归属
+
+OTel Collector、trace backend、target route、namespace、image、storage、retention、sampling、endpoint、probe 和应用接入关系必须进入 `config/platform-infra/observability.yaml` 或后续确认的 owning YAML。正式入口必须是 `bun scripts/cli.ts platform-infra observability plan|apply|status|validate|trace` 或等价受控 CLI;`trans :k3s` 只能作为有界诊断底座,不能成为长期 mutate path。
+
+首期应用侧手工 span 至少覆盖:
+
+- `POST /v1/agent/chat`
+- `durable_admission`
+- `billing_preflight`
+- `agentrun_dispatch`
+- `projection_write`
+- `trace_events_read`
+- `turn_status_read`
+
+统一 resource attributes 至少包含 `service.name`、`deployment.environment`、`unidesk.node`、`hwlab.lane`、`k8s.namespace.name` 和 `git.commit`。统一传播使用 W3C `traceparent`/`baggage`;`traceId`、`sessionId`、`turnId`、`runId`、`commandId` 只能作为 span attribute。
+
+#### 6.1.5 代码引用规则
+
+本需求范围内新增或修改的源码文件头部必须标注 `SPEC: PJ2026-01060501 OTel追踪 draft-2026-06-19-p0`,并用一句话说明文件职责。纯 YAML、生成 manifest、锁文件或第三方 CRD 如不能加头部,必须能从 owning YAML、渲染器或 CLI 命令追溯到本 SPEC。
### 6.2 OPS-MON-REQ-002 告警规则
@@ -135,6 +227,16 @@
Workbench 性能监控只记录低基数指标、阶段耗时、状态分类和脱敏 correlation;不得把 traceId、sessionId、runId、prompt、assistant 正文、tool 参数、stdout/stderr、Secret 或用户个人信息写入 Prometheus label。具体架构、数据流、时序和代码引用规则由 [PJ2026-01060505 Workbench性能](PJ2026-01060505-workbench-performance.md) 定义。
+### 6.6 OPS-MON-REQ-006 Metrics 接入
+
+| 编号 | 短名 | 主责模块 | 关联模块 |
+| --- | --- | --- | --- |
+| OPS-MON-REQ-006 | Metrics接入 | PJ2026-01060506 Metrics接入 | [YAML运维](PJ2026-010603-yaml-first-ops.md)、[发布流水](PJ2026-010601-controlled-release.md) |
+
+运维监控应接入 HWLAB 各运行服务的 metrics endpoint 和 scrape target,使服务 live、ready、resource、queue、error、latency 和依赖健康可以被 Prometheus 查询。
+
+各 L1 负责定义自身服务健康指标含义;平台运维负责通过 YAML-first UniDesk ops 统一接入、命名、label 和可查询状态。metrics 缺失应暴露为监控缺口,不得用日志 grep、一次性 curl、手工 `kubectl apply` 或人工截图替代长期监控。
+
## 7. 过程控制
本规格不单独索引过程 issue;跨 L1 的内测、灰度和阶段活动索引统一保留在 [PJ2026-01 HWLAB 总规格](PJ2026-01-HWLAB.md) 的 `7. 过程控制`。
diff --git a/scripts/src/platform-infra-observability.ts b/scripts/src/platform-infra-observability.ts
new file mode 100644
index 00000000..e95e235e
--- /dev/null
+++ b/scripts/src/platform-infra-observability.ts
@@ -0,0 +1,1124 @@
+// SPEC: PJ2026-01060501 OTel追踪 draft-2026-06-19-p0.
+// Responsibility: YAML-first platform-infra OpenTelemetry tracing control commands.
+import { Buffer } from "node:buffer";
+import { readFileSync } from "node:fs";
+import type { UniDeskConfig } from "./config";
+import { rootPath } from "./config";
+import { startJob } from "./jobs";
+import {
+ compactCapture,
+ compactUnknown,
+ createYamlFieldReader,
+ numberField,
+ parseJsonOutput,
+ redactSensitiveUnknown,
+ shQuote,
+ capture,
+} from "./platform-infra-ops-library";
+
+const configFile = rootPath("config", "platform-infra", "observability.yaml");
+const configLabel = "config/platform-infra/observability.yaml";
+const fieldManager = "unidesk-platform-observability";
+const {
+ asRecord,
+ objectField,
+ arrayOfRecords,
+ stringField,
+ integerField,
+ booleanField,
+ stringArrayField,
+ numberArrayField,
+ enumField,
+ kubernetesNameField,
+ portField,
+ apiPathField,
+} = createYamlFieldReader(configLabel);
+
+interface ObservabilityConfig {
+ version: number;
+ kind: "platform-infra-observability";
+ metadata: { id: string; owner: string; spec: string; relatedIssues: number[] };
+ defaults: { targetId: string };
+ images: {
+ collector: ImageSpec;
+ tempo: ImageSpec;
+ };
+ targets: ObservabilityTarget[];
+ collector: {
+ deploymentName: string;
+ serviceName: string;
+ configMapName: string;
+ replicas: number;
+ healthPort: number;
+ otlp: OtlpPorts;
+ };
+ traceBackend: {
+ type: "tempo";
+ deploymentName: string;
+ serviceName: string;
+ configMapName: string;
+ replicas: number;
+ httpPort: number;
+ otlp: OtlpPorts;
+ storage: { mode: "emptyDir"; retention: string };
+ };
+ sampling: { mode: "parentbased_traceidratio"; ratio: number };
+ instrumentation: {
+ contextPropagation: string[];
+ serviceConnections: ServiceConnection[];
+ };
+ resourceAttributes: {
+ required: string[];
+ businessCorrelationAttributes: string[];
+ };
+ probes: {
+ readinessPath: string;
+ traceQueryPathTemplate: string;
+ statusEndpoints: StatusEndpoint[];
+ };
+}
+
+interface ImageSpec {
+ repository: string;
+ tag: string;
+ pullPolicy: "Always" | "IfNotPresent" | "Never";
+}
+
+interface ObservabilityTarget {
+ id: string;
+ route: string;
+ namespace: string;
+ role: "active" | "standby";
+ enabled: boolean;
+ createNamespace: boolean;
+}
+
+interface OtlpPorts {
+ grpcPort: number;
+ httpPort: number;
+}
+
+interface ServiceConnection {
+ serviceName: string;
+ owningRepo: string;
+ targetNode: string;
+ lane: string;
+ namespace: string;
+ requiredSpans: string[];
+}
+
+interface StatusEndpoint {
+ name: string;
+ service: string;
+ portName: string;
+ path: string;
+}
+
+interface CommonOptions {
+ targetId: string | null;
+ full: boolean;
+ raw: boolean;
+}
+
+interface ApplyOptions extends CommonOptions {
+ confirm: boolean;
+ dryRun: boolean;
+ wait: boolean;
+}
+
+interface TraceOptions extends CommonOptions {
+ traceId: string | null;
+}
+
+export function observabilityHelp(): Record {
+ return {
+ command: "platform-infra observability plan|apply|status|validate|trace",
+ output: "json",
+ configTruth: "config/platform-infra/observability.yaml",
+ spec: "PJ2026-01060501 OTel追踪 draft-2026-06-19-p0",
+ usage: [
+ "bun scripts/cli.ts platform-infra observability plan --target D601",
+ "bun scripts/cli.ts platform-infra observability apply --target D601 --dry-run",
+ "bun scripts/cli.ts platform-infra observability apply --target D601 --confirm",
+ "bun scripts/cli.ts platform-infra observability status --target D601 [--full|--raw]",
+ "bun scripts/cli.ts platform-infra observability validate --target D601 [--full|--raw]",
+ "bun scripts/cli.ts platform-infra observability trace --target D601 --trace-id [--full|--raw]",
+ ],
+ boundary: "Prometheus remains the metrics source; this command owns only platform-infra OTel Collector, trace backend readiness, and trace lookup.",
+ };
+}
+
+export async function runPlatformObservabilityCommand(config: UniDeskConfig, args: string[]): Promise> {
+ const [action = "plan"] = args;
+ if (action === "plan") return plan(parseCommonOptions(args.slice(1)));
+ if (action === "apply") return await apply(config, parseApplyOptions(args.slice(1)));
+ if (action === "status") return await status(config, parseCommonOptions(args.slice(1)));
+ if (action === "validate") return await validate(config, parseCommonOptions(args.slice(1)));
+ if (action === "trace") return await trace(config, parseTraceOptions(args.slice(1)));
+ return { ok: false, error: "unsupported-platform-infra-observability-command", args, help: observabilityHelp() };
+}
+
+function parseCommonOptions(args: string[]): CommonOptions {
+ let targetId: string | null = null;
+ let full = false;
+ let raw = false;
+ for (let index = 0; index < args.length; index += 1) {
+ const arg = args[index];
+ if (arg === "--target") {
+ const value = args[index + 1];
+ if (value === undefined || value.startsWith("--")) throw new Error("--target requires a value");
+ if (!/^[A-Za-z0-9._-]+$/u.test(value)) throw new Error("--target must be a simple target id");
+ targetId = value;
+ index += 1;
+ } else if (arg === "--full") {
+ full = true;
+ } else if (arg === "--raw") {
+ raw = true;
+ full = true;
+ } else {
+ throw new Error(`unsupported observability option: ${arg}`);
+ }
+ }
+ return { targetId, full, raw };
+}
+
+function parseApplyOptions(args: string[]): ApplyOptions {
+ const commonArgs: string[] = [];
+ let confirm = false;
+ let dryRun = false;
+ let wait = false;
+ for (let index = 0; index < args.length; index += 1) {
+ const arg = args[index];
+ if (arg === "--confirm") confirm = true;
+ else if (arg === "--dry-run") dryRun = true;
+ else if (arg === "--wait") wait = true;
+ else {
+ commonArgs.push(arg);
+ if (arg === "--target") {
+ commonArgs.push(args[index + 1] ?? "");
+ index += 1;
+ }
+ }
+ }
+ if (confirm && dryRun) throw new Error("observability apply accepts only one of --confirm or --dry-run");
+ return { ...parseCommonOptions(commonArgs), confirm, dryRun: dryRun || !confirm, wait };
+}
+
+function parseTraceOptions(args: string[]): TraceOptions {
+ const commonArgs: string[] = [];
+ let traceId: string | null = null;
+ for (let index = 0; index < args.length; index += 1) {
+ const arg = args[index];
+ if (arg === "--trace-id") {
+ const value = args[index + 1];
+ if (value === undefined || value.startsWith("--")) throw new Error("--trace-id requires a value");
+ if (!/^[A-Za-z0-9._:-]+$/u.test(value)) throw new Error("--trace-id has an unsupported format");
+ traceId = value;
+ index += 1;
+ } else {
+ commonArgs.push(arg);
+ if (arg === "--target") {
+ commonArgs.push(args[index + 1] ?? "");
+ index += 1;
+ }
+ }
+ }
+ return { ...parseCommonOptions(commonArgs), traceId };
+}
+
+function readObservabilityConfig(): ObservabilityConfig {
+ const parsed = Bun.YAML.parse(readFileSync(configFile, "utf8")) as unknown;
+ const root = asRecord(parsed, configLabel);
+ const version = integerField(root, "version", "");
+ const kind = stringField(root, "kind", "");
+ if (kind !== "platform-infra-observability") throw new Error(`${configLabel}.kind must be platform-infra-observability`);
+ const metadata = objectField(root, "metadata", "");
+ const defaults = objectField(root, "defaults", "");
+ const images = objectField(root, "images", "");
+ const collector = objectField(root, "collector", "");
+ const collectorOtlp = objectField(collector, "otlp", "collector");
+ const traceBackend = objectField(root, "traceBackend", "");
+ const traceBackendOtlp = objectField(traceBackend, "otlp", "traceBackend");
+ const traceBackendStorage = objectField(traceBackend, "storage", "traceBackend");
+ const sampling = objectField(root, "sampling", "");
+ const instrumentation = objectField(root, "instrumentation", "");
+ const resourceAttributes = objectField(root, "resourceAttributes", "");
+ const probes = objectField(root, "probes", "");
+ const config: ObservabilityConfig = {
+ version,
+ kind,
+ metadata: {
+ id: stringField(metadata, "id", "metadata"),
+ owner: stringField(metadata, "owner", "metadata"),
+ spec: stringField(metadata, "spec", "metadata"),
+ relatedIssues: numberArrayField(metadata, "relatedIssues", "metadata"),
+ },
+ defaults: { targetId: stringField(defaults, "targetId", "defaults") },
+ images: {
+ collector: imageSpec(objectField(images, "collector", "images"), "images.collector"),
+ tempo: imageSpec(objectField(images, "tempo", "images"), "images.tempo"),
+ },
+ targets: arrayOfRecords(root.targets, "targets").map(parseTarget),
+ collector: {
+ deploymentName: kubernetesNameField(collector, "deploymentName", "collector"),
+ serviceName: kubernetesNameField(collector, "serviceName", "collector"),
+ configMapName: kubernetesNameField(collector, "configMapName", "collector"),
+ replicas: integerField(collector, "replicas", "collector"),
+ healthPort: portField(collector, "healthPort", "collector"),
+ otlp: parseOtlpPorts(collectorOtlp, "collector.otlp"),
+ },
+ traceBackend: {
+ type: enumField(traceBackend, "type", "traceBackend", ["tempo"] as const),
+ deploymentName: kubernetesNameField(traceBackend, "deploymentName", "traceBackend"),
+ serviceName: kubernetesNameField(traceBackend, "serviceName", "traceBackend"),
+ configMapName: kubernetesNameField(traceBackend, "configMapName", "traceBackend"),
+ replicas: integerField(traceBackend, "replicas", "traceBackend"),
+ httpPort: portField(traceBackend, "httpPort", "traceBackend"),
+ otlp: parseOtlpPorts(traceBackendOtlp, "traceBackend.otlp"),
+ storage: {
+ mode: enumField(traceBackendStorage, "mode", "traceBackend.storage", ["emptyDir"] as const),
+ retention: stringField(traceBackendStorage, "retention", "traceBackend.storage"),
+ },
+ },
+ sampling: {
+ mode: enumField(sampling, "mode", "sampling", ["parentbased_traceidratio"] as const),
+ ratio: numberField(sampling, "ratio", "sampling"),
+ },
+ instrumentation: {
+ contextPropagation: stringArrayField(instrumentation, "contextPropagation", "instrumentation"),
+ serviceConnections: arrayOfRecords(instrumentation.serviceConnections, "instrumentation.serviceConnections").map(parseServiceConnection),
+ },
+ resourceAttributes: {
+ required: stringArrayField(resourceAttributes, "required", "resourceAttributes"),
+ businessCorrelationAttributes: stringArrayField(resourceAttributes, "businessCorrelationAttributes", "resourceAttributes"),
+ },
+ probes: {
+ readinessPath: apiPathField(probes, "readinessPath", "probes"),
+ traceQueryPathTemplate: stringField(probes, "traceQueryPathTemplate", "probes"),
+ statusEndpoints: arrayOfRecords(probes.statusEndpoints, "probes.statusEndpoints").map(parseStatusEndpoint),
+ },
+ };
+ if (config.targets.length === 0) throw new Error(`${configLabel}.targets must not be empty`);
+ assertKnownEnabledTarget(config.targets, config.defaults.targetId, "defaults.targetId");
+ if (config.collector.replicas < 0 || config.traceBackend.replicas < 0) throw new Error(`${configLabel} replicas must be >= 0`);
+ if (config.sampling.ratio < 0 || config.sampling.ratio > 1) throw new Error(`${configLabel}.sampling.ratio must be between 0 and 1`);
+ if (!config.probes.traceQueryPathTemplate.includes("{{traceId}}")) throw new Error(`${configLabel}.probes.traceQueryPathTemplate must include {{traceId}}`);
+ return config;
+}
+
+function imageSpec(record: Record, path: string): ImageSpec {
+ const image = {
+ repository: stringField(record, "repository", path),
+ tag: stringField(record, "tag", path),
+ pullPolicy: enumField(record, "pullPolicy", path, ["Always", "IfNotPresent", "Never"] as const),
+ };
+ if (!/^[A-Za-z0-9._/:@-]+$/u.test(`${image.repository}:${image.tag}`)) throw new Error(`${configLabel}.${path} must render a valid image reference`);
+ return image;
+}
+
+function parseTarget(record: Record, index: number): ObservabilityTarget {
+ const path = `targets[${index}]`;
+ return {
+ id: stringField(record, "id", path),
+ route: stringField(record, "route", path),
+ namespace: kubernetesNameField(record, "namespace", path),
+ role: enumField(record, "role", path, ["active", "standby"] as const),
+ enabled: booleanField(record, "enabled", path),
+ createNamespace: booleanField(record, "createNamespace", path),
+ };
+}
+
+function parseOtlpPorts(record: Record, path: string): OtlpPorts {
+ return {
+ grpcPort: portField(record, "grpcPort", path),
+ httpPort: portField(record, "httpPort", path),
+ };
+}
+
+function parseServiceConnection(record: Record, index: number): ServiceConnection {
+ const path = `instrumentation.serviceConnections[${index}]`;
+ return {
+ serviceName: stringField(record, "serviceName", path),
+ owningRepo: stringField(record, "owningRepo", path),
+ targetNode: stringField(record, "targetNode", path),
+ lane: stringField(record, "lane", path),
+ namespace: kubernetesNameField(record, "namespace", path),
+ requiredSpans: stringArrayField(record, "requiredSpans", path),
+ };
+}
+
+function parseStatusEndpoint(record: Record, index: number): StatusEndpoint {
+ const path = `probes.statusEndpoints[${index}]`;
+ return {
+ name: stringField(record, "name", path),
+ service: kubernetesNameField(record, "service", path),
+ portName: stringField(record, "portName", path),
+ path: apiPathField(record, "path", path),
+ };
+}
+
+function assertKnownEnabledTarget(targets: ObservabilityTarget[], targetId: string, path: string): void {
+ const target = targets.find((item) => item.id.toLowerCase() === targetId.toLowerCase());
+ if (target === undefined) throw new Error(`${configLabel}.${path} references unknown target ${targetId}; known targets: ${targets.map((item) => item.id).join(", ")}`);
+ if (!target.enabled) throw new Error(`${configLabel}.${path} references disabled target ${target.id}`);
+}
+
+function resolveTarget(observability: ObservabilityConfig, targetId: string | null): ObservabilityTarget {
+ const resolved = targetId ?? observability.defaults.targetId;
+ const target = observability.targets.find((item) => item.id.toLowerCase() === resolved.toLowerCase());
+ if (target === undefined) throw new Error(`unknown observability target ${resolved}; known targets: ${observability.targets.map((item) => item.id).join(", ")}`);
+ if (!target.enabled) throw new Error(`observability target ${target.id} is disabled in ${configLabel}`);
+ return target;
+}
+
+function plan(options: CommonOptions): Record {
+ const observability = readObservabilityConfig();
+ const target = resolveTarget(observability, options.targetId);
+ const yaml = renderManifest(observability, target);
+ const policy = policyChecks(yaml, target);
+ return {
+ ok: policy.every((check) => check.ok),
+ action: "platform-infra-observability-plan",
+ mutation: false,
+ config: configSummary(observability, target),
+ renderPlan: {
+ target: targetSummary(target),
+ objects: manifestObjectSummary(yaml),
+ otlp: {
+ collectorGrpcEndpoint: `${observability.collector.serviceName}.${target.namespace}.svc.cluster.local:${observability.collector.otlp.grpcPort}`,
+ collectorHttpEndpoint: `http://${observability.collector.serviceName}.${target.namespace}.svc.cluster.local:${observability.collector.otlp.httpPort}`,
+ backendGrpcEndpoint: `${observability.traceBackend.serviceName}.${target.namespace}.svc.cluster.local:${observability.traceBackend.otlp.grpcPort}`,
+ },
+ instrumentation: observability.instrumentation.serviceConnections,
+ resourceAttributes: observability.resourceAttributes,
+ },
+ policy,
+ next: {
+ dryRun: `bun scripts/cli.ts platform-infra observability apply --target ${target.id} --dry-run`,
+ apply: `bun scripts/cli.ts platform-infra observability apply --target ${target.id} --confirm`,
+ status: `bun scripts/cli.ts platform-infra observability status --target ${target.id}`,
+ validate: `bun scripts/cli.ts platform-infra observability validate --target ${target.id}`,
+ trace: `bun scripts/cli.ts platform-infra observability trace --target ${target.id} --trace-id `,
+ },
+ };
+}
+
+async function apply(config: UniDeskConfig, options: ApplyOptions): Promise> {
+ const observability = readObservabilityConfig();
+ const target = resolveTarget(observability, options.targetId);
+ const yaml = renderManifest(observability, target);
+ const policy = policyChecks(yaml, target);
+ if (!policy.every((check) => check.ok)) return { ok: false, action: "platform-infra-observability-apply", mode: "policy-blocked", policy };
+ if (options.confirm && !options.wait) {
+ const job = startJob(
+ `platform_infra_observability_apply_${target.id.toLowerCase()}`,
+ ["bun", "scripts/cli.ts", "platform-infra", "observability", "apply", "--target", target.id, "--confirm", "--wait"],
+ `Apply ${target.id} platform-infra OTel Collector and trace backend through the controlled UniDesk CLI`,
+ );
+ return {
+ ok: true,
+ action: "platform-infra-observability-apply",
+ mode: "async-job",
+ mutation: true,
+ target: targetSummary(target),
+ job,
+ statusCommand: `bun scripts/cli.ts job status ${job.id} --tail-bytes 12000`,
+ next: {
+ status: `bun scripts/cli.ts job status ${job.id} --tail-bytes 12000`,
+ rollout: `bun scripts/cli.ts platform-infra observability status --target ${target.id}`,
+ validate: `bun scripts/cli.ts platform-infra observability validate --target ${target.id}`,
+ },
+ };
+ }
+ const result = await capture(config, target.route, ["sh"], applyScript({
+ yaml,
+ target,
+ dryRun: options.dryRun,
+ wait: options.wait,
+ collectorDeploymentName: observability.collector.deploymentName,
+ backendDeploymentName: observability.traceBackend.deploymentName,
+ }));
+ const parsed = parseJsonOutput(result.stdout);
+ return {
+ ok: result.exitCode === 0 && parsed?.ok === true,
+ action: "platform-infra-observability-apply",
+ mode: options.dryRun ? "dry-run" : "confirmed",
+ mutation: !options.dryRun,
+ target: targetSummary(target),
+ policy,
+ remote: parsed ?? compactCapture(result, { full: true }),
+ };
+}
+
+async function status(config: UniDeskConfig, options: CommonOptions): Promise> {
+ const observability = readObservabilityConfig();
+ const target = resolveTarget(observability, options.targetId);
+ const result = await capture(config, target.route, ["sh"], statusScript(observability, target, options.full));
+ const parsed = parseJsonOutput(result.stdout);
+ const summary = parsed === null ? null : statusSummary(parsed);
+ return {
+ ok: result.exitCode === 0 && summary?.ready === true,
+ action: "platform-infra-observability-status",
+ mutation: false,
+ target: targetSummary(target),
+ summary,
+ remote: options.raw ? parsed : compactStatus(parsed, options.full) ?? compactCapture(result, { full: true }),
+ next: {
+ plan: `bun scripts/cli.ts platform-infra observability plan --target ${target.id}`,
+ apply: `bun scripts/cli.ts platform-infra observability apply --target ${target.id} --confirm`,
+ validate: `bun scripts/cli.ts platform-infra observability validate --target ${target.id}`,
+ },
+ };
+}
+
+async function validate(config: UniDeskConfig, options: CommonOptions): Promise> {
+ const observability = readObservabilityConfig();
+ const target = resolveTarget(observability, options.targetId);
+ const result = await capture(config, target.route, ["sh"], statusScript(observability, target, options.full));
+ const parsed = parseJsonOutput(result.stdout);
+ const summary = parsed === null ? null : statusSummary(parsed);
+ const ready = summary?.ready === true;
+ return {
+ ok: result.exitCode === 0 && ready,
+ action: "platform-infra-observability-validate",
+ mutation: false,
+ target: targetSummary(target),
+ summary,
+ validation: {
+ readiness: ready ? "passed" : "failed",
+ testTrace: "not-generated-by-this-stage",
+ traceQuery: ready ? `bun scripts/cli.ts platform-infra observability trace --target ${target.id} --trace-id ` : "blocked-until-runtime-ready",
+ metricsBoundary: "Prometheus/RUM remains outside this trace readiness check.",
+ },
+ remote: options.raw ? parsed : compactStatus(parsed, options.full) ?? compactCapture(result, { full: true }),
+ };
+}
+
+async function trace(config: UniDeskConfig, options: TraceOptions): Promise> {
+ if (options.traceId === null) throw new Error("observability trace requires --trace-id ");
+ const observability = readObservabilityConfig();
+ const target = resolveTarget(observability, options.targetId);
+ const tracePath = observability.probes.traceQueryPathTemplate.replaceAll("{{traceId}}", encodeURIComponent(options.traceId));
+ const result = await capture(config, target.route, ["sh"], traceScript(observability, target, tracePath));
+ const parsed = parseJsonOutput(result.stdout);
+ return {
+ ok: result.exitCode === 0 && parsed?.ok === true,
+ action: "platform-infra-observability-trace",
+ mutation: false,
+ target: targetSummary(target),
+ traceId: options.traceId,
+ query: {
+ backend: observability.traceBackend.type,
+ service: observability.traceBackend.serviceName,
+ path: tracePath,
+ },
+ result: options.raw ? redactSensitiveUnknown(parsed) : compactUnknown(redactSensitiveUnknown(parsed)),
+ };
+}
+
+function renderManifest(observability: ObservabilityConfig, target: ObservabilityTarget): string {
+ const collectorImage = imageReference(observability.images.collector);
+ const tempoImage = imageReference(observability.images.tempo);
+ return [
+ target.createNamespace ? namespaceManifest(target) : "",
+ allowAllNetworkPolicy(target),
+ collectorConfigMap(observability, target),
+ collectorDeployment(observability, target, collectorImage),
+ collectorService(observability, target),
+ tempoConfigMap(observability, target),
+ tempoDeployment(observability, target, tempoImage),
+ tempoService(observability, target),
+ ].filter((item) => item.trim().length > 0).join("\n---\n");
+}
+
+function namespaceManifest(target: ObservabilityTarget): string {
+ return `apiVersion: v1
+kind: Namespace
+metadata:
+ name: ${target.namespace}
+ labels:
+ app.kubernetes.io/part-of: platform-infra
+ app.kubernetes.io/managed-by: unidesk
+ unidesk.ai/runtime-node: ${target.id}
+`;
+}
+
+function allowAllNetworkPolicy(target: ObservabilityTarget): string {
+ return `apiVersion: networking.k8s.io/v1
+kind: NetworkPolicy
+metadata:
+ name: allow-all
+ namespace: ${target.namespace}
+ labels:
+ app.kubernetes.io/part-of: platform-infra
+ app.kubernetes.io/managed-by: unidesk
+spec:
+ podSelector: {}
+ policyTypes:
+ - Ingress
+ - Egress
+ ingress:
+ - {}
+ egress:
+ - {}
+`;
+}
+
+function collectorConfigMap(observability: ObservabilityConfig, target: ObservabilityTarget): string {
+ const config = `receivers:
+ otlp:
+ protocols:
+ grpc:
+ endpoint: 0.0.0.0:${observability.collector.otlp.grpcPort}
+ http:
+ endpoint: 0.0.0.0:${observability.collector.otlp.httpPort}
+processors:
+ batch: {}
+exporters:
+ otlp/tempo:
+ endpoint: ${observability.traceBackend.serviceName}.${target.namespace}.svc.cluster.local:${observability.traceBackend.otlp.grpcPort}
+ tls:
+ insecure: true
+extensions:
+ health_check:
+ endpoint: 0.0.0.0:${observability.collector.healthPort}
+service:
+ extensions: [health_check]
+ pipelines:
+ traces:
+ receivers: [otlp]
+ processors: [batch]
+ exporters: [otlp/tempo]
+`;
+ return `apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: ${observability.collector.configMapName}
+ namespace: ${target.namespace}
+ labels:
+ app.kubernetes.io/name: ${observability.collector.deploymentName}
+ app.kubernetes.io/component: tracing
+ app.kubernetes.io/part-of: platform-infra
+ app.kubernetes.io/managed-by: unidesk
+ annotations:
+ unidesk.ai/spec: "${observability.metadata.spec}"
+data:
+ collector.yaml: |
+${indent(config, 4)}
+`;
+}
+
+function collectorDeployment(observability: ObservabilityConfig, target: ObservabilityTarget, image: string): string {
+ return `apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: ${observability.collector.deploymentName}
+ namespace: ${target.namespace}
+ labels:
+ app.kubernetes.io/name: ${observability.collector.deploymentName}
+ app.kubernetes.io/component: tracing
+ app.kubernetes.io/part-of: platform-infra
+ app.kubernetes.io/managed-by: unidesk
+spec:
+ replicas: ${observability.collector.replicas}
+ selector:
+ matchLabels:
+ app.kubernetes.io/name: ${observability.collector.deploymentName}
+ app.kubernetes.io/component: tracing
+ template:
+ metadata:
+ labels:
+ app.kubernetes.io/name: ${observability.collector.deploymentName}
+ app.kubernetes.io/component: tracing
+ app.kubernetes.io/part-of: platform-infra
+ annotations:
+ unidesk.ai/spec: "${observability.metadata.spec}"
+ spec:
+ containers:
+ - name: collector
+ image: ${image}
+ imagePullPolicy: ${observability.images.collector.pullPolicy}
+ args:
+ - --config=/etc/otelcol/collector.yaml
+ ports:
+ - name: otlp-grpc
+ containerPort: ${observability.collector.otlp.grpcPort}
+ - name: otlp-http
+ containerPort: ${observability.collector.otlp.httpPort}
+ - name: health
+ containerPort: ${observability.collector.healthPort}
+ readinessProbe:
+ httpGet:
+ path: /
+ port: health
+ volumeMounts:
+ - name: config
+ mountPath: /etc/otelcol/collector.yaml
+ subPath: collector.yaml
+ readOnly: true
+ volumes:
+ - name: config
+ configMap:
+ name: ${observability.collector.configMapName}
+`;
+}
+
+function collectorService(observability: ObservabilityConfig, target: ObservabilityTarget): string {
+ return `apiVersion: v1
+kind: Service
+metadata:
+ name: ${observability.collector.serviceName}
+ namespace: ${target.namespace}
+ labels:
+ app.kubernetes.io/name: ${observability.collector.deploymentName}
+ app.kubernetes.io/component: tracing
+ app.kubernetes.io/part-of: platform-infra
+ app.kubernetes.io/managed-by: unidesk
+spec:
+ type: ClusterIP
+ selector:
+ app.kubernetes.io/name: ${observability.collector.deploymentName}
+ app.kubernetes.io/component: tracing
+ ports:
+ - name: otlp-grpc
+ port: ${observability.collector.otlp.grpcPort}
+ targetPort: otlp-grpc
+ - name: otlp-http
+ port: ${observability.collector.otlp.httpPort}
+ targetPort: otlp-http
+ - name: health
+ port: ${observability.collector.healthPort}
+ targetPort: health
+`;
+}
+
+function tempoConfigMap(observability: ObservabilityConfig, _target: ObservabilityTarget): string {
+ const config = `server:
+ http_listen_port: ${observability.traceBackend.httpPort}
+distributor:
+ receivers:
+ otlp:
+ protocols:
+ grpc:
+ endpoint: 0.0.0.0:${observability.traceBackend.otlp.grpcPort}
+ http:
+ endpoint: 0.0.0.0:${observability.traceBackend.otlp.httpPort}
+ingester:
+ trace_idle_period: 10s
+ max_block_duration: 5m
+compactor:
+ compaction:
+ block_retention: ${observability.traceBackend.storage.retention}
+storage:
+ trace:
+ backend: local
+ wal:
+ path: /var/tempo/wal
+ local:
+ path: /var/tempo/traces
+`;
+ return `apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: ${observability.traceBackend.configMapName}
+ namespace: ${_target.namespace}
+ labels:
+ app.kubernetes.io/name: ${observability.traceBackend.deploymentName}
+ app.kubernetes.io/component: trace-backend
+ app.kubernetes.io/part-of: platform-infra
+ app.kubernetes.io/managed-by: unidesk
+ annotations:
+ unidesk.ai/spec: "${observability.metadata.spec}"
+data:
+ tempo.yaml: |
+${indent(config, 4)}
+`;
+}
+
+function tempoDeployment(observability: ObservabilityConfig, target: ObservabilityTarget, image: string): string {
+ return `apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: ${observability.traceBackend.deploymentName}
+ namespace: ${target.namespace}
+ labels:
+ app.kubernetes.io/name: ${observability.traceBackend.deploymentName}
+ app.kubernetes.io/component: trace-backend
+ app.kubernetes.io/part-of: platform-infra
+ app.kubernetes.io/managed-by: unidesk
+spec:
+ replicas: ${observability.traceBackend.replicas}
+ selector:
+ matchLabels:
+ app.kubernetes.io/name: ${observability.traceBackend.deploymentName}
+ app.kubernetes.io/component: trace-backend
+ template:
+ metadata:
+ labels:
+ app.kubernetes.io/name: ${observability.traceBackend.deploymentName}
+ app.kubernetes.io/component: trace-backend
+ app.kubernetes.io/part-of: platform-infra
+ annotations:
+ unidesk.ai/spec: "${observability.metadata.spec}"
+ spec:
+ containers:
+ - name: tempo
+ image: ${image}
+ imagePullPolicy: ${observability.images.tempo.pullPolicy}
+ args:
+ - -config.file=/etc/tempo/tempo.yaml
+ ports:
+ - name: http
+ containerPort: ${observability.traceBackend.httpPort}
+ - name: otlp-grpc
+ containerPort: ${observability.traceBackend.otlp.grpcPort}
+ - name: otlp-http
+ containerPort: ${observability.traceBackend.otlp.httpPort}
+ readinessProbe:
+ httpGet:
+ path: ${observability.probes.readinessPath}
+ port: http
+ volumeMounts:
+ - name: config
+ mountPath: /etc/tempo/tempo.yaml
+ subPath: tempo.yaml
+ readOnly: true
+ - name: data
+ mountPath: /var/tempo
+ volumes:
+ - name: config
+ configMap:
+ name: ${observability.traceBackend.configMapName}
+ - name: data
+ emptyDir: {}
+`;
+}
+
+function tempoService(observability: ObservabilityConfig, target: ObservabilityTarget): string {
+ return `apiVersion: v1
+kind: Service
+metadata:
+ name: ${observability.traceBackend.serviceName}
+ namespace: ${target.namespace}
+ labels:
+ app.kubernetes.io/name: ${observability.traceBackend.deploymentName}
+ app.kubernetes.io/component: trace-backend
+ app.kubernetes.io/part-of: platform-infra
+ app.kubernetes.io/managed-by: unidesk
+spec:
+ type: ClusterIP
+ selector:
+ app.kubernetes.io/name: ${observability.traceBackend.deploymentName}
+ app.kubernetes.io/component: trace-backend
+ ports:
+ - name: http
+ port: ${observability.traceBackend.httpPort}
+ targetPort: http
+ - name: otlp-grpc
+ port: ${observability.traceBackend.otlp.grpcPort}
+ targetPort: otlp-grpc
+ - name: otlp-http
+ port: ${observability.traceBackend.otlp.httpPort}
+ targetPort: otlp-http
+`;
+}
+
+function applyScript(params: {
+ yaml: string;
+ target: ObservabilityTarget;
+ dryRun: boolean;
+ wait: boolean;
+ collectorDeploymentName: string;
+ backendDeploymentName: string;
+}): string {
+ const encoded = Buffer.from(params.yaml, "utf8").toString("base64");
+ const dryRunArg = params.dryRun ? "--dry-run=server" : "";
+ const wait = params.dryRun || !params.wait
+ ? "wait_disposition=skipped"
+ : [
+ `kubectl -n ${shQuote(params.target.namespace)} rollout status deployment/${shQuote(params.collectorDeploymentName)} --timeout=180s >"$tmp/collector-rollout.out" 2>"$tmp/collector-rollout.err"`,
+ "collector_rollout_rc=$?",
+ `kubectl -n ${shQuote(params.target.namespace)} rollout status deployment/${shQuote(params.backendDeploymentName)} --timeout=180s >"$tmp/backend-rollout.out" 2>"$tmp/backend-rollout.err"`,
+ "backend_rollout_rc=$?",
+ "wait_disposition=executed",
+ ].join("\n");
+ return `
+set -u
+tmp="$(mktemp -d)"
+trap 'rm -rf "$tmp"' EXIT
+manifest="$tmp/platform-infra-observability.yaml"
+printf '%s' '${encoded}' | base64 -d > "$manifest"
+kubectl apply --server-side --field-manager=${fieldManager} ${dryRunArg} -f "$manifest" >"$tmp/apply.out" 2>"$tmp/apply.err"
+apply_rc=$?
+collector_rollout_rc=0
+backend_rollout_rc=0
+${wait}
+python3 - "$apply_rc" "$collector_rollout_rc" "$backend_rollout_rc" "$wait_disposition" "$tmp/apply.out" "$tmp/apply.err" "$tmp/collector-rollout.out" "$tmp/collector-rollout.err" "$tmp/backend-rollout.out" "$tmp/backend-rollout.err" <<'PY'
+import json, os, sys
+def text(path, limit=6000):
+ try:
+ return open(path, encoding="utf-8", errors="replace").read()[-limit:]
+ except FileNotFoundError:
+ return ""
+apply_rc = int(sys.argv[1])
+collector_rc = int(sys.argv[2])
+backend_rc = int(sys.argv[3])
+payload = {
+ "ok": apply_rc == 0 and collector_rc == 0 and backend_rc == 0,
+ "target": "${params.target.id}",
+ "namespace": "${params.target.namespace}",
+ "dryRun": ${params.dryRun ? "True" : "False"},
+ "apply": {"exitCode": apply_rc, "stdout": text(sys.argv[5]), "stderr": text(sys.argv[6])},
+ "rollout": {
+ "disposition": sys.argv[4],
+ "collector": {"exitCode": collector_rc, "stdout": text(sys.argv[7]), "stderr": text(sys.argv[8])},
+ "backend": {"exitCode": backend_rc, "stdout": text(sys.argv[9]), "stderr": text(sys.argv[10])},
+ },
+}
+print(json.dumps(payload, ensure_ascii=False, indent=2))
+sys.exit(0 if payload["ok"] else 1)
+PY
+`;
+}
+
+function statusScript(observability: ObservabilityConfig, target: ObservabilityTarget, full: boolean): string {
+ const endpointsJson = JSON.stringify(observability.probes.statusEndpoints);
+ return `
+set -u
+tmp="$(mktemp -d)"
+trap 'rm -rf "$tmp"' EXIT
+capture_json() {
+ name="$1"
+ shift
+ "$@" >"$tmp/$name.json" 2>"$tmp/$name.err"
+ echo $? >"$tmp/$name.rc"
+}
+capture_raw() {
+ name="$1"
+ shift
+ "$@" >"$tmp/$name.out" 2>"$tmp/$name.err"
+ echo $? >"$tmp/$name.rc"
+}
+capture_json namespace kubectl get namespace ${shQuote(target.namespace)} -o json
+capture_json deployments kubectl -n ${shQuote(target.namespace)} get deployment ${shQuote(observability.collector.deploymentName)} ${shQuote(observability.traceBackend.deploymentName)} -o json
+capture_json services kubectl -n ${shQuote(target.namespace)} get service ${shQuote(observability.collector.serviceName)} ${shQuote(observability.traceBackend.serviceName)} -o json
+capture_json pods kubectl -n ${shQuote(target.namespace)} get pods -l ${shQuote(`app.kubernetes.io/name in (${observability.collector.deploymentName},${observability.traceBackend.deploymentName})`)} -o json
+python3 - "$tmp" '${endpointsJson.replaceAll("'", "'\"'\"'")}' <<'PY'
+import json, subprocess, sys
+tmp = sys.argv[1]
+endpoints = json.loads(sys.argv[2])
+namespace = "${target.namespace}"
+def read(path, binary=False, limit=8000):
+ try:
+ mode = "rb" if binary else "r"
+ with open(path, mode, encoding=None if binary else "utf-8", errors=None if binary else "replace") as fh:
+ data = fh.read()
+ if binary:
+ data = data.decode("utf-8", errors="replace")
+ return data[-limit:]
+ except FileNotFoundError:
+ return ""
+def rc(name):
+ try:
+ return int(read(f"{tmp}/{name}.rc").strip() or "1")
+ except ValueError:
+ return 1
+def parsed_json(name):
+ raw = read(f"{tmp}/{name}.json", limit=1000000)
+ try:
+ return json.loads(raw) if raw else None
+ except Exception:
+ return None
+def compact_section(name):
+ return {
+ "exitCode": rc(name),
+ "stderrTail": read(f"{tmp}/{name}.err", limit=2000),
+ "json": parsed_json(name),
+ }
+probe_results = []
+for ep in endpoints:
+ path = ep.get("path") or "/"
+ if not path.startswith("/"):
+ path = "/" + path
+ proxy_path = f"/api/v1/namespaces/{namespace}/services/http:{ep['service']}:{ep['portName']}/proxy{path}"
+ proc = subprocess.run(["kubectl", "get", "--raw", proxy_path], text=True, capture_output=True, timeout=20)
+ probe_results.append({
+ "name": ep["name"],
+ "service": ep["service"],
+ "portName": ep["portName"],
+ "path": path,
+ "exitCode": proc.returncode,
+ "stdoutTail": proc.stdout[-1000:],
+ "stderrTail": proc.stderr[-1000:],
+ "ok": proc.returncode == 0,
+ })
+payload = {
+ "ok": rc("namespace") == 0 and rc("deployments") == 0 and rc("services") == 0 and all(item["ok"] for item in probe_results),
+ "target": "${target.id}",
+ "namespace": namespace,
+ "sections": {
+ "namespace": compact_section("namespace"),
+ "deployments": compact_section("deployments"),
+ "services": compact_section("services"),
+ "pods": compact_section("pods"),
+ },
+ "probes": probe_results,
+}
+print(json.dumps(payload, ensure_ascii=False, indent=2 if ${full ? "True" : "False"} else None))
+sys.exit(0 if payload["ok"] else 1)
+PY
+`;
+}
+
+function traceScript(observability: ObservabilityConfig, target: ObservabilityTarget, tracePath: string): string {
+ const proxyPath = `/api/v1/namespaces/${target.namespace}/services/http:${observability.traceBackend.serviceName}:http/proxy${tracePath}`;
+ return `
+set -u
+tmp="$(mktemp -d)"
+trap 'rm -rf "$tmp"' EXIT
+kubectl get --raw ${shQuote(proxyPath)} >"$tmp/trace.out" 2>"$tmp/trace.err"
+rc=$?
+python3 - "$rc" "$tmp/trace.out" "$tmp/trace.err" <<'PY'
+import json, sys
+def text(path, limit=12000):
+ try:
+ return open(path, encoding="utf-8", errors="replace").read()[-limit:]
+ except FileNotFoundError:
+ return ""
+body = text(sys.argv[2], 200000)
+try:
+ parsed = json.loads(body) if body else None
+except Exception:
+ parsed = body[-12000:]
+payload = {
+ "ok": int(sys.argv[1]) == 0,
+ "path": "${tracePath}",
+ "proxyPath": "${proxyPath}",
+ "body": parsed,
+ "stderrTail": text(sys.argv[3], 4000),
+}
+print(json.dumps(payload, ensure_ascii=False, indent=2))
+sys.exit(0 if payload["ok"] else 1)
+PY
+`;
+}
+
+function configSummary(observability: ObservabilityConfig, target: ObservabilityTarget): Record {
+ return {
+ configPath: configLabel,
+ spec: observability.metadata.spec,
+ target: targetSummary(target),
+ images: {
+ collector: imageReference(observability.images.collector),
+ traceBackend: imageReference(observability.images.tempo),
+ },
+ collector: observability.collector,
+ traceBackend: observability.traceBackend,
+ sampling: observability.sampling,
+ };
+}
+
+function targetSummary(target: ObservabilityTarget): Record {
+ return {
+ id: target.id,
+ route: target.route,
+ namespace: target.namespace,
+ role: target.role,
+ createNamespace: target.createNamespace,
+ };
+}
+
+function policyChecks(yaml: string, target: ObservabilityTarget): Array> {
+ return [
+ { name: "yaml-source-of-truth", ok: true, detail: "All concrete images, routes, namespace, ports, retention and sampling values are read from config/platform-infra/observability.yaml." },
+ { name: "clusterip-only", ok: !/^\s*type:\s*(NodePort|LoadBalancer)\s*$/mu.test(yaml), detail: "Collector and trace backend stay ClusterIP-only." },
+ { name: "no-ingress", ok: !/^\s*kind:\s*Ingress\s*$/mu.test(yaml), detail: "No public ingress is rendered for the first tracing backend." },
+ { name: "no-host-network", ok: !/^\s*hostNetwork:\s*true\s*$/mu.test(yaml), detail: "Pods must not use host network." },
+ { name: "allow-all-network-policy", ok: yaml.includes("kind: NetworkPolicy") && yaml.includes("name: allow-all") && yaml.includes(`namespace: ${target.namespace}`), detail: `NetworkPolicy/allow-all is rendered in ${target.namespace}.` },
+ ];
+}
+
+function statusSummary(payload: Record): Record {
+ const sections = asRecord(payload.sections, "status.sections");
+ const deployments = objectList(sectionJson(sections, "deployments"));
+ const services = objectList(sectionJson(sections, "services"));
+ const pods = objectList(sectionJson(sections, "pods"));
+ const probes = Array.isArray(payload.probes) ? payload.probes as Array> : [];
+ const readyDeployments = deployments.map((item) => deploymentSummary(item));
+ return {
+ ready: payload.ok === true,
+ namespace: payload.namespace,
+ deployments: readyDeployments,
+ services: services.map((item) => metadataName(item)),
+ pods: pods.map((item) => podSummary(item)),
+ probes: probes.map((item) => ({
+ name: item.name,
+ ok: item.ok === true,
+ service: item.service,
+ path: item.path,
+ stderrTail: item.ok === true ? "" : item.stderrTail,
+ })),
+ };
+}
+
+function compactStatus(payload: Record | null, full: boolean): Record | null {
+ if (payload === null) return null;
+ if (full) return redactSensitiveUnknown(payload) as Record;
+ return statusSummary(payload);
+}
+
+function sectionJson(sections: Record, name: string): unknown {
+ const section = asRecord(sections[name], `sections.${name}`);
+ return section.json;
+}
+
+function objectList(value: unknown): Record[] {
+ if (typeof value !== "object" || value === null) return [];
+ const items = (value as Record).items;
+ if (!Array.isArray(items)) return [];
+ return items.filter((item): item is Record => typeof item === "object" && item !== null && !Array.isArray(item));
+}
+
+function deploymentSummary(item: Record): Record {
+ const spec = item.spec as Record | undefined;
+ const status = item.status as Record | undefined;
+ const replicas = typeof spec?.replicas === "number" ? spec.replicas : null;
+ const available = typeof status?.availableReplicas === "number" ? status.availableReplicas : 0;
+ return {
+ name: metadataName(item),
+ replicas,
+ availableReplicas: available,
+ ready: replicas !== null && available >= replicas,
+ };
+}
+
+function podSummary(item: Record): Record {
+ const status = item.status as Record | undefined;
+ return {
+ name: metadataName(item),
+ phase: status?.phase ?? null,
+ };
+}
+
+function metadataName(item: Record): string | null {
+ const metadata = item.metadata as Record | undefined;
+ return typeof metadata?.name === "string" ? metadata.name : null;
+}
+
+function manifestObjectSummary(yaml: string): Array> {
+ const docs = yaml.split(/^---$/mu);
+ return docs.map((doc) => {
+ const kind = doc.match(/^\s*kind:\s*(.+)$/mu)?.[1]?.trim() ?? "unknown";
+ const name = doc.match(/^\s*name:\s*(.+)$/mu)?.[1]?.trim() ?? "unknown";
+ return { kind, name };
+ });
+}
+
+function imageReference(image: ImageSpec): string {
+ return `${image.repository}:${image.tag}`;
+}
+
+function indent(value: string, spaces: number): string {
+ const prefix = " ".repeat(spaces);
+ return value.split("\n").map((line) => `${prefix}${line}`).join("\n");
+}
diff --git a/scripts/src/platform-infra.ts b/scripts/src/platform-infra.ts
index 8c8eee2e..edc8a5b2 100644
--- a/scripts/src/platform-infra.ts
+++ b/scripts/src/platform-infra.ts
@@ -262,7 +262,7 @@ interface ManagedResourceCleanupPlan {
export function platformInfraHelp(): unknown {
const target = sub2ApiHelpTargetSummary();
return {
- command: "platform-infra sub2api|langbot|n8n|wechat-archive ...",
+ command: "platform-infra sub2api|langbot|n8n|wechat-archive|observability ...",
output: "json",
usage: [
"bun scripts/cli.ts platform-infra sub2api plan [--target G14|D601]",
@@ -297,8 +297,14 @@ export function platformInfraHelp(): unknown {
"bun scripts/cli.ts platform-infra wechat-archive collector-image-build --confirm",
"bun scripts/cli.ts platform-infra wechat-archive collector-apply --confirm",
"bun scripts/cli.ts platform-infra wechat-archive collector-status --full",
+ "bun scripts/cli.ts platform-infra observability plan --target D601",
+ "bun scripts/cli.ts platform-infra observability apply --target D601 --dry-run",
+ "bun scripts/cli.ts platform-infra observability apply --target D601 --confirm",
+ "bun scripts/cli.ts platform-infra observability status --target D601",
+ "bun scripts/cli.ts platform-infra observability validate --target D601",
+ "bun scripts/cli.ts platform-infra observability trace --target D601 --trace-id ",
],
- description: "Operate YAML-controlled platform-infra services such as Sub2API, LangBot, n8n and WeChat archive workflows. Public services use PK01 Caddy+FRP rather than Kubernetes Ingress, NodePort, or LoadBalancer.",
+ description: "Operate YAML-controlled platform-infra services such as Sub2API, LangBot, n8n, WeChat archive workflows and OpenTelemetry tracing. Public services use PK01 Caddy+FRP rather than Kubernetes Ingress, NodePort, or LoadBalancer.",
target,
codexPool: {
usage: [
@@ -341,6 +347,10 @@ export async function runPlatformInfraCommand(config: UniDeskConfig, args: strin
const { runWechatArchiveCommand } = await import("./platform-infra-wechat-archive");
return await runWechatArchiveCommand(config, args.slice(1));
}
+ if (target === "observability") {
+ const { runPlatformObservabilityCommand } = await import("./platform-infra-observability");
+ return await runPlatformObservabilityCommand(config, args.slice(1));
+ }
if (target !== "sub2api") return unsupported(args);
if (action === "plan" || action === undefined) return plan(parseTargetOptions(args.slice(2)));
if (action === "apply") return await apply(config, parseApplyOptions(args.slice(2)));