diff --git a/AGENTS.md b/AGENTS.md index b1fc26ee..06413e5d 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -30,10 +30,10 @@ UniDesk 是一个以主 server 为统一入口的分布式工作平台;本文 - `bun`:TypeScript 运行时固定使用 Bun,组件入口和 CLI 都直接运行 `.ts` 文件,约束见 `docs/reference/config.md`。 - `docker-compose.yml`:主 server 统一编排 core、frontend、database、本机 provider gateway 和 Todo Note 后端,且只公开 frontend/provider ingress,服务拓扑见 `docs/reference/deployment.md`。 -- `src/components/frontend`:前端源码固定使用 TypeScript + React,`app.tsx` 只做 shell/router,Todo Note、FindJob、Pipeline、MET Nonlinear 等业务页必须拆到独立 TSX 模块,并采用高信息密度工业控制台设计,界面规则见 `docs/reference/frontend.md`。 +- `src/components/frontend`:前端源码固定使用 TypeScript + React,`app.tsx` 只做 shell/router,资源监控含曲线和进程资源排序表,Todo Note、FindJob、Pipeline、MET Nonlinear 等业务页必须拆到独立 TSX 模块,并采用高信息密度工业控制台设计,界面规则见 `docs/reference/frontend.md`。 - `src/components/provider-gateway`:当前主 server `74.48.78.17` 也作为 provider gateway 接入 UniDesk,外部节点通过 `ws://74.48.78.17:18082/ws/provider` 接入,必须以 `restart: always` 部署 always-enabled 远程升级、sleep-and-validate 回滚保护和 Host SSH / WSL SSH 透传并完成自测,部署与 Playwright 公网前端验证方法见 `docs/reference/provider-gateway.md`。 - `microservices`:主 server 本地开发边界固定为只开发 UniDesk frontend;非 UniDesk 核心业务后端、Dockerfile、GPU/训练调试必须在目标计算节点通过 SSH 透传完成,Todo Note 这类明确写入主 server 的例外需单独登记,规则见 `docs/reference/microservices.md`。 -- `docs/reference/e2e.md`:交付前必须执行的自测门禁、Playwright 登录与 JSON 展示断言、数据库命名卷持久化要求。 +- `docs/reference/e2e.md`:交付前必须执行的自测门禁、Playwright 登录、资源监控进程排序、JSON 展示断言和数据库命名卷持久化要求。 ## Architecture Docs diff --git a/TEST.md b/TEST.md index 6942270b..c59ec447 100644 --- a/TEST.md +++ b/TEST.md @@ -54,11 +54,11 @@ ## T13 资源节点任务管理器曲线 -阅读 `AGENTS.md`(本项目 `AGENTS.md` 同时承担 `SKILL.md` 对 `scripts/cli.ts` 的解释职责),然后用 cli 手动测试以下内容:运行 `bun scripts/cli.ts e2e run`,确认 `provider:system-status` 和 `frontend:system-monitor-visible` passed;再用浏览器登录 frontend,进入左侧 `资源节点` 和顶部 `资源监控` 子标签,确认可以像 Windows 任务管理器一样看到 CPU、Memory、Disk 当前用量和历史曲线,Memory 明确显示为不含 Linux page cache / buffer 的实际内存占用,并能执行 `Provider Gateway 升级` 的 `预检升级`。 +阅读 `AGENTS.md`(本项目 `AGENTS.md` 同时承担 `SKILL.md` 对 `scripts/cli.ts` 的解释职责),然后用 cli 手动测试以下内容:运行 `bun scripts/cli.ts e2e run`,确认 `provider:system-status`、`provider:process-resource-status`、`frontend:system-monitor-visible` 和 `frontend:process-resource-sorting` passed;再用浏览器登录 frontend,进入左侧 `资源节点` 和顶部 `资源监控` 子标签,确认可以像 Windows 任务管理器一样看到 CPU、Memory、Disk 当前用量和历史曲线,Memory 明确显示为不含 Linux page cache / buffer 的实际内存占用;确认 `进程资源占用` 表默认按内存 RSS 降序,能点击 CPU、PID、用户、磁盘 I/O 等表头切换排序,且只通过 `查看原始JSON` 查看完整进程快照;最后确认能执行 `Provider Gateway 升级` 的 `预检升级`。 ## T14 Provider Gateway 远程升级 -阅读 `AGENTS.md`(本项目 `AGENTS.md` 同时承担 `SKILL.md` 对 `scripts/cli.ts` 的解释职责),然后用 cli 手动测试以下内容:如果本次变更修改了 `src/components/provider-gateway` 代码或行为,先确认 `src/components/provider-gateway/package.json` 的 `version` 已递增;运行 `bun scripts/cli.ts debug dispatch main-server provider.upgrade`,随后查看任务历史或 `bun scripts/cli.ts debug health`,确认 `provider.upgrade` 通过真实 WebSocket 下发并以 `mode: plan` 成功返回升级计划且计划中包含 `providerId`、`providerName`、`providerGatewayVersion`、`targetProviderGatewayVersion`、`policy: "always-enabled"`、`--no-deps`、`--force-recreate`、`oldGatewaySleepMs`、`promoteOnlyAfterCandidateValidation`、`candidateRestartPolicyAfterPromotion: "always"` 和 `candidateUsesOldContainerEnvironment`;对明确要升级或重建 `provider-gateway` 容器的计算节点,必须再运行 `bun scripts/cli.ts debug dispatch provider.upgrade --mode schedule --wait-ms 15000`,确认任务成功、result 包含 updater 容器信息、候选 gateway 验证后节点重新上线,`providerGatewayVersion` 已上报目标新版本,且最终 provider-gateway 容器 Docker restart policy 是 `always`。在非主 server 的计算节点上,必须使用 `bun scripts/cli.ts --main-server-ip 74.48.78.17 debug dispatch provider.upgrade --mode schedule --wait-ms 15000` 做同一验证,证明该节点能通过公网 frontend remote CLI 自测自动升级,且不需要指定 `--main-server-key`。正式执行计算节点 `provider-gateway` 重建/升级只能通过前端 `资源监控` 的 `执行升级` 或等价的 `provider.upgrade mode=schedule` 显式调度完成,不能通过 `bun scripts/cli.ts ssh ` 或 Host SSH 维护桥同步执行自重建命令,也不能通过 `PROVIDER_UPGRADE_ENABLED` 或等价开关禁用远程升级。 +阅读 `AGENTS.md`(本项目 `AGENTS.md` 同时承担 `SKILL.md` 对 `scripts/cli.ts` 的解释职责),然后用 cli 手动测试以下内容:如果本次变更修改了 `src/components/provider-gateway` 代码或行为,先确认 `src/components/provider-gateway/package.json` 的 `version` 已递增;运行 `bun scripts/cli.ts debug dispatch main-server provider.upgrade`,随后查看任务历史或 `bun scripts/cli.ts debug health`,确认 `provider.upgrade` 通过真实 WebSocket 下发并以 `mode: plan` 成功返回升级计划且计划中包含 `providerId`、`providerName`、`providerGatewayVersion`、`targetProviderGatewayVersion`、`policy: "always-enabled"`、`--no-deps`、`--force-recreate`、`oldGatewaySleepMs`、`promoteOnlyAfterCandidateValidation`、`candidateRestartPolicyAfterPromotion: "always"`、`candidateUsesOldContainerEnvironment` 和 `candidateUsesHostPidNamespace`;对明确要升级或重建 `provider-gateway` 容器的计算节点,必须再运行 `bun scripts/cli.ts debug dispatch provider.upgrade --mode schedule --wait-ms 15000`,确认任务成功、result 包含 updater 容器信息、候选 gateway 验证后节点重新上线,`providerGatewayVersion` 已上报目标新版本,且最终 provider-gateway 容器 Docker restart policy 是 `always` 并使用宿主 PID namespace。在非主 server 的计算节点上,必须使用 `bun scripts/cli.ts --main-server-ip 74.48.78.17 debug dispatch provider.upgrade --mode schedule --wait-ms 15000` 做同一验证,证明该节点能通过公网 frontend remote CLI 自测自动升级,且不需要指定 `--main-server-key`。正式执行计算节点 `provider-gateway` 重建/升级只能通过前端 `资源监控` 的 `执行升级` 或等价的 `provider.upgrade mode=schedule` 显式调度完成,不能通过 `bun scripts/cli.ts ssh ` 或 Host SSH 维护桥同步执行自重建命令,也不能通过 `PROVIDER_UPGRADE_ENABLED` 或等价开关禁用远程升级。 ## T15 待处理任务可追溯 diff --git a/docker-compose.yml b/docker-compose.yml index a09b2656..06629de1 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -126,6 +126,7 @@ services: dockerfile: src/components/provider-gateway/Dockerfile container_name: unidesk-provider-gateway-main restart: always + pid: "host" depends_on: - backend-core environment: diff --git a/docs/reference/deployment.md b/docs/reference/deployment.md index 9978d48a..964dbd82 100644 --- a/docs/reference/deployment.md +++ b/docs/reference/deployment.md @@ -7,7 +7,7 @@ - `database` 使用 `postgres:16-alpine`,数据保存到 named volume `unidesk_pgdata_10gb`,初始化 SQL 位于 `src/components/database/init/`。 - `backend-core` 是无状态核心服务,提供 Docker 内网 REST API、provider ingress WebSocket、任务调度入口和数据库访问层。 - `frontend` 是唯一公开 Web 控制台,提供登录、从 TSX 转译出的 React 应用资产和到 backend-core 的同源代理。 -- `provider-gateway` 是当前主 server 的本机计算节点代理,通过 WebSocket 主动连到 provider ingress,挂载 `/var/run/docker.sock` 作为自动任务执行主路径,并周期性上报系统资源指标与 Docker daemon 状态;维护用 Host SSH / WSL SSH 私钥目录只读挂载到 `/run/host-ssh`,不得作为自动任务调度主路径。 +- `provider-gateway` 是当前主 server 的本机计算节点代理,通过 WebSocket 主动连到 provider ingress,挂载 `/var/run/docker.sock` 作为自动任务执行主路径,使用 `pid: "host"` 读取节点级进程资源,并周期性上报系统资源指标、进程占用与 Docker daemon 状态;维护用 Host SSH / WSL SSH 私钥目录只读挂载到 `/run/host-ssh`,不得作为自动任务调度主路径。 - `todo-note` 是主 server 承载的 Todo Note 纯后端 microservice,容器名 `todo-note-backend`,只在 Compose 内网暴露 `4211/tcp`,使用主 PostgreSQL 存储迁移后的 Todo Note 数据。 ## Public Exposure Boundary diff --git a/docs/reference/e2e.md b/docs/reference/e2e.md index 1eae0b52..283392f1 100644 --- a/docs/reference/e2e.md +++ b/docs/reference/e2e.md @@ -14,11 +14,11 @@ UniDesk delivery is not complete until the public frontend, public provider ingr - Public exposure: Docker port summary must show only frontend and provider ingress host mappings; public core、public database and known private microservice ports such as FindJob `3254`, MET Nonlinear `3288` and Todo Note `4211` probes must fail. - Core API: `docker exec unidesk-backend-core` calls internal `GET /api/overview`, which must report `dbReady: true`, `pgdata.volumeName=unidesk_pgdata_10gb`, a positive PostgreSQL database byte count, and at least one online node. -- Provider self-connection: internal `GET /api/nodes` must contain `main-server` with `status: online`, `labels.providerGatewayVersion` equal to `src/components/provider-gateway/package.json` and `labels.providerGatewayUpgradePolicy: "always-enabled"`; internal `GET /api/nodes/system-status` must contain CPU/memory/disk samples; internal `GET /api/nodes/docker-status` must contain a Docker snapshot for `main-server`; public provider ingress `/health` must return ok. +- Provider self-connection: internal `GET /api/nodes` must contain `main-server` with `status: online`, `labels.providerGatewayVersion` equal to `src/components/provider-gateway/package.json` and `labels.providerGatewayUpgradePolicy: "always-enabled"`; internal `GET /api/nodes/system-status` must contain CPU/memory/disk samples plus a non-empty process resource list sorted by memory by default; internal `GET /api/nodes/docker-status` must contain a Docker snapshot for `main-server`; public provider ingress `/health` must return ok. - Provider remote control: internal `/api/dispatch` must successfully complete a real `provider.upgrade` task in `mode: "plan"` so the upgrade path is validated without recreating the running gateway during E2E. - Microservices: internal `/api/microservices` must include `todo-note` on `main-server` plus `findjob`, `pipeline` and `met-nonlinear` on `D601` with `public=false`; `/api/microservices/todo-note/health` must report `storage=postgres`, `/api/microservices/todo-note/proxy/api/instances` must expose the migrated Todo Note lists, and a temporary Todo Note list create/add/toggle/undo/delete cycle must succeed through the real provider-gateway proxy; `/api/microservices/findjob/health` and `/api/microservices/findjob/proxy/api/summary` must succeed through the real provider-gateway proxy; `/api/microservices/findjob/proxy/api/jobs?__unideskArrayLimit=jobs:5` must return a bounded preview with `_unidesk.arrayLimits` metadata; `/api/microservices/pipeline/health` and `/api/microservices/pipeline/proxy/api/snapshot?__unideskArrayLimit=registry.components:8,runs:3` must return Pipeline health, registry and run previews; `/api/microservices/met-nonlinear/health`, `/api/microservices/met-nonlinear/proxy/api/queue`, `/api/microservices/met-nonlinear/proxy/api/projects?root=projects&limit=500`, `/api/microservices/met-nonlinear/proxy/api/projects?root=ex_projects&limit=500`, `/api/microservices/met-nonlinear/proxy/api/projects/config?path=` and `/api/microservices/met-nonlinear/proxy/api/images` must return the D601 TS backend health, queue/GPU policy, full project tree inputs, structured project detail and ready `met-nonlinear-ml:tf26` image status. - Database: the command writes an `unidesk_e2e_markers` row through `docker exec unidesk-database psql`, confirms provider state is stored in PostgreSQL, and checks Todo Note rows exist in `todo_note_instances` using the same named volume. -- Frontend: Playwright must open the public frontend URL derived from `network.publicHost`, not localhost or a Docker-internal URL; it logs in with the configured account, waits for `核心在线`, asserts that `main-server` and `Main Server Provider` are visible, verifies desktop sidebar collapse and `PGDATA` overview metric, clicks `查看原始JSON` to verify Provider data from the frontend, confirms no raw JSON is visible before that click, opens task history to verify duration and failure diagnostics, opens resource nodes `资源监控` to verify CPU/Memory/Disk curves and provider upgrade precheck dispatch, opens `Docker 状态`, switches to `main-server`, and verifies the Docker Desktop-style container view including the database named volume `unidesk_pgdata_10gb`, opens `网关版本` and verifies the provider-gateway version, SSH 透传可用性、远程更新可用性 plus structured automatic update records for `provider.upgrade`, then opens `微服务 / 服务目录`、`微服务 / Todo Note`、`微服务 / FindJob`、`微服务 / Pipeline` and `微服务 / MET Nonlinear` to verify 主 server Todo Note、D601、仓库引用、私有后端映射、Todo Note 迁移清单和树形任务、FindJob 指标和岗位预览、Pipeline 组件矩阵、React Flow 控制图和最近运行、MET Nonlinear 项目库/Fork/待启动队列/当前队列/已完成/失败诊断/GPU/镜像都通过 React 控件展示。Task history and provider upgrade records must not display a real sub-second duration as `0s`; MET Nonlinear running rows must show an ETA derived from backend progress or from `startedAt` plus epoch progress, and queue/completed rows must show training speed as `epoch/h`. +- Frontend: Playwright must open the public frontend URL derived from `network.publicHost`, not localhost or a Docker-internal URL; it logs in with the configured account, waits for `核心在线`, asserts that `main-server` and `Main Server Provider` are visible, verifies desktop sidebar collapse and `PGDATA` overview metric, clicks `查看原始JSON` to verify Provider data from the frontend, confirms no raw JSON is visible before that click, opens task history to verify duration and failure diagnostics, opens resource nodes `资源监控` to verify CPU/Memory/Disk curves, the structured process resource table, default memory-desc sorting, sortable CPU column and provider upgrade precheck dispatch, opens `Docker 状态`, switches to `main-server`, and verifies the Docker Desktop-style container view including the database named volume `unidesk_pgdata_10gb`, opens `网关版本` and verifies the provider-gateway version, SSH 透传可用性、远程更新可用性 plus structured automatic update records for `provider.upgrade`, then opens `微服务 / 服务目录`、`微服务 / Todo Note`、`微服务 / FindJob`、`微服务 / Pipeline` and `微服务 / MET Nonlinear` to verify 主 server Todo Note、D601、仓库引用、私有后端映射、Todo Note 迁移清单和树形任务、FindJob 指标和岗位预览、Pipeline 组件矩阵、React Flow 控制图和最近运行、MET Nonlinear 项目库/Fork/待启动队列/当前队列/已完成/失败诊断/GPU/镜像都通过 React 控件展示。Task history and provider upgrade records must not display a real sub-second duration as `0s`; MET Nonlinear running rows must show an ETA derived from backend progress or from `startedAt` plus epoch progress, and queue/completed rows must show training speed as `epoch/h`. - Microservice frontend assertions must wait for real backend data, not only the page skeleton. For Todo Note this means the page must show the migrated lists `CONSTAR`、`大论文`、`找工作`、`小论文`、`事务`, support creating a temporary list and task through the frontend, and delete that temporary list afterwards. The temporary list must be selected again by its unique generated name before deletion so E2E never deletes a migrated source list by accident. For FindJob this means the page must show a numeric `岗位总量`, `HEALTH OK`, and a non-empty `PREVIEW` count such as `40/1463 PREVIEW`; for Pipeline this means the page must show `Pipeline v2 工作台`, `Health OK`, a numeric component count, a non-empty React Flow control graph, `控制图`, and `最近运行`; for MET Nonlinear this means the page must show `MET Nonlinear 训练编排`, `Health OK`, `Fork Project`, `加入待启动队列`, `启动队列`, `当前队列`, 最大并发设置、task queue and GPU/image panels, and must not show the removed hard-coded `创建10个10轮任务` frontend entry. The MET Nonlinear project library must render `projects/` and `ex_projects/` as a true path tree with folder Project counts; clicking a project row must open a structured detail panel containing `config.json`, `data/ 训练状态`, `模型参数`, `指标` and a parameter count such as `Total Params`; clicking a completed/current/failed job row must open a structured job detail and both the row and detail must show `epoch/h`. Full MET Nonlinear acceptance is driven by public frontend controls: choose a visible source Project, set batch size, epochs and max concurrency in inputs, fork into `projects/unidesk_forks/`, stage the selected forks, start the queue, and verify completed rows plus automatic `metnl-train-*` container removal; loading placeholders like `--` or empty states are not sufficient for E2E success. ## Frontend JSON Rule diff --git a/docs/reference/frontend.md b/docs/reference/frontend.md index 62780365..2e3f9dbc 100644 --- a/docs/reference/frontend.md +++ b/docs/reference/frontend.md @@ -22,7 +22,7 @@ frontend 应用源码必须使用 TypeScript + React,禁止在 `src/components ## Resource Node Monitor View -资源节点模块必须提供 `资源监控` 子标签,用类似 Windows 任务管理器的性能页展示每个 provider 节点的 CPU、内存和硬盘用量历史曲线。该页面应包含节点切换、当前用量摘要、CPU/Memory/Disk 三条曲线、采样说明和 `Provider Gateway 升级` 控制区;曲线数据来自 backend-core 的 `/api/nodes/system-status`,不得在页面默认展示原始 JSON。内存曲线必须使用实际内存口径,不把 Linux page cache / buffer 计入占用。 +资源节点模块必须提供 `资源监控` 子标签,用类似 Windows 任务管理器的性能页展示每个 provider 节点的 CPU、内存和硬盘用量历史曲线。该页面应包含节点切换、当前用量摘要、CPU/Memory/Disk 三条曲线、进程资源占用表、采样说明和 `Provider Gateway 升级` 控制区;曲线和进程表数据来自 backend-core 的 `/api/nodes/system-status`,不得在页面默认展示原始 JSON。内存曲线必须使用实际内存口径,不把 Linux page cache / buffer 计入占用。进程资源占用表必须用 React 表格控件展示进程名、命令摘要、PID、用户、状态、CPU、内存、RSS、磁盘 I/O、线程和运行时长,并支持按列排序;默认排序必须是内存 RSS 降序,表头要能明确显示当前排序方向。完整进程快照只能通过 `查看原始JSON` 显式查看。 ## Resource Node Docker View diff --git a/docs/reference/provider-gateway.md b/docs/reference/provider-gateway.md index f3dea40c..d559c2fe 100644 --- a/docs/reference/provider-gateway.md +++ b/docs/reference/provider-gateway.md @@ -20,7 +20,7 @@ Provider Gateway 是计算节点侧容器。它只主动连出到主 server 暴 当前主 server 公网 IP 是 `74.48.78.17`,`config.json` 中的 `network.publicHost` 必须保持为该地址;公网 frontend 入口是 `http://74.48.78.17:18081/`,provider gateway 对外接入入口是 `ws://74.48.78.17:18082/ws/provider`,provider ingress 健康检查是 `http://74.48.78.17:18082/health`。主 server 本机 provider 由根目录 `docker-compose.yml` 的 `provider-gateway` 服务启动,容器内使用 Docker 内网地址 `ws://backend-core:8081/ws/provider` 自接入;外部计算节点部署 provider-gateway 时必须改用公网 provider ingress URL,并复用 `config.json` / `.state/docker-compose.env` 中的 provider token、心跳间隔和重连参数。 -计算节点部署 provider-gateway 的最小方法是:准备可运行 `unidesk_provider-gateway` 镜像的 Docker 环境,为节点分配唯一 `PROVIDER_ID` 与可读 `PROVIDER_NAME`,设置 `PROVIDER_SERVER_URL=ws://74.48.78.17:18082/ws/provider`、`PROVIDER_TOKEN`、`PROVIDER_LABELS_JSON`、`HEARTBEAT_INTERVAL_MS`、`RECONNECT_BASE_MS` 和 `RECONNECT_MAX_MS`,并挂载 `/var/run/docker.sock:/var/run/docker.sock` 作为 Docker 状态采集、任务执行和远程升级的唯一自动化通道。所有长期接入节点都必须配置 `PROVIDER_UPGRADE_*` 环境变量,把节点上的 UniDesk 仓库只读挂载到 `PROVIDER_UPGRADE_WORKSPACE_PATH`,并确保升级命令只重建 `provider-gateway` service,不影响 database、backend-core、frontend。provider-gateway 容器必须使用 Docker restart policy `always`,Compose 写法是 `restart: always`,`docker run` 写法是 `--restart always`。provider-gateway 部署必须同时交付 Host SSH / WSL SSH 透传维护桥;WSL 节点应设置 `HOST_SSH_HOST=host.docker.internal`、`HOST_SSH_PORT=22`、`HOST_SSH_USER=`、`HOST_SSH_KEY=/run/host-ssh/id_ed25519`、`HOST_REMOTE_CWD=/home/`,并把只含维护私钥的宿主目录只读挂载到 `/run/host-ssh`。 +计算节点部署 provider-gateway 的最小方法是:准备可运行 `unidesk_provider-gateway` 镜像的 Docker 环境,为节点分配唯一 `PROVIDER_ID` 与可读 `PROVIDER_NAME`,设置 `PROVIDER_SERVER_URL=ws://74.48.78.17:18082/ws/provider`、`PROVIDER_TOKEN`、`PROVIDER_LABELS_JSON`、`HEARTBEAT_INTERVAL_MS`、`RECONNECT_BASE_MS` 和 `RECONNECT_MAX_MS`,并挂载 `/var/run/docker.sock:/var/run/docker.sock` 作为 Docker 状态采集、任务执行和远程升级的唯一自动化通道。为了让 `资源监控` 能看到节点级进程占用,provider-gateway 容器还必须运行在宿主 PID namespace:Compose 写法是 `pid: "host"`,`docker run` 写法是 `--pid host`;缺少该配置时只能看到 provider 容器命名空间内的进程,不能视为完整节点资源监控。所有长期接入节点都必须配置 `PROVIDER_UPGRADE_*` 环境变量,把节点上的 UniDesk 仓库只读挂载到 `PROVIDER_UPGRADE_WORKSPACE_PATH`,并确保升级命令只重建 `provider-gateway` service,不影响 database、backend-core、frontend。provider-gateway 容器必须使用 Docker restart policy `always`,Compose 写法是 `restart: always`,`docker run` 写法是 `--restart always`。provider-gateway 部署必须同时交付 Host SSH / WSL SSH 透传维护桥;WSL 节点应设置 `HOST_SSH_HOST=host.docker.internal`、`HOST_SSH_PORT=22`、`HOST_SSH_USER=`、`HOST_SSH_KEY=/run/host-ssh/id_ed25519`、`HOST_REMOTE_CWD=/home/`,并把只含维护私钥的宿主目录只读挂载到 `/run/host-ssh`。 ## Mandatory SSH Passthrough Bundle @@ -38,7 +38,7 @@ WSL 节点应优先使用 WSL 内部原生 Docker Engine 和 `/var/run/docker.so WSL provider 的最小环境文件应放在节点本地私有路径,例如 `/home/ubuntu/unidesk/.state/provider-.env`,并由 `docker run --env-file` 读取。`PROVIDER_LABELS_JSON` 在 Docker env-file 中可以写成单行 JSON;如果临时用 shell `source` 方式调试,必须对整段 JSON 加引号,否则 shell 会按 `{}` 和逗号拆分导致 JSON 解析失败。WSL 节点建议至少包含这些 labels:`host`、`role=wsl-provider`、`wsl=true`、`distro`、`docker=true`;运行时 provider-gateway 会自动追加 `runtime`、`dockerSocketPresent` 和 `gatewayUptimeSeconds`。`.state/provider-.env`、`logs/provider-/` 和容器日志属于节点本地运行态,必须保持在 `.gitignore` 覆盖范围内,不能提交 provider token、登录态或运行日志。 -长期运行推荐用 systemd 管理 provider-gateway 容器,而不是只在交互 shell 中运行 Bun 进程。systemd unit 的稳定形态是:`ExecStartPre=-docker rm -f unidesk-provider-gateway-` 清理同名旧容器,`ExecStart=docker run --restart always --name unidesk-provider-gateway- --env-file ... -v /var/run/docker.sock:/var/run/docker.sock -v /home/ubuntu/unidesk:/workspace:ro -v /home/ubuntu/unidesk/logs/provider-:/var/log/unidesk -v :/run/host-ssh:ro unidesk_provider-gateway:`,`ExecStop=docker stop unidesk-provider-gateway-`,并设置 `Restart=always`。临时部署也必须使用 `docker run -d --restart always`,并保证容器名、env 文件、日志目录、SSH 私钥只读挂载和镜像 tag 都带上节点 ID,便于 frontend、Docker 状态、SSH 透传和本地排障互相对应。`provider.upgrade` 是长期接入节点的必备能力,provider-gateway 不提供 `PROVIDER_UPGRADE_ENABLED` 或等价禁用开关;如果节点缺少升级环境变量或 SSH 透传环境变量,必须修正节点部署,而不是在服务端接受只能预检、不能升级或不能维护透传的半成品状态。 +长期运行推荐用 systemd 管理 provider-gateway 容器,而不是只在交互 shell 中运行 Bun 进程。systemd unit 的稳定形态是:`ExecStartPre=-docker rm -f unidesk-provider-gateway-` 清理同名旧容器,`ExecStart=docker run --restart always --pid host --name unidesk-provider-gateway- --env-file ... -v /var/run/docker.sock:/var/run/docker.sock -v /home/ubuntu/unidesk:/workspace:ro -v /home/ubuntu/unidesk/logs/provider-:/var/log/unidesk -v :/run/host-ssh:ro unidesk_provider-gateway:`,`ExecStop=docker stop unidesk-provider-gateway-`,并设置 `Restart=always`。临时部署也必须使用 `docker run -d --restart always --pid host`,并保证容器名、env 文件、日志目录、SSH 私钥只读挂载和镜像 tag 都带上节点 ID,便于 frontend、Docker 状态、SSH 透传、进程资源表和本地排障互相对应。`provider.upgrade` 是长期接入节点的必备能力,provider-gateway 不提供 `PROVIDER_UPGRADE_ENABLED` 或等价禁用开关;如果节点缺少升级环境变量或 SSH 透传环境变量,必须修正节点部署,而不是在服务端接受只能预检、不能升级或不能维护透传的半成品状态。 WSL 本身会在没有前台进程时被 Windows 回收;如果该节点要作为长期在线算力,必须通过 Windows 启动项、计划任务或后台 `wsl.exe -d -u root -- bash -lc "systemctl start docker unidesk-provider-gateway-.service; exec sleep infinity"` 这类 keepalive 进程保持发行版运行。仅启用 WSL 内 systemd service 不等价于 Windows 层面的常驻守护。 @@ -98,7 +98,7 @@ provider-gateway 连接成功后必须周期性上报 Docker daemon 状态,数 ## System Status Telemetry -provider-gateway 连接成功后必须周期性上报节点 CPU、内存和硬盘用量。采集来源是节点本地 `/proc/stat`、`/proc/loadavg`、`/proc/meminfo` 与 `df -PB1`,backend-core 将最新快照保存到 `unidesk_node_system_status`,并将历史采样保存到 `unidesk_node_metric_samples` 供 frontend 绘制任务管理器风格曲线。内存使用量采用实际占用口径:`MemTotal - MemFree - Buffers - Cached - SReclaimable + Shmem`,也就是不把 Linux page cache / buffer 计入占用;上报中同时保留 `cacheBytes` 便于排查。该链路仍然由 provider 主动上报,主 server 不反向探测计算节点。 +provider-gateway 连接成功后必须周期性上报节点 CPU、内存、硬盘和进程资源占用。整体采集来源是节点本地 `/proc/stat`、`/proc/loadavg`、`/proc/meminfo` 与 `df -PB1`,进程表来源是 `/proc/[pid]/stat`、`/proc/[pid]/status`、`/proc/[pid]/cmdline` 和 `/proc/[pid]/io`;backend-core 将最新快照保存到 `unidesk_node_system_status`,并将历史采样保存到 `unidesk_node_metric_samples` 供 frontend 绘制任务管理器风格曲线。内存使用量采用实际占用口径:`MemTotal - MemFree - Buffers - Cached - SReclaimable + Shmem`,也就是不把 Linux page cache / buffer 计入占用;上报中同时保留 `cacheBytes` 便于排查。进程表的 `rssBytes` 是实际常驻内存,默认按 `rssBytes` 降序截取前 120 个进程;`cpuPercent` 使用相邻采样 CPU tick 差值,首个采样用进程生命周期平均值兜底;磁盘 I/O 速率使用相邻 `/proc/[pid]/io` 的 `read_bytes/write_bytes` 差值。该链路仍然由 provider 主动上报,主 server 不反向探测计算节点。 ## Remote Provider Upgrade @@ -106,7 +106,7 @@ backend-core 可以通过真实 WebSocket 调度向在线 provider 下发 `provi 远程升级策略固定为 always-enabled:只要 provider-gateway 在线并声明 `provider.upgrade`,`mode: "schedule"` 就必须真正调度升级容器,不允许被 `PROVIDER_UPGRADE_ENABLED=false`、前端隐藏按钮或服务端特殊名单禁用。升级能力的安全边界不是开关,而是显式 `PROVIDER_UPGRADE_*` 配置、Docker socket 权限、只读仓库挂载、固定 Compose service 和 `--no-deps` 约束。升级计划中必须展示 `policy: "always-enabled"`、updater 容器名、runner image、workspace、Compose project/service、env file、compose file 和实际 `docker run` 命令,方便前端任务历史与 CLI debug 直接诊断。 -`mode: "schedule"` 的成功返回只代表 updater 已被调度,最终升级成败由候选 gateway 自验证决定。updater 必须先按 Compose 构建新镜像,再用旧容器的 `Config.Env` 生成候选 env-file,并复用旧容器的 Docker socket、日志目录、SSH 私钥只读挂载、Compose 网络和 `extra_hosts`;候选容器启动时 restart policy 必须先是 `no`,验证通过后才能改成 `always` 并删除旧容器。升级计划的 `replacementStrategy` 必须包含 `oldGatewaySleepMs`、`validationTimeoutMs`、`promoteOnlyAfterCandidateValidation`、`candidateRestartPolicyAfterPromotion: "always"`、`candidateUsesOldContainerEnvironment`、`candidateUsesOldContainerMounts`、`candidateUsesOldContainerNetworks` 和 `candidateUsesOldContainerExtraHosts`,并且必须在 plan 中显示指定 Provider 的当前/目标 gateway 版本号,便于前端和 CLI 判断这不是旧的先删旧容器再 up 的危险流程。 +`mode: "schedule"` 的成功返回只代表 updater 已被调度,最终升级成败由候选 gateway 自验证决定。updater 必须先按 Compose 构建新镜像,再用旧容器的 `Config.Env` 生成候选 env-file,并复用旧容器的 Docker socket、日志目录、SSH 私钥只读挂载、Compose 网络和 `extra_hosts`;候选容器启动时 restart policy 必须先是 `no`,并显式使用 `--pid host` 保持节点级进程资源采集,验证通过后才能改成 `always` 并删除旧容器。升级计划的 `replacementStrategy` 必须包含 `oldGatewaySleepMs`、`validationTimeoutMs`、`promoteOnlyAfterCandidateValidation`、`candidateRestartPolicyAfterPromotion: "always"`、`candidateUsesOldContainerEnvironment`、`candidateUsesOldContainerMounts`、`candidateUsesOldContainerNetworks`、`candidateUsesOldContainerExtraHosts` 和 `candidateUsesHostPidNamespace`,并且必须在 plan 中显示指定 Provider 的当前/目标 gateway 版本号,便于前端和 CLI 判断这不是旧的先删旧容器再 up 的危险流程。 自动更新记录的权威来源是 backend-core 保存的 `provider.upgrade` 任务历史,而不是 provider-gateway 容器日志文件。frontend 必须按 Provider 聚合这些任务,并把状态、模式、task id、来源、耗时、策略、updater 容器摘要、失败原因和更新时间渲染为表格或卡片;完整 task/result JSON 只能由操作员点击 `查看原始JSON` 后查看。 @@ -116,7 +116,7 @@ backend-core 可以通过真实 WebSocket 调度向在线 provider 下发 `provi 手动升级只用于把旧节点 bootstrap 到支持 always-enabled 远程升级的版本;bootstrap 完成后,常规重建/升级必须回到 `provider.upgrade mode=schedule`,不得再用 SSH 透传同步重建 `provider-gateway`。节点侧维护步骤是:进入节点本地 UniDesk 仓库,执行 `git pull --ff-only` 获取主 server 已推送版本;确认 `.state/provider-.env` 中存在 `PROVIDER_SERVER_URL=ws://74.48.78.17:18082/ws/provider`、`PROVIDER_ID=`、`PROVIDER_NAME=`、`PROVIDER_TOKEN`、`PROVIDER_LABELS_JSON`、`PROVIDER_UPGRADE_HOST_PROJECT_ROOT=/home/ubuntu/unidesk`、`PROVIDER_UPGRADE_WORKSPACE_PATH=/workspace`、`PROVIDER_UPGRADE_COMPOSE_FILE`、`PROVIDER_UPGRADE_ENV_FILE`、`PROVIDER_UPGRADE_COMPOSE_PROJECT`、`PROVIDER_UPGRADE_SERVICE=provider-gateway`、`PROVIDER_UPGRADE_RUNNER_IMAGE=unidesk_provider-gateway:`、`DOCKER_SOCKET_PATH=/var/run/docker.sock`、`MONITOR_DISK_PATH=/`、心跳和重连参数。旧 env 文件中如果还残留 `PROVIDER_UPGRADE_ENABLED`,新版 provider-gateway 会忽略它;长期文档和新部署不得再依赖这个键。 -如果节点已有专用 Compose,优先用节点本地 Compose 手动重建一次:`docker compose --env-file .state/provider-.env -f -p up -d --no-deps --build --force-recreate provider-gateway`。这条命令必须在节点本地终端、节点自有 Web terminal、系统计划任务或 detached shell 中执行;不得通过正在被重建的 UniDesk provider-gateway 自己提供的 SSH 透传同步执行,否则旧 provider 容器停止时会切断 SSH client,可能导致重建中断在旧容器已停、新容器未起的状态。若只能通过 UniDesk 触达该节点,必须使用 `provider.upgrade mode=schedule` 的 detached updater,或先用节点本地 `nohup`/systemd 启动一个不依赖当前 provider 容器生命周期的重建脚本。老版 `docker-compose` 可能在重建已存在容器时因为 `ContainerConfig` 兼容问题失败;此时只能移除目标 provider-gateway 容器后重新 `up -d --no-deps provider-gateway`,不得执行 `down -v`、`docker volume rm` 或任何会影响 database 命名卷的命令。如果节点当前只有 `docker run` 部署,则先构建镜像 `docker build -f src/components/provider-gateway/Dockerfile -t unidesk_provider-gateway: .`,再以固定容器名重建:挂载 `/var/run/docker.sock:/var/run/docker.sock`、`/home/ubuntu/unidesk:/workspace:ro`、节点日志目录到 `/var/log/unidesk`,如需 WSL SSH 维护桥还要把只读私钥目录挂载到 `/run/host-ssh`,并使用同一个 `.state/provider-.env` 启动。无论 Compose 还是 `docker run`,容器名和镜像 tag 都必须带 Provider ID,便于 Docker 状态页、任务历史和节点本地排障互相对应。 +如果节点已有专用 Compose,优先用节点本地 Compose 手动重建一次:`docker compose --env-file .state/provider-.env -f -p up -d --no-deps --build --force-recreate provider-gateway`。这条命令必须在节点本地终端、节点自有 Web terminal、系统计划任务或 detached shell 中执行;不得通过正在被重建的 UniDesk provider-gateway 自己提供的 SSH 透传同步执行,否则旧 provider 容器停止时会切断 SSH client,可能导致重建中断在旧容器已停、新容器未起的状态。若只能通过 UniDesk 触达该节点,必须使用 `provider.upgrade mode=schedule` 的 detached updater,或先用节点本地 `nohup`/systemd 启动一个不依赖当前 provider 容器生命周期的重建脚本。老版 `docker-compose` 可能在重建已存在容器时因为 `ContainerConfig` 兼容问题失败;此时只能移除目标 provider-gateway 容器后重新 `up -d --no-deps provider-gateway`,不得执行 `down -v`、`docker volume rm` 或任何会影响 database 命名卷的命令。如果节点当前只有 `docker run` 部署,则先构建镜像 `docker build -f src/components/provider-gateway/Dockerfile -t unidesk_provider-gateway: .`,再以固定容器名重建:使用 `--restart always --pid host`,挂载 `/var/run/docker.sock:/var/run/docker.sock`、`/home/ubuntu/unidesk:/workspace:ro`、节点日志目录到 `/var/log/unidesk`,如需 WSL SSH 维护桥还要把只读私钥目录挂载到 `/run/host-ssh`,并使用同一个 `.state/provider-.env` 启动。无论 Compose 还是 `docker run`,容器名和镜像 tag 都必须带 Provider ID,便于 Docker 状态页、进程资源表、任务历史和节点本地排障互相对应。 手动升级完成后的判定标准固定为主 server 可观测结果,而不是节点容器 `running`:访问公网 frontend `http://74.48.78.17:18081/`,确认该 Provider 在线;随后在任意装有本仓库且 `config.json` 含正确 frontend 登录凭据的计算节点上执行 `bun scripts/cli.ts --main-server-ip 74.48.78.17 debug dispatch provider.upgrade --mode schedule --wait-ms 15000`,确认任务 `succeeded` 且 result 包含 updater 容器信息;最后再次查看 frontend 或执行 `bun scripts/cli.ts --main-server-ip 74.48.78.17 debug health`,确认节点重连、指标恢复、labels 中 `host.ssh` 能力存在。每个 provider-gateway 手动升级后都必须用 remote CLI 再执行 `bun scripts/cli.ts --main-server-ip 74.48.78.17 debug dispatch host.ssh --wait-ms 15000` 和 `bun scripts/cli.ts --main-server-ip 74.48.78.17 ssh hostname`,验证维护桥没有在升级后丢失;该 remote CLI 默认走公网 frontend,不需要指定 `--main-server-key`。 diff --git a/scripts/src/e2e.ts b/scripts/src/e2e.ts index 0051f652..02b57128 100644 --- a/scripts/src/e2e.ts +++ b/scripts/src/e2e.ts @@ -277,8 +277,10 @@ async function serviceChecks(config: UniDeskConfig, urls: PublicUrls, checks: E2 const nodeList = (coreNodes as { body?: { nodes?: Array<{ providerId?: string; status?: string; labels?: Record }> } }).body?.nodes ?? []; const mainNode = nodeList.find((node) => node.providerId === config.providerGateway.id); const expectedGatewayVersion = providerGatewayPackageVersion(); - const systemStatuses = (systemStatus as { body?: { systemStatuses?: Array<{ providerId?: string; current?: { cpu?: { percent?: number }; memory?: { percent?: number; mode?: string; cacheBytes?: number }; disk?: { percent?: number } }; history?: unknown[] }> } }).body?.systemStatuses ?? []; + const systemStatuses = (systemStatus as { body?: { systemStatuses?: Array<{ providerId?: string; current?: { cpu?: { percent?: number }; memory?: { percent?: number; mode?: string; cacheBytes?: number }; disk?: { percent?: number }; processes?: Array<{ pid?: number; rssBytes?: number; cpuPercent?: number; command?: string }>; processSummary?: { defaultSort?: string; visible?: number; total?: number } }; history?: unknown[] }> } }).body?.systemStatuses ?? []; const mainSystem = systemStatuses.find((item) => item.providerId === config.providerGateway.id); + const mainProcesses = mainSystem?.current?.processes ?? []; + const processMemoryDescending = mainProcesses.length < 2 || mainProcesses.every((row, index, rows) => index === 0 || Number(rows[index - 1]?.rssBytes ?? 0) >= Number(row.rssBytes ?? 0)); const dockerStatuses = (dockerStatus as { body?: { dockerStatuses?: Array<{ providerId?: string; dockerStatus?: { counts?: { containers?: number }; containers?: unknown[] } }> } }).body?.dockerStatuses ?? []; const mainDocker = dockerStatuses.find((item) => item.providerId === config.providerGateway.id); addCheck(checks, "core:internal-overview", (coreOverview as { ok?: boolean }).ok === true && overviewBody?.ok === true && overviewBody.dbReady === true && (overviewBody.onlineNodeCount ?? 0) >= 1, coreOverview); @@ -286,6 +288,7 @@ async function serviceChecks(config: UniDeskConfig, urls: PublicUrls, checks: E2 addCheck(checks, "provider:self-node-online", nodeList.some((node) => node.providerId === config.providerGateway.id && node.status === "online"), coreNodes); addCheck(checks, "provider:gateway-version-label", mainNode?.labels?.providerGatewayVersion === expectedGatewayVersion && mainNode?.labels?.providerGatewayUpgradePolicy === "always-enabled", { providerId: config.providerGateway.id, expectedGatewayVersion, labels: mainNode?.labels ?? null }); addCheck(checks, "provider:system-status", (systemStatus as { ok?: boolean }).ok === true && mainSystem?.current !== undefined && Number.isFinite(mainSystem.current.cpu?.percent) && Number.isFinite(mainSystem.current.memory?.percent) && mainSystem.current.memory?.mode === "actual_without_cache" && Number.isFinite(mainSystem.current.memory?.cacheBytes) && Number.isFinite(mainSystem.current.disk?.percent) && (mainSystem.history?.length ?? 0) > 0, systemStatusCheckDetail(systemStatus, config.providerGateway.id)); + addCheck(checks, "provider:process-resource-status", mainProcesses.length > 0 && mainSystem?.current?.processSummary?.defaultSort === "memory_desc" && processMemoryDescending && mainProcesses.some((row) => Number.isFinite(row.pid) && Number.isFinite(row.rssBytes) && Number.isFinite(row.cpuPercent) && typeof row.command === "string"), { providerId: config.providerGateway.id, processSummary: mainSystem?.current?.processSummary, sample: mainProcesses.slice(0, 5) }); addCheck(checks, "provider:docker-status", (dockerStatus as { ok?: boolean }).ok === true && mainDocker?.dockerStatus !== undefined && ((mainDocker.dockerStatus.counts?.containers ?? 0) > 0 || (mainDocker.dockerStatus.containers?.length ?? 0) > 0), dockerStatusCheckDetail(dockerStatus, config.providerGateway.id)); const microserviceList = (microservices as { body?: { microservices?: Array<{ id?: string; providerId?: string; backend?: { public?: boolean }; runtime?: { providerStatus?: string; container?: { name?: string; state?: string } } }> } }).body?.microservices ?? []; const findjob = microserviceList.find((service) => service.id === "findjob"); @@ -484,11 +487,21 @@ async function frontendCheck(config: UniDeskConfig, urls: PublicUrls, checks: E2 await page.waitForSelector('[data-testid="metric-chart-cpu"]', { timeout: 10000 }); await page.waitForSelector('[data-testid="metric-chart-memory"]', { timeout: 10000 }); await page.waitForSelector('[data-testid="metric-chart-disk"]', { timeout: 10000 }); + await page.waitForSelector('[data-testid="process-resource-table"]', { timeout: 10000 }); await page.waitForFunction(() => { const text = document.body.innerText.toLowerCase(); - return text.includes("任务管理器视图") && text.includes("cpu") && text.includes("memory") && text.includes("disk") && text.includes("不含缓存"); + return text.includes("任务管理器视图") && text.includes("cpu") && text.includes("memory") && text.includes("disk") && text.includes("不含缓存") && text.includes("进程资源占用"); }, undefined, { timeout: 10000 }); const monitorText = await page.locator('[data-testid="node-monitor-page"]').innerText({ timeout: 5000 }); + const processTableText = await page.locator('[data-testid="process-resource-table"]').innerText({ timeout: 5000 }); + const processMemoryValues = await page.locator('[data-testid="process-resource-table"] tbody tr').evaluateAll((rows) => rows.map((row) => Number((row as HTMLElement).dataset.memoryBytes || "0"))); + const processDefaultMemoryDescending = processMemoryValues.length > 0 && processMemoryValues.every((value, index, rows) => index === 0 || rows[index - 1] >= value); + const processMemorySortAria = await page.getByTestId("process-sort-memory").evaluate((element) => element.closest("th")?.getAttribute("aria-sort") || ""); + await page.getByTestId("process-sort-cpu").click(); + await page.waitForFunction(() => document.querySelector('[data-testid="process-sort-cpu"]')?.closest("th")?.getAttribute("aria-sort") === "descending", undefined, { timeout: 5000 }); + const processCpuValues = await page.locator('[data-testid="process-resource-table"] tbody tr').evaluateAll((rows) => rows.map((row) => Number((row as HTMLElement).dataset.cpuPercent || "0"))); + const processCpuDescending = processCpuValues.length > 0 && processCpuValues.every((value, index, rows) => index === 0 || rows[index - 1] >= value); + const processCpuSortAria = await page.getByTestId("process-sort-cpu").evaluate((element) => element.closest("th")?.getAttribute("aria-sort") || ""); await page.getByTestId("upgrade-plan-button").click(); await page.waitForFunction(() => document.body.innerText.includes("预检升级 已下发"), undefined, { timeout: 10000 }); const upgradeControlText = await page.locator('[data-testid="provider-upgrade-control"]').innerText({ timeout: 5000 }); @@ -649,7 +662,8 @@ async function frontendCheck(config: UniDeskConfig, urls: PublicUrls, checks: E2 addCheck(checks, "frontend:task-history-diagnostics", taskHistoryText.includes("任务耗时") && taskHistoryText.includes("诊断信息") && taskHistoryText.includes("失败原因") && taskHistoryText.includes("e2e forced failure for diagnostics"), { taskHistoryPreview: taskHistoryText.slice(0, 900) }); addCheck(checks, "frontend:no-naked-json-before-click", rawBlocksBefore === 0 && !nakedJsonText, { rawBlocksBefore, nakedJsonText }); addCheck(checks, "frontend:raw-json-explicit-button", rawText.includes('"providerId"') && rawText.includes(config.providerGateway.id), { rawTextPreview: rawText.slice(0, 400) }); - addCheck(checks, "frontend:system-monitor-visible", monitorText.includes("任务管理器视图") && monitorText.includes("CPU") && monitorText.includes("Memory") && monitorText.includes("Disk") && monitorText.includes("不含缓存"), { monitorTextPreview: monitorText.slice(0, 800) }); + addCheck(checks, "frontend:system-monitor-visible", monitorText.includes("任务管理器视图") && monitorText.includes("CPU") && monitorText.includes("Memory") && monitorText.includes("Disk") && monitorText.includes("不含缓存") && monitorText.includes("进程资源占用"), { monitorTextPreview: monitorText.slice(0, 1000) }); + addCheck(checks, "frontend:process-resource-sorting", processTableText.includes("进程") && processTableText.includes("PID") && processTableText.includes("CPU") && processTableText.includes("内存") && processTableText.includes("磁盘 I/O") && processMemorySortAria === "descending" && processDefaultMemoryDescending && processCpuSortAria === "descending" && processCpuDescending, { processMemorySortAria, processCpuSortAria, processMemoryValues: processMemoryValues.slice(0, 12), processCpuValues: processCpuValues.slice(0, 12), processTablePreview: processTableText.slice(0, 1000) }); addCheck(checks, "frontend:upgrade-plan-dispatch", upgradeControlText.includes("预检升级 已下发") && upgradeControlText.includes("指定 Provider") && upgradeControlText.includes(`v${providerGatewayPackageVersion()}`), { providerId: config.providerGateway.id, upgradeControlPreview: upgradeControlText.slice(0, 500) }); addCheck(checks, "frontend:docker-status-visible", dockerText.toLowerCase().includes("docker desktop 视图") && dockerText.toLowerCase().includes("containers") && dockerText.includes("unidesk_pgdata_10gb") && (dockerText.includes("unidesk-frontend") || dockerText.includes("unidesk-backend-core")), { dockerTextPreview: dockerText.slice(0, 800) }); addCheck(checks, "frontend:gateway-version-records-visible", gatewayTextLower.includes("provider gateway 版本") && gatewayText.includes("自动更新记录") && gatewayTextLower.includes("gateway 版本") && gatewayText.includes(config.providerGateway.id) && gatewayText.includes(`v${providerGatewayPackageVersion()}`) && gatewayTextLower.includes("provider.upgrade"), { gatewayTextPreview: gatewayText.slice(0, 900) }); diff --git a/src/components/frontend/public/style.css b/src/components/frontend/public/style.css index 0401e3cc..aaeb20dc 100644 --- a/src/components/frontend/public/style.css +++ b/src/components/frontend/public/style.css @@ -718,6 +718,126 @@ h2 { font-size: 14px; text-transform: uppercase; letter-spacing: 0.08em; } margin-top: 8px; } +.process-resource-panel { + margin-top: 8px; + border: 1px solid var(--line-soft); + background: + linear-gradient(180deg, rgba(255,255,255,0.025), transparent 28%), + #0b141b; +} +.process-resource-head { + display: flex; + justify-content: space-between; + gap: 10px; + align-items: center; + padding: 8px 9px; + border-bottom: 1px solid var(--line-soft); +} +.process-resource-head h3 { + margin: 0; + font-size: 15px; + letter-spacing: 0.06em; +} +.process-resource-actions { + display: flex; + flex-wrap: wrap; + justify-content: flex-end; + gap: 5px; + align-items: center; +} +.process-table-wrap { + max-height: 420px; + overflow: auto; +} +.process-resource-table { + min-width: 1120px; +} +.process-resource-table th, +.process-resource-table td { + padding: 6px 8px; +} +.process-sort-button { + display: inline-flex; + align-items: center; + gap: 4px; + padding: 0; + border: 0; + color: inherit; + background: transparent; + text-transform: inherit; + letter-spacing: inherit; +} +.process-sort-button span { + color: var(--faint); + font-size: 10px; +} +.process-sort-button.active span { + color: var(--accent-2); +} +.process-name-cell { + display: grid; + gap: 2px; + min-width: 260px; + max-width: 420px; +} +.process-name-cell strong { + color: #e3eef0; +} +.process-command { + color: var(--muted); + font-size: 11px; + white-space: nowrap; + overflow: hidden; + text-overflow: ellipsis; +} +.process-state { + display: inline-flex; + min-width: 22px; + justify-content: center; + padding: 1px 5px; + border: 1px solid var(--line-soft); + color: var(--muted); + background: rgba(255,255,255,0.025); + font-family: "Cascadia Mono", "IBM Plex Mono", "Liberation Mono", monospace; +} +.process-meter { + position: relative; + min-width: 104px; + height: 20px; + overflow: hidden; + border: 1px solid var(--line-soft); + background: rgba(255,255,255,0.025); +} +.process-meter span { + position: absolute; + inset: 0 auto 0 0; + background: linear-gradient(90deg, rgba(78, 183, 168, 0.58), rgba(78, 183, 168, 0.12)); +} +.process-meter.cpu span { + background: linear-gradient(90deg, rgba(215, 161, 58, 0.64), rgba(215, 161, 58, 0.12)); +} +.process-meter b { + position: relative; + z-index: 1; + display: block; + padding: 2px 6px; + color: #eaf3f2; + font-family: "Cascadia Mono", "IBM Plex Mono", "Liberation Mono", monospace; + font-size: 11px; +} +.process-io-cell { + display: grid; + gap: 2px; + min-width: 146px; +} +.process-io-cell strong { + color: #d7e3e7; +} +.process-io-cell span { + color: var(--muted); + font-size: 11px; +} + .monitor-side-stack { display: grid; gap: 10px; @@ -1833,6 +1953,16 @@ input:focus, select:focus, textarea:focus { border-color: var(--accent-2); } } .metric-grid, .policy-grid, .security-board, .dispatch-form, .docker-metrics, .monitor-chart-grid, .monitor-summary-grid, .gateway-record-grid, .met-detail-kv { grid-template-columns: 1fr; } .compact-row, .heartbeat-row, .log-row, .endpoint-list article, .volume-route, .findjob-hero, .pipeline-hero { grid-template-columns: 1fr; align-items: start; } + .process-resource-head { + align-items: stretch; + flex-direction: column; + } + .process-resource-actions { + justify-content: flex-start; + } + .process-table-wrap { + max-height: 360px; + } .met-tree-header, .met-tree-row { grid-template-columns: minmax(220px, 1fr) 72px 62px 76px 96px; min-width: 560px; diff --git a/src/components/frontend/src/app.tsx b/src/components/frontend/src/app.tsx index c64eed9f..825b4a0e 100644 --- a/src/components/frontend/src/app.tsx +++ b/src/components/frontend/src/app.tsx @@ -104,6 +104,17 @@ function fmtPercent(value: any): string { return Number.isFinite(number) ? `${Math.max(0, Math.min(100, number)).toFixed(1)}%` : "--"; } +function fmtLoosePercent(value: any): string { + const number = Number(value); + return Number.isFinite(number) ? `${Math.max(0, number).toFixed(1)}%` : "--"; +} + +function fmtBytesRate(value: any): string { + const bytes = Number(value); + if (!Number.isFinite(bytes) || bytes <= 0) return "0 B/s"; + return `${fmtBytes(bytes)}/s`; +} + function asNumber(value: any, fallback = 0): number { const number = Number(value); return Number.isFinite(number) ? number : fallback; @@ -719,6 +730,7 @@ function NodeMonitorPage({ nodes, systemStatuses, tasks, onRaw, refresh }: AnyRe h(MetricCard, { label: "硬盘已用", value: fmtBytes(disk.usedBytes), hint: fmtPercent(disk.percent) }), h(MetricCard, { label: "更新时间", value: fmtDate(active.systemUpdatedAt || current.collectedAt), hint: active.providerId }), ), + h(ProcessResourceTable, { current, onRaw }), ), ), h("div", { className: "monitor-side-stack" }, @@ -729,6 +741,7 @@ function NodeMonitorPage({ nodes, systemStatuses, tasks, onRaw, refresh }: AnyRe h("article", null, h("b", null, "CPU"), h("span", null, "从 /proc/stat 计算相邻采样差值,首个采样用 load/cores 近似")), h("article", null, h("b", null, "Memory"), h("span", null, "实际内存 = MemTotal - MemFree - Buffers - Cached - SReclaimable + Shmem,不把 page cache / buffer 计入占用")), h("article", null, h("b", null, "Disk"), h("span", null, "使用 df -PB1 对配置路径采样,默认监控根文件系统")), + h("article", null, h("b", null, "Process"), h("span", null, "从 /proc/[pid] 采集进程 CPU、实际内存 RSS、线程数和磁盘 I/O 速率;表格默认按内存占用降序")), ), ), ), @@ -736,6 +749,119 @@ function NodeMonitorPage({ nodes, systemStatuses, tasks, onRaw, refresh }: AnyRe ); } +type ProcessSortKey = "memory" | "cpu" | "disk" | "pid" | "name" | "user" | "threads" | "runtime"; + +function processSortValue(row: AnyRecord, key: ProcessSortKey): string | number { + if (key === "memory") return asNumber(row.rssBytes); + if (key === "cpu") return asNumber(row.cpuPercent); + if (key === "disk") return asNumber(row.readBytesPerSecond) + asNumber(row.writeBytesPerSecond); + if (key === "pid") return asNumber(row.pid); + if (key === "threads") return asNumber(row.threads); + if (key === "runtime") return asNumber(row.elapsedSeconds); + if (key === "user") return String(row.user || ""); + return String(row.name || row.command || ""); +} + +function ProcessMeter({ value, label, tone }: AnyRecord) { + const width = Math.max(1, Math.min(100, asNumber(value))); + return h("div", { className: `process-meter ${tone || ""}` }, + h("span", { style: { width: `${width}%` } }), + h("b", null, label), + ); +} + +function ProcessResourceTable({ current, onRaw }: AnyRecord) { + const [sort, setSort] = useState({ key: "memory", direction: "desc" }); + const processSummary = current?.processSummary && typeof current.processSummary === "object" ? current.processSummary : {}; + const processes = Array.isArray(current?.processes) ? current.processes : []; + const rows = useMemo(() => { + const direction = sort.direction === "asc" ? 1 : -1; + return [...processes].sort((left: AnyRecord, right: AnyRecord) => { + const a = processSortValue(left, sort.key); + const b = processSortValue(right, sort.key); + if (typeof a === "string" || typeof b === "string") return String(a).localeCompare(String(b), "zh-CN") * direction; + return (a - b) * direction || asNumber(left.pid) - asNumber(right.pid); + }); + }, [processes, sort.key, sort.direction]); + + const sortHeader = (label: string, key: ProcessSortKey) => { + const active = sort.key === key; + const ariaSort = active ? (sort.direction === "asc" ? "ascending" : "descending") : "none"; + return h("th", { "aria-sort": ariaSort }, + h("button", { + type: "button", + className: `process-sort-button ${active ? "active" : ""}`, + "data-testid": `process-sort-${key}`, + onClick: () => setSort((previous: AnyRecord) => ({ + key, + direction: previous.key === key && previous.direction === "desc" ? "asc" : "desc", + })), + }, label, h("span", null, active ? (sort.direction === "desc" ? "↓" : "↑") : "↕")), + ); + }; + + return h("section", { className: "process-resource-panel", "data-testid": "process-resource-panel" }, + h("div", { className: "process-resource-head" }, + h("div", null, + h("p", { className: "panel-eyebrow" }, "Windows Resource Monitor Style"), + h("h3", null, "进程资源占用"), + ), + h("div", { className: "process-resource-actions" }, + h("span", { className: "data-chip" }, "默认按内存排序"), + h("span", { className: "data-chip" }, `${asNumber(processSummary.visible, rows.length)} / ${asNumber(processSummary.total, rows.length)} 进程`), + h(RawButton, { title: "Process Resource Snapshot", data: { processSummary, processes }, onOpen: onRaw, testId: "raw-process-resources" }), + ), + ), + rows.length === 0 ? h(EmptyState, { title: "暂无进程资源数据", text: "等待 provider-gateway 上报 /proc/[pid] 采样;旧版 provider 需要先升级到支持进程资源表的版本" }) : + h("div", { className: "process-table-wrap" }, + h("table", { className: "process-resource-table", "data-testid": "process-resource-table" }, + h("thead", null, h("tr", null, + sortHeader("进程", "name"), + sortHeader("PID", "pid"), + sortHeader("用户", "user"), + h("th", null, "状态"), + sortHeader("CPU", "cpu"), + sortHeader("内存", "memory"), + h("th", null, "RSS"), + sortHeader("磁盘 I/O", "disk"), + sortHeader("线程", "threads"), + sortHeader("运行时长", "runtime"), + )), + h("tbody", null, rows.map((row: AnyRecord) => { + const diskRate = asNumber(row.readBytesPerSecond) + asNumber(row.writeBytesPerSecond); + return h("tr", { + key: `${row.pid}-${row.startedAt}`, + "data-testid": `process-row-${safeId(row.pid)}`, + "data-memory-bytes": String(asNumber(row.rssBytes)), + "data-cpu-percent": String(asNumber(row.cpuPercent)), + "data-disk-bps": String(diskRate), + "data-pid": String(asNumber(row.pid)), + }, + h("td", null, + h("div", { className: "process-name-cell" }, + h("strong", null, row.name || "--"), + h("span", { className: "process-command" }, row.command || "--"), + ), + ), + h("td", null, h("code", null, row.pid || "--")), + h("td", null, row.user || `uid:${row.uid ?? "--"}`), + h("td", null, h("span", { className: `process-state state-${safeId(row.state || "unknown")}` }, row.state || "?")), + h("td", null, h(ProcessMeter, { value: row.cpuPercent, label: fmtLoosePercent(row.cpuPercent), tone: "cpu" })), + h("td", null, h(ProcessMeter, { value: row.memoryPercent, label: fmtPercent(row.memoryPercent), tone: "memory" })), + h("td", null, fmtBytes(row.rssBytes)), + h("td", null, h("div", { className: "process-io-cell" }, + h("strong", null, fmtBytesRate(diskRate)), + h("span", null, `R ${fmtBytesRate(row.readBytesPerSecond)} / W ${fmtBytesRate(row.writeBytesPerSecond)}`), + )), + h("td", null, row.threads || 0), + h("td", null, fmtDuration(asNumber(row.elapsedSeconds))), + ); + })), + ), + ), + ); +} + function MetricChart({ title, metricKey, current, points, detail, tone, testId }: AnyRecord) { const values = points.map((point: any) => Math.max(0, Math.min(100, asNumber(point[metricKey])))); const chartValues = values.length > 1 ? values : [values[0] || 0, values[0] || 0]; diff --git a/src/components/provider-gateway/package.json b/src/components/provider-gateway/package.json index ab16cdd0..d1e9c6c8 100644 --- a/src/components/provider-gateway/package.json +++ b/src/components/provider-gateway/package.json @@ -1,6 +1,6 @@ { "name": "@unidesk/provider-gateway", - "version": "0.2.5", + "version": "0.2.6", "private": true, "type": "module", "scripts": { diff --git a/src/components/provider-gateway/src/index.ts b/src/components/provider-gateway/src/index.ts index 15be5d6c..ac8112fd 100644 --- a/src/components/provider-gateway/src/index.ts +++ b/src/components/provider-gateway/src/index.ts @@ -1,4 +1,4 @@ -import { appendFileSync, existsSync, mkdirSync, readFileSync } from "node:fs"; +import { appendFileSync, existsSync, mkdirSync, readFileSync, readdirSync } from "node:fs"; import { dirname } from "node:path"; import { type CoreDispatchMessage, @@ -13,6 +13,7 @@ import { type DockerStatusSnapshot, type DockerVolumeSummary, type JsonValue, + type ProcessResourceSummary, type ProviderLabels, type ProviderTaskStatusMessage, type SystemStatusSnapshot, @@ -56,6 +57,10 @@ let systemStatusTimer: ReturnType | null = null; let dockerStatusRunning = false; let systemStatusRunning = false; let previousCpuSample: { idle: number; total: number } | null = null; +let previousProcessSamples = new Map(); +let passwdCache: Map | null = null; +let clockTicksCache: number | null = null; +let pageSizeCache: number | null = null; let reconnectAttempt = 0; let stopping = false; let upgradeSleepUntil = 0; @@ -261,6 +266,7 @@ async function sendSystemStatus(): Promise { cpuPercent: status.cpu.percent, memoryPercent: status.memory.percent, diskPercent: status.disk.percent, + processCount: status.processes?.length ?? 0, }); } catch (error) { logger("error", "system_status_failed", { error: error instanceof Error ? error.message : String(error) }); @@ -384,6 +390,11 @@ function clampPercent(value: number): number { return Math.max(0, Math.min(100, Number(value.toFixed(1)))); } +function roundMetric(value: number, digits = 1): number { + if (!Number.isFinite(value)) return 0; + return Math.max(0, Number(value.toFixed(digits))); +} + function readCpuSample(): { idle: number; total: number; cores: number } { const stat = readFileSync("/proc/stat", "utf8"); const lines = stat.split("\n"); @@ -415,6 +426,211 @@ function readMemInfo(): Record { return result; } +async function readGetconfNumber(name: string, fallback: number): Promise { + const result = await runProcessCommand("getconf", [name], 1000); + const parsed = Number(result.stdout.trim()); + return result.ok && Number.isFinite(parsed) && parsed > 0 ? parsed : fallback; +} + +async function systemClockTicks(): Promise { + if (clockTicksCache !== null) return clockTicksCache; + clockTicksCache = await readGetconfNumber("CLK_TCK", 100); + return clockTicksCache; +} + +async function systemPageSize(): Promise { + if (pageSizeCache !== null) return pageSizeCache; + pageSizeCache = await readGetconfNumber("PAGESIZE", 4096); + return pageSizeCache; +} + +function usernameForUid(uid: number): string { + if (passwdCache === null) { + passwdCache = new Map(); + try { + for (const line of readFileSync("/etc/passwd", "utf8").split("\n")) { + const parts = line.split(":"); + const parsedUid = Number(parts[2]); + if (parts[0] && Number.isFinite(parsedUid)) passwdCache.set(parsedUid, parts[0]); + } + } catch { + // Provider containers may not have the host passwd database; fall back to uid labels. + } + } + return passwdCache.get(uid) ?? `uid:${uid}`; +} + +function readProcessStatus(pid: number): { uid: number; threads: number; name: string } { + const status = readFileSync(`/proc/${pid}/status`, "utf8"); + let uid = -1; + let threads = 0; + let name = ""; + for (const line of status.split("\n")) { + if (line.startsWith("Uid:")) { + const parsed = Number(line.trim().split(/\s+/)[1]); + if (Number.isFinite(parsed)) uid = parsed; + } else if (line.startsWith("Threads:")) { + const parsed = Number(line.trim().split(/\s+/)[1]); + if (Number.isFinite(parsed)) threads = parsed; + } else if (line.startsWith("Name:")) { + name = line.slice("Name:".length).trim(); + } + } + return { uid, threads, name }; +} + +function readProcessCommand(pid: number, fallback: string): string { + try { + const raw = readFileSync(`/proc/${pid}/cmdline`, "utf8"); + const command = raw.split("\0").filter(Boolean).join(" ").trim(); + return (command || fallback).slice(0, 500); + } catch { + return fallback; + } +} + +function readProcessIo(pid: number): { readBytes: number; writeBytes: number } { + try { + const result = { readBytes: 0, writeBytes: 0 }; + for (const line of readFileSync(`/proc/${pid}/io`, "utf8").split("\n")) { + const match = line.match(/^(read_bytes|write_bytes):\s+(\d+)/); + if (!match) continue; + const parsed = Number(match[2]); + if (!Number.isFinite(parsed)) continue; + if (match[1] === "read_bytes") result.readBytes = parsed; + if (match[1] === "write_bytes") result.writeBytes = parsed; + } + return result; + } catch { + return { readBytes: 0, writeBytes: 0 }; + } +} + +function readUptimeSeconds(): number { + const uptime = Number(readFileSync("/proc/uptime", "utf8").trim().split(/\s+/)[0]); + return Number.isFinite(uptime) ? uptime : 0; +} + +function parseProcessStat(raw: string): { + pid: number; + name: string; + state: string; + ppid: number; + totalTicks: number; + startTicks: number; + vmsBytes: number; + rssPages: number; + threads: number; +} | null { + const open = raw.indexOf("("); + const close = raw.lastIndexOf(")"); + if (open < 0 || close <= open) return null; + const pid = Number(raw.slice(0, open).trim()); + const name = raw.slice(open + 1, close); + const fields = raw.slice(close + 1).trim().split(/\s+/); + if (!Number.isFinite(pid) || fields.length < 22) return null; + const value = (index: number): number => { + const parsed = Number(fields[index]); + return Number.isFinite(parsed) ? parsed : 0; + }; + return { + pid, + name, + state: fields[0] || "?", + ppid: value(1), + totalTicks: value(11) + value(12), + threads: value(17), + startTicks: value(19), + vmsBytes: value(20), + rssPages: value(21), + }; +} + +async function collectProcessResources(totalMemoryBytes: number, cpuCores: number): Promise<{ + processes: ProcessResourceSummary[]; + summary: Record; +}> { + const [clockTicks, pageSize] = await Promise.all([systemClockTicks(), systemPageSize()]); + const uptimeSeconds = readUptimeSeconds(); + const sampledAtMs = Date.now(); + const rows: ProcessResourceSummary[] = []; + const seen = new Set(); + let skipped = 0; + const hasPreviousProcessSample = previousProcessSamples.size > 0; + + for (const entry of readdirSync("/proc")) { + if (!/^\d+$/.test(entry)) continue; + const pid = Number(entry); + try { + const stat = parseProcessStat(readFileSync(`/proc/${entry}/stat`, "utf8")); + if (stat === null) { + skipped += 1; + continue; + } + const status = readProcessStatus(pid); + const io = readProcessIo(pid); + const previous = previousProcessSamples.get(pid); + const elapsedSampleSeconds = previous ? Math.max(0.001, (sampledAtMs - previous.sampledAtMs) / 1000) : 0; + const elapsedProcessSeconds = Math.max(0, uptimeSeconds - stat.startTicks / clockTicks); + const cpuPercent = previous + ? ((Math.max(0, stat.totalTicks - previous.totalTicks) / clockTicks) / elapsedSampleSeconds) * 100 + : elapsedProcessSeconds > 0 + ? ((stat.totalTicks / clockTicks) / elapsedProcessSeconds) * 100 + : 0; + const ioSeconds = previous ? elapsedSampleSeconds : 0; + const readBytesPerSecond = previous && ioSeconds > 0 ? Math.max(0, io.readBytes - previous.readBytes) / ioSeconds : 0; + const writeBytesPerSecond = previous && ioSeconds > 0 ? Math.max(0, io.writeBytes - previous.writeBytes) / ioSeconds : 0; + const rssBytes = Math.max(0, stat.rssPages * pageSize); + const name = status.name || stat.name; + const uid = status.uid >= 0 ? status.uid : 0; + rows.push({ + pid: stat.pid, + ppid: stat.ppid, + uid, + user: usernameForUid(uid), + name, + command: readProcessCommand(pid, name), + state: stat.state, + cpuPercent: roundMetric(Math.min(cpuPercent, Math.max(1, cpuCores) * 100)), + memoryPercent: totalMemoryBytes > 0 ? clampPercent((rssBytes / totalMemoryBytes) * 100) : 0, + rssBytes, + vmsBytes: Math.max(0, stat.vmsBytes), + threads: status.threads || stat.threads, + readBytes: io.readBytes, + writeBytes: io.writeBytes, + readBytesPerSecond: roundMetric(readBytesPerSecond, 0), + writeBytesPerSecond: roundMetric(writeBytesPerSecond, 0), + elapsedSeconds: roundMetric(elapsedProcessSeconds, 0), + startedAt: new Date(Date.now() - elapsedProcessSeconds * 1000).toISOString(), + }); + previousProcessSamples.set(pid, { + totalTicks: stat.totalTicks, + sampledAtMs, + readBytes: io.readBytes, + writeBytes: io.writeBytes, + }); + seen.add(pid); + } catch { + skipped += 1; + } + } + + previousProcessSamples = new Map([...previousProcessSamples].filter(([pid]) => seen.has(pid))); + rows.sort((a, b) => b.rssBytes - a.rssBytes || b.cpuPercent - a.cpuPercent || a.pid - b.pid); + return { + processes: rows.slice(0, 120), + summary: { + total: rows.length, + visible: Math.min(rows.length, 120), + skipped, + defaultSort: "memory_desc", + scope: "provider_pid_namespace", + cpuPercentMode: hasPreviousProcessSample ? "delta_ticks_per_sample" : "lifetime_average_first_sample", + diskIoMode: "proc_pid_io_delta_bytes_per_second", + }, + }; +} + async function readDiskUsage(path: string): Promise<{ mount: string; totalBytes: number; usedBytes: number; availableBytes: number; percent: number }> { const result = await runProcessCommand("df", ["-PB1", path], 3000); if (!result.ok) throw new Error(`df failed with exit ${result.exitCode}: ${result.stderr}`); @@ -485,6 +701,18 @@ async function collectSystemStatus(): Promise { errors.push({ source: "proc.meminfo", error: error instanceof Error ? error.message : String(error) }); } + let processes: ProcessResourceSummary[] = []; + let processSummary: Record = { total: 0, visible: 0, defaultSort: "memory_desc" }; + try { + const totalMemoryBytes = typeof memory.totalBytes === "number" ? memory.totalBytes : 0; + const cpuCores = typeof cpu.cores === "number" ? cpu.cores : 1; + const collected = await collectProcessResources(totalMemoryBytes, cpuCores); + processes = collected.processes; + processSummary = collected.summary; + } catch (error) { + errors.push({ source: "proc.processes", error: error instanceof Error ? error.message : String(error) }); + } + let disk: Record = { path: config.monitorDiskPath, mount: "", totalBytes: 0, usedBytes: 0, availableBytes: 0, percent: 0 }; try { disk = { path: config.monitorDiskPath, ...(await readDiskUsage(config.monitorDiskPath)) }; @@ -492,7 +720,7 @@ async function collectSystemStatus(): Promise { errors.push({ source: "df", path: config.monitorDiskPath, error: error instanceof Error ? error.message : String(error) }); } - return { ok: errors.length === 0, collectedAt, cpu, memory, disk, errors }; + return { ok: errors.length === 0, collectedAt, cpu, memory, disk, processes, processSummary, errors }; } async function collectDockerStatus(): Promise { @@ -975,6 +1203,7 @@ function upgradePlan(taskId: string): Record { "docker run -d", `--name ${shellQuote(candidateName)}`, "--restart no", + "--pid host", "$network_arg", `--env-file "$candidate_env_file"`, `--label ${shellQuote(`com.docker.compose.project=${config.upgradeComposeProject}`)}`, @@ -1053,6 +1282,7 @@ function upgradePlan(taskId: string): Record { candidateUsesOldContainerNetworks: true, candidateUsesOldContainerExtraHosts: true, candidateUsesOldContainerEnvironment: true, + candidateUsesHostPidNamespace: true, removeScope: { projectLabel: config.upgradeComposeProject, serviceLabel: config.upgradeService, diff --git a/src/components/shared/src/index.ts b/src/components/shared/src/index.ts index 54fbcd07..58f5f8f7 100644 --- a/src/components/shared/src/index.ts +++ b/src/components/shared/src/index.ts @@ -18,12 +18,35 @@ export interface ProviderHeartbeatMessage { at: string; } +export interface ProcessResourceSummary { + pid: number; + ppid: number; + uid: number; + user: string; + name: string; + command: string; + state: string; + cpuPercent: number; + memoryPercent: number; + rssBytes: number; + vmsBytes: number; + threads: number; + readBytes: number; + writeBytes: number; + readBytesPerSecond: number; + writeBytesPerSecond: number; + elapsedSeconds: number; + startedAt: string; +} + export interface SystemStatusSnapshot { ok: boolean; collectedAt: string; cpu: Record; memory: Record; disk: Record; + processes?: ProcessResourceSummary[]; + processSummary?: Record; errors: JsonValue[]; }