fix shared memory fallback accounting

This commit is contained in:
Codex
2026-05-17 16:54:21 +00:00
parent 236c5c38f6
commit 3a2f86df9e
6 changed files with 29 additions and 10 deletions
+1 -1
View File
@@ -33,7 +33,7 @@ Typical targeted commands:
- Public exposure: Docker port summary must not show core REST, Code Queue NodePort, or Code Queue public host mappings; frontend and provider ingress are the only browser/provider public entries. PostgreSQL `15432` and OA Event Flow `4255` may be host-mapped only for controlled Code Queue nodes and must be protected by the `DOCKER-USER` source restrictions generated from `network.restrictedHostAccess`; E2E treats either an unreachable generic probe or a verified restricted rule as passing. Known private user-service ports such as FindJob `3254`, MET Nonlinear `3288`, Todo Note `4211`, legacy Code Queue host ports and File Browser provider port `4251` probes must fail.
- Core API: `docker exec unidesk-backend-core` calls internal `GET /api/overview`, which must report `dbReady: true`, `pgdata.volumeName=unidesk_pgdata_10gb`, a positive PostgreSQL database byte count, and at least one online node; internal `GET /api/performance` must report component request statistics, internal operation statistics, PGDATA usage and Code Queue PostgreSQL storage metadata.
- Provider self-connection: internal `GET /api/nodes` must contain `main-server` with `status: online`, `labels.providerGatewayVersion` equal to `src/components/provider-gateway/package.json`, `labels.providerGatewayUpgradePolicy: "always-enabled"`, `labels.providerGatewayRestartPolicyOk: true`, `labels.providerGatewayPidModeOk: true`, and `labels.providerGatewayRuntimeGuardOk: true`; internal `GET /api/nodes/system-status` must contain CPU/memory/disk samples plus a non-empty process resource list sorted by `memoryBytes` by default, where `memoryBytes` should use PSS when `/proc/[pid]/smaps_rollup` is available and must retain `rssBytes` for diagnostics; internal `GET /api/nodes/docker-status` must contain a Docker snapshot for `main-server`; every running `provider-gateway` container visible in Docker snapshots must report `restartPolicy: "always"` and `pidMode: "host"`; public provider ingress `/health` must return ok.
- Provider self-connection: internal `GET /api/nodes` must contain `main-server` with `status: online`, `labels.providerGatewayVersion` equal to `src/components/provider-gateway/package.json`, `labels.providerGatewayUpgradePolicy: "always-enabled"`, `labels.providerGatewayRestartPolicyOk: true`, `labels.providerGatewayPidModeOk: true`, and `labels.providerGatewayRuntimeGuardOk: true`; internal `GET /api/nodes/system-status` must contain CPU/memory/disk samples plus a non-empty process resource list sorted by `memoryBytes` by default, where `memoryBytes` should use PSS when `/proc/[pid]/smaps_rollup` is available, otherwise `rssBytes - statm.shared` before raw RSS, and must retain `rssBytes` for diagnostics; internal `GET /api/nodes/docker-status` must contain a Docker snapshot for `main-server`; every running `provider-gateway` container visible in Docker snapshots must report `restartPolicy: "always"` and `pidMode: "host"`; public provider ingress `/health` must return ok.
- Provider remote control: internal `/api/dispatch` must successfully complete a real `provider.upgrade` task in `mode: "plan"` so the upgrade path is validated without recreating the running gateway during E2E.
- User services: internal `/api/microservices` must include `todo-note` and `oa-event-flow` on `main-server`, canonical `filebrowser` on `D518`, plus `k3sctl-adapter`, `code-queue`, `findjob`, `pipeline`, `met-nonlinear`, `claudeqq` and `filebrowser-d601` on `D601` with `public=false`; `/api/microservices/todo-note/health` must report `storage=postgres`, `/api/microservices/todo-note/proxy/api/instances` must expose the migrated Todo Note lists, and a temporary Todo Note list create/add/toggle/undo/delete cycle must succeed through the real provider-gateway proxy; `/api/microservices/oa-event-flow/health`, `/api/microservices/oa-event-flow/proxy/api/diagnostics`, `/api/microservices/oa-event-flow/proxy/api/events`, `/api/microservices/oa-event-flow/proxy/api/events?tags=service:pipeline` and `/api/microservices/oa-event-flow/proxy/api/stats/trace` must prove the independent OA event table、Pipeline bridge 和 stats center are reachable through UniDesk proxy; `/api/microservices/k3sctl-adapter/health` and `/api/microservices/k3sctl-adapter/proxy/api/control-plane` must expose the D601 `unidesk-k3s` control plane, `kubeApiProxy.mode=kubernetes-api-service-proxy`, D601 active Code Queue instance `servingHealthy=true`, `presentNodeIds` containing `D601`, `missingNodeIds=[]`, `status=healthy`, and `noFallback=true`; `/api/microservices/code-queue/health` must return the active Code Queue backend summary with default model `gpt-5.5`, `egressProxy.connected=true`, and `/api/microservices/code-queue/proxy/api/tasks/overview` must return queue state through backend-core -> k3sctl-adapter -> Kubernetes API service proxy -> k3s/k8s Service, not through a `serviceId=code-queue` provider-gateway direct task or `/api/code-queue-direct`; `/api/microservices/filebrowser/health`, `/api/microservices/filebrowser-d601/health` and `/api/microservices/filebrowser/proxy/` must prove File Browser health and WebUI access through UniDesk proxy; `/api/microservices/findjob/health` and `/api/microservices/findjob/proxy/api/summary` must succeed through the real provider-gateway proxy; `/api/microservices/findjob/proxy/api/jobs?__unideskArrayLimit=jobs:5` must return a bounded preview with `_unidesk.arrayLimits` metadata; `/api/microservices/pipeline/health`, `/api/microservices/pipeline/proxy/api/snapshot?__unideskArrayLimit=registry.components:8,runs:3` and `/api/microservices/pipeline/proxy/api/oa-event-flow/diagnostics` must return Pipeline health, registry/run previews and OA event-flow evidence; `/api/microservices/met-nonlinear/health`, `/api/microservices/met-nonlinear/proxy/api/queue`, `/api/microservices/met-nonlinear/proxy/api/projects?root=projects&limit=500`, `/api/microservices/met-nonlinear/proxy/api/projects?root=ex_projects&limit=500`, `/api/microservices/met-nonlinear/proxy/api/projects/config?path=<projectPath>` and `/api/microservices/met-nonlinear/proxy/api/images` must return the D601 TS backend health, queue/GPU policy, full project tree inputs, structured project detail and ready `met-nonlinear-ml:tf26` image status.
- ClaudeQQ availability: `/api/microservices/claudeqq/health` must only pass when `ready=true`, NapCat HTTP and WebSocket are connected, and `napcat.loginState=logged_in`; `/api/microservices/claudeqq/proxy/api/napcat/login` must show the same logged-in account state and `/api/microservices/claudeqq/proxy/api/events/recent` must prove the backend can read the persistent event cache. A QR-code-only or not-logged-in NapCat state must be treated as unhealthy.
+1 -1
View File
@@ -122,7 +122,7 @@ provider-gateway 连接成功后必须周期性上报 Docker daemon 状态,数
## System Status Telemetry
provider-gateway 连接成功后必须周期性上报节点 CPU、内存、硬盘和进程资源占用。整体采集来源是节点本地 `/proc/stat``/proc/loadavg``/proc/meminfo``df -PB1`,进程表来源是 `/proc/[pid]/stat``/proc/[pid]/status``/proc/[pid]/cmdline``/proc/[pid]/io`可用时的 `/proc/[pid]/smaps_rollup`backend-core 将最新快照保存到 `unidesk_node_system_status`,并将历史采样保存到 `unidesk_node_metric_samples` 供 frontend 绘制任务管理器风格曲线。内存使用量采用实际占用口径:`MemTotal - MemFree - Buffers - Cached - SReclaimable + Shmem`,也就是不把 Linux page cache / buffer 计入占用;上报中同时保留 `cacheBytes` 便于排查。进程表的 `memoryBytes` 优先使用 `smaps_rollup` 中的 PSS,避免 PostgreSQL shared buffer 等共享页在多个进程之间重复计入,同时保留 `rssBytes``privateBytes``sharedBytes` 便于排查;如果目标内核或权限不支持 `smaps_rollup`,则按进程回退 `rssBytes`。默认按 `memoryBytes` 降序截取前 120 个进程;`cpuPercent` 使用相邻采样 CPU tick 差值,首个采样用进程生命周期平均值兜底;磁盘 I/O 速率使用相邻 `/proc/[pid]/io``read_bytes/write_bytes` 差值。该链路仍然由 provider 主动上报,主 server 不反向探测计算节点。
provider-gateway 连接成功后必须周期性上报节点 CPU、内存、硬盘和进程资源占用。整体采集来源是节点本地 `/proc/stat``/proc/loadavg``/proc/meminfo``df -PB1`,进程表来源是 `/proc/[pid]/stat``/proc/[pid]/status``/proc/[pid]/cmdline``/proc/[pid]/io`可用时的 `/proc/[pid]/smaps_rollup` 和 fallback 用的 `/proc/[pid]/statm`backend-core 将最新快照保存到 `unidesk_node_system_status`,并将历史采样保存到 `unidesk_node_metric_samples` 供 frontend 绘制任务管理器风格曲线。内存使用量采用实际占用口径:`MemTotal - MemFree - Buffers - Cached - SReclaimable + Shmem`,也就是不把 Linux page cache / buffer 计入占用;上报中同时保留 `cacheBytes` 便于排查。进程表的 `memoryBytes` 优先使用 `smaps_rollup` 中的 PSS,避免 PostgreSQL shared buffer 等共享页在多个进程之间重复计入,同时保留 `rssBytes``privateBytes``sharedBytes` 便于排查;如果目标内核或权限不支持 `smaps_rollup`,则回退 `rssBytes - statm.shared`,仍避免把共享页全部计入单个进程,只有 `statm` 也不可读时才退回原始 RSS。默认按 `memoryBytes` 降序截取前 120 个进程;`cpuPercent` 使用相邻采样 CPU tick 差值,首个采样用进程生命周期平均值兜底;磁盘 I/O 速率使用相邻 `/proc/[pid]/io``read_bytes/write_bytes` 差值。该链路仍然由 provider 主动上报,主 server 不反向探测计算节点。
## Remote Provider Upgrade
+1 -1
View File
@@ -30,7 +30,7 @@
},
"components/provider-gateway": {
"name": "@unidesk/provider-gateway",
"version": "0.2.23",
"version": "0.2.24",
},
"components/shared": {
"name": "@unidesk/shared",
+5 -1
View File
@@ -884,7 +884,11 @@ function ProcessResourceTable({ current, onRaw }: AnyRecord) {
const processSummary = current?.processSummary && typeof current.processSummary === "object" ? current.processSummary : {};
const processes = Array.isArray(current?.processes) ? current.processes : [];
const memoryMode = String(processSummary.memoryMode || "");
const memoryModeLabel = memoryMode === "pss_smaps_rollup" ? "PSS" : "RSS fallback";
const memoryModeLabel = memoryMode.includes("pss_smaps_rollup")
? "PSS"
: memoryMode === "rss_minus_shared_fallback"
? "RSS-shared"
: "RSS fallback";
const rows = useMemo(() => {
const direction = sort.direction === "asc" ? 1 : -1;
return [...processes].sort((left: AnyRecord, right: AnyRecord) => {
+1 -1
View File
@@ -1,6 +1,6 @@
{
"name": "@unidesk/provider-gateway",
"version": "0.2.23",
"version": "0.2.24",
"private": true,
"type": "module",
"scripts": {
+20 -5
View File
@@ -914,7 +914,7 @@ function readProcessIo(pid: number): { readBytes: number; writeBytes: number } {
}
}
function readProcessMemory(pid: number, rssBytes: number): {
function readProcessMemory(pid: number, rssBytes: number, pageSize: number): {
memoryBytes: number;
memoryMode: string;
pssBytes?: number;
@@ -945,7 +945,20 @@ function readProcessMemory(pid: number, rssBytes: number): {
swapPssBytes,
};
} catch {
return { memoryBytes: rssBytes, memoryMode: "rss_fallback" };
try {
const fields = readFileSync(`/proc/${pid}/statm`, "utf8").trim().split(/\s+/);
const sharedPages = Number(fields[2]);
const sharedBytes = Number.isFinite(sharedPages) ? Math.max(0, sharedPages * pageSize) : 0;
const privateBytes = Math.max(0, rssBytes - sharedBytes);
return {
memoryBytes: privateBytes,
memoryMode: "rss_minus_shared_fallback",
privateBytes,
sharedBytes,
};
} catch {
return { memoryBytes: rssBytes, memoryMode: "rss_fallback" };
}
}
}
@@ -1024,7 +1037,7 @@ async function collectProcessResources(totalMemoryBytes: number, cpuCores: numbe
const readBytesPerSecond = previous && ioSeconds > 0 ? Math.max(0, io.readBytes - previous.readBytes) / ioSeconds : 0;
const writeBytesPerSecond = previous && ioSeconds > 0 ? Math.max(0, io.writeBytes - previous.writeBytes) / ioSeconds : 0;
const rssBytes = Math.max(0, stat.rssPages * pageSize);
const processMemory = readProcessMemory(pid, rssBytes);
const processMemory = readProcessMemory(pid, rssBytes, pageSize);
const name = status.name || stat.name;
const uid = status.uid >= 0 ? status.uid : 0;
rows.push({
@@ -1068,6 +1081,7 @@ async function collectProcessResources(totalMemoryBytes: number, cpuCores: numbe
previousProcessSamples = new Map([...previousProcessSamples].filter(([pid]) => seen.has(pid)));
rows.sort((a, b) => b.memoryBytes - a.memoryBytes || b.cpuPercent - a.cpuPercent || a.pid - b.pid);
const pssRows = rows.filter((row) => row.memoryMode === "pss_smaps_rollup").length;
const rssMinusSharedRows = rows.filter((row) => row.memoryMode === "rss_minus_shared_fallback").length;
return {
processes: rows.slice(0, 120),
summary: {
@@ -1076,9 +1090,10 @@ async function collectProcessResources(totalMemoryBytes: number, cpuCores: numbe
skipped,
defaultSort: "memory_desc",
scope: "provider_pid_namespace",
memoryMode: pssRows > 0 ? "pss_smaps_rollup" : "rss_fallback",
memoryMode: pssRows > 0 ? "pss_smaps_rollup_with_fallback" : rssMinusSharedRows > 0 ? "rss_minus_shared_fallback" : "rss_fallback",
pssRows,
rssFallbackRows: rows.length - pssRows,
rssMinusSharedRows,
rssFallbackRows: rows.length - pssRows - rssMinusSharedRows,
cpuPercentMode: hasPreviousProcessSample ? "delta_ticks_per_sample" : "lifetime_average_first_sample",
diskIoMode: "proc_pid_io_delta_bytes_per_second",
},