From 05c9e9a7deee97832ed3f308bc7b2f82f8ef4980 Mon Sep 17 00:00:00 2001 From: Codex Date: Thu, 11 Jun 2026 16:16:13 +0000 Subject: [PATCH] feat: add confirmed server cleanup execution --- .agents/skills/unidesk-ops/SKILL.md | 6 +- AGENTS.md | 4 +- docs/reference/cli.md | 4 +- docs/reference/deployment.md | 2 +- docs/reference/gc.md | 2 + scripts/src/gc.ts | 114 +++++++++++++-- scripts/src/server-cleanup.ts | 207 +++++++++++++++++++++++++++- 7 files changed, 315 insertions(+), 24 deletions(-) diff --git a/.agents/skills/unidesk-ops/SKILL.md b/.agents/skills/unidesk-ops/SKILL.md index c346f5b4..1e362a43 100644 --- a/.agents/skills/unidesk-ops/SKILL.md +++ b/.agents/skills/unidesk-ops/SKILL.md @@ -59,9 +59,10 @@ bun scripts/cli.ts server logs ```bash bun scripts/cli.ts server cleanup plan [--min-age-hours 24] [--limit N] +bun scripts/cli.ts server cleanup run --confirm [--min-age-hours 24] [--limit N] ``` -只生成 dry-run 计划,不执行删除。保守白名单:保留 running/stopped 容器镜像、deploy.json/CI.json commit-pinned artifact、Compose stable image。禁止 `docker system prune`、`docker volume rm`、`docker compose down -v`。 +`plan` 只生成 dry-run 计划;`run --confirm` 只删除同一 classifier 选出的 stale Docker images。保守白名单:保留 running/stopped 容器镜像、deploy.json/CI.json commit-pinned artifact、Compose stable image。禁止 `docker system prune`、`docker image prune`、`docker volume rm`、`docker compose down -v` 和数据库清理。高风险候选必须额外显式 `--include-high-risk` 才会执行。 --- @@ -82,12 +83,13 @@ bun scripts/cli.ts gc remote [--target-use-percent N] [--dry-run|-- ```bash bun scripts/cli.ts gc plan --target-use-percent 69 \ --include-tool-caches \ + --include-stale-tmp \ --include-vscode-stale-servers \ --include-vscode-stale-extensions \ --include-baidu-staging ``` -`--target-use-percent` 按 `df` 显示口径估算 shortfall。工具缓存、VS Code 历史 server/extension 版本、Baidu staging 旧 PGDATA tarball 均默认不启用;必须显式 include 后才进入候选,且执行时仍受 allowlist 路径断言保护。默认 GC 不触碰 PGDATA、Docker volumes/images、Codex sessions/auth state 或 Baidu staging 根目录。 +`--target-use-percent` 按 `df` 显示口径估算 shortfall。工具缓存、`/tmp` 非 allowlist 直接子项、VS Code 历史 server/extension 版本、Baidu staging 旧 PGDATA tarball 均默认不启用;必须显式 include 后才进入候选,且执行时仍受路径断言保护。stale `/tmp` 扫描按 `--limit` 有界枚举候选,避免为了估算全量临时目录而长时间无输出。默认 GC 不触碰 PGDATA、Docker volumes/images、Codex sessions/auth state 或 Baidu staging 根目录。 --- diff --git a/AGENTS.md b/AGENTS.md index 051b26f3..f26c6350 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -222,8 +222,8 @@ UniDesk 是一个以主 server 为统一入口的分布式工作平台;本文 - `bun scripts/cli.ts server status`:查询固定端口、swap 摘要、容器状态、健康检查和访问 URL,包含生产 frontend、dev frontend proxy 和 provider ingress,判定标准见 `docs/reference/deployment.md` 与 `docs/reference/dev-environment.md`。 - `bun scripts/cli.ts server swap status|ensure [--path /swapfile] [--size 2GiB] [--dry-run]`:以 JSON 查看或幂等创建主 server swapfile,`ensure` 输出 before/after、动作、持久化状态和 degraded/failed 详情,规则见 `docs/reference/deployment.md`。 - `bun scripts/cli.ts server logs [--tail-bytes N]`:分页返回文件日志与 Docker 日志尾部并带截断元数据,日志规则见 `docs/reference/observability.md`。 -- `bun scripts/cli.ts server cleanup plan [--min-age-hours N] [--limit N]`:只读/干跑生成主 server Docker 镜像清理计划,默认只列出至少 24 小时前创建的非保护镜像,输出 active/protected images、stale candidates、预计释放空间、风险等级和必须人工确认的 `docker image rm` 命令;禁止默认删除、禁止 prune、禁止触碰 database volume、registry storage 或 Baidu Netdisk 状态。 -- `bun scripts/cli.ts gc plan|run|db-trace|policy|remote`:主 server 或受控 provider 磁盘高水位一次性缓解和低风险防膨胀入口,覆盖日志、journald、Docker BuildKit cache、allowlisted `/tmp` 诊断目录、受限 core dump、显式 trace 遥测留存和 systemd 定时策略;规则见 `docs/reference/gc.md`。 +- `bun scripts/cli.ts server cleanup plan|run --confirm [--min-age-hours N] [--limit N]`:生成主 server Docker 镜像清理 dry-run 计划,并在显式确认后只删除同一 classifier 选出的 stale images;禁止 prune、禁止触碰 database volume、registry storage 或 Baidu Netdisk 状态,规则见 `docs/reference/cli.md` 和 `docs/reference/deployment.md`。 +- `bun scripts/cli.ts gc plan|run|db-trace|policy|remote`:主 server 或受控 provider 磁盘高水位一次性缓解和低风险防膨胀入口,覆盖日志、journald、Docker BuildKit cache、allowlisted `/tmp` 诊断目录、显式 opt-in stale `/tmp` 直接子项、受限 core dump、显式 trace 遥测留存和 systemd 定时策略;规则见 `docs/reference/gc.md`。 - `bun scripts/cli.ts server rebuild `:以 build-first、Compose lock、no-deps force-recreate 和 post-up validation 的异步 job 重建主 server Compose 内单个服务;对 database、File Browser、Code Queue 执行面、k3sctl-adapter 或未知对象返回结构化 `unsupported-server-rebuild`,规则见 `docs/reference/deployment.md` 与 `docs/reference/cicd-standardization.md`。 - `bun scripts/cli.ts provider attach [--master-server URL] [--up] [--force]` / `bun scripts/cli.ts provider triage [--observed-error text] [--observed-scope scope] [--microservice id ...] [--full|--raw]`:前者在新增计算节点上生成两项配置的 provider-gateway 挂载包;后者是只读多信号健康裁决入口,默认低噪声输出 `decision`、`healthyScopes`、`failedScopes`、`retryable` 和异常信号摘要,用来把单路径 `provider is not online`、SSH 超时、registry 失败或 proxy 失败归类为 `retryable-transient`、`service-degraded` 或 `global-offline`,完整 evidence 需显式 `--full|--raw`,规则见 `docs/reference/provider-gateway.md` 和 `docs/reference/code-queue-supervision.md`。 - `trans [operation args...]` / `tran [operation args...]`:通过 provider-gateway 的 Host SSH / WSL SSH 维护桥进入 provider、host workspace、Windows cmd route、k3s 控制面或 pod workspace,并提供带 SHA-256 校验的 `upload`/`download` 文件传输;主 server 人工/Codex 分布式操作必须优先用本机 `trans` wrapper,`tran` 只作为兼容入口,细则见 `docs/reference/cli.md`、`docs/reference/windows-passthrough.md` 和 `docs/reference/provider-gateway.md`。 diff --git a/docs/reference/cli.md b/docs/reference/cli.md index 17e8a7bf..328efdc1 100644 --- a/docs/reference/cli.md +++ b/docs/reference/cli.md @@ -32,8 +32,8 @@ CI/CD、GitOps、rollout、artifact 发布、PR 合并后的 runtime lane 滚动 - `server status` 查询公开端口、受限宿主端口、内部端口、主机 swap 摘要、Compose 容器、core/frontend/dev-frontend/provider/database 健康检查和访问 URL;D601 Code Queue 使用的 PostgreSQL/OA Event Flow host mapping 必须出现在受限宿主端口而不是无条件公开入口中。低内存主 server 上 `swap.warning` 非空时,先执行 `server swap status` 或 `server swap ensure`。 - `server swap status|ensure [--path /swapfile] [--size 2GiB] [--dry-run]` 是主 server swap 管理入口。`status` 仅读 `/proc/meminfo`、`/proc/swaps` 和 `/etc/fstab` 并返回 JSON;`ensure` 在已有任何 active swap 时只报告 no-op,在无 active swap 时创建固定 swapfile、`chmod 600`、`mkswap`、`swapon` 并尽量写入 `/etc/fstab`。输出必须包含 `before`、`after`、total memory、active swap、持久化状态、关键动作和错误详情;若 swap 已启用但 fstab 写入失败,状态为 `degraded`,调用者需按返回的 detail 修复持久化。 - `server logs` 返回 `logs/` 文件日志和 Docker 容器日志的尾部,默认限制输出大小,避免日志爆炸。实现必须只读取文件末尾字节,不得为了 tail 先把巨大日志完整读入 CLI 内存。 -- `server cleanup plan [--min-age-hours N] [--limit N]` 只生成主 server Docker 镜像清理 dry-run 计划,不执行删除;默认 `--min-age-hours 24`,避免把刚发布或刚验证的镜像列为 stale。输出必须包含 `dryRun=true`、`mutation=false`、`policy.deletionExecuted=false`、active containers/images、受保护镜像、candidate stale images、估算释放空间、风险等级、`commandsToReview` 和人工审批清单。计划必须保守白名单:保留 running containers 使用的 image ID,保留 stopped containers 引用的 image ID 直到人工先复核容器,保留 `deploy.json`/`CI.json` 当前 commit-pinned artifact、Compose stable image、上游 digest pin 和 provider-gateway runner image;`protectedStorage` 必须显式列出 PostgreSQL named volume、Baidu Netdisk `.state`、D601 registry storage 和 Docker volumes/host data policy。该入口禁止生成或执行 `docker system prune`、`docker image prune`、`docker builder prune`、`docker volume rm`、`docker compose down -v`、数据库清理或 host data `rm` 命令;未来若增加真实删除,必须另设显式审批参数并先复核 dry-run 输出。 -- `gc plan|run --confirm|db-trace|policy|remote` 是主 server 和受控 provider 的磁盘高水位一次性缓解与长期防膨胀入口。`plan` 只读输出候选、风险、估算收益和保护对象;`run` 必须显式 `--confirm`;`gc remote ...` 通过 UniDesk SSH 透传执行远端 GC,`--target-use-percent N` 会在 `summary.target` 中报告目标水位所需释放量、候选估算、预计水位、缺口和 safe-stop 决策。G14/HWLAB registry retention、受限 core dump、保护对象、safe-stop 线和长期收益表的权威规则见 `docs/reference/gc.md`。 +- `server cleanup plan|run --confirm [--min-age-hours N] [--limit N]` 是主 server Docker 镜像高水位治理入口。`plan` 生成 dry-run 计划,不执行删除;`run --confirm` 只删除同一 classifier 选出的 stale Docker images,高风险候选必须额外 `--include-high-risk` 才会执行。默认 `--min-age-hours 24`,避免把刚发布或刚验证的镜像列为 stale。输出必须包含 active containers/images、受保护镜像、candidate stale images、估算释放空间、风险等级、执行/跳过结果和人工审批线索。计划必须保守白名单:保留 running containers 使用的 image ID,保留 stopped containers 引用的 image ID 直到人工先复核容器,保留 `deploy.json`/`CI.json` 当前 commit-pinned artifact、Compose stable image、上游 digest pin 和 provider-gateway runner image;`protectedStorage` 必须显式列出 PostgreSQL named volume、Baidu Netdisk `.state`、D601 registry storage 和 Docker volumes/host data policy。该入口禁止 `docker system prune`、`docker image prune`、`docker builder prune`、`docker volume rm`、`docker compose down -v`、数据库清理或 host data `rm` 命令。 +- `gc plan|run --confirm|db-trace|policy|remote` 是主 server 和受控 provider 的磁盘高水位一次性缓解与长期防膨胀入口。`plan` 只读输出候选、风险、估算收益和保护对象;`run` 必须显式 `--confirm`;`gc remote ...` 通过 UniDesk SSH 透传执行远端 GC,`--target-use-percent N` 会在 `summary.target` 中报告目标水位所需释放量、候选估算、预计水位、缺口和 safe-stop 决策。默认只包含 allowlisted `/tmp` 诊断目录;非 allowlist stale `/tmp` 直接子项必须显式 `--include-stale-tmp`,并只允许删除 `/tmp` 一级子项且避开系统 socket/session 前缀。G14/HWLAB registry retention、受限 core dump、保护对象、safe-stop 线和长期收益表的权威规则见 `docs/reference/gc.md`。 - `server rebuild ` 创建异步 job,先构建目标服务镜像,随后在 `.state/locks/server-compose.lock` 串行保护下用 `--no-deps --force-recreate` 替换目标 service 并等待容器 `healthy/running`;该命令用于替代手工删除容器的兜底流程,其中 `dev-frontend-proxy` 只更新主 server dev 入口薄代理,`todo-note`、`code-queue-mgr`、`project-manager`、`baidu-netdisk` 和 `oa-event-flow` 只重建主 server 承载的对应后端,不会重建或删除 database 命名卷。D601 Code Queue 执行面不由 `server rebuild` 管理;Rust backend-core 常规迭代不得用该命令在 master server 编译,只有明确的 backend-core 主 server 上线例外可以按限流、异步轮询和 health 证据执行,规则见 `docs/reference/dev-environment.md`。 - `provider attach [--master-server URL] [--up] [--force]` 在新计算节点生成两项配置的 provider-gateway 挂载包:`.state/provider-.env` 默认只包含 `UNIDESK_MASTER_SERVER` 与 `PROVIDER_ID`,`provider-.yml` 固定 Docker socket、`pid: "host"`、`restart: always`、只读 `/workspace` 和 SSH 维护私钥挂载;`--up` 会立即执行生成的 `docker compose up -d --build`。`provider triage [--observed-error text] [--observed-scope scope] [--microservice id ...] [--full|--raw]` 是只读多信号健康裁决入口,会把单路径 `provider is not online`、SSH 超时、registry 失败和 service proxy 失败归类成 `runner-local-observation-gap`、`service-degraded`、`provider-degraded` 或 `global-blocker`。默认输出只返回裁决、scope、失败/降级/未知信号和有界 evidence 摘要,完整 evidence 必须显式加 `--full` 或 `--raw`;推荐交叉验证命令仍包含 `debug health`、`debug dispatch host.ssh --wait-ms 15000`、`trans argv true`、`artifact-registry health --provider-id `、`microservice health k3sctl-adapter`、`microservice health code-queue` 和 `codex tasks --view supervisor --limit 20`。 - `trans [operation args...]` / `tran [operation args...]` 通过 backend-core 内网 WebSocket broker 和 provider-gateway 的 Host SSH / WSL SSH 维护桥连接目标节点;`route` 基础形态是 provider id,例如 `D601` 或 `G14`,也可以扩展为纯定位路径 `provider:plane[:namespace:resource[:container]]`,例如 `D601:win`、`D601:win/c/test`、`G14:k3s`、`D601:k3s` 或 `G14:k3s::`。WSL provider 的 Windows plane 固定使用 `win`,不得使用 `win32`;Windows operation 必须显式区分:`ps` 执行 Windows PowerShell heredoc 或一行 PowerShell 命令,`cmd` 执行 cmd.exe/batch,`skills` 发现 Windows skill 目录。需要 Windows cwd 时用 `trans D601:win/c/test ps` 或 `trans D601:win/c/test cmd cd`,CLI 自动设置 UTF-8/Python 编码默认值;`cmd` 额外设置 `chcp 65001`。非交互远端命令优先使用 `trans argv ...`;需要 POSIX shell 脚本、管道、变量或循环时优先使用 quoted heredoc 单步传输,例如 `trans G14 script <<'SCRIPT'`、`trans G14:k3s script <<'SCRIPT'` 或 `trans G14:k3s:: script <<'SCRIPT'`,把脚本走 stdin。`script` 只表示 host/k3s POSIX shell,不表示 Windows PowerShell;Windows PowerShell 必须写 `trans :win ps <<'PS'`。`script -- '<单个字符串>'` 是无需 stdin 的远端 POSIX shell one-liner,例如 `trans G14:/root/hwlab script -- 'cd /root/hwlab && git status --short --branch'`;`script -- <多个 argv>` 才是 direct argv,适合 `trans D601:/path script -- sed -n '1,20p' file` 这类带短横线的单进程命令。顶层 remote option parser 必须保留命令已经开始后的 `--`,不得把它吞成全局选项结束符。需要远端改文本文件时默认优先使用 ` apply-patch < patch.diff`;需要可靠传输非文本或整文件时使用 ` upload ` 和 ` download `,CLI 会按字节数与 SHA-256 自动校验并在 provider-gateway stdin/argv 限制下切换客户端分块策略;需要旧 helper 时显式使用 `:k3s:: apply-patch-v1` 或 ` apply-patch-v1`。ssh-like 命令遇到 timeout/kex/255 类失败时,CLI 会在 stderr 追加一行 `UNIDESK_SSH_HINT` JSON,提示 stdin script/argv 重试和 provider triage 交叉验证。 diff --git a/docs/reference/deployment.md b/docs/reference/deployment.md index 7481cb92..8e948b8a 100644 --- a/docs/reference/deployment.md +++ b/docs/reference/deployment.md @@ -52,7 +52,7 @@ MiniMax-M3 配置必须保持 profile/provider 级隔离。当前 `mxcx` 的稳 swap 管理不能被强塞进所有热路径。`server start/status` 可以暴露 warning 或摘要,但不会自动创建 swap;需要变更主机 swap 时必须显式运行 `server swap ensure`,并用返回的 `before`/`after` 和 `fstab.persisted` 作为验收记录。 -根分区 Docker 镜像高水位治理必须先走 `bun scripts/cli.ts server cleanup plan` 的只读 dry-run。该计划只针对 Docker image inventory:默认只把创建时间超过 24 小时且不在保护集里的镜像列为 stale,输出 active containers/images、protected images、candidate stale images、风险、估算释放空间和人工复核命令,但不删除、不 prune、不改容器、不碰 volume。候选必须从白名单保护集中排除:running container image ID、stopped container 引用 image ID、Compose stable image、`deploy.json`/`CI.json` 当前 commit artifact、上游 digest pin 和 provider-gateway runner image。计划还必须显式保护 PostgreSQL named volume、Baidu Netdisk `.state`/staging、D601 registry storage 和所有 Docker volume/host data 目录。任何真实清理必须作为未来显式授权操作实现,且不得用 `docker system prune`、`docker image prune`、`docker builder prune` 或数据库清理替代 dry-run 审批;数据库清理前必须先确认可用备份。 +根分区 Docker 镜像高水位治理必须先走 `bun scripts/cli.ts server cleanup plan` 的 dry-run,再用 `bun scripts/cli.ts server cleanup run --confirm` 执行同一 classifier 选出的 stale image 删除。该入口只针对 Docker image inventory:默认只把创建时间超过 24 小时且不在保护集里的镜像列为 stale,输出 active containers/images、protected images、candidate stale images、风险、估算释放空间、人工复核命令和执行结果;高风险候选必须额外显式 `--include-high-risk` 才会执行。候选必须从白名单保护集中排除:running container image ID、stopped container 引用 image ID、Compose stable image、`deploy.json`/`CI.json` 当前 commit artifact、上游 digest pin 和 provider-gateway runner image。计划还必须显式保护 PostgreSQL named volume、Baidu Netdisk `.state`/staging、D601 registry storage 和所有 Docker volume/host data 目录。该入口不得用 `docker system prune`、`docker image prune`、`docker builder prune`、volume 清理或数据库清理替代 dry-run 审批;数据库清理前必须先确认可用备份。 ## Start And Stop diff --git a/docs/reference/gc.md b/docs/reference/gc.md index ced600dd..a8efb0a7 100644 --- a/docs/reference/gc.md +++ b/docs/reference/gc.md @@ -12,6 +12,8 @@ UniDesk 的磁盘治理入口是 `bun scripts/cli.ts gc ...`。该入口用于 所有成功和失败输出都必须是 JSON。`plan` 必须标记 `dryRun=true`、`mutation=false`;`run` 必须要求 `--confirm` 并报告 `diskBefore`、`diskAfter`、`summary`、`results` 和 `protected`。远端 GC 可用 `--target-use-percent N` 显式表达目标根盘水位;`summary.target` 必须给出目标所需释放量、候选估算、预计水位、缺口和 `safeStop` 决策,避免靠人工心算判断是否应该继续扩大清理范围。 +默认 `/tmp` GC 只包含 allowlisted 诊断目录和已知低风险路径。非 allowlist 的 stale `/tmp` 一级子项必须显式 `--include-stale-tmp` 才能进入候选;扫描按 `--limit` 有界枚举候选,执行时仍只允许删除 `/tmp` 直接子项,并避开 X11/ICE/font socket、systemd private、tmux、ssh、vscode 等系统/session 前缀。该入口不能递归扩大成通用 `/tmp` 清空器,也不能为了估算全量临时目录而长时间阻塞。 + ## Protected Data 默认 GC 不得删除或 prune 以下对象: diff --git a/scripts/src/gc.ts b/scripts/src/gc.ts index 3c9db0f6..ea624404 100644 --- a/scripts/src/gc.ts +++ b/scripts/src/gc.ts @@ -1,5 +1,5 @@ import { spawnSync } from "node:child_process"; -import { closeSync, existsSync, ftruncateSync, lstatSync, mkdirSync, openSync, readdirSync, readSync, rmSync, statSync, unlinkSync, writeFileSync, writeSync } from "node:fs"; +import { closeSync, existsSync, ftruncateSync, lstatSync, mkdirSync, opendirSync, openSync, readdirSync, readSync, rmSync, statSync, unlinkSync, writeFileSync, writeSync } from "node:fs"; import { basename, join, resolve } from "node:path"; import { type UniDeskConfig, repoRoot, rootPath } from "./config"; @@ -13,6 +13,7 @@ type GcItemKind = | "journal-vacuum" | "docker-build-cache-prune" | "tmp-path-delete" + | "stale-tmp-path-delete" | "browser-cache-delete" | "tool-cache-delete" | "vscode-server-delete" @@ -33,6 +34,7 @@ interface GcOptions { buildCacheAll: boolean; tmp: boolean; tmpMinAgeHours: number; + staleTmp: boolean; browserCache: boolean; toolCaches: boolean; vscodeStaleServers: boolean; @@ -169,6 +171,7 @@ const DEFAULT_OPTIONS: GcOptions = { buildCacheAll: false, tmp: true, tmpMinAgeHours: 24, + staleTmp: false, browserCache: false, toolCaches: false, vscodeStaleServers: false, @@ -225,6 +228,17 @@ const TMP_EXACT_PROTECT = new Set([ "/tmp/tmux-0", ]); +const STALE_TMP_PROTECTED_PREFIXES = [ + ".X", + ".ICE", + ".font-unix", + ".Test-unix", + "systemd-private-", + "tmux-", + "ssh-", + "vscode-", +]; + const TOOL_CACHE_ALLOWLIST = [ { id: "npm-cacache", @@ -281,6 +295,9 @@ const TOOL_CACHE_ALLOWLIST = [ const VSCODE_SERVER_ROOT = "/root/.vscode-server/cli/servers"; const VSCODE_EXTENSION_ROOT = "/root/.vscode-server/extensions"; const BAIDU_STAGING_RELATIVE_ROOT = [".state", "baidu-netdisk", "staging"]; +const DEFAULT_PATH_SIZE_TIMEOUT_MS = 5_000; +const STALE_TMP_PATH_SIZE_TIMEOUT_MS = 1_500; +const STALE_TMP_MAX_CANDIDATES = 1_000; export async function runGcCommand(config: UniDeskConfig, args: string[]): Promise { const [action = "plan", ...rest] = args; @@ -374,6 +391,10 @@ export function gcPlan(config: UniDeskConfig, options: GcOptions = DEFAULT_OPTIO if (options.tmp) { candidates.push(...collectTmpCandidates(options, observedAt)); } + if (options.staleTmp) { + const alreadySelected = new Set(candidates.map((candidate) => candidate.path).filter((path): path is string => path !== undefined)); + candidates.push(...collectStaleTmpCandidates(options, observedAt, alreadySelected)); + } if (options.browserCache) { const item = collectBrowserCacheCandidate(); if (item !== null) candidates.push(item); @@ -459,7 +480,7 @@ export function gcPlan(config: UniDeskConfig, options: GcOptions = DEFAULT_OPTIO notes: [ "gc run only executes listed one-time cleanup actions after --confirm.", options.full ? "Full candidate output requested." : `Default output is capped to ${options.limit} candidates; use --full or --limit N for broader disclosure.`, - "Tool caches, stale VS Code server versions and stale VS Code extension versions are opt-in and require explicit include flags.", + "Tool caches, stale /tmp direct children, stale VS Code server versions and stale VS Code extension versions are opt-in and require explicit include flags.", "Baidu Netdisk staging cleanup is opt-in and only selects old PGDATA backup tarballs under server-data/unidesk-pg-data.", "Database event retention is diagnostic-only in this command; cleanups for oa_events require a backup and a separate schema/retention change.", "Docker image cleanup stays under server cleanup plan; gc does not run docker system prune or docker image prune.", @@ -540,6 +561,10 @@ function parseGcOptions(args: string[]): GcOptions { options.buildCacheAll = true; } else if (arg === "--tmp-min-age-hours") { options.tmpMinAgeHours = parseNonNegativeNumber(arg, args[++index]); + } else if (arg === "--include-stale-tmp") { + options.staleTmp = true; + } else if (arg === "--no-stale-tmp") { + options.staleTmp = false; } else if (arg === "--include-browser-cache") { options.browserCache = true; } else if (arg === "--no-browser-cache") { @@ -705,6 +730,7 @@ function publicOptions(options: GcOptions): Record { buildCacheAll: options.buildCacheAll, tmp: options.tmp, tmpMinAgeHours: options.tmpMinAgeHours, + staleTmp: options.staleTmp, browserCache: options.browserCache, toolCaches: options.toolCaches, vscodeStaleServers: options.vscodeStaleServers, @@ -834,7 +860,7 @@ function collectTmpCandidates(options: GcOptions, observedAt: string): GcCandida continue; } if (stat.mtimeMs >= cutoffMs) continue; - const sizeBytes = safePathSize(path); + const sizeBytes = safePathSize(path, STALE_TMP_PATH_SIZE_TIMEOUT_MS); if (sizeBytes <= 0) continue; result.push({ id: `tmp:${path}`, @@ -850,6 +876,52 @@ function collectTmpCandidates(options: GcOptions, observedAt: string): GcCandida return result.sort((left, right) => right.estimatedReclaimBytes - left.estimatedReclaimBytes); } +function collectStaleTmpCandidates(options: GcOptions, observedAt: string, alreadySelected: Set): GcCandidate[] { + const root = "/tmp"; + if (!existsSync(root)) return []; + const cutoffMs = new Date(observedAt).getTime() - options.tmpMinAgeHours * 60 * 60 * 1000; + const candidateLimit = Math.min(options.limit, STALE_TMP_MAX_CANDIDATES); + const result: GcCandidate[] = []; + const dir = opendirSync(root); + try { + let entry; + while ((entry = dir.readSync()) !== null && result.length < candidateLimit) { + const name = entry.name; + const path = join(root, name); + if (alreadySelected.has(path)) continue; + if (TMP_EXACT_PROTECT.has(path)) continue; + if (isStaleTmpProtectedName(name)) continue; + if (!entry.isDirectory() && !entry.isFile() && !entry.isSymbolicLink()) continue; + let stat; + try { + stat = lstatSync(path); + } catch { + continue; + } + if (stat.mtimeMs >= cutoffMs) continue; + const sizeBytes = safePathSize(path, STALE_TMP_PATH_SIZE_TIMEOUT_MS); + if (sizeBytes <= 0) continue; + result.push({ + id: `stale-tmp:${path}`, + kind: "stale-tmp-path-delete", + risk: "medium", + description: `Delete one bounded direct /tmp child older than ${options.tmpMinAgeHours} hours`, + path, + sizeBytes, + estimatedReclaimBytes: sizeBytes, + action: { op: "rm-recursive", allowlist: "tmp-direct-stale", minAgeHours: options.tmpMinAgeHours, boundedByLimit: candidateLimit }, + }); + } + } finally { + dir.closeSync(); + } + return result.sort((left, right) => right.estimatedReclaimBytes - left.estimatedReclaimBytes); +} + +function isStaleTmpProtectedName(name: string): boolean { + return STALE_TMP_PROTECTED_PREFIXES.some((prefix) => name.startsWith(prefix)); +} + function collectBrowserCacheCandidate(): GcCandidate | null { const path = rootPath(".state", "playwright-browsers"); if (!existsSync(path)) return null; @@ -1361,6 +1433,12 @@ function executeCandidate(candidate: GcCandidate, options: GcOptions): { reclaim rmSync(candidate.path, { recursive: true, force: true }); return { reclaimedBytes: before }; } + if (candidate.kind === "stale-tmp-path-delete" && candidate.path !== undefined) { + assertStaleTmpCandidatePath(candidate.path); + const before = safePathSize(candidate.path); + rmSync(candidate.path, { recursive: true, force: true }); + return { reclaimedBytes: before }; + } if (candidate.kind === "browser-cache-delete" && candidate.path !== undefined) { const expected = rootPath(".state", "playwright-browsers"); if (resolve(candidate.path) !== resolve(expected)) throw new Error(`refusing to remove unexpected browser cache path: ${candidate.path}`); @@ -1447,6 +1525,16 @@ function assertTmpCandidatePath(path: string): void { } } +function assertStaleTmpCandidatePath(path: string): void { + const resolved = resolve(path); + if (!resolved.startsWith("/tmp/")) throw new Error(`refusing to remove non-/tmp path: ${path}`); + if (TMP_EXACT_PROTECT.has(resolved)) throw new Error(`refusing to remove protected tmp path: ${path}`); + const relativePath = resolved.slice("/tmp/".length); + if (relativePath.length === 0 || relativePath.includes("/")) throw new Error(`refusing to remove nested tmp path: ${path}`); + const name = basename(resolved); + if (isStaleTmpProtectedName(name)) throw new Error(`refusing to remove protected stale tmp path: ${path}`); +} + function assertToolCacheCandidatePath(path: string): void { const resolved = resolve(path); const allowed = TOOL_CACHE_ALLOWLIST.some((item) => resolve(item.path) === resolved); @@ -1560,19 +1648,23 @@ function collectFiles(root: string): Array<{ path: string; sizeBytes: number; mt return result; } -function safePathSize(path: string): number { +function safePathSize(path: string, timeoutMs = DEFAULT_PATH_SIZE_TIMEOUT_MS): number { + return pathSizeFromDu(path, timeoutMs) ?? 0; +} + +function pathSizeFromDu(path: string, timeoutMs: number): number | null { try { const stat = lstatSync(path); if (stat.isFile() || stat.isSymbolicLink()) return stat.size; - if (!stat.isDirectory()) return 0; - let total = 0; - for (const entry of readdirSync(path)) { - total += safePathSize(join(path, entry)); - } - return total; + if (!stat.isDirectory()) return null; } catch { - return 0; + return null; } + const result = command(["du", "-sb", "--one-file-system", "--", path], timeoutMs); + if (result.exitCode !== 0 || result.timedOut) return null; + const rawSize = result.stdout.trim().split(/\s+/u)[0] ?? ""; + const sizeBytes = Number(rawSize); + return Number.isFinite(sizeBytes) && sizeBytes >= 0 ? sizeBytes : null; } function safeFileSize(path: string): number { diff --git a/scripts/src/server-cleanup.ts b/scripts/src/server-cleanup.ts index 25f77d66..6fcf28fa 100644 --- a/scripts/src/server-cleanup.ts +++ b/scripts/src/server-cleanup.ts @@ -56,6 +56,8 @@ export interface DockerCleanupInventory { export interface ServerCleanupPlanOptions { minAgeHours: number; limit: number; + confirm: boolean; + includeHighRisk: boolean; } export interface CleanupImageSummary { @@ -92,6 +94,15 @@ export interface CleanupCommandReview { reviewChecklist: string[]; } +interface DiskSnapshot { + filesystem: string; + sizeBytes: number; + usedBytes: number; + availableBytes: number; + usePercent: number; + mount: string; +} + export interface ServerCleanupPlan { ok: boolean; dryRun: true; @@ -107,7 +118,7 @@ export interface ServerCleanupPlan { dockerVolumesTouched: false; dataDirectoriesTouched: false; databaseCleanupIncluded: false; - liveCleanupImplemented: false; + liveCleanupImplemented: boolean; note: string; }; inventory: { @@ -147,6 +158,49 @@ export interface ServerCleanupPlan { prohibitedCommands: string[]; } +interface ServerCleanupRun { + ok: boolean; + dryRun: false; + mutation: true; + action: "server cleanup run"; + scope: "docker-images-only"; + observedAt: string; + options: ServerCleanupPlanOptions; + diskBefore: DiskSnapshot | null; + diskAfter: DiskSnapshot | null; + summary: { + plannedCandidateCount: number; + attemptedCount: number; + succeededCount: number; + failedCount: number; + skippedHighRiskCount: number; + estimatedReclaimBytes: number; + actualDiskReclaimBytes: number | null; + }; + results: Array<{ + imageId: string; + shortId: string; + repoTags: string[]; + repoDigests: string[]; + risk: Exclude; + estimatedReclaimBytes: number; + command: string[]; + status: "succeeded" | "failed" | "skipped"; + reason?: string; + exitCode?: number | null; + stdoutTail?: string; + stderrTail?: string; + }>; + policy: { + deletionExecuted: true; + dockerPruneUsed: false; + dockerVolumesTouched: false; + dataDirectoriesTouched: false; + databaseCleanupIncluded: false; + highRiskRequiresIncludeFlag: true; + }; +} + interface DeployServiceCommit { environment: string; serviceId: string; @@ -156,6 +210,8 @@ interface DeployServiceCommit { const defaultOptions: ServerCleanupPlanOptions = { minAgeHours: 24, limit: 200, + confirm: false, + includeHighRisk: false, }; export function parseServerCleanupOptions(args: string[]): ServerCleanupPlanOptions { @@ -174,8 +230,16 @@ export function parseServerCleanupOptions(args: string[]): ServerCleanupPlanOpti if (!Number.isInteger(value) || value <= 0) throw new Error("--limit must be a positive integer"); options.limit = Math.min(value, 1000); index += 1; + } else if (arg === "--confirm") { + options.confirm = true; + } else if (arg === "--dry-run") { + options.confirm = false; + } else if (arg === "--include-high-risk") { + options.includeHighRisk = true; + } else if (arg === "--no-high-risk") { + options.includeHighRisk = false; } else { - throw new Error(`unknown server cleanup plan option: ${arg}`); + throw new Error(`unknown server cleanup option: ${arg}`); } } return options; @@ -186,14 +250,31 @@ export async function runServerCleanupCommand(config: UniDeskConfig, args: strin if (action === "plan" || action === "dry-run") { return serverCleanupPlan(config, parseServerCleanupOptions(rest)); } + if (action === "run") { + const options = parseServerCleanupOptions(rest); + if (!options.confirm) { + return { + ok: false, + error: "server-cleanup-run-requires-confirm", + dryRun: true, + mutation: false, + next: { + plan: `bun scripts/cli.ts server cleanup plan --min-age-hours ${options.minAgeHours} --limit ${options.limit}`, + confirm: `bun scripts/cli.ts server cleanup run --confirm --min-age-hours ${options.minAgeHours} --limit ${options.limit}`, + }, + policy: "server cleanup run removes only listed stale Docker images; high-risk images require --include-high-risk.", + }; + } + return serverCleanupRun(config, options); + } return { ok: false, error: "unsupported-server-cleanup-action", action, - supportedActions: ["plan"], + supportedActions: ["plan", "run"], dryRunOnly: true, mutation: false, - policy: "This task implements only server cleanup plan. Real image deletion is intentionally not implemented and must require a future explicit approval parameter.", + policy: "server cleanup run requires --confirm and removes only stale Docker images selected by the same plan classifier.", }; } @@ -307,8 +388,8 @@ export function buildDockerCleanupPlan(inventory: DockerCleanupInventory, option dockerVolumesTouched: false, dataDirectoriesTouched: false, databaseCleanupIncluded: false, - liveCleanupImplemented: false, - note: "This command only inventories Docker images and builds a dry-run review plan. It never runs docker image rm, docker prune, docker volume rm, rm, or database cleanup.", + liveCleanupImplemented: true, + note: "This command inventories Docker images and builds a dry-run review plan. Confirmed execution is available through server cleanup run --confirm; it never runs docker prune, docker volume rm, rm, or database cleanup.", }, inventory: { dockerAvailable: inventory.collection.dockerAvailable, @@ -369,6 +450,91 @@ export function buildDockerCleanupPlan(inventory: DockerCleanupInventory, option }; } +export function serverCleanupRun(config: UniDeskConfig, options: ServerCleanupPlanOptions = defaultOptions): ServerCleanupRun { + const diskBefore = rootDiskSnapshot(); + const plan = serverCleanupPlan(config, options); + const selected = plan.candidateStaleImages; + const results: ServerCleanupRun["results"] = []; + + for (const image of selected) { + const command = image.commandsToReview[0] ?? imageRemoveCommand({ + id: image.id, + repoTags: image.repoTags, + repoDigests: image.repoDigests, + sizeBytes: image.sizeBytes, + createdAt: image.createdAt, + labels: {}, + }); + if (image.risk === "high" && !options.includeHighRisk) { + results.push({ + imageId: image.id, + shortId: image.shortId, + repoTags: image.repoTags, + repoDigests: image.repoDigests, + risk: image.risk, + estimatedReclaimBytes: image.sizeBytes, + command, + status: "skipped", + reason: "high-risk-requires-include-high-risk", + }); + continue; + } + const remove = runCommand(command, repoRoot, { timeoutMs: 60_000 }); + const presentAfterRemove = remove.exitCode === 0 ? false : dockerImagePresent(image.id); + const succeeded = remove.exitCode === 0 || presentAfterRemove === false; + results.push({ + imageId: image.id, + shortId: image.shortId, + repoTags: image.repoTags, + repoDigests: image.repoDigests, + risk: image.risk, + estimatedReclaimBytes: image.sizeBytes, + command, + status: succeeded ? "succeeded" : "failed", + reason: remove.exitCode !== 0 && presentAfterRemove === false ? "image-absent-after-remove" : undefined, + exitCode: remove.exitCode, + stdoutTail: tailText(remove.stdout, 4000), + stderrTail: tailText(remove.stderr, 4000), + }); + } + + const diskAfter = rootDiskSnapshot(); + const failedCount = results.filter((item) => item.status === "failed").length; + const succeededCount = results.filter((item) => item.status === "succeeded").length; + const skippedHighRiskCount = results.filter((item) => item.status === "skipped").length; + const attempted = results.filter((item) => item.status !== "skipped"); + const actualDiskReclaimBytes = diskBefore !== null && diskAfter !== null ? diskAfter.availableBytes - diskBefore.availableBytes : null; + return { + ok: plan.ok && failedCount === 0, + dryRun: false, + mutation: true, + action: "server cleanup run", + scope: "docker-images-only", + observedAt: new Date().toISOString(), + options, + diskBefore, + diskAfter, + summary: { + plannedCandidateCount: selected.length, + attemptedCount: attempted.length, + succeededCount, + failedCount, + skippedHighRiskCount, + estimatedReclaimBytes: attempted.reduce((sum, item) => sum + item.estimatedReclaimBytes, 0), + actualDiskReclaimBytes, + }, + results, + policy: { + deletionExecuted: true, + dockerPruneUsed: false, + dockerVolumesTouched: false, + dataDirectoriesTouched: false, + databaseCleanupIncluded: false, + highRiskRequiresIncludeFlag: true, + }, + }; +} + function collectDockerCleanupInventory(config: UniDeskConfig): DockerCleanupInventory { const observedAt = new Date().toISOString(); const desired = collectDesiredImagePolicy(config); @@ -829,6 +995,35 @@ function commandError(command: string[], message: string, exitCode: number | nul return { command, message, exitCode, stderrTail: stderr.slice(-1200) }; } +function dockerImagePresent(imageId: string): boolean | null { + const inspect = runCommand(["docker", "image", "inspect", imageId], repoRoot, { timeoutMs: 15_000 }); + if (inspect.exitCode === 0) return true; + const text = `${inspect.stdout}\n${inspect.stderr}`; + if (/no such image|no such object/i.test(text)) return false; + return null; +} + +function rootDiskSnapshot(): DiskSnapshot | null { + const result = runCommand(["df", "-B1", "-P", "/"], repoRoot, { timeoutMs: 5000 }); + if (result.exitCode !== 0) return null; + const line = result.stdout.trim().split(/\r?\n/u)[1]; + if (!line) return null; + const parts = line.trim().split(/\s+/u); + if (parts.length < 6) return null; + return { + filesystem: parts[0] ?? "", + sizeBytes: Number(parts[1]), + usedBytes: Number(parts[2]), + availableBytes: Number(parts[3]), + usePercent: Number((parts[4] ?? "0").replace("%", "")), + mount: parts[5] ?? "/", + }; +} + +function tailText(value: string, maxChars: number): string { + return value.length <= maxChars ? value : value.slice(-maxChars); +} + function shortContainerId(id: string): string { return id.slice(0, 12); }