gc: retain vpn diagnostic pcaps

This commit is contained in:
Codex
2026-06-11 17:11:46 +00:00
parent 35c18784a2
commit 79b4c668c0
3 changed files with 116 additions and 6 deletions
+5 -2
View File
@@ -86,10 +86,13 @@ bun scripts/cli.ts gc plan --target-use-percent 69 \
--include-stale-tmp \
--include-vscode-stale-servers \
--include-vscode-stale-extensions \
--include-baidu-staging
--include-baidu-staging \
--include-vpn-diagnostic-logs
```
`--target-use-percent``df` 显示口径估算 shortfall。工具缓存、`/tmp` 非 allowlist 直接子项、VS Code 历史 server/extension 版本、Baidu staging 旧 PGDATA tarball 均默认不启用;必须显式 include 后才进入候选,且执行时仍受路径断言保护。stale `/tmp` 扫描按 `--limit` 有界枚举候选,避免为了估算全量临时目录而长时间无输出。默认 GC 不触碰 PGDATA、Docker volumes/images、Codex sessions/auth stateBaidu staging 根目录。
`--target-use-percent``df` 显示口径估算 shortfall。工具缓存、`/tmp` 非 allowlist 直接子项、VS Code 历史 server/extension 版本、Baidu staging 旧 PGDATA tarball、VPN 诊断 ring pcap 均默认不启用;必须显式 include 后才进入候选,且执行时仍受路径断言保护。stale `/tmp` 扫描按 `--limit` 有界枚举候选,避免为了估算全量临时目录而长时间无输出。VPN 诊断日志只选择 `/root/vpn-server/logs/hy2-udp-ring-*.pcap``hy2-monitor-ring-*.pcap` 中超过 `--vpn-diagnostic-log-keep-hours` 的普通文件,执行前检查 active fd;不删除 evidence JSONL。默认 GC 不触碰 PGDATA、Docker volumes/images、Codex sessions/auth stateBaidu staging 根目录或 VPN 日志根目录。
`gc policy install` 的每日 timer 会自动执行 24 小时 VPN 诊断 pcap retention,用于限制长期 tcpdump ring 文件增长;手动 `gc plan/run` 仍必须显式 `--include-vpn-diagnostic-logs` 才会列出或删除这些 pcap。
---
+4 -1
View File
@@ -6,7 +6,7 @@ UniDesk 的磁盘治理入口是 `bun scripts/cli.ts gc ...`。该入口用于
- `gc plan`:只读生成主 server 清理候选、估算收益、风险等级、保护对象和数据库诊断摘要。
- `gc run --confirm`:只执行当前 plan 可见候选页,默认不执行分页隐藏候选;用 `--limit``--result-limit``--full|--raw` 控制披露和执行范围。
- `gc policy plan|install`:渲染或安装低风险长期策略,例如 journald cap每日 allowlisted 文件/tmp 清理 timer。
- `gc policy plan|install`:渲染或安装低风险长期策略,例如 journald cap每日 allowlisted 文件/tmp 清理 timer 和 24 小时 VPN 诊断 pcap retention
- `gc db-trace plan|run --confirm --before-date YYYY-MM-DD --vacuum-full`:显式 trace 遥测留存入口;涉及数据库重写时按维护窗口处理。
- `gc remote <providerId> plan|run --confirm|status --job-id <id>`:通过 UniDesk SSH 透传在 provider host 上执行受控 GC。远端长任务必须使用异步 job 和 `status` 短查询,不应让单次 SSH 等待完整 registry GC 或其他长清理。
@@ -14,6 +14,8 @@ UniDesk 的磁盘治理入口是 `bun scripts/cli.ts gc ...`。该入口用于
默认 `/tmp` GC 只包含 allowlisted 诊断目录和已知低风险路径。非 allowlist 的 stale `/tmp` 一级子项必须显式 `--include-stale-tmp` 才能进入候选;扫描按 `--limit` 有界枚举候选,执行时仍只允许删除 `/tmp` 直接子项,并避开 X11/ICE/font socket、systemd private、tmux、ssh、vscode 等系统/session 前缀。该入口不能递归扩大成通用 `/tmp` 清空器,也不能为了估算全量临时目录而长时间阻塞。
主 server VPN 诊断日志默认不清理。`/root/vpn-server/logs` 中由长期 `tcpdump -G` 产生的 `hy2-udp-ring-*.pcap``hy2-monitor-ring-*.pcap` 可通过显式 `--include-vpn-diagnostic-logs` 进入候选,默认只选择超过 `--vpn-diagnostic-log-keep-hours 24` 的普通 pcap 文件。执行前必须重新校验路径、文件名、非 symlink/regular file,并用 active-file 检查确认没有进程仍打开该文件。`hy2-server-evidence.jsonl`、stdout/stderr log、最新 pcap 和整个日志根目录始终作为 protected 输出,不得被这个入口删除或截断。
## Protected Data
默认 GC 不得删除或 prune 以下对象:
@@ -23,6 +25,7 @@ UniDesk 的磁盘治理入口是 `bun scripts/cli.ts gc ...`。该入口用于
| PostgreSQL PGDATA | 数据库权威状态,必须走备份、留存或迁移流程 |
| Docker image/container/volume | 运行面和发布真相可能依赖旧镜像或 volume |
| Baidu Netdisk staging/backups | 备份链路状态和可重建缓存边界需单独判定 |
| VPN diagnostic evidence logs | `/root/vpn-server/logs/hy2-server-evidence.jsonl` 等 active evidence 流用于网络排障,不随 pcap retention 删除 |
| D601 registry storage | artifact registry retention 需使用专门入口 |
| `/var/lib/rancher/k3s``/var/lib/rancher/k3s/storage` | k3s 控制面、containerd 状态和 local-path PVC 数据 |
| `/var/lib/kubelet``/var/lib/containerd` | kubelet/runtime 状态和可能被 workload 复用的 image cache |
+107 -3
View File
@@ -18,7 +18,8 @@ type GcItemKind =
| "tool-cache-delete"
| "vscode-server-delete"
| "vscode-extension-delete"
| "baidu-staging-file-delete";
| "baidu-staging-file-delete"
| "vpn-diagnostic-pcap-delete";
interface GcOptions {
fileLogs: boolean;
@@ -43,6 +44,8 @@ interface GcOptions {
vscodeKeepExtensionVersions: number;
baiduStaging: boolean;
baiduStagingKeepDays: number;
vpnDiagnosticLogs: boolean;
vpnDiagnosticLogKeepHours: number;
dbSummary: boolean;
limit: number;
resultLimit: number;
@@ -180,6 +183,8 @@ const DEFAULT_OPTIONS: GcOptions = {
vscodeKeepExtensionVersions: 1,
baiduStaging: false,
baiduStagingKeepDays: 10,
vpnDiagnosticLogs: false,
vpnDiagnosticLogKeepHours: 24,
dbSummary: true,
limit: 50,
resultLimit: 50,
@@ -295,6 +300,8 @@ const TOOL_CACHE_ALLOWLIST = [
const VSCODE_SERVER_ROOT = "/root/.vscode-server/cli/servers";
const VSCODE_EXTENSION_ROOT = "/root/.vscode-server/extensions";
const BAIDU_STAGING_RELATIVE_ROOT = [".state", "baidu-netdisk", "staging"];
const VPN_DIAGNOSTIC_LOG_ROOT = "/root/vpn-server/logs";
const VPN_DIAGNOSTIC_RING_PCAP_PATTERN = /^hy2-(?:udp|monitor)-ring-\d{14}\.pcap$/u;
const DEFAULT_PATH_SIZE_TIMEOUT_MS = 5_000;
const STALE_TMP_PATH_SIZE_TIMEOUT_MS = 1_500;
const STALE_TMP_MAX_CANDIDATES = 1_000;
@@ -446,6 +453,10 @@ export function gcPlan(config: UniDeskConfig, options: GcOptions = DEFAULT_OPTIO
if (options.baiduStaging) {
candidates.push(...collectBaiduStagingCandidates(options, observedAt));
}
if (options.vpnDiagnosticLogs) {
candidates.push(...collectVpnDiagnosticPcapCandidates(options, observedAt));
}
protectedItems.push(...collectProtectedVpnDiagnosticLogs(options));
protectedItems.push(...collectProtectedStorage(config, options));
const databaseSummary = options.dbSummary ? collectDatabaseSummary() : { skipped: true, reason: "disabled-by-option" };
@@ -482,6 +493,7 @@ export function gcPlan(config: UniDeskConfig, options: GcOptions = DEFAULT_OPTIO
options.full ? "Full candidate output requested." : `Default output is capped to ${options.limit} candidates; use --full or --limit N for broader disclosure.`,
"Tool caches, stale /tmp direct children, stale VS Code server versions and stale VS Code extension versions are opt-in and require explicit include flags.",
"Baidu Netdisk staging cleanup is opt-in and only selects old PGDATA backup tarballs under server-data/unidesk-pg-data.",
"VPN diagnostic pcap cleanup is opt-in and only selects stale hy2 ring pcap files; active pcap files and evidence JSONL are protected.",
"Database event retention is diagnostic-only in this command; cleanups for oa_events require a backup and a separate schema/retention change.",
"Docker image cleanup stays under server cleanup plan; gc does not run docker system prune or docker image prune.",
],
@@ -597,6 +609,12 @@ function parseGcOptions(args: string[]): GcOptions {
options.baiduStaging = false;
} else if (arg === "--baidu-staging-keep-days") {
options.baiduStagingKeepDays = parsePositiveIntegerOption(arg, args[++index], 3650);
} else if (arg === "--include-vpn-diagnostic-logs") {
options.vpnDiagnosticLogs = true;
} else if (arg === "--no-vpn-diagnostic-logs") {
options.vpnDiagnosticLogs = false;
} else if (arg === "--vpn-diagnostic-log-keep-hours") {
options.vpnDiagnosticLogKeepHours = parsePositiveIntegerOption(arg, args[++index], 365 * 24);
} else if (arg === "--no-file-logs" || arg === "--no-logs") {
options.fileLogs = false;
} else if (arg === "--no-docker-logs") {
@@ -739,6 +757,8 @@ function publicOptions(options: GcOptions): Record<string, unknown> {
vscodeKeepExtensionVersions: options.vscodeKeepExtensionVersions,
baiduStaging: options.baiduStaging,
baiduStagingKeepDays: options.baiduStagingKeepDays,
vpnDiagnosticLogs: options.vpnDiagnosticLogs,
vpnDiagnosticLogKeepHours: options.vpnDiagnosticLogKeepHours,
dbSummary: options.dbSummary,
limit: options.limit,
resultLimit: options.resultLimit,
@@ -1081,6 +1101,62 @@ function collectBaiduStagingCandidates(options: GcOptions, observedAt: string):
return result.sort((left, right) => right.estimatedReclaimBytes - left.estimatedReclaimBytes);
}
function collectVpnDiagnosticPcapCandidates(options: GcOptions, observedAt: string): GcCandidate[] {
if (!existsSync(VPN_DIAGNOSTIC_LOG_ROOT)) return [];
const cutoffMs = new Date(observedAt).getTime() - options.vpnDiagnosticLogKeepHours * 60 * 60 * 1000;
const result: GcCandidate[] = [];
for (const entry of readdirSync(VPN_DIAGNOSTIC_LOG_ROOT, { withFileTypes: true })) {
if (!entry.isFile() || !VPN_DIAGNOSTIC_RING_PCAP_PATTERN.test(entry.name)) continue;
const path = join(VPN_DIAGNOSTIC_LOG_ROOT, entry.name);
let stat;
try {
stat = lstatSync(path);
} catch {
continue;
}
if (!stat.isFile() || stat.isSymbolicLink() || stat.mtimeMs >= cutoffMs || stat.size <= 0) continue;
result.push({
id: `vpn-diagnostic-pcap:${entry.name}`,
kind: "vpn-diagnostic-pcap-delete",
risk: "medium",
description: `Delete stale VPN diagnostic ring pcap older than ${options.vpnDiagnosticLogKeepHours} hours`,
path,
sizeBytes: stat.size,
estimatedReclaimBytes: stat.size,
action: { op: "unlink", allowlist: "vpn-diagnostic-ring-pcap", keepHours: options.vpnDiagnosticLogKeepHours, activeCheck: "fuser-before-delete" },
});
}
return result.sort((left, right) => right.estimatedReclaimBytes - left.estimatedReclaimBytes);
}
function collectProtectedVpnDiagnosticLogs(options: GcOptions): ProtectedGcItem[] {
const result: ProtectedGcItem[] = [];
if (!existsSync(VPN_DIAGNOSTIC_LOG_ROOT)) return result;
const rootSize = safePathSize(VPN_DIAGNOSTIC_LOG_ROOT);
if (rootSize > 0) {
result.push({
kind: options.vpnDiagnosticLogs ? "vpn-diagnostic-log-root" : "vpn-diagnostic-log",
risk: "blocked",
ref: VPN_DIAGNOSTIC_LOG_ROOT,
sizeBytes: rootSize,
reason: options.vpnDiagnosticLogs
? "VPN diagnostic log root is protected; only stale hy2 ring pcap files are candidate files."
: "VPN diagnostic logs are not removed by default; rerun with --include-vpn-diagnostic-logs to remove only stale hy2 ring pcap files.",
});
}
const evidencePath = join(VPN_DIAGNOSTIC_LOG_ROOT, "hy2-server-evidence.jsonl");
if (existsSync(evidencePath)) {
result.push({
kind: "vpn-diagnostic-evidence-log",
risk: "blocked",
ref: evidencePath,
sizeBytes: safeFileSize(evidencePath),
reason: "Evidence JSONL is an active diagnostic stream and is not removed by gc pcap retention.",
});
}
return result;
}
function collectProtectedStorage(config: UniDeskConfig, options: GcOptions): ProtectedGcItem[] {
const result: ProtectedGcItem[] = [
{
@@ -1272,7 +1348,7 @@ function gcPolicyPlan(options: GcPolicyOptions): unknown {
policy: {
safeScope: [
"systemd journal is capped at 512MiB",
"daily timer runs file-log, Docker json logs, 24h BuildKit cache and allowlisted /tmp gc",
"daily timer runs file-log, Docker json logs, 24h BuildKit cache, allowlisted /tmp gc and 24h VPN diagnostic pcap retention",
"timer does not touch PostgreSQL PGDATA, Docker images, Docker volumes, tool caches, VS Code servers/extensions or Baidu Netdisk staging",
"timer output is redirected under .state/gc and capped by gc --result-limit",
],
@@ -1324,7 +1400,7 @@ function gcPolicyInstall(options: GcPolicyOptions): unknown {
function gcPolicyFiles(): Record<string, { path: string; content: string }> {
const gcStateDir = rootPath(".state", "gc");
const bunPath = bunExecutablePath();
const gcScript = `cd ${shellQuote(repoRoot)} && mkdir -p ${shellQuote(gcStateDir)} && ${shellQuote(bunPath)} scripts/cli.ts gc run --confirm --no-db-summary --no-journal --build-cache-until 24h --limit 5000 --result-limit 25 > ${shellQuote(join(gcStateDir, "last-run.json"))} 2> ${shellQuote(join(gcStateDir, "last-run.stderr"))}`;
const gcScript = `cd ${shellQuote(repoRoot)} && mkdir -p ${shellQuote(gcStateDir)} && ${shellQuote(bunPath)} scripts/cli.ts gc run --confirm --no-db-summary --no-journal --build-cache-until 24h --include-vpn-diagnostic-logs --vpn-diagnostic-log-keep-hours 24 --limit 5000 --result-limit 25 > ${shellQuote(join(gcStateDir, "last-run.json"))} 2> ${shellQuote(join(gcStateDir, "last-run.stderr"))}`;
return {
journald: {
path: "/etc/systemd/journald.conf.d/unidesk-gc.conf",
@@ -1470,6 +1546,13 @@ function executeCandidate(candidate: GcCandidate, options: GcOptions): { reclaim
unlinkSync(candidate.path);
return { reclaimedBytes: before };
}
if (candidate.kind === "vpn-diagnostic-pcap-delete" && candidate.path !== undefined) {
assertVpnDiagnosticPcapCandidatePath(candidate.path);
assertPathNotOpen(candidate.path);
const before = safeFileSize(candidate.path);
unlinkSync(candidate.path);
return { reclaimedBytes: before };
}
if (candidate.kind === "journal-vacuum") {
const result = command(["journalctl", `--vacuum-size=${options.journalTargetBytes}`], 30000);
if (result.exitCode !== 0) throw new Error(result.stderr.trim() || "journalctl vacuum failed");
@@ -1568,6 +1651,27 @@ function assertBaiduStagingCandidatePath(path: string): void {
}
}
function assertVpnDiagnosticPcapCandidatePath(path: string): void {
const resolved = resolve(path);
const root = resolve(VPN_DIAGNOSTIC_LOG_ROOT);
if (!resolved.startsWith(`${root}/`)) throw new Error(`refusing to remove VPN diagnostic pcap outside log root: ${path}`);
const relativePath = resolved.slice(root.length + 1);
if (relativePath.includes("/")) throw new Error(`refusing to remove nested VPN diagnostic path: ${path}`);
if (!VPN_DIAGNOSTIC_RING_PCAP_PATTERN.test(basename(resolved))) throw new Error(`refusing to remove unexpected VPN diagnostic pcap name: ${path}`);
const stat = lstatSync(resolved);
if (!stat.isFile() || stat.isSymbolicLink()) throw new Error(`refusing to remove non-regular VPN diagnostic pcap: ${path}`);
}
function assertPathNotOpen(path: string): void {
const result = command(["fuser", "--", path], 1000);
if (result.exitCode === 0) throw new Error(`refusing to remove active file still open by a process: ${path}`);
if (result.timedOut) throw new Error(`refusing to remove file because active-file check timed out: ${path}`);
if (result.exitCode !== 1) {
const detail = (result.stderr || result.stdout).trim();
throw new Error(`refusing to remove file because active-file check failed: ${detail || path}`);
}
}
function summarizeCandidates(candidates: GcCandidate[], returnedCandidates: GcCandidate[], diskBefore: DiskSnapshot | null, options: GcOptions): GcPlan["summary"] {
const byKind: GcPlan["summary"]["byKind"] = {};
let estimatedReclaimBytes = 0;