gc: retain vpn diagnostic pcaps
This commit is contained in:
@@ -86,10 +86,13 @@ bun scripts/cli.ts gc plan --target-use-percent 69 \
|
||||
--include-stale-tmp \
|
||||
--include-vscode-stale-servers \
|
||||
--include-vscode-stale-extensions \
|
||||
--include-baidu-staging
|
||||
--include-baidu-staging \
|
||||
--include-vpn-diagnostic-logs
|
||||
```
|
||||
|
||||
`--target-use-percent` 按 `df` 显示口径估算 shortfall。工具缓存、`/tmp` 非 allowlist 直接子项、VS Code 历史 server/extension 版本、Baidu staging 旧 PGDATA tarball 均默认不启用;必须显式 include 后才进入候选,且执行时仍受路径断言保护。stale `/tmp` 扫描按 `--limit` 有界枚举候选,避免为了估算全量临时目录而长时间无输出。默认 GC 不触碰 PGDATA、Docker volumes/images、Codex sessions/auth state 或 Baidu staging 根目录。
|
||||
`--target-use-percent` 按 `df` 显示口径估算 shortfall。工具缓存、`/tmp` 非 allowlist 直接子项、VS Code 历史 server/extension 版本、Baidu staging 旧 PGDATA tarball、VPN 诊断 ring pcap 均默认不启用;必须显式 include 后才进入候选,且执行时仍受路径断言保护。stale `/tmp` 扫描按 `--limit` 有界枚举候选,避免为了估算全量临时目录而长时间无输出。VPN 诊断日志只选择 `/root/vpn-server/logs/hy2-udp-ring-*.pcap` 和 `hy2-monitor-ring-*.pcap` 中超过 `--vpn-diagnostic-log-keep-hours` 的普通文件,执行前检查 active fd;不删除 evidence JSONL。默认 GC 不触碰 PGDATA、Docker volumes/images、Codex sessions/auth state、Baidu staging 根目录或 VPN 日志根目录。
|
||||
|
||||
`gc policy install` 的每日 timer 会自动执行 24 小时 VPN 诊断 pcap retention,用于限制长期 tcpdump ring 文件增长;手动 `gc plan/run` 仍必须显式 `--include-vpn-diagnostic-logs` 才会列出或删除这些 pcap。
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ UniDesk 的磁盘治理入口是 `bun scripts/cli.ts gc ...`。该入口用于
|
||||
|
||||
- `gc plan`:只读生成主 server 清理候选、估算收益、风险等级、保护对象和数据库诊断摘要。
|
||||
- `gc run --confirm`:只执行当前 plan 可见候选页,默认不执行分页隐藏候选;用 `--limit`、`--result-limit`、`--full|--raw` 控制披露和执行范围。
|
||||
- `gc policy plan|install`:渲染或安装低风险长期策略,例如 journald cap 和每日 allowlisted 文件/tmp 清理 timer。
|
||||
- `gc policy plan|install`:渲染或安装低风险长期策略,例如 journald cap、每日 allowlisted 文件/tmp 清理 timer 和 24 小时 VPN 诊断 pcap retention。
|
||||
- `gc db-trace plan|run --confirm --before-date YYYY-MM-DD --vacuum-full`:显式 trace 遥测留存入口;涉及数据库重写时按维护窗口处理。
|
||||
- `gc remote <providerId> plan|run --confirm|status --job-id <id>`:通过 UniDesk SSH 透传在 provider host 上执行受控 GC。远端长任务必须使用异步 job 和 `status` 短查询,不应让单次 SSH 等待完整 registry GC 或其他长清理。
|
||||
|
||||
@@ -14,6 +14,8 @@ UniDesk 的磁盘治理入口是 `bun scripts/cli.ts gc ...`。该入口用于
|
||||
|
||||
默认 `/tmp` GC 只包含 allowlisted 诊断目录和已知低风险路径。非 allowlist 的 stale `/tmp` 一级子项必须显式 `--include-stale-tmp` 才能进入候选;扫描按 `--limit` 有界枚举候选,执行时仍只允许删除 `/tmp` 直接子项,并避开 X11/ICE/font socket、systemd private、tmux、ssh、vscode 等系统/session 前缀。该入口不能递归扩大成通用 `/tmp` 清空器,也不能为了估算全量临时目录而长时间阻塞。
|
||||
|
||||
主 server VPN 诊断日志默认不清理。`/root/vpn-server/logs` 中由长期 `tcpdump -G` 产生的 `hy2-udp-ring-*.pcap` 和 `hy2-monitor-ring-*.pcap` 可通过显式 `--include-vpn-diagnostic-logs` 进入候选,默认只选择超过 `--vpn-diagnostic-log-keep-hours 24` 的普通 pcap 文件。执行前必须重新校验路径、文件名、非 symlink/regular file,并用 active-file 检查确认没有进程仍打开该文件。`hy2-server-evidence.jsonl`、stdout/stderr log、最新 pcap 和整个日志根目录始终作为 protected 输出,不得被这个入口删除或截断。
|
||||
|
||||
## Protected Data
|
||||
|
||||
默认 GC 不得删除或 prune 以下对象:
|
||||
@@ -23,6 +25,7 @@ UniDesk 的磁盘治理入口是 `bun scripts/cli.ts gc ...`。该入口用于
|
||||
| PostgreSQL PGDATA | 数据库权威状态,必须走备份、留存或迁移流程 |
|
||||
| Docker image/container/volume | 运行面和发布真相可能依赖旧镜像或 volume |
|
||||
| Baidu Netdisk staging/backups | 备份链路状态和可重建缓存边界需单独判定 |
|
||||
| VPN diagnostic evidence logs | `/root/vpn-server/logs/hy2-server-evidence.jsonl` 等 active evidence 流用于网络排障,不随 pcap retention 删除 |
|
||||
| D601 registry storage | artifact registry retention 需使用专门入口 |
|
||||
| `/var/lib/rancher/k3s` 与 `/var/lib/rancher/k3s/storage` | k3s 控制面、containerd 状态和 local-path PVC 数据 |
|
||||
| `/var/lib/kubelet`、`/var/lib/containerd` | kubelet/runtime 状态和可能被 workload 复用的 image cache |
|
||||
|
||||
+107
-3
@@ -18,7 +18,8 @@ type GcItemKind =
|
||||
| "tool-cache-delete"
|
||||
| "vscode-server-delete"
|
||||
| "vscode-extension-delete"
|
||||
| "baidu-staging-file-delete";
|
||||
| "baidu-staging-file-delete"
|
||||
| "vpn-diagnostic-pcap-delete";
|
||||
|
||||
interface GcOptions {
|
||||
fileLogs: boolean;
|
||||
@@ -43,6 +44,8 @@ interface GcOptions {
|
||||
vscodeKeepExtensionVersions: number;
|
||||
baiduStaging: boolean;
|
||||
baiduStagingKeepDays: number;
|
||||
vpnDiagnosticLogs: boolean;
|
||||
vpnDiagnosticLogKeepHours: number;
|
||||
dbSummary: boolean;
|
||||
limit: number;
|
||||
resultLimit: number;
|
||||
@@ -180,6 +183,8 @@ const DEFAULT_OPTIONS: GcOptions = {
|
||||
vscodeKeepExtensionVersions: 1,
|
||||
baiduStaging: false,
|
||||
baiduStagingKeepDays: 10,
|
||||
vpnDiagnosticLogs: false,
|
||||
vpnDiagnosticLogKeepHours: 24,
|
||||
dbSummary: true,
|
||||
limit: 50,
|
||||
resultLimit: 50,
|
||||
@@ -295,6 +300,8 @@ const TOOL_CACHE_ALLOWLIST = [
|
||||
const VSCODE_SERVER_ROOT = "/root/.vscode-server/cli/servers";
|
||||
const VSCODE_EXTENSION_ROOT = "/root/.vscode-server/extensions";
|
||||
const BAIDU_STAGING_RELATIVE_ROOT = [".state", "baidu-netdisk", "staging"];
|
||||
const VPN_DIAGNOSTIC_LOG_ROOT = "/root/vpn-server/logs";
|
||||
const VPN_DIAGNOSTIC_RING_PCAP_PATTERN = /^hy2-(?:udp|monitor)-ring-\d{14}\.pcap$/u;
|
||||
const DEFAULT_PATH_SIZE_TIMEOUT_MS = 5_000;
|
||||
const STALE_TMP_PATH_SIZE_TIMEOUT_MS = 1_500;
|
||||
const STALE_TMP_MAX_CANDIDATES = 1_000;
|
||||
@@ -446,6 +453,10 @@ export function gcPlan(config: UniDeskConfig, options: GcOptions = DEFAULT_OPTIO
|
||||
if (options.baiduStaging) {
|
||||
candidates.push(...collectBaiduStagingCandidates(options, observedAt));
|
||||
}
|
||||
if (options.vpnDiagnosticLogs) {
|
||||
candidates.push(...collectVpnDiagnosticPcapCandidates(options, observedAt));
|
||||
}
|
||||
protectedItems.push(...collectProtectedVpnDiagnosticLogs(options));
|
||||
|
||||
protectedItems.push(...collectProtectedStorage(config, options));
|
||||
const databaseSummary = options.dbSummary ? collectDatabaseSummary() : { skipped: true, reason: "disabled-by-option" };
|
||||
@@ -482,6 +493,7 @@ export function gcPlan(config: UniDeskConfig, options: GcOptions = DEFAULT_OPTIO
|
||||
options.full ? "Full candidate output requested." : `Default output is capped to ${options.limit} candidates; use --full or --limit N for broader disclosure.`,
|
||||
"Tool caches, stale /tmp direct children, stale VS Code server versions and stale VS Code extension versions are opt-in and require explicit include flags.",
|
||||
"Baidu Netdisk staging cleanup is opt-in and only selects old PGDATA backup tarballs under server-data/unidesk-pg-data.",
|
||||
"VPN diagnostic pcap cleanup is opt-in and only selects stale hy2 ring pcap files; active pcap files and evidence JSONL are protected.",
|
||||
"Database event retention is diagnostic-only in this command; cleanups for oa_events require a backup and a separate schema/retention change.",
|
||||
"Docker image cleanup stays under server cleanup plan; gc does not run docker system prune or docker image prune.",
|
||||
],
|
||||
@@ -597,6 +609,12 @@ function parseGcOptions(args: string[]): GcOptions {
|
||||
options.baiduStaging = false;
|
||||
} else if (arg === "--baidu-staging-keep-days") {
|
||||
options.baiduStagingKeepDays = parsePositiveIntegerOption(arg, args[++index], 3650);
|
||||
} else if (arg === "--include-vpn-diagnostic-logs") {
|
||||
options.vpnDiagnosticLogs = true;
|
||||
} else if (arg === "--no-vpn-diagnostic-logs") {
|
||||
options.vpnDiagnosticLogs = false;
|
||||
} else if (arg === "--vpn-diagnostic-log-keep-hours") {
|
||||
options.vpnDiagnosticLogKeepHours = parsePositiveIntegerOption(arg, args[++index], 365 * 24);
|
||||
} else if (arg === "--no-file-logs" || arg === "--no-logs") {
|
||||
options.fileLogs = false;
|
||||
} else if (arg === "--no-docker-logs") {
|
||||
@@ -739,6 +757,8 @@ function publicOptions(options: GcOptions): Record<string, unknown> {
|
||||
vscodeKeepExtensionVersions: options.vscodeKeepExtensionVersions,
|
||||
baiduStaging: options.baiduStaging,
|
||||
baiduStagingKeepDays: options.baiduStagingKeepDays,
|
||||
vpnDiagnosticLogs: options.vpnDiagnosticLogs,
|
||||
vpnDiagnosticLogKeepHours: options.vpnDiagnosticLogKeepHours,
|
||||
dbSummary: options.dbSummary,
|
||||
limit: options.limit,
|
||||
resultLimit: options.resultLimit,
|
||||
@@ -1081,6 +1101,62 @@ function collectBaiduStagingCandidates(options: GcOptions, observedAt: string):
|
||||
return result.sort((left, right) => right.estimatedReclaimBytes - left.estimatedReclaimBytes);
|
||||
}
|
||||
|
||||
function collectVpnDiagnosticPcapCandidates(options: GcOptions, observedAt: string): GcCandidate[] {
|
||||
if (!existsSync(VPN_DIAGNOSTIC_LOG_ROOT)) return [];
|
||||
const cutoffMs = new Date(observedAt).getTime() - options.vpnDiagnosticLogKeepHours * 60 * 60 * 1000;
|
||||
const result: GcCandidate[] = [];
|
||||
for (const entry of readdirSync(VPN_DIAGNOSTIC_LOG_ROOT, { withFileTypes: true })) {
|
||||
if (!entry.isFile() || !VPN_DIAGNOSTIC_RING_PCAP_PATTERN.test(entry.name)) continue;
|
||||
const path = join(VPN_DIAGNOSTIC_LOG_ROOT, entry.name);
|
||||
let stat;
|
||||
try {
|
||||
stat = lstatSync(path);
|
||||
} catch {
|
||||
continue;
|
||||
}
|
||||
if (!stat.isFile() || stat.isSymbolicLink() || stat.mtimeMs >= cutoffMs || stat.size <= 0) continue;
|
||||
result.push({
|
||||
id: `vpn-diagnostic-pcap:${entry.name}`,
|
||||
kind: "vpn-diagnostic-pcap-delete",
|
||||
risk: "medium",
|
||||
description: `Delete stale VPN diagnostic ring pcap older than ${options.vpnDiagnosticLogKeepHours} hours`,
|
||||
path,
|
||||
sizeBytes: stat.size,
|
||||
estimatedReclaimBytes: stat.size,
|
||||
action: { op: "unlink", allowlist: "vpn-diagnostic-ring-pcap", keepHours: options.vpnDiagnosticLogKeepHours, activeCheck: "fuser-before-delete" },
|
||||
});
|
||||
}
|
||||
return result.sort((left, right) => right.estimatedReclaimBytes - left.estimatedReclaimBytes);
|
||||
}
|
||||
|
||||
function collectProtectedVpnDiagnosticLogs(options: GcOptions): ProtectedGcItem[] {
|
||||
const result: ProtectedGcItem[] = [];
|
||||
if (!existsSync(VPN_DIAGNOSTIC_LOG_ROOT)) return result;
|
||||
const rootSize = safePathSize(VPN_DIAGNOSTIC_LOG_ROOT);
|
||||
if (rootSize > 0) {
|
||||
result.push({
|
||||
kind: options.vpnDiagnosticLogs ? "vpn-diagnostic-log-root" : "vpn-diagnostic-log",
|
||||
risk: "blocked",
|
||||
ref: VPN_DIAGNOSTIC_LOG_ROOT,
|
||||
sizeBytes: rootSize,
|
||||
reason: options.vpnDiagnosticLogs
|
||||
? "VPN diagnostic log root is protected; only stale hy2 ring pcap files are candidate files."
|
||||
: "VPN diagnostic logs are not removed by default; rerun with --include-vpn-diagnostic-logs to remove only stale hy2 ring pcap files.",
|
||||
});
|
||||
}
|
||||
const evidencePath = join(VPN_DIAGNOSTIC_LOG_ROOT, "hy2-server-evidence.jsonl");
|
||||
if (existsSync(evidencePath)) {
|
||||
result.push({
|
||||
kind: "vpn-diagnostic-evidence-log",
|
||||
risk: "blocked",
|
||||
ref: evidencePath,
|
||||
sizeBytes: safeFileSize(evidencePath),
|
||||
reason: "Evidence JSONL is an active diagnostic stream and is not removed by gc pcap retention.",
|
||||
});
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
function collectProtectedStorage(config: UniDeskConfig, options: GcOptions): ProtectedGcItem[] {
|
||||
const result: ProtectedGcItem[] = [
|
||||
{
|
||||
@@ -1272,7 +1348,7 @@ function gcPolicyPlan(options: GcPolicyOptions): unknown {
|
||||
policy: {
|
||||
safeScope: [
|
||||
"systemd journal is capped at 512MiB",
|
||||
"daily timer runs file-log, Docker json logs, 24h BuildKit cache and allowlisted /tmp gc",
|
||||
"daily timer runs file-log, Docker json logs, 24h BuildKit cache, allowlisted /tmp gc and 24h VPN diagnostic pcap retention",
|
||||
"timer does not touch PostgreSQL PGDATA, Docker images, Docker volumes, tool caches, VS Code servers/extensions or Baidu Netdisk staging",
|
||||
"timer output is redirected under .state/gc and capped by gc --result-limit",
|
||||
],
|
||||
@@ -1324,7 +1400,7 @@ function gcPolicyInstall(options: GcPolicyOptions): unknown {
|
||||
function gcPolicyFiles(): Record<string, { path: string; content: string }> {
|
||||
const gcStateDir = rootPath(".state", "gc");
|
||||
const bunPath = bunExecutablePath();
|
||||
const gcScript = `cd ${shellQuote(repoRoot)} && mkdir -p ${shellQuote(gcStateDir)} && ${shellQuote(bunPath)} scripts/cli.ts gc run --confirm --no-db-summary --no-journal --build-cache-until 24h --limit 5000 --result-limit 25 > ${shellQuote(join(gcStateDir, "last-run.json"))} 2> ${shellQuote(join(gcStateDir, "last-run.stderr"))}`;
|
||||
const gcScript = `cd ${shellQuote(repoRoot)} && mkdir -p ${shellQuote(gcStateDir)} && ${shellQuote(bunPath)} scripts/cli.ts gc run --confirm --no-db-summary --no-journal --build-cache-until 24h --include-vpn-diagnostic-logs --vpn-diagnostic-log-keep-hours 24 --limit 5000 --result-limit 25 > ${shellQuote(join(gcStateDir, "last-run.json"))} 2> ${shellQuote(join(gcStateDir, "last-run.stderr"))}`;
|
||||
return {
|
||||
journald: {
|
||||
path: "/etc/systemd/journald.conf.d/unidesk-gc.conf",
|
||||
@@ -1470,6 +1546,13 @@ function executeCandidate(candidate: GcCandidate, options: GcOptions): { reclaim
|
||||
unlinkSync(candidate.path);
|
||||
return { reclaimedBytes: before };
|
||||
}
|
||||
if (candidate.kind === "vpn-diagnostic-pcap-delete" && candidate.path !== undefined) {
|
||||
assertVpnDiagnosticPcapCandidatePath(candidate.path);
|
||||
assertPathNotOpen(candidate.path);
|
||||
const before = safeFileSize(candidate.path);
|
||||
unlinkSync(candidate.path);
|
||||
return { reclaimedBytes: before };
|
||||
}
|
||||
if (candidate.kind === "journal-vacuum") {
|
||||
const result = command(["journalctl", `--vacuum-size=${options.journalTargetBytes}`], 30000);
|
||||
if (result.exitCode !== 0) throw new Error(result.stderr.trim() || "journalctl vacuum failed");
|
||||
@@ -1568,6 +1651,27 @@ function assertBaiduStagingCandidatePath(path: string): void {
|
||||
}
|
||||
}
|
||||
|
||||
function assertVpnDiagnosticPcapCandidatePath(path: string): void {
|
||||
const resolved = resolve(path);
|
||||
const root = resolve(VPN_DIAGNOSTIC_LOG_ROOT);
|
||||
if (!resolved.startsWith(`${root}/`)) throw new Error(`refusing to remove VPN diagnostic pcap outside log root: ${path}`);
|
||||
const relativePath = resolved.slice(root.length + 1);
|
||||
if (relativePath.includes("/")) throw new Error(`refusing to remove nested VPN diagnostic path: ${path}`);
|
||||
if (!VPN_DIAGNOSTIC_RING_PCAP_PATTERN.test(basename(resolved))) throw new Error(`refusing to remove unexpected VPN diagnostic pcap name: ${path}`);
|
||||
const stat = lstatSync(resolved);
|
||||
if (!stat.isFile() || stat.isSymbolicLink()) throw new Error(`refusing to remove non-regular VPN diagnostic pcap: ${path}`);
|
||||
}
|
||||
|
||||
function assertPathNotOpen(path: string): void {
|
||||
const result = command(["fuser", "--", path], 1000);
|
||||
if (result.exitCode === 0) throw new Error(`refusing to remove active file still open by a process: ${path}`);
|
||||
if (result.timedOut) throw new Error(`refusing to remove file because active-file check timed out: ${path}`);
|
||||
if (result.exitCode !== 1) {
|
||||
const detail = (result.stderr || result.stdout).trim();
|
||||
throw new Error(`refusing to remove file because active-file check failed: ${detail || path}`);
|
||||
}
|
||||
}
|
||||
|
||||
function summarizeCandidates(candidates: GcCandidate[], returnedCandidates: GcCandidate[], diskBefore: DiskSnapshot | null, options: GcOptions): GcPlan["summary"] {
|
||||
const byKind: GcPlan["summary"]["byKind"] = {};
|
||||
let estimatedReclaimBytes = 0;
|
||||
|
||||
Reference in New Issue
Block a user