diff --git a/scripts/src/debug.ts b/scripts/src/debug.ts index f5900b79..4e12cb2d 100644 --- a/scripts/src/debug.ts +++ b/scripts/src/debug.ts @@ -142,6 +142,85 @@ function stringValue(value: unknown): string | null { return typeof value === "string" ? value : null; } +function numberValue(value: unknown): number | null { + const parsed = Number(value); + return Number.isFinite(parsed) ? parsed : null; +} + +function rounded(value: number | null): number | null { + return value === null ? null : Math.round(value * 100) / 100; +} + +function providerHostPressureSummary(providerId: string): unknown { + const response = coreInternalFetch("/api/nodes/system-status?limit=24"); + const body = recordValue(recordValue(response).body); + const systemStatuses = arrayValue(body.systemStatuses); + const item = systemStatuses + .map((entry) => recordValue(entry)) + .find((entry) => entry.providerId === providerId) ?? null; + if (item === null) { + return { + ok: false, + degradedReason: "system-status-not-found", + systemStatusFetch: response, + }; + } + const current = recordValue(item.current); + if (Object.keys(current).length === 0) { + return { + ok: false, + providerId, + degradedReason: "system-status-current-missing", + updatedAt: item.updatedAt ?? null, + nodeStatus: item.nodeStatus ?? null, + classification: "host-pressure-snapshot-unavailable", + }; + } + const cpu = recordValue(current.cpu); + const memory = recordValue(current.memory); + const disk = recordValue(current.disk); + const cores = numberValue(cpu.cores); + const load1 = numberValue(cpu.load1); + const load5 = numberValue(cpu.load5); + const load15 = numberValue(cpu.load15); + const load1PerCore = cores !== null && cores > 0 && load1 !== null ? load1 / cores : null; + const memoryPercent = numberValue(memory.percent); + const memoryAvailableBytes = numberValue(memory.availableBytes); + const swapTotalBytes = numberValue(memory.swapTotalBytes); + const diskPercent = numberValue(disk.percent); + const signals: string[] = []; + if (load1PerCore !== null && load1PerCore >= 4) signals.push("load1-per-core>=4"); + if (memoryPercent !== null && memoryPercent >= 90) signals.push("memory-percent>=90"); + if (memoryAvailableBytes !== null && memoryAvailableBytes < 512 * 1024 * 1024) signals.push("memory-available<512MiB"); + if (swapTotalBytes === 0) signals.push("swap-disabled"); + if (diskPercent !== null && diskPercent >= 90) signals.push("disk-percent>=90"); + return { + ok: current.ok ?? null, + providerId, + updatedAt: item.updatedAt ?? null, + collectedAt: current.collectedAt ?? null, + cpu: { + cores, + load1, + load5, + load15, + load1PerCore: rounded(load1PerCore), + }, + memory: { + percent: memoryPercent, + availableBytes: memoryAvailableBytes, + swapTotalBytes, + }, + disk: { + percent: diskPercent, + availableBytes: numberValue(disk.availableBytes), + mount: disk.mount ?? null, + }, + signals, + classification: signals.length > 0 ? "host-pressure-signals-present" : "host-pressure-signals-not-observed", + }; +} + export async function debugSshPool(_config: UniDeskConfig, providerId: string): Promise { const nodesResponse = await coreInternalFetch("/api/nodes"); const body = recordValue(recordValue(nodesResponse).body); @@ -185,6 +264,7 @@ export async function debugSshPool(_config: UniDeskConfig, providerId: string): updatedAt: node.updatedAt ?? null, }, pool, + hostPressure: providerHostPressureSummary(providerId), classification: ok ? "ssh-tcp-pool-ready" : pool.transport !== "tcp-pool" @@ -195,7 +275,9 @@ export async function debugSshPool(_config: UniDeskConfig, providerId: string): next: { smoke: `trans ${providerId} argv true`, fullHealth: "bun scripts/cli.ts debug health", + gcDegradedSnapshot: `bun scripts/cli.ts gc remote ${providerId} snapshot --no-save`, }, + note: "Pool labels are provider-reported channel state; use the smoke command or gc degraded output to verify execution path.", }; } diff --git a/scripts/src/gc-remote.ts b/scripts/src/gc-remote.ts index 60ab8f90..672689a4 100644 --- a/scripts/src/gc-remote.ts +++ b/scripts/src/gc-remote.ts @@ -2,7 +2,7 @@ import { Buffer } from "node:buffer"; import { existsSync, readFileSync } from "node:fs"; import { type UniDeskConfig, rootPath } from "./config"; -import { runSshCommandCapture } from "./ssh"; +import { classifySshTcpPoolFailure, runSshCommandCapture, type SshCaptureResult } from "./ssh"; type RemoteGcAction = "plan" | "snapshot" | "trend" | "run" | "status" | "policy-plan" | "policy-install"; @@ -254,14 +254,22 @@ async function runRemoteGc(config: UniDeskConfig, providerId: string, action: Re const scriptConfig = Buffer.from(JSON.stringify({ providerId, action, options, remoteTarget }), "utf8").toString("base64"); const result = await runSshCommandCapture(config, providerId, ["py"], remoteGcPython(scriptConfig)); if (result.exitCode !== 0) { + const degraded = remoteGcDegradedFailure(providerId, action, result); return { ok: false, error: "gc-remote-command-failed", providerId, action: `gc remote ${action}`, exitCode: result.exitCode, + degradedReason: degraded.degradedReason, + transport: degraded.transport, + safeCandidateCount: null, + runAllowed: false, + mutation: false, + degraded, stdoutTail: result.stdout.slice(-4000), stderrTail: result.stderr.slice(-4000), + next: degraded.next, }; } try { @@ -279,6 +287,43 @@ async function runRemoteGc(config: UniDeskConfig, providerId: string, action: Re } } +function remoteGcDegradedFailure(providerId: string, action: RemoteGcAction, result: SshCaptureResult): Record { + const text = `${result.stderr}\n${result.stdout}`; + const failureKind = classifySshTcpPoolFailure(text); + const timeout = result.exitCode === 124 || text.includes("ssh-runtime-timeout"); + const providerOffline = text.includes(`provider is not online: ${providerId}`) || text.includes("provider is not online"); + const degradedReason = failureKind ?? (providerOffline ? "provider-offline" : timeout ? "ssh-runtime-timeout" : "remote-command-failed"); + return { + ok: false, + degraded: true, + providerId, + action: `gc remote ${action}`, + degradedReason, + transport: { + sshTcpPoolFailureKind: failureKind, + providerOffline, + sshRuntimeTimeout: timeout, + exitCode: result.exitCode, + }, + safeCandidateCount: null, + runAllowed: false, + mutation: false, + summary: failureKind !== null + ? `remote GC could not acquire a provider data channel: ${failureKind}` + : providerOffline + ? `provider ${providerId} is offline from the controlled CLI transport view` + : timeout + ? "remote GC did not complete before the SSH runtime timeout" + : "remote GC command failed before producing a valid plan", + next: { + sshPool: `bun scripts/cli.ts debug ssh-pool ${providerId}`, + fullHealth: "bun scripts/cli.ts debug health", + smoke: `trans ${providerId} argv true`, + retryPlan: `bun scripts/cli.ts gc remote ${providerId} plan --no-snapshot-save`, + }, + }; +} + function remoteGcPython(configBase64: string): string { return String.raw` import base64