fix: expose JD01 transport degraded snapshots

This commit is contained in:
Codex
2026-07-04 20:38:10 +00:00
parent f35fde8f65
commit de984e06f7
2 changed files with 128 additions and 1 deletions
+82
View File
@@ -142,6 +142,85 @@ function stringValue(value: unknown): string | null {
return typeof value === "string" ? value : null;
}
function numberValue(value: unknown): number | null {
const parsed = Number(value);
return Number.isFinite(parsed) ? parsed : null;
}
function rounded(value: number | null): number | null {
return value === null ? null : Math.round(value * 100) / 100;
}
function providerHostPressureSummary(providerId: string): unknown {
const response = coreInternalFetch("/api/nodes/system-status?limit=24");
const body = recordValue(recordValue(response).body);
const systemStatuses = arrayValue(body.systemStatuses);
const item = systemStatuses
.map((entry) => recordValue(entry))
.find((entry) => entry.providerId === providerId) ?? null;
if (item === null) {
return {
ok: false,
degradedReason: "system-status-not-found",
systemStatusFetch: response,
};
}
const current = recordValue(item.current);
if (Object.keys(current).length === 0) {
return {
ok: false,
providerId,
degradedReason: "system-status-current-missing",
updatedAt: item.updatedAt ?? null,
nodeStatus: item.nodeStatus ?? null,
classification: "host-pressure-snapshot-unavailable",
};
}
const cpu = recordValue(current.cpu);
const memory = recordValue(current.memory);
const disk = recordValue(current.disk);
const cores = numberValue(cpu.cores);
const load1 = numberValue(cpu.load1);
const load5 = numberValue(cpu.load5);
const load15 = numberValue(cpu.load15);
const load1PerCore = cores !== null && cores > 0 && load1 !== null ? load1 / cores : null;
const memoryPercent = numberValue(memory.percent);
const memoryAvailableBytes = numberValue(memory.availableBytes);
const swapTotalBytes = numberValue(memory.swapTotalBytes);
const diskPercent = numberValue(disk.percent);
const signals: string[] = [];
if (load1PerCore !== null && load1PerCore >= 4) signals.push("load1-per-core>=4");
if (memoryPercent !== null && memoryPercent >= 90) signals.push("memory-percent>=90");
if (memoryAvailableBytes !== null && memoryAvailableBytes < 512 * 1024 * 1024) signals.push("memory-available<512MiB");
if (swapTotalBytes === 0) signals.push("swap-disabled");
if (diskPercent !== null && diskPercent >= 90) signals.push("disk-percent>=90");
return {
ok: current.ok ?? null,
providerId,
updatedAt: item.updatedAt ?? null,
collectedAt: current.collectedAt ?? null,
cpu: {
cores,
load1,
load5,
load15,
load1PerCore: rounded(load1PerCore),
},
memory: {
percent: memoryPercent,
availableBytes: memoryAvailableBytes,
swapTotalBytes,
},
disk: {
percent: diskPercent,
availableBytes: numberValue(disk.availableBytes),
mount: disk.mount ?? null,
},
signals,
classification: signals.length > 0 ? "host-pressure-signals-present" : "host-pressure-signals-not-observed",
};
}
export async function debugSshPool(_config: UniDeskConfig, providerId: string): Promise<unknown> {
const nodesResponse = await coreInternalFetch("/api/nodes");
const body = recordValue(recordValue(nodesResponse).body);
@@ -185,6 +264,7 @@ export async function debugSshPool(_config: UniDeskConfig, providerId: string):
updatedAt: node.updatedAt ?? null,
},
pool,
hostPressure: providerHostPressureSummary(providerId),
classification: ok
? "ssh-tcp-pool-ready"
: pool.transport !== "tcp-pool"
@@ -195,7 +275,9 @@ export async function debugSshPool(_config: UniDeskConfig, providerId: string):
next: {
smoke: `trans ${providerId} argv true`,
fullHealth: "bun scripts/cli.ts debug health",
gcDegradedSnapshot: `bun scripts/cli.ts gc remote ${providerId} snapshot --no-save`,
},
note: "Pool labels are provider-reported channel state; use the smoke command or gc degraded output to verify execution path.",
};
}
+46 -1
View File
@@ -2,7 +2,7 @@ import { Buffer } from "node:buffer";
import { existsSync, readFileSync } from "node:fs";
import { type UniDeskConfig, rootPath } from "./config";
import { runSshCommandCapture } from "./ssh";
import { classifySshTcpPoolFailure, runSshCommandCapture, type SshCaptureResult } from "./ssh";
type RemoteGcAction = "plan" | "snapshot" | "trend" | "run" | "status" | "policy-plan" | "policy-install";
@@ -254,14 +254,22 @@ async function runRemoteGc(config: UniDeskConfig, providerId: string, action: Re
const scriptConfig = Buffer.from(JSON.stringify({ providerId, action, options, remoteTarget }), "utf8").toString("base64");
const result = await runSshCommandCapture(config, providerId, ["py"], remoteGcPython(scriptConfig));
if (result.exitCode !== 0) {
const degraded = remoteGcDegradedFailure(providerId, action, result);
return {
ok: false,
error: "gc-remote-command-failed",
providerId,
action: `gc remote ${action}`,
exitCode: result.exitCode,
degradedReason: degraded.degradedReason,
transport: degraded.transport,
safeCandidateCount: null,
runAllowed: false,
mutation: false,
degraded,
stdoutTail: result.stdout.slice(-4000),
stderrTail: result.stderr.slice(-4000),
next: degraded.next,
};
}
try {
@@ -279,6 +287,43 @@ async function runRemoteGc(config: UniDeskConfig, providerId: string, action: Re
}
}
function remoteGcDegradedFailure(providerId: string, action: RemoteGcAction, result: SshCaptureResult): Record<string, unknown> {
const text = `${result.stderr}\n${result.stdout}`;
const failureKind = classifySshTcpPoolFailure(text);
const timeout = result.exitCode === 124 || text.includes("ssh-runtime-timeout");
const providerOffline = text.includes(`provider is not online: ${providerId}`) || text.includes("provider is not online");
const degradedReason = failureKind ?? (providerOffline ? "provider-offline" : timeout ? "ssh-runtime-timeout" : "remote-command-failed");
return {
ok: false,
degraded: true,
providerId,
action: `gc remote ${action}`,
degradedReason,
transport: {
sshTcpPoolFailureKind: failureKind,
providerOffline,
sshRuntimeTimeout: timeout,
exitCode: result.exitCode,
},
safeCandidateCount: null,
runAllowed: false,
mutation: false,
summary: failureKind !== null
? `remote GC could not acquire a provider data channel: ${failureKind}`
: providerOffline
? `provider ${providerId} is offline from the controlled CLI transport view`
: timeout
? "remote GC did not complete before the SSH runtime timeout"
: "remote GC command failed before producing a valid plan",
next: {
sshPool: `bun scripts/cli.ts debug ssh-pool ${providerId}`,
fullHealth: "bun scripts/cli.ts debug health",
smoke: `trans ${providerId} argv true`,
retryPlan: `bun scripts/cli.ts gc remote ${providerId} plan --no-snapshot-save`,
},
};
}
function remoteGcPython(configBase64: string): string {
return String.raw`
import base64