fix: expose JD01 transport degraded snapshots
This commit is contained in:
@@ -142,6 +142,85 @@ function stringValue(value: unknown): string | null {
|
||||
return typeof value === "string" ? value : null;
|
||||
}
|
||||
|
||||
function numberValue(value: unknown): number | null {
|
||||
const parsed = Number(value);
|
||||
return Number.isFinite(parsed) ? parsed : null;
|
||||
}
|
||||
|
||||
function rounded(value: number | null): number | null {
|
||||
return value === null ? null : Math.round(value * 100) / 100;
|
||||
}
|
||||
|
||||
function providerHostPressureSummary(providerId: string): unknown {
|
||||
const response = coreInternalFetch("/api/nodes/system-status?limit=24");
|
||||
const body = recordValue(recordValue(response).body);
|
||||
const systemStatuses = arrayValue(body.systemStatuses);
|
||||
const item = systemStatuses
|
||||
.map((entry) => recordValue(entry))
|
||||
.find((entry) => entry.providerId === providerId) ?? null;
|
||||
if (item === null) {
|
||||
return {
|
||||
ok: false,
|
||||
degradedReason: "system-status-not-found",
|
||||
systemStatusFetch: response,
|
||||
};
|
||||
}
|
||||
const current = recordValue(item.current);
|
||||
if (Object.keys(current).length === 0) {
|
||||
return {
|
||||
ok: false,
|
||||
providerId,
|
||||
degradedReason: "system-status-current-missing",
|
||||
updatedAt: item.updatedAt ?? null,
|
||||
nodeStatus: item.nodeStatus ?? null,
|
||||
classification: "host-pressure-snapshot-unavailable",
|
||||
};
|
||||
}
|
||||
const cpu = recordValue(current.cpu);
|
||||
const memory = recordValue(current.memory);
|
||||
const disk = recordValue(current.disk);
|
||||
const cores = numberValue(cpu.cores);
|
||||
const load1 = numberValue(cpu.load1);
|
||||
const load5 = numberValue(cpu.load5);
|
||||
const load15 = numberValue(cpu.load15);
|
||||
const load1PerCore = cores !== null && cores > 0 && load1 !== null ? load1 / cores : null;
|
||||
const memoryPercent = numberValue(memory.percent);
|
||||
const memoryAvailableBytes = numberValue(memory.availableBytes);
|
||||
const swapTotalBytes = numberValue(memory.swapTotalBytes);
|
||||
const diskPercent = numberValue(disk.percent);
|
||||
const signals: string[] = [];
|
||||
if (load1PerCore !== null && load1PerCore >= 4) signals.push("load1-per-core>=4");
|
||||
if (memoryPercent !== null && memoryPercent >= 90) signals.push("memory-percent>=90");
|
||||
if (memoryAvailableBytes !== null && memoryAvailableBytes < 512 * 1024 * 1024) signals.push("memory-available<512MiB");
|
||||
if (swapTotalBytes === 0) signals.push("swap-disabled");
|
||||
if (diskPercent !== null && diskPercent >= 90) signals.push("disk-percent>=90");
|
||||
return {
|
||||
ok: current.ok ?? null,
|
||||
providerId,
|
||||
updatedAt: item.updatedAt ?? null,
|
||||
collectedAt: current.collectedAt ?? null,
|
||||
cpu: {
|
||||
cores,
|
||||
load1,
|
||||
load5,
|
||||
load15,
|
||||
load1PerCore: rounded(load1PerCore),
|
||||
},
|
||||
memory: {
|
||||
percent: memoryPercent,
|
||||
availableBytes: memoryAvailableBytes,
|
||||
swapTotalBytes,
|
||||
},
|
||||
disk: {
|
||||
percent: diskPercent,
|
||||
availableBytes: numberValue(disk.availableBytes),
|
||||
mount: disk.mount ?? null,
|
||||
},
|
||||
signals,
|
||||
classification: signals.length > 0 ? "host-pressure-signals-present" : "host-pressure-signals-not-observed",
|
||||
};
|
||||
}
|
||||
|
||||
export async function debugSshPool(_config: UniDeskConfig, providerId: string): Promise<unknown> {
|
||||
const nodesResponse = await coreInternalFetch("/api/nodes");
|
||||
const body = recordValue(recordValue(nodesResponse).body);
|
||||
@@ -185,6 +264,7 @@ export async function debugSshPool(_config: UniDeskConfig, providerId: string):
|
||||
updatedAt: node.updatedAt ?? null,
|
||||
},
|
||||
pool,
|
||||
hostPressure: providerHostPressureSummary(providerId),
|
||||
classification: ok
|
||||
? "ssh-tcp-pool-ready"
|
||||
: pool.transport !== "tcp-pool"
|
||||
@@ -195,7 +275,9 @@ export async function debugSshPool(_config: UniDeskConfig, providerId: string):
|
||||
next: {
|
||||
smoke: `trans ${providerId} argv true`,
|
||||
fullHealth: "bun scripts/cli.ts debug health",
|
||||
gcDegradedSnapshot: `bun scripts/cli.ts gc remote ${providerId} snapshot --no-save`,
|
||||
},
|
||||
note: "Pool labels are provider-reported channel state; use the smoke command or gc degraded output to verify execution path.",
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@ import { Buffer } from "node:buffer";
|
||||
import { existsSync, readFileSync } from "node:fs";
|
||||
|
||||
import { type UniDeskConfig, rootPath } from "./config";
|
||||
import { runSshCommandCapture } from "./ssh";
|
||||
import { classifySshTcpPoolFailure, runSshCommandCapture, type SshCaptureResult } from "./ssh";
|
||||
|
||||
type RemoteGcAction = "plan" | "snapshot" | "trend" | "run" | "status" | "policy-plan" | "policy-install";
|
||||
|
||||
@@ -254,14 +254,22 @@ async function runRemoteGc(config: UniDeskConfig, providerId: string, action: Re
|
||||
const scriptConfig = Buffer.from(JSON.stringify({ providerId, action, options, remoteTarget }), "utf8").toString("base64");
|
||||
const result = await runSshCommandCapture(config, providerId, ["py"], remoteGcPython(scriptConfig));
|
||||
if (result.exitCode !== 0) {
|
||||
const degraded = remoteGcDegradedFailure(providerId, action, result);
|
||||
return {
|
||||
ok: false,
|
||||
error: "gc-remote-command-failed",
|
||||
providerId,
|
||||
action: `gc remote ${action}`,
|
||||
exitCode: result.exitCode,
|
||||
degradedReason: degraded.degradedReason,
|
||||
transport: degraded.transport,
|
||||
safeCandidateCount: null,
|
||||
runAllowed: false,
|
||||
mutation: false,
|
||||
degraded,
|
||||
stdoutTail: result.stdout.slice(-4000),
|
||||
stderrTail: result.stderr.slice(-4000),
|
||||
next: degraded.next,
|
||||
};
|
||||
}
|
||||
try {
|
||||
@@ -279,6 +287,43 @@ async function runRemoteGc(config: UniDeskConfig, providerId: string, action: Re
|
||||
}
|
||||
}
|
||||
|
||||
function remoteGcDegradedFailure(providerId: string, action: RemoteGcAction, result: SshCaptureResult): Record<string, unknown> {
|
||||
const text = `${result.stderr}\n${result.stdout}`;
|
||||
const failureKind = classifySshTcpPoolFailure(text);
|
||||
const timeout = result.exitCode === 124 || text.includes("ssh-runtime-timeout");
|
||||
const providerOffline = text.includes(`provider is not online: ${providerId}`) || text.includes("provider is not online");
|
||||
const degradedReason = failureKind ?? (providerOffline ? "provider-offline" : timeout ? "ssh-runtime-timeout" : "remote-command-failed");
|
||||
return {
|
||||
ok: false,
|
||||
degraded: true,
|
||||
providerId,
|
||||
action: `gc remote ${action}`,
|
||||
degradedReason,
|
||||
transport: {
|
||||
sshTcpPoolFailureKind: failureKind,
|
||||
providerOffline,
|
||||
sshRuntimeTimeout: timeout,
|
||||
exitCode: result.exitCode,
|
||||
},
|
||||
safeCandidateCount: null,
|
||||
runAllowed: false,
|
||||
mutation: false,
|
||||
summary: failureKind !== null
|
||||
? `remote GC could not acquire a provider data channel: ${failureKind}`
|
||||
: providerOffline
|
||||
? `provider ${providerId} is offline from the controlled CLI transport view`
|
||||
: timeout
|
||||
? "remote GC did not complete before the SSH runtime timeout"
|
||||
: "remote GC command failed before producing a valid plan",
|
||||
next: {
|
||||
sshPool: `bun scripts/cli.ts debug ssh-pool ${providerId}`,
|
||||
fullHealth: "bun scripts/cli.ts debug health",
|
||||
smoke: `trans ${providerId} argv true`,
|
||||
retryPlan: `bun scripts/cli.ts gc remote ${providerId} plan --no-snapshot-save`,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function remoteGcPython(configBase64: string): string {
|
||||
return String.raw`
|
||||
import base64
|
||||
|
||||
Reference in New Issue
Block a user