Merge pull request #1546 from pikasTech/fix/1544-jd01-transport-visibility

fix: expose JD01 transport degraded snapshots
This commit is contained in:
Lyon
2026-07-05 04:57:10 +08:00
committed by GitHub
4 changed files with 3089 additions and 2947 deletions
+94 -4
View File
@@ -142,6 +142,85 @@ function stringValue(value: unknown): string | null {
return typeof value === "string" ? value : null;
}
function numberValue(value: unknown): number | null {
const parsed = Number(value);
return Number.isFinite(parsed) ? parsed : null;
}
function rounded(value: number | null): number | null {
return value === null ? null : Math.round(value * 100) / 100;
}
function providerHostPressureSummary(providerId: string): unknown {
const response = coreInternalFetch("/api/nodes/system-status?limit=24");
const body = recordValue(recordValue(response).body);
const systemStatuses = arrayValue(body.systemStatuses);
const item = systemStatuses
.map((entry) => recordValue(entry))
.find((entry) => entry.providerId === providerId) ?? null;
if (item === null) {
return {
ok: false,
degradedReason: "system-status-not-found",
systemStatusFetch: response,
};
}
const current = recordValue(item.current);
if (Object.keys(current).length === 0) {
return {
ok: false,
providerId,
degradedReason: "system-status-current-missing",
updatedAt: item.updatedAt ?? null,
nodeStatus: item.nodeStatus ?? null,
classification: "host-pressure-snapshot-unavailable",
};
}
const cpu = recordValue(current.cpu);
const memory = recordValue(current.memory);
const disk = recordValue(current.disk);
const cores = numberValue(cpu.cores);
const load1 = numberValue(cpu.load1);
const load5 = numberValue(cpu.load5);
const load15 = numberValue(cpu.load15);
const load1PerCore = cores !== null && cores > 0 && load1 !== null ? load1 / cores : null;
const memoryPercent = numberValue(memory.percent);
const memoryAvailableBytes = numberValue(memory.availableBytes);
const swapTotalBytes = numberValue(memory.swapTotalBytes);
const diskPercent = numberValue(disk.percent);
const signals: string[] = [];
if (load1PerCore !== null && load1PerCore >= 4) signals.push("load1-per-core>=4");
if (memoryPercent !== null && memoryPercent >= 90) signals.push("memory-percent>=90");
if (memoryAvailableBytes !== null && memoryAvailableBytes < 512 * 1024 * 1024) signals.push("memory-available<512MiB");
if (swapTotalBytes === 0) signals.push("swap-disabled");
if (diskPercent !== null && diskPercent >= 90) signals.push("disk-percent>=90");
return {
ok: current.ok ?? null,
providerId,
updatedAt: item.updatedAt ?? null,
collectedAt: current.collectedAt ?? null,
cpu: {
cores,
load1,
load5,
load15,
load1PerCore: rounded(load1PerCore),
},
memory: {
percent: memoryPercent,
availableBytes: memoryAvailableBytes,
swapTotalBytes,
},
disk: {
percent: diskPercent,
availableBytes: numberValue(disk.availableBytes),
mount: disk.mount ?? null,
},
signals,
classification: signals.length > 0 ? "host-pressure-signals-present" : "host-pressure-signals-not-observed",
};
}
export async function debugSshPool(_config: UniDeskConfig, providerId: string): Promise<unknown> {
const nodesResponse = await coreInternalFetch("/api/nodes");
const body = recordValue(recordValue(nodesResponse).body);
@@ -159,6 +238,7 @@ export async function debugSshPool(_config: UniDeskConfig, providerId: string):
};
}
const labels = recordValue(node.labels);
const nodeStatus = stringValue(node.status);
const pool = {
transport: stringValue(labels.providerGatewaySshDataTransport),
host: stringValue(labels.providerGatewaySshDataHost),
@@ -173,19 +253,27 @@ export async function debugSshPool(_config: UniDeskConfig, providerId: string):
const ready = Number(pool.ready ?? 0);
const claimed = Number(pool.claimed ?? 0);
const desired = Number(pool.desired ?? 0);
const ok = pool.transport === "tcp-pool" && Number.isFinite(ready) && ready > 0;
const providerOnline = nodeStatus === "online";
const poolLabelReady = pool.transport === "tcp-pool" && Number.isFinite(ready) && ready > 0;
const executionPathReady = providerOnline && poolLabelReady;
return {
ok,
ok: executionPathReady,
providerId,
providerOnline,
poolLabelReady,
executionPathReady,
node: {
providerId: node.providerId,
name: node.name,
status: node.status,
status: nodeStatus,
lastHeartbeat: node.lastHeartbeat ?? null,
updatedAt: node.updatedAt ?? null,
},
pool,
classification: ok
hostPressure: providerHostPressureSummary(providerId),
classification: !providerOnline
? (nodeStatus === "offline" ? "provider-offline" : "provider-not-online")
: poolLabelReady
? "ssh-tcp-pool-ready"
: pool.transport !== "tcp-pool"
? "provider-gateway-upgrade-required"
@@ -195,7 +283,9 @@ export async function debugSshPool(_config: UniDeskConfig, providerId: string):
next: {
smoke: `trans ${providerId} argv true`,
fullHealth: "bun scripts/cli.ts debug health",
gcDegradedSnapshot: `bun scripts/cli.ts gc remote ${providerId} snapshot --no-save`,
},
note: "Pool labels are provider-reported channel state; use the smoke command or gc degraded output to verify execution path.",
};
}
+38
View File
@@ -0,0 +1,38 @@
import { classifySshTcpPoolFailure, type SshCaptureResult } from "./ssh";
export function remoteGcDegradedFailure(providerId: string, action: string, result: SshCaptureResult): Record<string, unknown> {
const text = `${result.stderr}\n${result.stdout}`;
const failureKind = classifySshTcpPoolFailure(text);
const timeout = result.exitCode === 124 || text.includes("ssh-runtime-timeout");
const providerOffline = text.includes(`provider is not online: ${providerId}`) || text.includes("provider is not online");
const degradedReason = failureKind ?? (providerOffline ? "provider-offline" : timeout ? "ssh-runtime-timeout" : "remote-command-failed");
return {
ok: false,
degraded: true,
providerId,
action: `gc remote ${action}`,
degradedReason,
transport: {
sshTcpPoolFailureKind: failureKind,
providerOffline,
sshRuntimeTimeout: timeout,
exitCode: result.exitCode,
},
safeCandidateCount: null,
runAllowed: false,
mutation: false,
summary: failureKind !== null
? `remote GC could not acquire a provider data channel: ${failureKind}`
: providerOffline
? `provider ${providerId} is offline from the controlled CLI transport view`
: timeout
? "remote GC did not complete before the SSH runtime timeout"
: "remote GC command failed before producing a valid plan",
next: {
sshPool: `bun scripts/cli.ts debug ssh-pool ${providerId}`,
fullHealth: "bun scripts/cli.ts debug health",
smoke: `trans ${providerId} argv true`,
retryPlan: `bun scripts/cli.ts gc remote ${providerId} plan --no-snapshot-save`,
},
};
}
File diff suppressed because it is too large Load Diff
+15 -2942
View File
File diff suppressed because it is too large Load Diff