Merge pull request #1546 from pikasTech/fix/1544-jd01-transport-visibility
fix: expose JD01 transport degraded snapshots
This commit is contained in:
+94
-4
@@ -142,6 +142,85 @@ function stringValue(value: unknown): string | null {
|
||||
return typeof value === "string" ? value : null;
|
||||
}
|
||||
|
||||
function numberValue(value: unknown): number | null {
|
||||
const parsed = Number(value);
|
||||
return Number.isFinite(parsed) ? parsed : null;
|
||||
}
|
||||
|
||||
function rounded(value: number | null): number | null {
|
||||
return value === null ? null : Math.round(value * 100) / 100;
|
||||
}
|
||||
|
||||
function providerHostPressureSummary(providerId: string): unknown {
|
||||
const response = coreInternalFetch("/api/nodes/system-status?limit=24");
|
||||
const body = recordValue(recordValue(response).body);
|
||||
const systemStatuses = arrayValue(body.systemStatuses);
|
||||
const item = systemStatuses
|
||||
.map((entry) => recordValue(entry))
|
||||
.find((entry) => entry.providerId === providerId) ?? null;
|
||||
if (item === null) {
|
||||
return {
|
||||
ok: false,
|
||||
degradedReason: "system-status-not-found",
|
||||
systemStatusFetch: response,
|
||||
};
|
||||
}
|
||||
const current = recordValue(item.current);
|
||||
if (Object.keys(current).length === 0) {
|
||||
return {
|
||||
ok: false,
|
||||
providerId,
|
||||
degradedReason: "system-status-current-missing",
|
||||
updatedAt: item.updatedAt ?? null,
|
||||
nodeStatus: item.nodeStatus ?? null,
|
||||
classification: "host-pressure-snapshot-unavailable",
|
||||
};
|
||||
}
|
||||
const cpu = recordValue(current.cpu);
|
||||
const memory = recordValue(current.memory);
|
||||
const disk = recordValue(current.disk);
|
||||
const cores = numberValue(cpu.cores);
|
||||
const load1 = numberValue(cpu.load1);
|
||||
const load5 = numberValue(cpu.load5);
|
||||
const load15 = numberValue(cpu.load15);
|
||||
const load1PerCore = cores !== null && cores > 0 && load1 !== null ? load1 / cores : null;
|
||||
const memoryPercent = numberValue(memory.percent);
|
||||
const memoryAvailableBytes = numberValue(memory.availableBytes);
|
||||
const swapTotalBytes = numberValue(memory.swapTotalBytes);
|
||||
const diskPercent = numberValue(disk.percent);
|
||||
const signals: string[] = [];
|
||||
if (load1PerCore !== null && load1PerCore >= 4) signals.push("load1-per-core>=4");
|
||||
if (memoryPercent !== null && memoryPercent >= 90) signals.push("memory-percent>=90");
|
||||
if (memoryAvailableBytes !== null && memoryAvailableBytes < 512 * 1024 * 1024) signals.push("memory-available<512MiB");
|
||||
if (swapTotalBytes === 0) signals.push("swap-disabled");
|
||||
if (diskPercent !== null && diskPercent >= 90) signals.push("disk-percent>=90");
|
||||
return {
|
||||
ok: current.ok ?? null,
|
||||
providerId,
|
||||
updatedAt: item.updatedAt ?? null,
|
||||
collectedAt: current.collectedAt ?? null,
|
||||
cpu: {
|
||||
cores,
|
||||
load1,
|
||||
load5,
|
||||
load15,
|
||||
load1PerCore: rounded(load1PerCore),
|
||||
},
|
||||
memory: {
|
||||
percent: memoryPercent,
|
||||
availableBytes: memoryAvailableBytes,
|
||||
swapTotalBytes,
|
||||
},
|
||||
disk: {
|
||||
percent: diskPercent,
|
||||
availableBytes: numberValue(disk.availableBytes),
|
||||
mount: disk.mount ?? null,
|
||||
},
|
||||
signals,
|
||||
classification: signals.length > 0 ? "host-pressure-signals-present" : "host-pressure-signals-not-observed",
|
||||
};
|
||||
}
|
||||
|
||||
export async function debugSshPool(_config: UniDeskConfig, providerId: string): Promise<unknown> {
|
||||
const nodesResponse = await coreInternalFetch("/api/nodes");
|
||||
const body = recordValue(recordValue(nodesResponse).body);
|
||||
@@ -159,6 +238,7 @@ export async function debugSshPool(_config: UniDeskConfig, providerId: string):
|
||||
};
|
||||
}
|
||||
const labels = recordValue(node.labels);
|
||||
const nodeStatus = stringValue(node.status);
|
||||
const pool = {
|
||||
transport: stringValue(labels.providerGatewaySshDataTransport),
|
||||
host: stringValue(labels.providerGatewaySshDataHost),
|
||||
@@ -173,19 +253,27 @@ export async function debugSshPool(_config: UniDeskConfig, providerId: string):
|
||||
const ready = Number(pool.ready ?? 0);
|
||||
const claimed = Number(pool.claimed ?? 0);
|
||||
const desired = Number(pool.desired ?? 0);
|
||||
const ok = pool.transport === "tcp-pool" && Number.isFinite(ready) && ready > 0;
|
||||
const providerOnline = nodeStatus === "online";
|
||||
const poolLabelReady = pool.transport === "tcp-pool" && Number.isFinite(ready) && ready > 0;
|
||||
const executionPathReady = providerOnline && poolLabelReady;
|
||||
return {
|
||||
ok,
|
||||
ok: executionPathReady,
|
||||
providerId,
|
||||
providerOnline,
|
||||
poolLabelReady,
|
||||
executionPathReady,
|
||||
node: {
|
||||
providerId: node.providerId,
|
||||
name: node.name,
|
||||
status: node.status,
|
||||
status: nodeStatus,
|
||||
lastHeartbeat: node.lastHeartbeat ?? null,
|
||||
updatedAt: node.updatedAt ?? null,
|
||||
},
|
||||
pool,
|
||||
classification: ok
|
||||
hostPressure: providerHostPressureSummary(providerId),
|
||||
classification: !providerOnline
|
||||
? (nodeStatus === "offline" ? "provider-offline" : "provider-not-online")
|
||||
: poolLabelReady
|
||||
? "ssh-tcp-pool-ready"
|
||||
: pool.transport !== "tcp-pool"
|
||||
? "provider-gateway-upgrade-required"
|
||||
@@ -195,7 +283,9 @@ export async function debugSshPool(_config: UniDeskConfig, providerId: string):
|
||||
next: {
|
||||
smoke: `trans ${providerId} argv true`,
|
||||
fullHealth: "bun scripts/cli.ts debug health",
|
||||
gcDegradedSnapshot: `bun scripts/cli.ts gc remote ${providerId} snapshot --no-save`,
|
||||
},
|
||||
note: "Pool labels are provider-reported channel state; use the smoke command or gc degraded output to verify execution path.",
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,38 @@
|
||||
import { classifySshTcpPoolFailure, type SshCaptureResult } from "./ssh";
|
||||
|
||||
export function remoteGcDegradedFailure(providerId: string, action: string, result: SshCaptureResult): Record<string, unknown> {
|
||||
const text = `${result.stderr}\n${result.stdout}`;
|
||||
const failureKind = classifySshTcpPoolFailure(text);
|
||||
const timeout = result.exitCode === 124 || text.includes("ssh-runtime-timeout");
|
||||
const providerOffline = text.includes(`provider is not online: ${providerId}`) || text.includes("provider is not online");
|
||||
const degradedReason = failureKind ?? (providerOffline ? "provider-offline" : timeout ? "ssh-runtime-timeout" : "remote-command-failed");
|
||||
return {
|
||||
ok: false,
|
||||
degraded: true,
|
||||
providerId,
|
||||
action: `gc remote ${action}`,
|
||||
degradedReason,
|
||||
transport: {
|
||||
sshTcpPoolFailureKind: failureKind,
|
||||
providerOffline,
|
||||
sshRuntimeTimeout: timeout,
|
||||
exitCode: result.exitCode,
|
||||
},
|
||||
safeCandidateCount: null,
|
||||
runAllowed: false,
|
||||
mutation: false,
|
||||
summary: failureKind !== null
|
||||
? `remote GC could not acquire a provider data channel: ${failureKind}`
|
||||
: providerOffline
|
||||
? `provider ${providerId} is offline from the controlled CLI transport view`
|
||||
: timeout
|
||||
? "remote GC did not complete before the SSH runtime timeout"
|
||||
: "remote GC command failed before producing a valid plan",
|
||||
next: {
|
||||
sshPool: `bun scripts/cli.ts debug ssh-pool ${providerId}`,
|
||||
fullHealth: "bun scripts/cli.ts debug health",
|
||||
smoke: `trans ${providerId} argv true`,
|
||||
retryPlan: `bun scripts/cli.ts gc remote ${providerId} plan --no-snapshot-save`,
|
||||
},
|
||||
};
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
+15
-2942
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user