fix ssh tcp pool transient diagnostics

This commit is contained in:
Codex
2026-06-13 17:19:08 +00:00
parent 1ec60de8bb
commit 2cf767b635
7 changed files with 285 additions and 5 deletions
+8
View File
@@ -153,6 +153,14 @@ bun scripts/cli.ts debug health
在输出或 frontend 原始 JSON 里看这些 labels`providerGatewaySshDataTransport=tcp-pool``providerGatewaySshDataPoolReady``providerGatewaySshDataPoolClaimed``providerGatewaySshDataPoolDesired``providerGatewaySshDataPoolLastError`
单个 provider 的低噪声池状态:
```bash
bun scripts/cli.ts debug ssh-pool D601
```
如果 `trans`/`tran` stderr 出现 `UNIDESK_SSH_TCP_POOL_HINT`,优先按其中的 `failureKind` 处理:`provider-data-channel-closed` 表示会话中途 data channel 断开,`provider-data-channel-missing` 表示 core/provider 对 channel 状态不一致,`provider-data-pool-exhausted` 表示没有空闲 channel。对幂等受控操作先查 `debug ssh-pool <provider>`,再重试原受控 CLI;不要把这类 hint 单独定性为远端 runtime 配置失败。
快速验证 D601 维护桥:
```bash
+10 -1
View File
@@ -1,5 +1,5 @@
import { readConfig } from "./src/config";
import { debugDispatch, debugHealth, debugTask, isDebugDispatchCommand, type DebugDispatchCommand } from "./src/debug";
import { debugDispatch, debugHealth, debugSshPool, debugTask, isDebugDispatchCommand, type DebugDispatchCommand } from "./src/debug";
import { isRebuildableService, rebuildService, restartService, stackLogs, stackStatus, startStack, stopStack, unsupportedRebuildService, unsupportedRestartService } from "./src/docker";
import { emitError, emitJson, emitText, isRenderedCliResult } from "./src/output";
import { cancelJob, jobWithTail, listJobs, listJobsSummary, readJob, runJob } from "./src/jobs";
@@ -542,6 +542,15 @@ async function main(): Promise<void> {
emitJson(commandName, await debugHealth(config));
return;
}
if (sub === "ssh-pool") {
const providerId = third ?? "";
if (providerId.length === 0) throw new Error("debug ssh-pool requires providerId");
const result = await debugSshPool(config, providerId);
const ok = (result as { ok?: unknown }).ok !== false;
emitJson(commandName, result, ok);
if (!ok) process.exitCode = 1;
return;
}
if (sub === "dispatch") {
const providerId = isDebugDispatchCommand(third) ? config.providerGateway.id : third ?? config.providerGateway.id;
const commandArg = isDebugDispatchCommand(third) ? third : fourth;
+69
View File
@@ -130,6 +130,75 @@ export async function debugHealth(config: UniDeskConfig): Promise<unknown> {
};
}
function recordValue(value: unknown): Record<string, unknown> {
return typeof value === "object" && value !== null && !Array.isArray(value) ? value as Record<string, unknown> : {};
}
function arrayValue(value: unknown): unknown[] {
return Array.isArray(value) ? value : [];
}
function stringValue(value: unknown): string | null {
return typeof value === "string" ? value : null;
}
export async function debugSshPool(_config: UniDeskConfig, providerId: string): Promise<unknown> {
const nodesResponse = await coreInternalFetch("/api/nodes");
const body = recordValue(recordValue(nodesResponse).body);
const nodes = arrayValue(body.nodes);
const node = nodes
.map((item) => recordValue(item))
.find((item) => item.providerId === providerId) ?? null;
if (node === null) {
return {
ok: false,
providerId,
degradedReason: "provider-not-found",
nodesFetch: nodesResponse,
next: { fullHealth: "bun scripts/cli.ts debug health" },
};
}
const labels = recordValue(node.labels);
const pool = {
transport: stringValue(labels.providerGatewaySshDataTransport),
host: stringValue(labels.providerGatewaySshDataHost),
port: labels.providerGatewaySshDataPort ?? null,
desired: labels.providerGatewaySshDataPoolDesired ?? null,
total: labels.providerGatewaySshDataPoolTotal ?? null,
ready: labels.providerGatewaySshDataPoolReady ?? null,
claimed: labels.providerGatewaySshDataPoolClaimed ?? null,
connecting: labels.providerGatewaySshDataPoolConnecting ?? null,
lastError: labels.providerGatewaySshDataPoolLastError ?? null,
};
const ready = Number(pool.ready ?? 0);
const claimed = Number(pool.claimed ?? 0);
const desired = Number(pool.desired ?? 0);
const ok = pool.transport === "tcp-pool" && Number.isFinite(ready) && ready > 0;
return {
ok,
providerId,
node: {
providerId: node.providerId,
name: node.name,
status: node.status,
lastHeartbeat: node.lastHeartbeat ?? null,
updatedAt: node.updatedAt ?? null,
},
pool,
classification: ok
? "ssh-tcp-pool-ready"
: pool.transport !== "tcp-pool"
? "provider-gateway-upgrade-required"
: desired > 0 && claimed >= desired
? "provider-data-pool-exhausted"
: "provider-data-pool-not-ready",
next: {
smoke: `trans ${providerId} argv true`,
fullHealth: "bun scripts/cli.ts debug health",
},
};
}
async function waitForTask(taskId: string, timeoutMs: number): Promise<unknown> {
const started = Date.now();
let latest: unknown = null;
+4 -2
View File
@@ -88,6 +88,7 @@ export function rootHelp(): unknown {
{ command: "job status <jobId|latest> [--tail-bytes N]", description: "Show job state with a structured progress summary and bounded stdout/stderr tails." },
{ command: "job cancel <jobId>", description: "Cancel a queued/running async job through the .state/jobs control entry and keep a terminal canceled record." },
{ command: "debug health", description: "Probe internal core, nodes, system/Docker status, frontend, provider ingress, and public boundary." },
{ command: "debug ssh-pool <providerId>", description: "Show bounded host.ssh.tcp-pool labels for one provider, including ready/claimed/desired/lastError." },
{ command: "debug dispatch [providerId] [docker.ps|provider.upgrade|host.ssh|microservice.http|echo] [--wait-ms N]", description: "Submit a real internal-core dispatch request for CLI debugging." },
{ command: "debug task <taskId|latest>", description: "Read a dispatched task record from internal core for CLI debugging." },
{ command: "network perf [--service code-queue --path /api/tasks/overview?limit=30 --count N --concurrency N --label before|after]", description: "Benchmark frontend -> backend-core -> provider/adapter user-service networking and report latency/proxy-mode distributions." },
@@ -492,14 +493,15 @@ function jobHelp(): unknown {
function debugHelp(): unknown {
return {
command: "debug health|dispatch|task",
command: "debug health|ssh-pool|dispatch|task",
output: "json",
usage: [
"bun scripts/cli.ts debug health",
"bun scripts/cli.ts debug ssh-pool <providerId>",
"bun scripts/cli.ts debug dispatch [providerId] [docker.ps|provider.upgrade|host.ssh|microservice.http|echo] [--wait-ms N]",
"bun scripts/cli.ts debug task <taskId|latest>",
],
description: "Debug the real core/provider/dispatch paths; do not use these as formal TEST.md acceptance steps.",
description: "Debug the real core/provider/dispatch paths. ssh-pool returns bounded host.ssh.tcp-pool labels for one provider; do not use debug commands as formal TEST.md acceptance steps.",
};
}
+41 -2
View File
@@ -4,6 +4,7 @@ import { join } from "node:path";
import { repoRoot, rootPath, type Config } from "./config";
import { runCommand, type CommandResult } from "./command";
import { startJob } from "./jobs";
import { classifySshTcpPoolFailure } from "./ssh";
import { runHwlabG14Command } from "./hwlab-g14";
import { HWLAB_NODE_CONTROL_PLANE_CONFIG_PATH, hwlabNodeControlPlaneInfraHelp, runHwlabNodeControlPlaneInfra } from "./hwlab-node-control-plane";
import { hwlabRuntimeLaneConfigPath, hwlabRuntimeLaneSpec, hwlabRuntimeLaneSpecForNode, isHwlabRuntimeLane, type HwlabRuntimeLane, type HwlabRuntimeLaneSpec, type HwlabRuntimePublicExposureSpec } from "./hwlab-node-lanes";
@@ -411,6 +412,25 @@ function compactRuntimeCommand(result: CommandResult): Record<string, unknown> {
};
}
function sshTcpPoolDiagnosticsFromCommand(spec: HwlabRuntimeLaneSpec, result: CommandResult): Record<string, unknown> | null {
if (isCommandSuccess(result)) return null;
const failureKind = classifySshTcpPoolFailure(`${result.stderr}\n${result.stdout}`);
if (failureKind === null) return null;
return {
classification: "ssh-tcp-pool-transient",
failureKind,
providerId: spec.nodeId,
route: spec.nodeKubeRoute,
message: "SSH tcp data pool failed while running the controlled command; treat this as transport/data-pool transient until provider labels and retry say otherwise.",
next: {
poolStatus: `bun scripts/cli.ts debug ssh-pool ${spec.nodeId}`,
retrySmoke: `trans ${spec.nodeId} argv true`,
retryApply: `bun scripts/cli.ts hwlab nodes control-plane apply --node ${spec.nodeId} --lane ${spec.lane} --confirm`,
retryTriggerCurrent: `bun scripts/cli.ts hwlab nodes control-plane trigger-current --node ${spec.nodeId} --lane ${spec.lane} --confirm`,
},
};
}
function nodeRuntimeUnsupportedAction(scoped: ReturnType<typeof parseNodeScopedDelegatedOptions>): Record<string, unknown> {
return {
ok: false,
@@ -779,6 +799,7 @@ function nodeRuntimeApply(scoped: ReturnType<typeof parseNodeScopedDelegatedOpti
const cleanup = render.location === "local"
? cleanupLocalNodeRuntimeRenderDir(spec, render)
: cleanupNodeRuntimeRenderDir(spec, render.renderDir);
const sshTcpPoolDiagnostics = sshTcpPoolDiagnosticsFromCommand(spec, apply);
return {
ok: isCommandSuccess(apply),
command: `hwlab nodes control-plane apply --node ${scoped.node} --lane ${scoped.lane}`,
@@ -795,10 +816,14 @@ function nodeRuntimeApply(scoped: ReturnType<typeof parseNodeScopedDelegatedOpti
render: compactRuntimeCommand(render.result),
apply: compactRuntimeCommand(apply),
cleanupRenderDir: compactRuntimeCommand(cleanup),
diagnostics: sshTcpPoolDiagnostics,
degradedReason: isCommandSuccess(apply) ? undefined : "node-runtime-control-plane-apply-failed",
next: scoped.dryRun
? { apply: `bun scripts/cli.ts hwlab nodes control-plane apply --node ${scoped.node} --lane ${scoped.lane} --confirm` }
: { triggerCurrent: `bun scripts/cli.ts hwlab nodes control-plane trigger-current --node ${scoped.node} --lane ${scoped.lane} --confirm` },
: {
triggerCurrent: `bun scripts/cli.ts hwlab nodes control-plane trigger-current --node ${scoped.node} --lane ${scoped.lane} --confirm`,
...(sshTcpPoolDiagnostics === null ? {} : { sshPoolStatus: `bun scripts/cli.ts debug ssh-pool ${scoped.node}` }),
},
};
}
@@ -1049,7 +1074,16 @@ function nodeRuntimeTriggerCurrent(scoped: ReturnType<typeof parseNodeScopedDele
printNodeRuntimeTriggerProgress(spec, { stage: "control-plane-refresh", status: "started", sourceCommit, pipelineRun });
const refresh = nodeRuntimeApply({ ...scoped, action: "apply", dryRun: false });
if (refresh.ok !== true) {
printNodeRuntimeTriggerProgress(spec, { stage: "control-plane-refresh", status: "failed", sourceCommit, pipelineRun });
const diagnostics = record(refresh.diagnostics);
printNodeRuntimeTriggerProgress(spec, {
stage: "control-plane-refresh",
status: "failed",
sourceCommit,
pipelineRun,
...(diagnostics.classification === "ssh-tcp-pool-transient"
? { reason: "ssh-tcp-pool-transient", failureKind: diagnostics.failureKind ?? null }
: {}),
});
return {
ok: false,
command: `hwlab nodes control-plane trigger-current --node ${scoped.node} --lane ${scoped.lane}`,
@@ -1059,7 +1093,12 @@ function nodeRuntimeTriggerCurrent(scoped: ReturnType<typeof parseNodeScopedDele
sourceCommit,
pipelineRun,
refresh,
diagnostics: Object.keys(diagnostics).length > 0 ? diagnostics : null,
degradedReason: "node-runtime-control-plane-apply-before-trigger-failed",
next: {
retryTriggerCurrent: `bun scripts/cli.ts hwlab nodes control-plane trigger-current --node ${scoped.node} --lane ${scoped.lane} --confirm`,
...(diagnostics.classification === "ssh-tcp-pool-transient" ? { sshPoolStatus: `bun scripts/cli.ts debug ssh-pool ${scoped.node}` } : {}),
},
};
}
printNodeRuntimeTriggerProgress(spec, { stage: "control-plane-refresh", status: "succeeded", sourceCommit, pipelineRun });
+45
View File
@@ -38,6 +38,7 @@ export interface JobProgressSummary {
eventsObserved: number;
slow: boolean;
warnings: string[];
diagnostics?: Record<string, unknown> | null;
timings: Record<string, number>;
summary: string;
nextCommand: string | null;
@@ -437,6 +438,7 @@ function summarizeRuntimeLaneTriggerJobProgress(job: JobRecord, stdoutTail: stri
stageElapsedSeconds,
lastEventAgeSeconds,
});
const tcpPoolDiagnostics = job.status === "succeeded" ? null : sshTcpPoolDiagnosticsFromJobText(`${stderrTail}\n${stdoutTail}`);
const slow = warnings.length > 0;
return {
kind: "hwlab-runtime-lane-trigger",
@@ -452,6 +454,7 @@ function summarizeRuntimeLaneTriggerJobProgress(job: JobRecord, stdoutTail: stri
eventsObserved: events.length,
slow,
warnings,
diagnostics: tcpPoolDiagnostics,
timings,
summary: [
job.status,
@@ -462,6 +465,7 @@ function summarizeRuntimeLaneTriggerJobProgress(job: JobRecord, stdoutTail: stri
elapsedSeconds !== null ? `elapsed=${elapsedSeconds}s` : null,
stageElapsedSeconds !== null && job.status === "running" ? `stageElapsed=${stageElapsedSeconds}s` : null,
lastEventAgeSeconds !== null && job.status === "running" ? `lastEventAge=${lastEventAgeSeconds}s` : null,
tcpPoolDiagnostics !== null ? `sshPool=${String(tcpPoolDiagnostics.failureKind ?? "transient")}` : null,
slow ? "visibility-warning" : null,
].filter(Boolean).join(" "),
nextCommand: pipelineRun
@@ -472,6 +476,47 @@ function summarizeRuntimeLaneTriggerJobProgress(job: JobRecord, stdoutTail: stri
};
}
function sshTcpPoolDiagnosticsFromJobText(text: string): Record<string, unknown> | null {
const hint = lastJsonLinePayload(text, "UNIDESK_SSH_TCP_POOL_HINT");
if (hint !== null) return hint;
const error = lastJsonLinePayload(text, "UNIDESK_SSH_ERROR");
if (error !== null && typeof error.failureKind === "string" && error.failureKind.startsWith("provider-data-")) {
return {
code: "ssh-tcp-pool-transient",
failureKind: error.failureKind,
providerId: error.providerId ?? null,
dataChannelId: error.dataChannelId ?? null,
dataPool: error.dataPool ?? null,
diagnostics: { fullHealth: "bun scripts/cli.ts debug health" },
};
}
if (/ssh tcp data channel closed/iu.test(text)) {
return { code: "ssh-tcp-pool-transient", failureKind: "provider-data-channel-closed", diagnostics: { fullHealth: "bun scripts/cli.ts debug health" } };
}
if (/requested ssh tcp data channel is not ready|ssh tcp data channel is not available/iu.test(text)) {
return { code: "ssh-tcp-pool-transient", failureKind: "provider-data-channel-missing", diagnostics: { fullHealth: "bun scripts/cli.ts debug health" } };
}
if (/provider ssh tcp data pool has no idle channel/iu.test(text)) {
return { code: "ssh-tcp-pool-transient", failureKind: "provider-data-pool-exhausted", diagnostics: { fullHealth: "bun scripts/cli.ts debug health" } };
}
return null;
}
function lastJsonLinePayload(text: string, prefix: string): Record<string, unknown> | null {
const lines = text.split(/\r?\n/u);
for (let index = lines.length - 1; index >= 0; index -= 1) {
const line = lines[index]?.trim() ?? "";
if (!line.startsWith(`${prefix} `)) continue;
try {
const parsed = JSON.parse(line.slice(prefix.length + 1)) as unknown;
if (typeof parsed === "object" && parsed !== null && !Array.isArray(parsed)) return parsed as Record<string, unknown>;
} catch {
return null;
}
}
return null;
}
function genericJobProgress(job: JobRecord, stderrTailOverride?: string): JobProgressSummary {
const nowMs = Date.now();
const stderrTail = stderrTailOverride ?? tailFile(job.stderrFile, 96_000);
+108
View File
@@ -130,6 +130,30 @@ export interface SshStdoutTruncationHint {
note: string;
}
export type SshTcpPoolFailureKind =
| "provider-data-channel-closed"
| "provider-data-channel-missing"
| "provider-data-pool-exhausted";
export interface SshTcpPoolHint {
code: "ssh-tcp-pool-transient";
level: "warning";
providerId: string;
route: string;
transport: "backend-core-broker" | "frontend-websocket";
invocationKind: SshInvocationKind;
failureKind: SshTcpPoolFailureKind;
exitCode: number;
message: string;
action: string;
retry: string;
diagnostics: {
poolStatus: string;
fullHealth: string;
};
note: string;
}
const argvQuotedSshSubcommands = new Set(["git", "rg", "grep", "sed", "nl", "stat", "du", "ls", "cat", "head", "tail", "wc", "pwd"]);
const nativeK3sKubeconfig = "/etc/rancher/k3s/k3s.yaml";
const windowsBridgeCwd = "/mnt/c/Windows";
@@ -2489,6 +2513,59 @@ export function formatSshStdoutTruncationHint(hint: SshStdoutTruncationHint): st
return `UNIDESK_SSH_STDOUT_TRUNCATED ${JSON.stringify(hint)}\n`;
}
export function classifySshTcpPoolFailure(text: string): SshTcpPoolFailureKind | null {
const normalized = text.toLowerCase();
if (normalized.includes("ssh tcp data channel closed")) return "provider-data-channel-closed";
if (
normalized.includes("requested ssh tcp data channel is not ready")
|| normalized.includes("ssh tcp data channel is not available")
|| normalized.includes('"failurekind":"provider-data-channel-missing"')
) {
return "provider-data-channel-missing";
}
if (
normalized.includes("provider ssh tcp data pool has no idle channel")
|| normalized.includes('"failurekind":"provider-data-pool-exhausted"')
) {
return "provider-data-pool-exhausted";
}
return null;
}
export function sshTcpPoolHint(options: {
invocation: ParsedSshInvocation;
transport: SshTcpPoolHint["transport"];
exitCode: number;
stderrText: string;
}): SshTcpPoolHint | null {
if (options.exitCode === 0) return null;
const failureKind = classifySshTcpPoolFailure(options.stderrText);
if (failureKind === null) return null;
const providerId = safeProviderId(options.invocation.providerId);
return {
code: "ssh-tcp-pool-transient",
level: "warning",
providerId,
route: options.invocation.route.raw,
transport: options.transport,
invocationKind: options.invocation.parsed.invocationKind,
failureKind,
exitCode: options.exitCode,
message: "host.ssh.tcp-pool data channel failed during an SSH operation; classify this as transport/data-pool transient until pool labels and a retry prove otherwise.",
action: "Inspect providerGatewaySshData* labels, then rerun the same idempotent command through the controlled CLI. Do not treat this alone as HWLAB runtime configuration failure.",
retry: `trans ${providerId} argv true`,
diagnostics: {
poolStatus: `bun scripts/cli.ts debug ssh-pool ${providerId}`,
fullHealth: "bun scripts/cli.ts debug health",
},
note: "Hint is written to stderr and intentionally does not echo the original remote command.",
};
}
export function formatSshTcpPoolHint(hint: SshTcpPoolHint | null): string {
return hint === null ? "" : `UNIDESK_SSH_TCP_POOL_HINT ${JSON.stringify(hint)}\n`;
}
function sshStdoutDumpPath(invocation: ParsedSshInvocation): string {
mkdirSync(sshStdoutDumpDir, { recursive: true, mode: 0o700 });
const timestamp = new Date().toISOString().replace(/[:.]/gu, "-");
@@ -2677,6 +2754,16 @@ ws.addEventListener("message", (event) => {
clearTimeout(openTimer);
clearRuntimeTimer();
process.stderr.write(String(message.message || "ssh bridge error") + "\n");
if (message.failureKind || message.providerId || message.dataChannelId || message.dataPool) {
process.stderr.write("UNIDESK_SSH_ERROR " + JSON.stringify({
failureKind: message.failureKind || null,
providerId: message.providerId || null,
dataChannelId: message.dataChannelId || null,
dataPool: message.dataPool || null,
transport: message.transport || null,
controlFallback: message.controlFallback === true
}) + "\n");
}
exitCode = 255;
ws.close();
return;
@@ -2915,6 +3002,12 @@ async function runSshCaptureRemoteCommand(config: UniDeskConfig, invocation: Par
startedAtMs,
}));
if (timingHint) stderr += timingHint;
stderr += formatSshTcpPoolHint(sshTcpPoolHint({
invocation,
transport: "backend-core-broker",
exitCode,
stderrText: stderr,
}));
resolve({ exitCode, stdout, stderr });
};
const runtimeTimer = setTimeout(() => {
@@ -3066,6 +3159,12 @@ async function runSshStreamRemoteCommand(
startedAtMs,
}));
if (timingHint) stderr += timingHint;
stderr += formatSshTcpPoolHint(sshTcpPoolHint({
invocation: streamInvocation,
transport: "backend-core-broker",
exitCode: finalCode,
stderrText: stderr,
}));
resolve({ exitCode: finalCode, stdout, stderr });
});
};
@@ -3358,6 +3457,15 @@ export async function runSsh(config: UniDeskConfig, providerId: string, args: st
restore();
const hint = timedOut ? null : sshFailureHint(invocation.providerId, parsed, exitCode, stderrTail);
if (hint !== null) process.stderr.write(formatSshFailureHint(hint));
if (!timedOut) {
const tcpPoolHint = formatSshTcpPoolHint(sshTcpPoolHint({
invocation,
transport: "backend-core-broker",
exitCode,
stderrText: stderrTail,
}));
if (tcpPoolHint) process.stderr.write(tcpPoolHint);
}
const timingHint = formatSshRuntimeTimingHint(sshRuntimeTimingHint({
invocation,
transport: "backend-core-broker",