diff --git a/.agents/skills/unidesk-trans/SKILL.md b/.agents/skills/unidesk-trans/SKILL.md index ee52d098..8fa8c80e 100644 --- a/.agents/skills/unidesk-trans/SKILL.md +++ b/.agents/skills/unidesk-trans/SKILL.md @@ -153,6 +153,14 @@ bun scripts/cli.ts debug health 在输出或 frontend 原始 JSON 里看这些 labels:`providerGatewaySshDataTransport=tcp-pool`、`providerGatewaySshDataPoolReady`、`providerGatewaySshDataPoolClaimed`、`providerGatewaySshDataPoolDesired`、`providerGatewaySshDataPoolLastError`。 +单个 provider 的低噪声池状态: + +```bash +bun scripts/cli.ts debug ssh-pool D601 +``` + +如果 `trans`/`tran` stderr 出现 `UNIDESK_SSH_TCP_POOL_HINT`,优先按其中的 `failureKind` 处理:`provider-data-channel-closed` 表示会话中途 data channel 断开,`provider-data-channel-missing` 表示 core/provider 对 channel 状态不一致,`provider-data-pool-exhausted` 表示没有空闲 channel。对幂等受控操作先查 `debug ssh-pool `,再重试原受控 CLI;不要把这类 hint 单独定性为远端 runtime 配置失败。 + 快速验证 D601 维护桥: ```bash diff --git a/scripts/cli.ts b/scripts/cli.ts index 3ec1ba5e..c5899456 100644 --- a/scripts/cli.ts +++ b/scripts/cli.ts @@ -1,5 +1,5 @@ import { readConfig } from "./src/config"; -import { debugDispatch, debugHealth, debugTask, isDebugDispatchCommand, type DebugDispatchCommand } from "./src/debug"; +import { debugDispatch, debugHealth, debugSshPool, debugTask, isDebugDispatchCommand, type DebugDispatchCommand } from "./src/debug"; import { isRebuildableService, rebuildService, restartService, stackLogs, stackStatus, startStack, stopStack, unsupportedRebuildService, unsupportedRestartService } from "./src/docker"; import { emitError, emitJson, emitText, isRenderedCliResult } from "./src/output"; import { cancelJob, jobWithTail, listJobs, listJobsSummary, readJob, runJob } from "./src/jobs"; @@ -542,6 +542,15 @@ async function main(): Promise { emitJson(commandName, await debugHealth(config)); return; } + if (sub === "ssh-pool") { + const providerId = third ?? ""; + if (providerId.length === 0) throw new Error("debug ssh-pool requires providerId"); + const result = await debugSshPool(config, providerId); + const ok = (result as { ok?: unknown }).ok !== false; + emitJson(commandName, result, ok); + if (!ok) process.exitCode = 1; + return; + } if (sub === "dispatch") { const providerId = isDebugDispatchCommand(third) ? config.providerGateway.id : third ?? config.providerGateway.id; const commandArg = isDebugDispatchCommand(third) ? third : fourth; diff --git a/scripts/src/debug.ts b/scripts/src/debug.ts index f055234e..3d57409b 100644 --- a/scripts/src/debug.ts +++ b/scripts/src/debug.ts @@ -130,6 +130,75 @@ export async function debugHealth(config: UniDeskConfig): Promise { }; } +function recordValue(value: unknown): Record { + return typeof value === "object" && value !== null && !Array.isArray(value) ? value as Record : {}; +} + +function arrayValue(value: unknown): unknown[] { + return Array.isArray(value) ? value : []; +} + +function stringValue(value: unknown): string | null { + return typeof value === "string" ? value : null; +} + +export async function debugSshPool(_config: UniDeskConfig, providerId: string): Promise { + const nodesResponse = await coreInternalFetch("/api/nodes"); + const body = recordValue(recordValue(nodesResponse).body); + const nodes = arrayValue(body.nodes); + const node = nodes + .map((item) => recordValue(item)) + .find((item) => item.providerId === providerId) ?? null; + if (node === null) { + return { + ok: false, + providerId, + degradedReason: "provider-not-found", + nodesFetch: nodesResponse, + next: { fullHealth: "bun scripts/cli.ts debug health" }, + }; + } + const labels = recordValue(node.labels); + const pool = { + transport: stringValue(labels.providerGatewaySshDataTransport), + host: stringValue(labels.providerGatewaySshDataHost), + port: labels.providerGatewaySshDataPort ?? null, + desired: labels.providerGatewaySshDataPoolDesired ?? null, + total: labels.providerGatewaySshDataPoolTotal ?? null, + ready: labels.providerGatewaySshDataPoolReady ?? null, + claimed: labels.providerGatewaySshDataPoolClaimed ?? null, + connecting: labels.providerGatewaySshDataPoolConnecting ?? null, + lastError: labels.providerGatewaySshDataPoolLastError ?? null, + }; + const ready = Number(pool.ready ?? 0); + const claimed = Number(pool.claimed ?? 0); + const desired = Number(pool.desired ?? 0); + const ok = pool.transport === "tcp-pool" && Number.isFinite(ready) && ready > 0; + return { + ok, + providerId, + node: { + providerId: node.providerId, + name: node.name, + status: node.status, + lastHeartbeat: node.lastHeartbeat ?? null, + updatedAt: node.updatedAt ?? null, + }, + pool, + classification: ok + ? "ssh-tcp-pool-ready" + : pool.transport !== "tcp-pool" + ? "provider-gateway-upgrade-required" + : desired > 0 && claimed >= desired + ? "provider-data-pool-exhausted" + : "provider-data-pool-not-ready", + next: { + smoke: `trans ${providerId} argv true`, + fullHealth: "bun scripts/cli.ts debug health", + }, + }; +} + async function waitForTask(taskId: string, timeoutMs: number): Promise { const started = Date.now(); let latest: unknown = null; diff --git a/scripts/src/help.ts b/scripts/src/help.ts index b92ada5f..06e88f25 100644 --- a/scripts/src/help.ts +++ b/scripts/src/help.ts @@ -88,6 +88,7 @@ export function rootHelp(): unknown { { command: "job status [--tail-bytes N]", description: "Show job state with a structured progress summary and bounded stdout/stderr tails." }, { command: "job cancel ", description: "Cancel a queued/running async job through the .state/jobs control entry and keep a terminal canceled record." }, { command: "debug health", description: "Probe internal core, nodes, system/Docker status, frontend, provider ingress, and public boundary." }, + { command: "debug ssh-pool ", description: "Show bounded host.ssh.tcp-pool labels for one provider, including ready/claimed/desired/lastError." }, { command: "debug dispatch [providerId] [docker.ps|provider.upgrade|host.ssh|microservice.http|echo] [--wait-ms N]", description: "Submit a real internal-core dispatch request for CLI debugging." }, { command: "debug task ", description: "Read a dispatched task record from internal core for CLI debugging." }, { command: "network perf [--service code-queue --path /api/tasks/overview?limit=30 --count N --concurrency N --label before|after]", description: "Benchmark frontend -> backend-core -> provider/adapter user-service networking and report latency/proxy-mode distributions." }, @@ -492,14 +493,15 @@ function jobHelp(): unknown { function debugHelp(): unknown { return { - command: "debug health|dispatch|task", + command: "debug health|ssh-pool|dispatch|task", output: "json", usage: [ "bun scripts/cli.ts debug health", + "bun scripts/cli.ts debug ssh-pool ", "bun scripts/cli.ts debug dispatch [providerId] [docker.ps|provider.upgrade|host.ssh|microservice.http|echo] [--wait-ms N]", "bun scripts/cli.ts debug task ", ], - description: "Debug the real core/provider/dispatch paths; do not use these as formal TEST.md acceptance steps.", + description: "Debug the real core/provider/dispatch paths. ssh-pool returns bounded host.ssh.tcp-pool labels for one provider; do not use debug commands as formal TEST.md acceptance steps.", }; } diff --git a/scripts/src/hwlab-node.ts b/scripts/src/hwlab-node.ts index c641a5bf..d9b464dc 100644 --- a/scripts/src/hwlab-node.ts +++ b/scripts/src/hwlab-node.ts @@ -4,6 +4,7 @@ import { join } from "node:path"; import { repoRoot, rootPath, type Config } from "./config"; import { runCommand, type CommandResult } from "./command"; import { startJob } from "./jobs"; +import { classifySshTcpPoolFailure } from "./ssh"; import { runHwlabG14Command } from "./hwlab-g14"; import { HWLAB_NODE_CONTROL_PLANE_CONFIG_PATH, hwlabNodeControlPlaneInfraHelp, runHwlabNodeControlPlaneInfra } from "./hwlab-node-control-plane"; import { hwlabRuntimeLaneConfigPath, hwlabRuntimeLaneSpec, hwlabRuntimeLaneSpecForNode, isHwlabRuntimeLane, type HwlabRuntimeLane, type HwlabRuntimeLaneSpec, type HwlabRuntimePublicExposureSpec } from "./hwlab-node-lanes"; @@ -411,6 +412,25 @@ function compactRuntimeCommand(result: CommandResult): Record { }; } +function sshTcpPoolDiagnosticsFromCommand(spec: HwlabRuntimeLaneSpec, result: CommandResult): Record | null { + if (isCommandSuccess(result)) return null; + const failureKind = classifySshTcpPoolFailure(`${result.stderr}\n${result.stdout}`); + if (failureKind === null) return null; + return { + classification: "ssh-tcp-pool-transient", + failureKind, + providerId: spec.nodeId, + route: spec.nodeKubeRoute, + message: "SSH tcp data pool failed while running the controlled command; treat this as transport/data-pool transient until provider labels and retry say otherwise.", + next: { + poolStatus: `bun scripts/cli.ts debug ssh-pool ${spec.nodeId}`, + retrySmoke: `trans ${spec.nodeId} argv true`, + retryApply: `bun scripts/cli.ts hwlab nodes control-plane apply --node ${spec.nodeId} --lane ${spec.lane} --confirm`, + retryTriggerCurrent: `bun scripts/cli.ts hwlab nodes control-plane trigger-current --node ${spec.nodeId} --lane ${spec.lane} --confirm`, + }, + }; +} + function nodeRuntimeUnsupportedAction(scoped: ReturnType): Record { return { ok: false, @@ -779,6 +799,7 @@ function nodeRuntimeApply(scoped: ReturnType 0 ? diagnostics : null, degradedReason: "node-runtime-control-plane-apply-before-trigger-failed", + next: { + retryTriggerCurrent: `bun scripts/cli.ts hwlab nodes control-plane trigger-current --node ${scoped.node} --lane ${scoped.lane} --confirm`, + ...(diagnostics.classification === "ssh-tcp-pool-transient" ? { sshPoolStatus: `bun scripts/cli.ts debug ssh-pool ${scoped.node}` } : {}), + }, }; } printNodeRuntimeTriggerProgress(spec, { stage: "control-plane-refresh", status: "succeeded", sourceCommit, pipelineRun }); diff --git a/scripts/src/jobs.ts b/scripts/src/jobs.ts index d916cf93..84dc3df0 100644 --- a/scripts/src/jobs.ts +++ b/scripts/src/jobs.ts @@ -38,6 +38,7 @@ export interface JobProgressSummary { eventsObserved: number; slow: boolean; warnings: string[]; + diagnostics?: Record | null; timings: Record; summary: string; nextCommand: string | null; @@ -437,6 +438,7 @@ function summarizeRuntimeLaneTriggerJobProgress(job: JobRecord, stdoutTail: stri stageElapsedSeconds, lastEventAgeSeconds, }); + const tcpPoolDiagnostics = job.status === "succeeded" ? null : sshTcpPoolDiagnosticsFromJobText(`${stderrTail}\n${stdoutTail}`); const slow = warnings.length > 0; return { kind: "hwlab-runtime-lane-trigger", @@ -452,6 +454,7 @@ function summarizeRuntimeLaneTriggerJobProgress(job: JobRecord, stdoutTail: stri eventsObserved: events.length, slow, warnings, + diagnostics: tcpPoolDiagnostics, timings, summary: [ job.status, @@ -462,6 +465,7 @@ function summarizeRuntimeLaneTriggerJobProgress(job: JobRecord, stdoutTail: stri elapsedSeconds !== null ? `elapsed=${elapsedSeconds}s` : null, stageElapsedSeconds !== null && job.status === "running" ? `stageElapsed=${stageElapsedSeconds}s` : null, lastEventAgeSeconds !== null && job.status === "running" ? `lastEventAge=${lastEventAgeSeconds}s` : null, + tcpPoolDiagnostics !== null ? `sshPool=${String(tcpPoolDiagnostics.failureKind ?? "transient")}` : null, slow ? "visibility-warning" : null, ].filter(Boolean).join(" "), nextCommand: pipelineRun @@ -472,6 +476,47 @@ function summarizeRuntimeLaneTriggerJobProgress(job: JobRecord, stdoutTail: stri }; } +function sshTcpPoolDiagnosticsFromJobText(text: string): Record | null { + const hint = lastJsonLinePayload(text, "UNIDESK_SSH_TCP_POOL_HINT"); + if (hint !== null) return hint; + const error = lastJsonLinePayload(text, "UNIDESK_SSH_ERROR"); + if (error !== null && typeof error.failureKind === "string" && error.failureKind.startsWith("provider-data-")) { + return { + code: "ssh-tcp-pool-transient", + failureKind: error.failureKind, + providerId: error.providerId ?? null, + dataChannelId: error.dataChannelId ?? null, + dataPool: error.dataPool ?? null, + diagnostics: { fullHealth: "bun scripts/cli.ts debug health" }, + }; + } + if (/ssh tcp data channel closed/iu.test(text)) { + return { code: "ssh-tcp-pool-transient", failureKind: "provider-data-channel-closed", diagnostics: { fullHealth: "bun scripts/cli.ts debug health" } }; + } + if (/requested ssh tcp data channel is not ready|ssh tcp data channel is not available/iu.test(text)) { + return { code: "ssh-tcp-pool-transient", failureKind: "provider-data-channel-missing", diagnostics: { fullHealth: "bun scripts/cli.ts debug health" } }; + } + if (/provider ssh tcp data pool has no idle channel/iu.test(text)) { + return { code: "ssh-tcp-pool-transient", failureKind: "provider-data-pool-exhausted", diagnostics: { fullHealth: "bun scripts/cli.ts debug health" } }; + } + return null; +} + +function lastJsonLinePayload(text: string, prefix: string): Record | null { + const lines = text.split(/\r?\n/u); + for (let index = lines.length - 1; index >= 0; index -= 1) { + const line = lines[index]?.trim() ?? ""; + if (!line.startsWith(`${prefix} `)) continue; + try { + const parsed = JSON.parse(line.slice(prefix.length + 1)) as unknown; + if (typeof parsed === "object" && parsed !== null && !Array.isArray(parsed)) return parsed as Record; + } catch { + return null; + } + } + return null; +} + function genericJobProgress(job: JobRecord, stderrTailOverride?: string): JobProgressSummary { const nowMs = Date.now(); const stderrTail = stderrTailOverride ?? tailFile(job.stderrFile, 96_000); diff --git a/scripts/src/ssh.ts b/scripts/src/ssh.ts index a56bfe51..b2554f29 100644 --- a/scripts/src/ssh.ts +++ b/scripts/src/ssh.ts @@ -130,6 +130,30 @@ export interface SshStdoutTruncationHint { note: string; } +export type SshTcpPoolFailureKind = + | "provider-data-channel-closed" + | "provider-data-channel-missing" + | "provider-data-pool-exhausted"; + +export interface SshTcpPoolHint { + code: "ssh-tcp-pool-transient"; + level: "warning"; + providerId: string; + route: string; + transport: "backend-core-broker" | "frontend-websocket"; + invocationKind: SshInvocationKind; + failureKind: SshTcpPoolFailureKind; + exitCode: number; + message: string; + action: string; + retry: string; + diagnostics: { + poolStatus: string; + fullHealth: string; + }; + note: string; +} + const argvQuotedSshSubcommands = new Set(["git", "rg", "grep", "sed", "nl", "stat", "du", "ls", "cat", "head", "tail", "wc", "pwd"]); const nativeK3sKubeconfig = "/etc/rancher/k3s/k3s.yaml"; const windowsBridgeCwd = "/mnt/c/Windows"; @@ -2489,6 +2513,59 @@ export function formatSshStdoutTruncationHint(hint: SshStdoutTruncationHint): st return `UNIDESK_SSH_STDOUT_TRUNCATED ${JSON.stringify(hint)}\n`; } +export function classifySshTcpPoolFailure(text: string): SshTcpPoolFailureKind | null { + const normalized = text.toLowerCase(); + if (normalized.includes("ssh tcp data channel closed")) return "provider-data-channel-closed"; + if ( + normalized.includes("requested ssh tcp data channel is not ready") + || normalized.includes("ssh tcp data channel is not available") + || normalized.includes('"failurekind":"provider-data-channel-missing"') + ) { + return "provider-data-channel-missing"; + } + if ( + normalized.includes("provider ssh tcp data pool has no idle channel") + || normalized.includes('"failurekind":"provider-data-pool-exhausted"') + ) { + return "provider-data-pool-exhausted"; + } + return null; +} + +export function sshTcpPoolHint(options: { + invocation: ParsedSshInvocation; + transport: SshTcpPoolHint["transport"]; + exitCode: number; + stderrText: string; +}): SshTcpPoolHint | null { + if (options.exitCode === 0) return null; + const failureKind = classifySshTcpPoolFailure(options.stderrText); + if (failureKind === null) return null; + const providerId = safeProviderId(options.invocation.providerId); + return { + code: "ssh-tcp-pool-transient", + level: "warning", + providerId, + route: options.invocation.route.raw, + transport: options.transport, + invocationKind: options.invocation.parsed.invocationKind, + failureKind, + exitCode: options.exitCode, + message: "host.ssh.tcp-pool data channel failed during an SSH operation; classify this as transport/data-pool transient until pool labels and a retry prove otherwise.", + action: "Inspect providerGatewaySshData* labels, then rerun the same idempotent command through the controlled CLI. Do not treat this alone as HWLAB runtime configuration failure.", + retry: `trans ${providerId} argv true`, + diagnostics: { + poolStatus: `bun scripts/cli.ts debug ssh-pool ${providerId}`, + fullHealth: "bun scripts/cli.ts debug health", + }, + note: "Hint is written to stderr and intentionally does not echo the original remote command.", + }; +} + +export function formatSshTcpPoolHint(hint: SshTcpPoolHint | null): string { + return hint === null ? "" : `UNIDESK_SSH_TCP_POOL_HINT ${JSON.stringify(hint)}\n`; +} + function sshStdoutDumpPath(invocation: ParsedSshInvocation): string { mkdirSync(sshStdoutDumpDir, { recursive: true, mode: 0o700 }); const timestamp = new Date().toISOString().replace(/[:.]/gu, "-"); @@ -2677,6 +2754,16 @@ ws.addEventListener("message", (event) => { clearTimeout(openTimer); clearRuntimeTimer(); process.stderr.write(String(message.message || "ssh bridge error") + "\n"); + if (message.failureKind || message.providerId || message.dataChannelId || message.dataPool) { + process.stderr.write("UNIDESK_SSH_ERROR " + JSON.stringify({ + failureKind: message.failureKind || null, + providerId: message.providerId || null, + dataChannelId: message.dataChannelId || null, + dataPool: message.dataPool || null, + transport: message.transport || null, + controlFallback: message.controlFallback === true + }) + "\n"); + } exitCode = 255; ws.close(); return; @@ -2915,6 +3002,12 @@ async function runSshCaptureRemoteCommand(config: UniDeskConfig, invocation: Par startedAtMs, })); if (timingHint) stderr += timingHint; + stderr += formatSshTcpPoolHint(sshTcpPoolHint({ + invocation, + transport: "backend-core-broker", + exitCode, + stderrText: stderr, + })); resolve({ exitCode, stdout, stderr }); }; const runtimeTimer = setTimeout(() => { @@ -3066,6 +3159,12 @@ async function runSshStreamRemoteCommand( startedAtMs, })); if (timingHint) stderr += timingHint; + stderr += formatSshTcpPoolHint(sshTcpPoolHint({ + invocation: streamInvocation, + transport: "backend-core-broker", + exitCode: finalCode, + stderrText: stderr, + })); resolve({ exitCode: finalCode, stdout, stderr }); }); }; @@ -3358,6 +3457,15 @@ export async function runSsh(config: UniDeskConfig, providerId: string, args: st restore(); const hint = timedOut ? null : sshFailureHint(invocation.providerId, parsed, exitCode, stderrTail); if (hint !== null) process.stderr.write(formatSshFailureHint(hint)); + if (!timedOut) { + const tcpPoolHint = formatSshTcpPoolHint(sshTcpPoolHint({ + invocation, + transport: "backend-core-broker", + exitCode, + stderrText: stderrTail, + })); + if (tcpPoolHint) process.stderr.write(tcpPoolHint); + } const timingHint = formatSshRuntimeTimingHint(sshRuntimeTimingHint({ invocation, transport: "backend-core-broker",