fix: suppress routine ssh timing noise

This commit is contained in:
Codex
2026-05-25 16:24:00 +00:00
parent fcc3ce125a
commit 5cca0546a0
5 changed files with 13 additions and 9 deletions
+1 -1
View File
@@ -133,7 +133,7 @@ core 只允许声明了 `host.ssh` capability 的 provider 使用 `ssh` 透传
本地 broker 默认等待 provider SSH 会话打开 60000ms,以便在目标节点同时有较多 microservice.http 任务时仍能建立维护会话;需要诊断慢连接时可用 `UNIDESK_SSH_OPEN_TIMEOUT_MS=<ms>` 临时调大,但最小有效值固定为 15000ms,避免把真实离线误判为长时间阻塞。
ssh-like 远端命令如果出现 `kex_exchange_identification``Connection closed by remote host`、provider session timeout 或 exit code 255CLI 会在原始 stderr 后追加一行 `UNIDESK_SSH_HINT { ... }`。该 JSON 不回显原始远端命令,只包含 `code=ssh-like-command-friction``trigger``try``triage``try` 固定指向 stdin script 形态,避免把一次 ssh-like 解析/握手摩擦误读成 D601 SSH 整体不可用。每次 `ssh`/`tran` 运行结束还会在 stderr 追加一行 `UNIDESK_SSH_TIMING { ... }`,包含 `elapsedMs``elapsedSeconds``transport``invocationKind``exitCode`;耗时超过默认 10000ms 时 `level=warning`,提示优先排查 provider/session 延迟、远端命令自身耗时、helper bootstrap 或 `tran`/`apply-patch` 工具层回归。阈值可用 `UNIDESK_SSH_SLOW_WARNING_MS=<ms>` 临时调节,提示同样不回显原始远端命令。
ssh-like 远端命令如果出现 `kex_exchange_identification``Connection closed by remote host`、provider session timeout 或 exit code 255CLI 会在原始 stderr 后追加一行 `UNIDESK_SSH_HINT { ... }`。该 JSON 不回显原始远端命令,只包含 `code=ssh-like-command-friction``trigger``try``triage``try` 固定指向 stdin script 形态,避免把一次 ssh-like 解析/握手摩擦误读成 D601 SSH 整体不可用。`ssh`/`tran` 只有在运行耗时超过默认 10000ms 时才会在 stderr 追加一行 `UNIDESK_SSH_TIMING { ... }``level=warning`;正常短调用不输出 timing 噪声。warning 包含 `elapsedMs``elapsedSeconds``transport``invocationKind``exitCode`,提示优先排查 provider/session 延迟、远端命令自身耗时、helper bootstrap 或 `tran`/`apply-patch` 工具层回归。阈值可用 `UNIDESK_SSH_SLOW_WARNING_MS=<ms>` 临时调节,提示同样不回显原始远端命令。
`ssh <providerId>` 只在当前 operation 需要 helper 时才注入 `/tmp/unidesk-ssh-tools`,普通 `argv``script``kubectl``logs` 等路径不得传输无关工具源码。`apply-patch` 只注入 `apply_patch``glob` 只注入 `glob``skills`/`skill discover` 只注入 `skill-discover``apply_patch` 接受标准 `*** Begin Patch` / `*** End Patch` patch 格式,便于通过 SSH 透传编辑远端仓库文件;远端存在 `perl` 时必须走快速精确匹配路径,避免大文件 hunk 被 sh 模式匹配拖成几十秒,缺少 `perl` 时才退回 sh-only 实现。`glob``skill-discover` 需要远端 `python3`。注入工具只写 `/tmp/unidesk-ssh-tools`,不修改目标仓库。
+1 -1
View File
@@ -178,7 +178,7 @@ export function sshHelp(): unknown {
"Do not put operation names in any colon route segment, including nested k3s namespace/workload/container segments.",
"Do not use post-provider shorthand such as `ssh G14 k3s ...`; write `ssh G14:k3s ...` so location and operation stay separated.",
"If an ssh-like remote command fails with timeout/kex/exit-255 friction, stderr includes one low-noise UNIDESK_SSH_HINT JSON line with the argv retry command.",
"Every ssh/tran runtime writes one UNIDESK_SSH_TIMING JSON line to stderr with elapsedMs/elapsedSeconds; operations over 10s are marked level=warning and should be checked for provider latency, remote command cost, helper bootstrap, or tran/apply-patch optimization before repeating high-frequency work.",
"Only slow ssh/tran runtime writes UNIDESK_SSH_TIMING JSON to stderr; operations over 10s are marked level=warning and should be checked for provider latency, remote command cost, helper bootstrap, or tran/apply-patch optimization before repeating high-frequency work. Routine short calls do not emit timing noise.",
"The local tran wrapper serializes non-interactive calls per provider/plane before opening provider SSH sessions, so parallel Codex file reads do not stampede the provider session allocator; set UNIDESK_TRAN_SESSION_LOCK=0 only for explicit diagnostics.",
"Use -- before a remote command that intentionally starts with a dash.",
],
+3 -2
View File
@@ -968,12 +968,13 @@ async function runRemoteSshWebSocket(
restore();
const hint = sshFailureHint(invocation.providerId, parsed, code, "");
if (hint !== null) process.stderr.write(formatSshFailureHint(hint));
process.stderr.write(formatSshRuntimeTimingHint(sshRuntimeTimingHint({
const timingHint = formatSshRuntimeTimingHint(sshRuntimeTimingHint({
invocation,
transport: "frontend-websocket",
exitCode: code,
startedAtMs,
})));
}));
if (timingHint) process.stderr.write(timingHint);
resolve(code);
};
const onStdinData = (chunk: Buffer): void => {
+4 -2
View File
@@ -1626,6 +1626,7 @@ export function sshRuntimeTimingHint(options: {
}
export function formatSshRuntimeTimingHint(hint: SshRuntimeTimingHint): string {
if (!hint.slow) return "";
return `UNIDESK_SSH_TIMING ${JSON.stringify(hint)}\n`;
}
@@ -1836,12 +1837,13 @@ export async function runSsh(config: UniDeskConfig, providerId: string, args: st
restore();
const hint = sshFailureHint(invocation.providerId, parsed, exitCode, stderrTail);
if (hint !== null) process.stderr.write(formatSshFailureHint(hint));
process.stderr.write(formatSshRuntimeTimingHint(sshRuntimeTimingHint({
const timingHint = formatSshRuntimeTimingHint(sshRuntimeTimingHint({
invocation,
transport: "backend-core-broker",
exitCode,
startedAtMs,
})));
}));
if (timingHint) process.stderr.write(timingHint);
resolve(exitCode);
};
child.on("error", (error) => {
+4 -3
View File
@@ -338,8 +338,9 @@ export function runSshArgvGuidanceContract(): JsonRecord {
finishedAtMs: 5200,
thresholdMs: 10_000,
});
assertCondition(timingInfo.level === "info" && timingInfo.slow === false, "short ssh operation should emit an info timing hint", timingInfo);
assertCondition(timingInfo.level === "info" && timingInfo.slow === false, "short ssh operation should stay below the timing warning threshold", timingInfo);
assertCondition(timingInfo.elapsedMs === 4200 && timingInfo.elapsedSeconds === 4.2, "timing hint must include elapsed ms and seconds", timingInfo);
assertCondition(formatSshRuntimeTimingHint(timingInfo) === "", "short ssh operation must not write routine timing noise to stderr", timingInfo);
const slowTiming = sshRuntimeTimingHint({
invocation: parseSshInvocation("D601", ["apply-patch"]),
transport: "frontend-websocket",
@@ -371,7 +372,7 @@ export function runSshArgvGuidanceContract(): JsonRecord {
assertCondition(helpText.includes("apply-patch [--allow-loose]") && helpText.includes("low-context update hunks"), "ssh help must document apply-patch loose-context guard", helpText);
assertCondition(helpText.includes("ssh D601:k3s:hwlab-dev:hwlab-cloud-api script <<'SCRIPT'"), "ssh help must document k3s script operation", helpText);
assertCondition(helpText.includes("UNIDESK_SSH_HINT"), "ssh help must document structured failure hint", helpText);
assertCondition(helpText.includes("UNIDESK_SSH_TIMING") && helpText.includes("10s"), "ssh help must document runtime timing hints", helpText);
assertCondition(helpText.includes("UNIDESK_SSH_TIMING") && helpText.includes("10s") && helpText.includes("Routine short calls do not emit timing noise"), "ssh help must document slow-only runtime timing hints", helpText);
assertCondition(helpText.includes("UNIDESK_TRAN_SESSION_LOCK=0") && helpText.includes("provider session allocator"), "ssh help must document tran provider session serialization", helpText);
const crossChecks = providerTriageRecommendedCrossChecks("D601");
@@ -438,7 +439,7 @@ export function runSshArgvGuidanceContract(): JsonRecord {
"k3s route stays location-only while operations fix native kubeconfig and assemble kubectl exec as argv",
"top-level remote option parsing preserves command-local -- separators for script -- sed -n style commands",
"ssh-like timeout/kex failures emit one structured argv retry hint",
"ssh runtime emits one structured timing hint on stderr and marks operations over 10 seconds as warnings",
"ssh runtime emits structured timing only for slow operations over 10 seconds",
"help text documents stdin script passthrough and UNIDESK_SSH_HINT",
"provider triage recommendedCrossChecks keeps ssh D601 argv true",
"remote frontend ssh uses the same structured route parser for host, k3s and pod argv routes",