From d3d542fbd36594db564556cf919b039bb5ef2cb2 Mon Sep 17 00:00:00 2001 From: Lyon <88232613+pikasTech@users.noreply.github.com> Date: Fri, 26 Jun 2026 12:51:58 +0800 Subject: [PATCH] fix: improve egress and job diagnostics (#969) Co-authored-by: Codex --- docs/reference/cli.md | 8 +- docs/reference/observability.md | 2 +- docs/reference/provider-gateway.md | 2 +- .../PJ2026-010605-observability-monitoring.md | 76 ++++++++++ scripts/cli.ts | 53 +++++-- scripts/src/docker.ts | 12 +- scripts/src/help.ts | 11 +- scripts/src/jobs.ts | 46 ++++++ src/components/backend-core/src/egress_tcp.rs | 133 +++++++++++++++++- .../provider-gateway/src/egress-proxy.ts | 73 ++++++++-- 10 files changed, 375 insertions(+), 41 deletions(-) diff --git a/docs/reference/cli.md b/docs/reference/cli.md index 7ec1b68f..86bb36d8 100644 --- a/docs/reference/cli.md +++ b/docs/reference/cli.md @@ -53,8 +53,8 @@ PipelineRun 失败或长时间未完成时,先按定点 `control-plane status - `--main-server-ip ` 默认通过公网 frontend 登录态调用主 server 的同源 API 代理,不要求计算节点持有主 server SSH key;显式提供 `--main-server-key` 或 `--main-server-transport ssh` 时才使用旧 SSH 传输。远程 frontend 传输下的 `ssh ...` 必须复用同一套结构化 route parser,支持 `D601`、`G14`、host workspace、`D601:win`、`D601:win/c/test`、`D601:k3s` 和 `D601:k3s::` 这类定位路径;它不向调用容器下发 provider token,也不要求调用容器能解析 backend-core 内网 DNS。 - `config show` 读取并校验根目录 `config.json`,不从环境变量、默认值或隐藏文件静默补配置。 - `check` 默认只执行轻量配置校验、Bun 版本检查和 Bun Transpiler 语法解析(覆盖 CLI 入口、主要 `scripts/` 模块和核心组件入口,不做类型推导)。除非用户明确要求,CLI 改动不运行单元测试、合同测试或新增测试脚本;默认最多做语法检查和必要的帮助/命令形态人工确认。关键文件存在性、`scripts/` TypeScript 类型检查、`src/components/` TypeScript 类型检查、Docker Compose config、日志轮转策略扫描和 D601 recovery guardrails 默认不启用,分别通过 `--files`、`--scripts-typecheck`、`--components`、`--compose`、`--logs`、`--recovery-guardrails` 开启,或用 `--full` 一次性开启。`--scripts-typecheck` 只跑 scripts TypeScript 类型检查,不触发测试脚本或 GitHub issue/PR live API check。长命令项必须在 stderr 输出 `unidesk.check.progress` JSON lines,stdout 保持最终 JSON 结果,避免 post-task 或人工运行时长时间无可见进度。`typescript:scripts` 固定通过 `bun --bun tsc -p scripts/tsconfig.json --noEmit --pretty false` 执行,默认 `--scripts-typecheck-timeout-ms 120000`,可按目标运行面显式调小或调大但 CLI 会封顶;`--check-heartbeat-ms` 控制运行中心跳间隔,默认 `15000`。所有命令项的最终 item detail 必须包含 `durationMs`、`timeoutMs`、`heartbeatMs`、`exitCode`、`signal`、`timedOut`、stdout/stderr byte count、truncation flag 和有界 tail;超时必须返回 `timedOut=true`,不得只留下被外层命令杀死的空输出。不要把 `bun --check scripts/cli.ts` 当作低噪声 CLI 自检入口;它可能执行根 CLI help 并触发长 help dump。CLI 入口级自检使用 `bun scripts/cli.ts check`,单文件语法确认只针对具体模块文件运行。`check recovery-guardrails` 是同一诊断的低噪声直接入口,报告 malformed `/proc/mounts`、kubelet validation risk、stale CRI sandbox count、Code Queue worktree/symlink、Code Queue/MDTODO hostPath 和 `ContainerCreating` 分类;它不得重启 k3s、删除 CRI sandbox、修改 hostPath、deploy/rollout 或 prune/reset。`--rust` 只允许在 D601 CI/dev execution 中配合 `UNIDESK_D601_RUST_CHECK=1` 使用,长期规则见 `docs/reference/dev-environment.md` 和 `docs/reference/devops-hygiene.md`。 -- `server start` 创建异步 job,在后台执行 Docker 构建和启动;命令本身只负责返回 job id、日志路径和启动命令。 -- `server stop` 创建异步 job,在后台停止固定 Compose project 中的全部 UniDesk 服务。 +- `server start` 创建异步 job,在后台执行 Docker 构建和启动;命令默认只返回低噪声 async job 摘要、stdout/stderr 路径和 `job status` 后续命令,完整 JSON 只能通过 `--full`/`--raw` 显式展开。 +- `server stop` 创建异步 job,在后台停止固定 Compose project 中的全部 UniDesk 服务;默认输出同样是 async job 摘要。 - `server status` 查询公开端口、受限宿主端口、内部端口、主机 swap 摘要、Compose 容器、core/frontend/dev-frontend/provider/database 健康检查和访问 URL;D601 Code Queue 使用的 PostgreSQL/OA Event Flow host mapping 必须出现在受限宿主端口而不是无条件公开入口中。低内存主 server 上 `swap.warning` 非空时,先执行 `server swap status` 或 `server swap ensure`。 - `server swap status|ensure [--path /swapfile] [--size 2GiB] [--dry-run]` 是主 server swap 管理入口。`status` 仅读 `/proc/meminfo`、`/proc/swaps` 和 `/etc/fstab` 并返回 JSON;`ensure` 在已有任何 active swap 时只报告 no-op,在无 active swap 时创建固定 swapfile、`chmod 600`、`mkswap`、`swapon` 并尽量写入 `/etc/fstab`。输出必须包含 `before`、`after`、total memory、active swap、持久化状态、关键动作和错误详情;若 swap 已启用但 fstab 写入失败,状态为 `degraded`,调用者需按返回的 detail 修复持久化。 - `server logs` 返回 `logs/` 文件日志和 Docker 容器日志的尾部,默认限制输出大小,避免日志爆炸。实现必须只读取文件末尾字节,不得为了 tail 先把巨大日志完整读入 CLI 内存。 @@ -143,7 +143,7 @@ PipelineRun 失败或长时间未完成时,先按定点 `control-plane status - `codex interrupt|cancel ` 通过 Code Queue 私有代理请求中断;running/judging 任务会请求 D601 当前 agent run 停止,queued/retry_wait 任务的取消也必须保持与 WebUI 相同代理路径,返回有界 task 摘要和后续查询命令。任何需要接触 active run 的动作仍属于 D601 执行面。 - 旧 Code Queue 多队列 lane 现在是归档视图:`codex queues [--full|--all] [--limit N] [--page N|--offset N]` 只读展示历史 queue 摘要、activity、commanderConcurrency、counts 和 execution diagnostics。`queue create`、`queue merge`、`move` 等旧队列写入口冻结并返回 `legacy-code-queue-frozen`;AgentRun 新任务的排队、派发和取消必须使用 `agentrun create|apply|get|cancel`。 - 所有旧 `codex` 历史查询、已读和残留 interrupt/cancel 命令必须走与 WebUI 相同的 backend-core 私有代理路径 `/api/microservices/code-queue/proxy/...`。旧 submit/steer/resume/queue mutation/move/workdir mutation 不得绕过冻结;若需要新任务或新 session 控制,使用 AgentRun 资源原语。 -- `job list [--limit N] [--include-command]` 与 `job status [--tail-bytes N]` 查询 `.state/jobs/` 文件系统状态,是异步命令的可观测入口。`job list` 默认只返回最新 50 条摘要,并为已知异步工作流返回轻量 `progress.summary` 与后续查询命令;`job status` 默认返回结构化 `progress`、stdout/stderr 末尾 12000 字节、`tailPolicy` 与完整日志路径。已知工作流应从有界日志尾部抽取阶段、关键对象名和下一步命令,避免为了判断当前阶段而手工打开完整 stdout/stderr。`hwlab_g14_v02_trigger_current` 的 progress 必须暴露 trigger 阶段、source commit 和 PipelineRun;`hwlab_g14_v02_pr_monitor` 的 progress 必须暴露 preflight、merge、source-head、cd-trigger、cd-status、git-mirror-flush 和 pr-comment 阶段,以及 PR、source commit、PipelineRun、targetValidation/pendingFlush 摘要;`agentrun_vNN_trigger_current` 的 progress 必须识别 YAML lane 的 source-bootstrap、image-build、gitops-publish、git-mirror 阶段,暴露 source commit、PipelineRun、stage diagnostics、timing 和 `agentrun control-plane status --node --lane --pipeline-run `;`hwlab_g14_git_mirror_sync|flush` 与 `agentrun_v01_git_mirror_sync|flush` 的 progress 必须暴露 sync/flush 状态、Job 名、pendingFlush 与 fetch/push/total/SSH timing,并给出对应 repo 的 mirror status 命令。 +- `job|jobs list [--limit N] [--include-command]` 与 `job status|get|read [--tail-bytes N]` 查询 `.state/jobs/` 文件系统状态,是异步命令的可观测入口。`jobs get/read` 是 `job status` 的兼容别名,不应引导调用者直接读取 `.state/jobs` 文件。`job list` 默认只返回最新 50 条摘要,并为已知异步工作流返回轻量 `progress.summary` 与后续查询命令;`job status` 默认返回结构化 `progress`、stdout/stderr 末尾 12000 字节、`tailPolicy` 与完整日志路径。已知工作流应从有界日志尾部抽取阶段、关键对象名和下一步命令,避免为了判断当前阶段而手工打开完整 stdout/stderr。`hwlab_g14_v02_trigger_current` 的 progress 必须暴露 trigger 阶段、source commit 和 PipelineRun;`hwlab_g14_v02_pr_monitor` 的 progress 必须暴露 preflight、merge、source-head、cd-trigger、cd-status、git-mirror-flush 和 pr-comment 阶段,以及 PR、source commit、PipelineRun、targetValidation/pendingFlush 摘要;`agentrun_vNN_trigger_current` 的 progress 必须识别 YAML lane 的 source-bootstrap、image-build、gitops-publish、git-mirror 阶段,暴露 source commit、PipelineRun、stage diagnostics、timing 和 `agentrun control-plane status --node --lane --pipeline-run `;`hwlab_g14_git_mirror_sync|flush` 与 `agentrun_v01_git_mirror_sync|flush` 的 progress 必须暴露 sync/flush 状态、Job 名、pendingFlush 与 fetch/push/total/SSH timing,并给出对应 repo 的 mirror status 命令。 - `debug health`、`debug ssh-pool `、`debug dispatch` 与 `debug task` 走真实内部 core、WebSocket、数据库、provider、系统指标、Docker 状态和 Host SSH 维护桥流程,只用于开发调试,不写入 `TEST.md` 的正式验收步骤;`debug ssh-pool` 只裁剪单个 provider 的 `providerGatewaySshData*` labels,用于低噪声判断 tcp-pool 是否 ready、claimed、exhausted 或有 lastError。 - `e2e run [--only pattern[,pattern...]] [--skip pattern[,pattern...]]` 使用 publicHost 派生的公开 production frontend/dev frontend/provider ingress URL,并通过 Docker 内网验证 core API、PostgreSQL、provider self-connection、系统指标曲线、Docker 状态快照、provider.upgrade 预检和 Playwright 前端页面,是交付前的自动化 E2E 门禁;CLI 默认输出 check 状态摘要,完整诊断写入 `resultPath`,日常迭代应优先用 `--only` / `--skip` 跑最小必要集合。 @@ -153,7 +153,7 @@ UniDesk/HWLAB Web 开发、Playwright wrapper、`trans playwright`、HWL ## Async Job State -长时操作采用 Fire-and-Forget 模式:CLI 创建 `.state/jobs/{jobId}.json`,后台进程执行真实命令,并将 stdout、stderr 分别写入 `.state/jobs/{jobId}.stdout.log` 与 `.state/jobs/{jobId}.stderr.log`。调用者通过 `bun scripts/cli.ts job status ` 查询进度和尾部输出。 +长时操作采用 Fire-and-Forget 模式:CLI 创建 `.state/jobs/{jobId}.json`,后台进程执行真实命令,并将 stdout、stderr 分别写入 `.state/jobs/{jobId}.stdout.log` 与 `.state/jobs/{jobId}.stderr.log`。调用者通过 `bun scripts/cli.ts job status ` 或兼容别名 `bun scripts/cli.ts jobs get ` 查询进度和尾部输出。 异步 job 的返回值只表示控制动作已经排入后台执行,不表示目标运行面对象已经创建或收敛。所有带 `statusCommand` 的返回都必须先用 `job status ` 查看 `progress.stage`、`progress.stageStatus`、关键对象名和 `nextCommand`;只有 progress 已进入对应创建/完成阶段后,才进入更重的运行面 status。对于 `hwlab g14 control-plane trigger-current --lane v02 --confirm`,`progress.pipelineRun` 在 refresh 或 mirror pre-sync 阶段可能只是预期 PipelineRun 名称;在 `progress.stage=create-pipelinerun` 且 `progress.pipelineCreated=true` 前,`control-plane status --pipeline-run ` 返回 not found 只能说明 PipelineRun 尚未创建,不能当作 CI/CD 失败。对于 `git-mirror sync|flush --confirm`,先看 job progress 和 timing 摘要,再用对应 `git-mirror status` 确认 `pendingFlush`、local/github refs 和 `githubInSync`;node-scoped `flush` 的 progress 若出现 `partialSuccess=push-succeeded-fetch-failed`,先看是否同时有 `partialSuccessRecovered=true` 和 `postPushRecovery`,未恢复时再按 `nextCommand` 执行同 node/lane 的 `sync --confirm --wait` 刷新 mirror-stage cache。 diff --git a/docs/reference/observability.md b/docs/reference/observability.md index 0de197b1..3626d789 100644 --- a/docs/reference/observability.md +++ b/docs/reference/observability.md @@ -25,7 +25,7 @@ Web/Workbench trace、Web 哨兵和 `web-probe observe` 的人工判定入口以 ## CLI Logs -异步 job 的 stdout 和 stderr 位于 `.state/jobs/`。`job list` 默认只返回最新 50 条摘要,并为已知异步工作流返回轻量 `progress.summary`;`job status` 会返回结构化 `progress` 与有限尾部,避免输出爆炸,同时保留完整日志文件路径便于继续排查。实现必须只读取日志尾部字节,不得先把完整 job 日志读入 CLI 内存;长时命令的阶段、关键对象名和下一步查询命令应优先沉淀到 `progress`,不能要求调用者先阅读完整日志才能知道是否卡在提交、构建、发布或观测阶段。 +异步 job 的 stdout 和 stderr 位于 `.state/jobs/`。`job|jobs list` 默认只返回最新 50 条摘要,并为已知异步工作流返回轻量 `progress.summary`;`job status ` 与兼容别名 `jobs get/read ` 会返回结构化 `progress` 与有限尾部,避免输出爆炸,同时保留完整日志文件路径便于继续排查。实现必须只读取日志尾部字节,不得先把完整 job 日志读入 CLI 内存;长时命令的阶段、关键对象名和下一步查询命令应优先沉淀到 `progress`,不能要求调用者先阅读完整日志才能知道是否卡在提交、构建、发布或观测阶段。 ## Service Logs diff --git a/docs/reference/provider-gateway.md b/docs/reference/provider-gateway.md index edf6ac27..23332c4f 100644 --- a/docs/reference/provider-gateway.md +++ b/docs/reference/provider-gateway.md @@ -122,7 +122,7 @@ provider-gateway 可以提供 egress HTTP CONNECT 代理,用于让 Code Queue egress proxy 的长期边界是“统一 provider 通道,不引入第二控制面”。backend-core 只接受在线 provider socket 上的 `egress_tcp_*` 消息,并在该 socket 关闭时销毁全部对应 TCP relay;provider-gateway 只维护本地 HTTP proxy 与 WebSocket 消息映射,不保存业务状态,不参与任务调度、统计或节点注册以外的控制面。执行容器、用户服务、Pipeline runner 和 provider-side deploy build 不允许直接连接 backend-core provider ingress,也不允许携带 provider token 自行注册;需要出网时只能连接同节点 provider-gateway 的私有 proxy endpoint。当前 k3s/k8s Code Queue 通过 `d601-provider-egress-proxy` 或 `g14-provider-egress-proxy` Kubernetes Service 连接同节点 provider-gateway egress endpoint,这是 Pod 内的出网入口,不是业务 HTTP 代理入口,也不能替代 Kubernetes API service proxy。部署构建同样不得新建 SSH SOCKS、公网 master proxy 或未登记的宿主全局代理;构建脚本默认只能把 provider-gateway WS egress 作为短生命周期环境变量和 Docker build-arg 注入,并配合目标节点本地 BuildKit/image cache 避免重复下载大依赖层。G14 的节点本地 VPN proxy 是已登记的基础设施 bootstrap 例外,只允许按 `docs/reference/g14.md` 用于 G14 host-side 镜像构建、cache prewarm 或恢复下载;k3s runtime Pod 仍必须使用 `g14-provider-egress-proxy` 和 `g14-tcp-egress-gateway`。 -egress tunnel 必须有生命周期边界:provider-gateway 发出 `egress_tcp_open` 后如果主 server 未在 `openTimeoutMs` 内返回 `egress_tcp_opened` 或 close,必须主动关闭本地 client 并向 core 发送 `egress_tcp_close`;provider-gateway 与 backend-core 都必须对长时间无数据的 relay 执行 idle 清理,避免 provider WebSocket 抖动、TCP connect 卡住或上游未关闭时留下 stale tunnel。排障时如果 `activeTunnels` 持续增长、`pendingTunnels` 非零或 `oldestTunnelAgeMs` 明显超过业务请求耗时,应先看 provider-gateway 与 backend-core egress 清理日志,再判断 Code Queue、PostgreSQL 或 OA Event Flow 本身是否慢。 +egress tunnel 必须有生命周期边界:provider-gateway 发出 `egress_tcp_open` 后如果主 server 未在 `openTimeoutMs` 内返回 `egress_tcp_opened` 或 close,必须主动关闭本地 client 并向 core 发送 `egress_tcp_close`;provider-gateway 与 backend-core 都必须对长时间无数据的 relay 执行 idle 清理,避免 provider WebSocket 抖动、TCP connect 卡住或上游未关闭时留下 stale tunnel。生命周期日志必须覆盖 provider proxy open request/opened/remote close/local close、backend-core open requested/connect start/connected/connect failed/connect timeout/read-write failed,并包含 `providerId`、`connectionId`、目标 host/port、method、ageMs、pendingBytes 和低基数 `failureKind`;不得记录 payload、DSN 密码、Authorization、Secret 或 token。排障时如果 `activeTunnels` 持续增长、`pendingTunnels` 非零或 `oldestTunnelAgeMs` 明显超过业务请求耗时,应先看 provider-gateway 与 backend-core egress 清理日志,再判断 Code Queue、PostgreSQL 或 OA Event Flow 本身是否慢。 故障语义必须显式,不允许静默 fallback。provider-gateway 到 backend-core 的 WebSocket 未连接时,本地 proxy 必须返回 503;执行容器不能自动绕过到 D601 本地直连公网、外部公共代理或主 server 公网 HTTP 端口。`NO_PROXY` 只用于 PostgreSQL、OA Event Flow、ClaudeQQ、frontend/backend-core 内网代理、provider-gateway health 等明确内网链路,不能把 GitHub、npm registry 等外部目标加入绕过列表。`hyueapi.com` 与 MiniMax judge 上游 `api.minimaxi.com` 是明确的模型 API 例外:前者会拒绝 provider-gateway egress proxy 出口,后者在 judge 高频短请求上容易受 provider egress 抖动影响导致任务误重试;Code Queue 必须用 `CODE_QUEUE_EGRESS_PROXY_NO_PROXY` / `NO_PROXY` 将 `hyueapi.com,.hyueapi.com,api.minimaxi.com,.minimaxi.com` 配成直连,其它模型 API 仍不得默认绕过 proxy。验收必须同时证明 provider-gateway labels、业务服务 `/health` 和执行容器内 `curl -I https://...` 都走同一 proxy path,hyueapi/MiniMax 例外则以 Code Queue `/health.egressProxy.noProxy`、MiniMax judge 探测和目标任务成功完成作为证据。 diff --git a/project-management/PJ2026-01/specs/PJ2026-010605-observability-monitoring.md b/project-management/PJ2026-01/specs/PJ2026-010605-observability-monitoring.md index 5d93d1f4..11e954e9 100644 --- a/project-management/PJ2026-01/specs/PJ2026-010605-observability-monitoring.md +++ b/project-management/PJ2026-01/specs/PJ2026-010605-observability-monitoring.md @@ -19,6 +19,7 @@ | 短名 | 运维监控 | | 层级 | L2 课题 | | 状态 | 已生效 | +| 实现引用版本 | draft-2026-06-26-p8-egress-job-friction | | 需求规格模板 | [ISO/IEC/IEEE 29148 需求规格模板](../../templates/iso-iec-ieee-29148-requirements-spec-template.md) | | 上级规格 | [PJ2026-0106 平台运维](PJ2026-0106-platform-ops.md) | | 规格治理索引 | [规格治理](spec-governance.md) | @@ -42,6 +43,7 @@ - Web/API/AgentRun/HWPOD/Harness/用户管理等服务的运行面健康、资源状态、公开入口健康和用户可感知性能观测。 - 发布后 runtime readiness、resource usage、error rate、queue depth、target availability 和 alert 状态摘要。 - AgentRun rolling recovery 与 cancel lifecycle 的 active runner、stale lease、cancel request、cancel delivery、terminal report retry、reconciler backlog、projection lag、late-write fencing 和 Job TTL cleanup 观测。 +- provider-gateway egress TCP tunnel、backend-core egress bridge、受控 relay、异步 job 和 server lifecycle 命令的阶段化可见性。 - 监控和 trace 数据的受控查询、低噪声摘要、失败归因和敏感输出约束。 ### 2.3 范围外 @@ -62,6 +64,8 @@ | Trace Explorer URL 模板 | 由 YAML 声明的 trace backend 打开链接模板,例如包含 `{trace_id}` 占位符的 Tempo、Jaeger 或 Grafana Explore URL;前端只替换合法 OTel trace id,不拼接其他用户输入。 | | span | 一次请求在 admission、billing、AgentRun dispatch、projection、read/replay 等阶段中的一个可观察工作单元。 | | trace context | W3C `traceparent`/`baggage` 等跨进程传播上下文;业务 `traceId/sessionId/runId/turnId` 只能作为 span attribute,不能替代标准 trace context。 | +| provider egress TCP tunnel | target node 上 provider-gateway egress proxy 通过 backend-core WebSocket 控制通道请求 master 侧打开 TCP 连接,并在两端转发字节流的出站链路。 | +| async job 摘要 | `server start|stop|rebuild|restart` 等长操作创建 `.state/jobs` 后默认返回的低噪声身份、状态、日志路径和后续查询命令,不默认展开完整命令或 stdout/stderr。 | | 用户可感知性能 | 用户在 Web、CLI 或 API 入口中直接感受到的等待时间、加载时间、首个可读内容出现时间和完整可用状态。 | | YAML-first 运维监控 | Prometheus scrape、rule、summary 和 target 归属先进入 UniDesk YAML,再通过受控 CLI 渲染和验证的运维形态。 | | rolling recovery signal | 用于判断 manager/cloud-api rolling 后同一 run/command/runnerJob 是否恢复控制权的低基数 metrics、span attribute 和 CLI 摘要字段。 | @@ -94,6 +98,7 @@ | PJ2026-01060506 | Metrics接入 | 本规格 6.6 | metrics endpoint、scrape target 和 label 口径 | 各 L1 服务健康指标 | Prometheus 查询 | | PJ2026-01060507 | Rolling恢复观测 | 本规格 6.7 | active runner、reconciler backlog、terminal retry、projection lag 和不可恢复 blocker 的查询口径 | AgentRun核心、HWLAB接入、Workbench唯一投影、发布Lane | rolling 发布判定、故障收口 | | PJ2026-01060508 | Web哨兵 | [PJ2026-01060508 Web哨兵](PJ2026-01060508-web-probe-sentinel.md) | `web-probe observe` 的 YAML-first 生产哨兵、持续 canary、报告视图、dashboard 分析工作台和发布恢复验证 | Web工作台、Workbench唯一投影、YAML运维、公开入口、发布流水 | 平台值守、CI/CD targetValidation、用户入口恢复判定 | +| PJ2026-01060509 | 出站诊断 | 本规格 6.9 | provider egress TCP tunnel、受控 relay、async job/server lifecycle 的阶段化日志、低噪声摘要和诊断入口 | provider-gateway、backend-core、YAML运维、发布流水 | D601/G14 runtime 出站排障、OpenFGA/Postgres/OA Event Flow 链路验收 | ## 6. 原子需求 @@ -274,6 +279,77 @@ Web 哨兵的首条生产 canary 应覆盖 `workbench-dsflash-go-tool-call-10x` 发布流水可调用 Web 哨兵 maintenance start/stop。maintenance start 暂停告警但继续采样;maintenance stop 触发同一 observe CLI quick verify 和 analyze。CI/CD targetValidation 对 HWLAB Web 恢复的判断必须包含 quick verify、analysis report SHA、finding 摘要、public origin、scenario id 和 observer/run id;Argo green 但哨兵 red/timeout 时不得判定发布恢复成功。 +### 6.9 OPS-MON-REQ-009 Provider 出站与 Job 可见性 + +| 编号 | 短名 | 主责模块 | 关联模块 | +| --- | --- | --- | --- | +| OPS-MON-REQ-009 | 出站诊断 | PJ2026-01060509 出站诊断 | [YAML运维](PJ2026-010603-yaml-first-ops.md)、[公开入口](PJ2026-010604-public-entry.md)、[发布流水](PJ2026-010601-controlled-release.md) | + +运维监控应为 provider-gateway egress TCP tunnel、backend-core egress bridge、受控 relay 和 CLI async job 提供阶段化可见性,使 D601/G14 等节点的 PostgreSQL、OpenFGA、OA Event Flow、Git mirror 或外部下载问题可以被定位到 provider 本地 proxy、backend-core open、master 侧 TCP connect、relay ACL、目标服务或调用方读写阶段。 + +provider egress tunnel 生命周期至少应记录:provider proxy 收到 client 请求、发出 `egress_tcp_open`、core 确认 opened、core connect failed/timeout、remote close、本地 open timeout、client close、pending buffer 超限和 idle close。日志和受控摘要必须包含 `providerId`、`connectionId`、目标 `host`/`port`、`method`、`ageMs`、`pendingBytes` 和低基数 failureKind;不得记录 TCP payload、PostgreSQL DSN 密码、HTTP Authorization、Secret、token 或完整业务正文。 + +`server start|stop|rebuild|restart` 等创建 `.state/jobs` 的命令默认必须返回 async job 摘要,而不是完整 JSON payload、完整 command、完整 runtime env 或 stdout/stderr。摘要至少包含 job id、目标 service、当前状态、stdout/stderr 路径、`job status` 查询命令和必要的 `server logs`/`server status` 后续命令。完整结构只能通过 `--full`/`--raw` 或 `job status ` 渐进展开。`job` 与常见自然输入 `jobs`、`status|get|read` 应指向同一受控状态入口,避免调用者因为别名缺失退回手工文件读取。 + +#### 6.9.1 目标架构图 + +```mermaid +flowchart LR + Pod[目标 runtime Pod] --> Proxy[provider-gateway egress proxy] + Proxy -- egress_tcp_open/data/close --> Core[backend-core provider WS] + Core --> Relay[受控 relay 或目标 TCP 服务] + Relay --> Target[Postgres / OA Event Flow / external endpoint] + Core --> Logs[backend-core logs] + Proxy --> ProviderLogs[provider-gateway logs] + CLI[UniDesk CLI job/status/triage] --> Logs + CLI --> ProviderLogs +``` + +#### 6.9.2 数据流图 + +```mermaid +flowchart TD + Request[CONNECT 或 HTTP proxy 请求] --> LocalEvent[provider proxy stage event] + LocalEvent --> OpenMsg[egress_tcp_open] + OpenMsg --> CoreEvent[backend-core open/connect stage event] + CoreEvent --> Outcome{connect outcome} + Outcome -->|opened| DataFlow[egress_tcp_data 双向转发] + Outcome -->|failed/timeout| CloseMsg[egress_tcp_close with failureKind] + DataFlow --> CloseMsg + CloseMsg --> Summary[provider triage / job status / logs 摘要] +``` + +#### 6.9.3 关键时序图 + +```mermaid +sequenceDiagram + participant C as Runtime client + participant P as provider egress proxy + participant B as backend-core + participant R as Relay/Target + participant CLI as UniDesk CLI + + C->>P: CONNECT host:port + P->>P: log egress_proxy_tunnel_open_request + P->>B: egress_tcp_open(providerId, connectionId, host, port) + B->>B: log egress_tcp_open_requested/connect_start + B->>R: TCP connect + alt connect ok + B->>P: egress_tcp_opened + P->>P: log egress_proxy_tunnel_opened + P-->>C: 200 Connection Established + else connect failed or timeout + B->>P: egress_tcp_close(error, failureKind) + P->>P: log remote close + P-->>C: connection close + end + CLI->>CLI: job status / logs / triage show bounded stage summary +``` + +#### 6.9.4 代码引用规则 + +本需求范围内新增或修改的源码文件头部必须标注 `SPEC: PJ2026-01060509 出站诊断 draft-2026-06-26-p8-egress-job-friction`。纯配置、生成 manifest 或无法承载注释头的文件必须能从 owning CLI、渲染器或运行日志追溯到本 SPEC。 + ## 7. 过程控制 本规格不单独索引过程 issue;跨 L1 的内测、灰度和阶段活动索引统一保留在 [PJ2026-01 HWLAB 总规格](PJ2026-01-HWLAB.md) 的 `7. 过程控制`。 diff --git a/scripts/cli.ts b/scripts/cli.ts index b00fd020..1ad7eaac 100644 --- a/scripts/cli.ts +++ b/scripts/cli.ts @@ -1,8 +1,10 @@ +// SPEC: PJ2026-01060509 出站诊断 draft-2026-06-26-p8-egress-job-friction. +// UniDesk CLI dispatcher with bounded server lifecycle and job drill-down output. import { readConfig } from "./src/config"; import { debugDispatch, debugHealth, debugSshPool, debugTask, isDebugDispatchCommand, type DebugDispatchCommand } from "./src/debug"; import { isRebuildableService, isRestartableService, rebuildService, restartService, stackLogs, stackStatus, startStack, stopStack, unsupportedRebuildService, unsupportedRestartService } from "./src/docker"; import { emitError, emitJson, emitText, isRenderedCliResult } from "./src/output"; -import { cancelJob, jobWithTail, listJobs, listJobsSummary, readJob, renderJobStatusSummary, runJob } from "./src/jobs"; +import { cancelJob, jobWithTail, listJobs, listJobsSummary, readJob, renderJobLaunchSummary, renderJobStatusSummary, runJob } from "./src/jobs"; import { checkHelp, parseCheckOptions, runChecks, runRecoveryGuardrailsCheck } from "./src/check"; import { runSsh } from "./src/ssh"; import { autoRemoteCiPublishUserServiceDryRunPlan, extractRemoteCliOptions, runRemoteCli } from "./src/remote"; @@ -193,6 +195,23 @@ function latestJobId(): string { return jobs[0].id; } +function wantsFullDisclosure(): boolean { + return args.includes("--full") || args.includes("--raw"); +} + +function emitServerLifecycleResult(result: unknown, ok = true): void { + if (!wantsFullDisclosure()) { + const rendered = renderJobLaunchSummary(commandName, result); + if (rendered !== null) { + emitText(rendered.renderedText, rendered.command || commandName); + if (!rendered.ok) process.exitCode = 1; + return; + } + } + emitJson(commandName, result, ok); + if (!ok) process.exitCode = 1; +} + async function main(): Promise { if (remoteOptions.host !== null) { process.exitCode = await runRemoteCli(remoteOptions, readConfig()); @@ -444,12 +463,11 @@ async function main(): Promise { if (sub === "start") { const result = startStack(config); const ok = (result as { ok?: unknown }).ok !== false; - emitJson(commandName, result, ok); - if (!ok) process.exitCode = 1; + emitServerLifecycleResult(result, ok); return; } if (sub === "stop") { - emitJson(commandName, stopStack(config)); + emitServerLifecycleResult(stopStack(config)); return; } if (sub === "status") { @@ -484,7 +502,7 @@ async function main(): Promise { process.exitCode = 1; return; } - emitJson(commandName, rebuildService(config, third)); + emitServerLifecycleResult(rebuildService(config, third)); return; } if (sub === "restart") { @@ -494,7 +512,7 @@ async function main(): Promise { process.exitCode = 1; return; } - emitJson(commandName, restartService(config, third)); + emitServerLifecycleResult(restartService(config, third)); return; } } @@ -564,22 +582,35 @@ async function main(): Promise { return; } - if (top === "job") { - if (sub === "list") { + if (top === "job" || top === "jobs") { + const jobSub = sub === "get" || sub === "read" ? "status" : sub; + if (jobSub === "list" || jobSub === undefined || isHelpToken(jobSub)) { + if (jobSub === undefined || isHelpToken(jobSub)) { + emitJson(commandName, { + command: "job|jobs list|status|get|read|cancel", + aliases: ["jobs list", "jobs get ", "jobs read "], + usage: [ + "bun scripts/cli.ts job list [--limit N] [--include-command]", + "bun scripts/cli.ts job status [--tail-bytes N] [--full|--raw]", + "bun scripts/cli.ts jobs get ", + ], + }); + return; + } emitJson(commandName, listJobsSummary({ limit: boundedNumberOption("--limit", 50, 500), includeCommand: args.includes("--include-command") })); return; } - if (sub === "status") { + if (jobSub === "status") { const id = third === "latest" || third === undefined ? latestJobId() : third; const job = jobWithTail(readJob(id), boundedNumberOption("--tail-bytes", 12000, 500_000)); - if (args.includes("--full") || args.includes("--raw")) { + if (wantsFullDisclosure()) { emitJson(commandName, { job }); return; } emitText(renderJobStatusSummary(job).renderedText, commandName); return; } - if (sub === "cancel") { + if (jobSub === "cancel") { if (!third) throw new Error("job cancel requires job id"); emitJson(commandName, cancelJob(third)); return; diff --git a/scripts/src/docker.ts b/scripts/src/docker.ts index 9dfeb1c7..19bcc5c5 100644 --- a/scripts/src/docker.ts +++ b/scripts/src/docker.ts @@ -1,3 +1,5 @@ +// SPEC: PJ2026-01060509 出站诊断 draft-2026-06-26-p8-egress-job-friction. +// Main-server Docker lifecycle helpers with bounded status probes and job summaries. import { chmodSync, existsSync, mkdirSync, readFileSync, readdirSync, statSync, writeFileSync } from "node:fs"; import { basename, dirname, join, resolve } from "node:path"; import { commandOk, runCommand, tailFile } from "./command"; @@ -608,7 +610,7 @@ export function dockerContainers(config: UniDeskConfig): ContainerStatus[] { "label=com.docker.compose.oneoff=False", "--format", "{{json .}}", - ], repoRoot); + ], repoRoot, { timeoutMs: 5000 }); if (result.exitCode !== 0 || result.stdout.trim().length === 0) return []; return result.stdout .split("\n") @@ -643,9 +645,9 @@ function dockerExecJson(container: string, path: string): unknown { "export url", "bun -e 'const url=process.env.url; fetch(url).then(async r=>{const text=await r.text(); console.log(JSON.stringify({ok:r.ok,status:r.status,body:text?JSON.parse(text):null})); process.exit(r.ok?0:1);}).catch(e=>{console.error(e); process.exit(1)})'", ].join("\n"); - const result = runCommand(["docker", "exec", container, "sh", "-lc", script], repoRoot); + const result = runCommand(["docker", "exec", container, "sh", "-lc", script], repoRoot, { timeoutMs: 5000 }); if (result.exitCode !== 0) { - return { ok: false, exitCode: result.exitCode, stdout: result.stdout.slice(-1200), stderr: result.stderr.slice(-1200) }; + return { ok: false, exitCode: result.exitCode, timedOut: result.timedOut, signal: result.signal, stdout: result.stdout.slice(-1200), stderr: result.stderr.slice(-1200) }; } try { return JSON.parse(result.stdout.trim()) as unknown; @@ -655,8 +657,8 @@ function dockerExecJson(container: string, path: string): unknown { } function dockerExec(config: UniDeskConfig, container: string, command: string[]): unknown { - const result = runCommand(["docker", "exec", container, ...command], repoRoot); - return { ok: result.exitCode === 0, exitCode: result.exitCode, stdout: result.stdout.slice(-1200), stderr: result.stderr.slice(-1200) }; + const result = runCommand(["docker", "exec", container, ...command], repoRoot, { timeoutMs: 5000 }); + return { ok: result.exitCode === 0, exitCode: result.exitCode, timedOut: result.timedOut, signal: result.signal, stdout: result.stdout.slice(-1200), stderr: result.stderr.slice(-1200) }; } export async function stackStatus(config: UniDeskConfig): Promise { diff --git a/scripts/src/help.ts b/scripts/src/help.ts index 36577439..f0aeb380 100644 --- a/scripts/src/help.ts +++ b/scripts/src/help.ts @@ -1,3 +1,5 @@ +// SPEC: PJ2026-01060509 出站诊断 draft-2026-06-26-p8-egress-job-friction. +// Static CLI help for job aliases and server lifecycle progressive disclosure. import { ghHelp, ghScopedHelp } from "./gh"; import { authBrokerHelp } from "./auth-broker"; import { platformDbHelp } from "./platform-db"; @@ -86,8 +88,8 @@ export function rootHelp(): unknown { { command: "codex steer-confirm --steer-id [--raw]", description: "Read-only lookup for a steerId in task trace so deliveryUnconfirmed can be resolved without resending the corrective prompt." }, { command: "codex interrupt|cancel ", description: "Request interrupt for a running Code Queue task, or cancel a queued/retry_wait task, through the same private proxy." }, { command: "codex queues [--full|--all] [--limit N] [--page N|--offset N]", description: "Read legacy Code Queue archive summaries. Legacy queue create/merge and move are frozen; use agentrun create/apply/get/cancel for new work." }, - { command: "job list [--limit N] [--include-command]", description: "List async jobs from .state/jobs with a bounded default page and progress summaries." }, - { command: "job status [--tail-bytes N]", description: "Show job state with a structured progress summary and bounded stdout/stderr tails." }, + { command: "job|jobs list [--limit N] [--include-command]", description: "List async jobs from .state/jobs with a bounded default page and progress summaries." }, + { command: "job status|get|read [--tail-bytes N]", description: "Show job state with a structured progress summary and bounded stdout/stderr tails." }, { command: "job cancel ", description: "Cancel a queued/running async job through the .state/jobs control entry and keep a terminal canceled record." }, { command: "debug health", description: "Probe internal core, nodes, system/Docker status, frontend, provider ingress, and public boundary." }, { command: "debug ssh-pool ", description: "Show bounded host.ssh.tcp-pool labels for one provider, including ready/claimed/desired/lastError." }, @@ -503,14 +505,15 @@ function codexHelp(): unknown { function jobHelp(): unknown { return { - command: "job list|status|cancel", + command: "job|jobs list|status|get|read|cancel", output: "json", usage: [ "bun scripts/cli.ts job list [--limit N] [--include-command]", "bun scripts/cli.ts job status [--tail-bytes N]", + "bun scripts/cli.ts jobs get [--tail-bytes N]", "bun scripts/cli.ts job cancel ", ], - description: "Inspect or cancel fire-and-forget job state from .state/jobs with structured progress summaries and bounded log tails.", + description: "Inspect or cancel fire-and-forget job state from .state/jobs with structured progress summaries and bounded log tails. `jobs get/read` are compatibility aliases for `job status`; server lifecycle commands return this drill-down by default and reserve full JSON for --full/--raw.", }; } diff --git a/scripts/src/jobs.ts b/scripts/src/jobs.ts index fd6f8d21..8614968e 100644 --- a/scripts/src/jobs.ts +++ b/scripts/src/jobs.ts @@ -1,3 +1,5 @@ +// SPEC: PJ2026-01060509 出站诊断 draft-2026-06-26-p8-egress-job-friction. +// Async job state, summaries, and lifecycle drill-down rendering for UniDesk CLI. import { spawn, spawnSync } from "node:child_process"; import { existsSync, mkdirSync, readFileSync, readdirSync, statSync, writeFileSync } from "node:fs"; import { join } from "node:path"; @@ -270,6 +272,50 @@ export function renderJobStatusSummary(job: ReturnType): Ren }; } +export function renderJobLaunchSummary(command: string, result: unknown): RenderedCliResult | null { + if (typeof result !== "object" || result === null || !("job" in result)) return null; + const record = result as Record; + const job = record.job as Partial | undefined; + if (typeof job !== "object" || job === null || typeof job.id !== "string") return null; + const service = typeof record.service === "string" ? record.service : "-"; + const runtimeEnv = typeof record.runtimeEnv === "object" && record.runtimeEnv !== null ? record.runtimeEnv as Record : {}; + const stdoutFile = typeof job.stdoutFile === "string" ? job.stdoutFile : "-"; + const stderrFile = typeof job.stderrFile === "string" ? job.stderrFile : "-"; + const statusCommand = `bun scripts/cli.ts job status ${job.id} --tail-bytes 12000`; + const fullCommand = command.includes("--full") || command.includes("--raw") ? null : `bun scripts/cli.ts ${command} --full`; + const lines = [ + "ASYNC_JOB", + jobStatusTable( + ["JOB", "NAME", "STATUS", "SERVICE", "RUNNER", "CREATED"], + [[job.id, job.name ?? "-", job.status ?? "-", service, job.runner ?? "-", job.createdAt ?? "-"]], + ), + "", + jobStatusTable( + ["STREAM", "PATH"], + [ + ["stdout", stdoutFile], + ["stderr", stderrFile], + ["runtimeEnv", typeof runtimeEnv.envFile === "string" ? runtimeEnv.envFile : "-"], + ["logDir", typeof runtimeEnv.logDir === "string" ? runtimeEnv.logDir : "-"], + ], + ), + "", + "NEXT", + ` status: ${statusCommand}`, + " latest: bun scripts/cli.ts job status latest --tail-bytes 12000", + " list: bun scripts/cli.ts job list --limit 20", + " logs: bun scripts/cli.ts server logs --tail-bytes 3000", + " server: bun scripts/cli.ts server status", + fullCommand === null ? null : ` full: ${fullCommand}`, + ].filter((line): line is string => line !== null); + return { + ok: job.status !== "failed", + command, + contentType: "text/plain", + renderedText: `${lines.join("\n")}\n`, + }; +} + function summarizeJobProgress(job: JobRecord, maxBytes = 96_000, tails?: { stdoutTail: string; stderrTail: string }): JobProgressSummary { const nowMs = Date.now(); const knownWorkflow = job.name === "hwlab_g14_v02_trigger_current"; diff --git a/src/components/backend-core/src/egress_tcp.rs b/src/components/backend-core/src/egress_tcp.rs index e23a49c4..b0fd41f6 100644 --- a/src/components/backend-core/src/egress_tcp.rs +++ b/src/components/backend-core/src/egress_tcp.rs @@ -1,3 +1,5 @@ +// SPEC: PJ2026-01060509 出站诊断 draft-2026-06-26-p8-egress-job-friction. +// Backend-core side of provider egress TCP bridge with stage logs. use std::sync::Arc; use std::time::Duration; @@ -43,6 +45,23 @@ fn send_egress_close(provider: &Arc, connection_id: &str, er let _ = provider.sender.send(Message::Text(message.to_string())); } +fn egress_failure_kind(error: &str) -> &'static str { + let normalized = error.to_ascii_lowercase(); + if normalized.contains("connect timeout") { + "connect-timeout" + } else if normalized.contains("timed out") || normalized.contains("timeout") { + "timeout" + } else if normalized.contains("invalid") { + "invalid-target" + } else if normalized.contains("refused") { + "connect-refused" + } else if normalized.contains("unreachable") { + "connect-unreachable" + } else { + "connect-failed" + } +} + pub async fn handle_egress_tcp_open( state: &Arc, provider: Arc, @@ -70,10 +89,42 @@ pub async fn handle_egress_tcp_open( || provider_id.is_empty() || connection_id.is_empty() { + state.log( + "warn", + "egress_tcp_open_rejected", + Some(json!({ + "providerId": provider_id.as_str(), + "connectionId": connection_id.as_str(), + "targetHost": host.as_str(), + "targetPort": port, + "failureKind": "invalid-target" + })), + ); send_egress_close(&provider, &connection_id, Some("invalid egress target")); return Ok(()); } + state.log( + "info", + "egress_tcp_open_requested", + Some(json!({ + "providerId": provider_id.as_str(), + "connectionId": connection_id.as_str(), + "targetHost": host.as_str(), + "targetPort": port + })), + ); close_egress_tcp_connection(state, &provider_id, &connection_id, None).await; + state.log( + "info", + "egress_tcp_connect_start", + Some(json!({ + "providerId": provider_id.as_str(), + "connectionId": connection_id.as_str(), + "targetHost": host.as_str(), + "targetPort": port, + "timeoutMs": 15_000 + })), + ); let stream = match tokio::time::timeout( Duration::from_secs(15), TcpStream::connect((host.as_str(), port as u16)), @@ -82,10 +133,35 @@ pub async fn handle_egress_tcp_open( { Ok(Ok(stream)) => stream, Ok(Err(error)) => { - send_egress_close(&provider, &connection_id, Some(&error.to_string())); + let error_text = error.to_string(); + state.log( + "warn", + "egress_tcp_connect_failed", + Some(json!({ + "providerId": provider_id.as_str(), + "connectionId": connection_id.as_str(), + "targetHost": host.as_str(), + "targetPort": port, + "failureKind": egress_failure_kind(&error_text), + "error": error_text.as_str() + })), + ); + send_egress_close(&provider, &connection_id, Some(&error_text)); return Ok(()); } Err(_) => { + state.log( + "warn", + "egress_tcp_connect_timeout", + Some(json!({ + "providerId": provider_id.as_str(), + "connectionId": connection_id.as_str(), + "targetHost": host.as_str(), + "targetPort": port, + "failureKind": "connect-timeout", + "timeoutMs": 15_000 + })), + ); send_egress_close( &provider, &connection_id, @@ -108,6 +184,16 @@ pub async fn handle_egress_tcp_open( let _ = provider.sender.send(Message::Text( json!({ "type": "egress_tcp_opened", "connectionId": connection_id }).to_string(), )); + state.log( + "info", + "egress_tcp_connected", + Some(json!({ + "providerId": provider_id.as_str(), + "connectionId": connection_id.as_str(), + "targetHost": host.as_str(), + "targetPort": port + })), + ); tokio::spawn({ let state = state.clone(); @@ -125,7 +211,18 @@ pub async fn handle_egress_tcp_open( let _ = provider.sender.send(Message::Text(json!({ "type": "egress_tcp_data", "connectionId": connection_id, "data": data, "encoding": "base64" }).to_string())); } Err(error) => { - send_egress_close(&provider, &connection_id, Some(&error.to_string())); + let error_text = error.to_string(); + state.log( + "warn", + "egress_tcp_read_failed", + Some(json!({ + "providerId": provider_id.as_str(), + "connectionId": connection_id.as_str(), + "failureKind": "read-failed", + "error": error_text.as_str() + })), + ); + send_egress_close(&provider, &connection_id, Some(&error_text)); break; } } @@ -144,7 +241,17 @@ pub async fn handle_egress_tcp_open( let connection_id = connection_id.clone(); async move { while let Some(data) = rx.recv().await { - if writer.write_all(&data).await.is_err() { + if let Err(error) = writer.write_all(&data).await { + state.log( + "warn", + "egress_tcp_write_failed", + Some(json!({ + "providerId": provider_id.as_str(), + "connectionId": connection_id.as_str(), + "failureKind": "write-failed", + "error": error.to_string() + })), + ); break; } } @@ -204,13 +311,27 @@ pub async fn close_egress_tcp_connection( state: &Arc, provider_id: &str, connection_id: &str, - _error: Option<&str>, + error: Option<&str>, ) { - state + let removed = state .active_egress_tcp .lock() .await - .remove(&egress_tcp_key(provider_id, connection_id)); + .remove(&egress_tcp_key(provider_id, connection_id)) + .is_some(); + if removed || error.is_some() { + state.log( + if error.is_some() { "warn" } else { "info" }, + "egress_tcp_connection_closed", + Some(json!({ + "providerId": provider_id, + "connectionId": connection_id, + "removed": removed, + "failureKind": error.map(egress_failure_kind).unwrap_or("normal-close"), + "error": error.unwrap_or("") + })), + ); + } } pub async fn close_egress_tcp_connections_for_provider(state: &Arc, provider_id: &str) { diff --git a/src/components/provider-gateway/src/egress-proxy.ts b/src/components/provider-gateway/src/egress-proxy.ts index 848ff07d..209d2b67 100644 --- a/src/components/provider-gateway/src/egress-proxy.ts +++ b/src/components/provider-gateway/src/egress-proxy.ts @@ -1,3 +1,5 @@ +// SPEC: PJ2026-01060509 出站诊断 draft-2026-06-26-p8-egress-job-friction. +// Provider-side egress TCP proxy with lifecycle logs for tunnel diagnosis. import { createServer, type Server, type Socket } from "node:net"; import type { CoreEgressTcpCloseMessage, @@ -33,6 +35,8 @@ interface Tunnel { closed: boolean; createdAt: number; lastActivityAt: number; + targetHost: string; + targetPort: number; openTimer: ReturnType | null; idleTimer: ReturnType | null; } @@ -116,6 +120,18 @@ function proxyUrlFor(host: string, port: number): string { return `http://${host}:${port}`; } +function egressFailureKind(error: string | undefined): string { + if (error === undefined || error.length === 0) return "normal-close"; + const normalized = error.toLowerCase(); + if (normalized.includes("open timeout")) return "open-timeout"; + if (normalized.includes("idle timeout")) return "idle-timeout"; + if (normalized.includes("pending buffer")) return "pending-buffer-exceeded"; + if (normalized.includes("not connected")) return "core-channel-not-connected"; + if (normalized.includes("timeout")) return "timeout"; + if (normalized.includes("closed")) return "closed"; + return "remote-or-socket-error"; +} + const tunnelOpenTimeoutMs = 15_000; const tunnelIdleTimeoutMs = 600_000; const maxPendingBytes = 4 * 1024 * 1024; @@ -163,14 +179,19 @@ export function startProviderEgressProxy(options: ProviderEgressProxyOptions): P tunnel.pendingBytes = 0; if (!tunnel.client.destroyed) tunnel.client.destroy(); if (notifyCore) send({ type: "egress_tcp_close", providerId: options.providerId, connectionId: tunnel.id, at: nowIso() }); - if (error !== undefined) { - options.logger("warn", "egress_proxy_tunnel_closed", { - connectionId: tunnel.id, - opened: tunnel.opened, - ageMs: Date.now() - tunnel.createdAt, - error, - }); - } + const event = { + providerId: options.providerId, + connectionId: tunnel.id, + method: tunnel.method, + targetHost: tunnel.targetHost, + targetPort: tunnel.targetPort, + opened: tunnel.opened, + ageMs: Date.now() - tunnel.createdAt, + pendingBytes: tunnel.pendingBytes, + failureKind: egressFailureKind(error), + ...(error === undefined ? {} : { error }), + }; + options.logger(error === undefined ? "info" : "warn", "egress_proxy_tunnel_closed", event); }; const closeTunnel = (id: string, error?: string): void => { @@ -206,6 +227,15 @@ export function startProviderEgressProxy(options: ProviderEgressProxyOptions): P tunnel.openTimer = null; } refreshTunnelIdle(tunnel); + options.logger("info", "egress_proxy_tunnel_opened", { + providerId: options.providerId, + connectionId: tunnel.id, + method: tunnel.method, + targetHost: tunnel.targetHost, + targetPort: tunnel.targetPort, + ageMs: Date.now() - tunnel.createdAt, + pendingBytes: tunnel.pendingBytes, + }); if (tunnel.method === "CONNECT") { tunnel.client.write("HTTP/1.1 200 Connection Established\r\nProxy-Agent: UniDesk-ProviderGateway\r\n\r\n"); } @@ -226,7 +256,14 @@ export function startProviderEgressProxy(options: ProviderEgressProxyOptions): P destroyTunnel(tunnel, false, message.error); } if (message.error !== undefined && message.error.length > 0) { - options.logger("warn", "egress_proxy_remote_close", { connectionId: message.connectionId, error: message.error }); + options.logger("warn", "egress_proxy_remote_close", { + providerId: options.providerId, + connectionId: message.connectionId, + targetHost: tunnel?.targetHost ?? null, + targetPort: tunnel?.targetPort ?? null, + failureKind: egressFailureKind(message.error), + error: message.error, + }); } return true; } @@ -295,10 +332,20 @@ export function startProviderEgressProxy(options: ProviderEgressProxyOptions): P closed: false, createdAt, lastActivityAt: createdAt, + targetHost: target.host, + targetPort: target.port, openTimer: null, idleTimer: null, }; tunnels.set(id, tunnel); + options.logger("info", "egress_proxy_tunnel_open_request", { + providerId: options.providerId, + connectionId: id, + method: parsed.method, + targetHost: target.host, + targetPort: target.port, + firstPayloadBytes: firstPayload === null ? 0 : firstPayload.byteLength, + }); client.on("data", (nextChunk) => { const nextBuffer = Buffer.isBuffer(nextChunk) ? nextChunk : Buffer.from(nextChunk); if (!tunnel.opened) { @@ -315,6 +362,14 @@ export function startProviderEgressProxy(options: ProviderEgressProxyOptions): P tunnels.delete(id); tunnel.closed = true; clearTunnelTimers(tunnel); + options.logger("warn", "egress_proxy_tunnel_open_not_sent", { + providerId: options.providerId, + connectionId: id, + method: parsed.method, + targetHost: target.host, + targetPort: target.port, + failureKind: "core-channel-not-connected", + }); fail("503 Service Unavailable", "provider-gateway core channel is not connected\n"); return; }