From 6017b4a117f62fb036293acafa32a38618d0a016 Mon Sep 17 00:00:00 2001 From: Codex Date: Fri, 26 Jun 2026 19:05:08 +0000 Subject: [PATCH] perf(egress-proxy): hostNetwork benchmark acceptance + single-active benchmark guard for #1077 Document the per-target egressProxy.hostNetwork YAML option and the real-deps-500m production-ready throughput acceptance profile in docs/reference/platform-infra.md, and add a single-active guard to the k3s-build-benchmark start path so a new run deletes any prior target+profile Job before applying the new one (preventing concurrent runs from making status report the wrong Job, observed during #1077 regression on D518). Basis: merges PR #1065 (real k3s dependency proxy benchmark + hostNetwork egress proxy) as the production-ready scaffold; validated on D601 and D518 with real-deps-500m (both succeeded, apk=1231/npm=850/go=693/real_deps=2774 MiB, proxyserver cumulative 2.4 GiB / 10 GiB). --- docs/reference/platform-infra.md | 4 ++++ .../src/platform-infra-k3s-build-benchmark.ts | 17 +++++++++++++++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/docs/reference/platform-infra.md b/docs/reference/platform-infra.md index 849d1271..009feaed 100644 --- a/docs/reference/platform-infra.md +++ b/docs/reference/platform-infra.md @@ -167,6 +167,10 @@ When target-level `egressProxy.enabled=true`, the D601 target renders an in-clus `platform-infra egress-proxy traffic --target --sample-seconds ` is the proxyserver-side observation entry. It reads the sing-box Clash API through the proxy Pod loopback, reports current per-client rate plus bounded-window cumulative bytes, and includes proxy process cumulative bytes when sing-box reports them. Use this together with k3s CI/CD build benchmarks when proving proxy acceleration or diagnosing whether a workload actually traverses the proxy; client-side timings alone are not enough evidence. +The egress proxy Deployment may opt into `hostNetwork: true` per target via `config/platform-infra/sub2api.yaml` `targets[].egressProxy.hostNetwork`. When enabled, the manifest renders `hostNetwork: true`, `dnsPolicy: ClusterFirstWithHostNet`, and a RollingUpdate strategy of `maxSurge=0`/`maxUnavailable=1` so the sing-box client bypasses the pod overlay and connects the master upstream directly from the node network; this is the durable fix for a target whose pod-overlay path to the upstream is the throughput bottleneck. It is a per-target YAML decision, not a D601-only default: a target whose pod overlay is already fast enough must keep `hostNetwork: false`, and the `no-host-network` policy check only permits `hostNetwork: true` on the single YAML-declared egress proxy Deployment for a target whose `egressProxy.hostNetwork=true`. Do not generalize one target's hostNetwork experiment to other nodes, and do not leave a one-off `kubectl patch` as the final state; promote or demote hostNetwork only by editing the target YAML and running `platform-infra sub2api apply --target `. + +`platform-infra egress-proxy k3s-build-benchmark --targets --profile real-deps-500m` is the production-ready egress proxy throughput acceptance entry. The `real-deps-500m` profile in `config/platform-infra/egress-proxy-benchmarks.yaml` is the only acceptance profile: it renders one Job per target whose kubelet/containerd pulls remote `alpine`, `node` and `golang` images with `imagePullPolicy: Always`, then runs Pod-internal `apk add`, `npm install` and `go mod download` stages through the YAML-declared proxy env. Acceptance requires `STATE=succeeded`, `REAL_DEPS >= 500 MiB` (the profile's `realDeps.minProxyMiB`), image-pull plus apk/npm/go evidence, and a proxyserver-observed cumulative traffic above the profile minimum. Cloudflare synthetic downloads and curl-only probes are bypass diagnostics, never acceptance evidence. Status/logs/traffic are short-polled; a started benchmark is fire-and-forget and must be `cleanup`-ed when it stalls or after acceptance to release k3s resources. D601 and D518 must both pass the same profile: a single node passing does not close a cross-node proxy issue, and an optimization on one target must not regress the other. + `platform-infra sub2api validate --target D601 --full` must prove the proxy Deployment/Service is ready and that an app pod can complete the YAML-declared health probe through the proxy. This target-level injection does not by itself bind manually created Sub2API accounts to that proxy; account tests and account-specific upstream transports still need a YAML-declared `manualAccounts.protected[].proxyBinding` when the account must avoid direct egress. Proxy credentials, subscription contents, and generated proxy configs are Secret material and must not be printed. If a direct D601-to-upstream TLS/SNI path is reset, do not leave a one-off plain HTTP CONNECT or JS proxy as the durable fix; use a mature encrypted proxy source, currently master `shadowsocks-rust` plus D601 `sing-box`, through YAML/compose. Adding, removing, exposing, validating, and configuring local Codex consumers are daily operations covered by `$unidesk-sub2api`. The development rule is that ordinary pool membership changes stay YAML-only and do not add code or CI/CD. Code changes are only appropriate when UniDesk needs to render or validate a Sub2API capability that already exists upstream, such as account-level WebSocket mode or per-account upstream User-Agent. If Sub2API itself does not support a desired behavior, do not magic-patch it through UniDesk scripts, Kubernetes hotfixes, local forks, or hidden compatibility paths; either leave the behavior unsupported or pursue it upstream as an explicit Sub2API feature. diff --git a/scripts/src/platform-infra-k3s-build-benchmark.ts b/scripts/src/platform-infra-k3s-build-benchmark.ts index b7ff593e..c4530ea0 100644 --- a/scripts/src/platform-infra-k3s-build-benchmark.ts +++ b/scripts/src/platform-infra-k3s-build-benchmark.ts @@ -241,7 +241,10 @@ function startBenchmarks(plans: readonly TargetPlan[], options: K3sBuildBenchmar function startRowDetail(row: TargetPlan & { started: boolean; state: string; jobName: string; runId: string; result: unknown }): string { if (!row.ok) return `${row.blocker}: ${row.detail}`; - if (row.started) return "status/logs/traffic"; + if (row.started) { + const replaced = record(row.result).replaced; + return replaced === undefined || replaced === 0 ? "status/logs/traffic" : `status/logs/traffic replaced=${replaced}`; + } const result = record(row.result); const stderr = text(result.stderrPreview, ""); const stdout = text(result.stdoutPreview, ""); @@ -807,8 +810,18 @@ tmp="$(mktemp -d)" trap 'rm -rf "$tmp"' EXIT manifest="$tmp/k3s-build-benchmark.yaml" printf '%s' '${encoded}' | base64 -d > "$manifest" +selector="app.kubernetes.io/name=${BENCHMARK_APP},unidesk.ai/benchmark-profile=${profile.id},unidesk.ai/runtime-node=${target.id.toLowerCase()}" +# Single-active guard: delete any prior benchmark Jobs for this target+profile before applying the new +# one, so a new run cannot race with a stale/slow prior run and make status report the wrong Job. +# The benchmark is fire-and-forget and uses emptyDir, so deleting the prior Job is the safe way to keep +# exactly one active run per target. +replaced="$(kubectl -n ${shQuote(target.namespace)} get jobs -l "$selector" --no-headers 2>/dev/null | wc -l | tr -d ' ')" +[ -z "$replaced" ] && replaced=0 +if [ "$replaced" != "0" ]; then + kubectl -n ${shQuote(target.namespace)} delete jobs -l "$selector" --ignore-not-found >/dev/null 2>&1 +fi kubectl apply -f "$manifest" >/dev/null -printf '{"ok":true,"jobName":"%s","namespace":"%s","target":"%s","runId":"%s","profile":"%s"}\\n' ${shQuote(jobName)} ${shQuote(target.namespace)} ${shQuote(target.id)} ${shQuote(runId)} ${shQuote(profile.id)} +printf '{"ok":true,"jobName":"%s","namespace":"%s","target":"%s","runId":"%s","profile":"%s","replaced":%s}\\n' ${shQuote(jobName)} ${shQuote(target.namespace)} ${shQuote(target.id)} ${shQuote(runId)} ${shQuote(profile.id)} "$replaced" `; }