From 6017b4a117f62fb036293acafa32a38618d0a016 Mon Sep 17 00:00:00 2001
From: Codex <ops@pikapython.com>
Date: Fri, 26 Jun 2026 19:05:08 +0000
Subject: [PATCH] perf(egress-proxy): hostNetwork benchmark acceptance +
 single-active benchmark guard for #1077

Document the per-target egressProxy.hostNetwork YAML option and the
real-deps-500m production-ready throughput acceptance profile in
docs/reference/platform-infra.md, and add a single-active guard to the
k3s-build-benchmark start path so a new run deletes any prior target+profile
Job before applying the new one (preventing concurrent runs from making
status report the wrong Job, observed during #1077 regression on D518).

Basis: merges PR #1065 (real k3s dependency proxy benchmark + hostNetwork
egress proxy) as the production-ready scaffold; validated on D601 and D518
with real-deps-500m (both succeeded, apk=1231/npm=850/go=693/real_deps=2774
MiB, proxyserver cumulative 2.4 GiB / 10 GiB).
---
 docs/reference/platform-infra.md                |  4 ++++
 .../src/platform-infra-k3s-build-benchmark.ts   | 17 +++++++++++++++--
 2 files changed, 19 insertions(+), 2 deletions(-)
diff --git a/docs/reference/platform-infra.md b/docs/reference/platform-infra.md
index 849d1271..009feaed 100644
--- a/docs/reference/platform-infra.md
+++ b/docs/reference/platform-infra.md
@@ -167,6 +167,10 @@ When target-level `egressProxy.enabled=true`, the D601 target renders an in-clus
 
 `platform-infra egress-proxy traffic --target <id> --sample-seconds <n>` is the proxyserver-side observation entry. It reads the sing-box Clash API through the proxy Pod loopback, reports current per-client rate plus bounded-window cumulative bytes, and includes proxy process cumulative bytes when sing-box reports them. Use this together with k3s CI/CD build benchmarks when proving proxy acceleration or diagnosing whether a workload actually traverses the proxy; client-side timings alone are not enough evidence.
 
+The egress proxy Deployment may opt into `hostNetwork: true` per target via `config/platform-infra/sub2api.yaml` `targets[].egressProxy.hostNetwork`. When enabled, the manifest renders `hostNetwork: true`, `dnsPolicy: ClusterFirstWithHostNet`, and a RollingUpdate strategy of `maxSurge=0`/`maxUnavailable=1` so the sing-box client bypasses the pod overlay and connects the master upstream directly from the node network; this is the durable fix for a target whose pod-overlay path to the upstream is the throughput bottleneck. It is a per-target YAML decision, not a D601-only default: a target whose pod overlay is already fast enough must keep `hostNetwork: false`, and the `no-host-network` policy check only permits `hostNetwork: true` on the single YAML-declared egress proxy Deployment for a target whose `egressProxy.hostNetwork=true`. Do not generalize one target's hostNetwork experiment to other nodes, and do not leave a one-off `kubectl patch` as the final state; promote or demote hostNetwork only by editing the target YAML and running `platform-infra sub2api apply --target <id>`.
+
+`platform-infra egress-proxy k3s-build-benchmark --targets <ids> --profile real-deps-500m` is the production-ready egress proxy throughput acceptance entry. The `real-deps-500m` profile in `config/platform-infra/egress-proxy-benchmarks.yaml` is the only acceptance profile: it renders one Job per target whose kubelet/containerd pulls remote `alpine`, `node` and `golang` images with `imagePullPolicy: Always`, then runs Pod-internal `apk add`, `npm install` and `go mod download` stages through the YAML-declared proxy env. Acceptance requires `STATE=succeeded`, `REAL_DEPS >= 500 MiB` (the profile's `realDeps.minProxyMiB`), image-pull plus apk/npm/go evidence, and a proxyserver-observed cumulative traffic above the profile minimum. Cloudflare synthetic downloads and curl-only probes are bypass diagnostics, never acceptance evidence. Status/logs/traffic are short-polled; a started benchmark is fire-and-forget and must be `cleanup`-ed when it stalls or after acceptance to release k3s resources. D601 and D518 must both pass the same profile: a single node passing does not close a cross-node proxy issue, and an optimization on one target must not regress the other.
+
 `platform-infra sub2api validate --target D601 --full` must prove the proxy Deployment/Service is ready and that an app pod can complete the YAML-declared health probe through the proxy. This target-level injection does not by itself bind manually created Sub2API accounts to that proxy; account tests and account-specific upstream transports still need a YAML-declared `manualAccounts.protected[].proxyBinding` when the account must avoid direct egress. Proxy credentials, subscription contents, and generated proxy configs are Secret material and must not be printed. If a direct D601-to-upstream TLS/SNI path is reset, do not leave a one-off plain HTTP CONNECT or JS proxy as the durable fix; use a mature encrypted proxy source, currently master `shadowsocks-rust` plus D601 `sing-box`, through YAML/compose.
 
 Adding, removing, exposing, validating, and configuring local Codex consumers are daily operations covered by `$unidesk-sub2api`. The development rule is that ordinary pool membership changes stay YAML-only and do not add code or CI/CD. Code changes are only appropriate when UniDesk needs to render or validate a Sub2API capability that already exists upstream, such as account-level WebSocket mode or per-account upstream User-Agent. If Sub2API itself does not support a desired behavior, do not magic-patch it through UniDesk scripts, Kubernetes hotfixes, local forks, or hidden compatibility paths; either leave the behavior unsupported or pursue it upstream as an explicit Sub2API feature.
diff --git a/scripts/src/platform-infra-k3s-build-benchmark.ts b/scripts/src/platform-infra-k3s-build-benchmark.ts
index b7ff593e..c4530ea0 100644
--- a/scripts/src/platform-infra-k3s-build-benchmark.ts
+++ b/scripts/src/platform-infra-k3s-build-benchmark.ts
@@ -241,7 +241,10 @@ function startBenchmarks(plans: readonly TargetPlan[], options: K3sBuildBenchmar
 
 function startRowDetail(row: TargetPlan & { started: boolean; state: string; jobName: string; runId: string; result: unknown }): string {
   if (!row.ok) return `${row.blocker}: ${row.detail}`;
-  if (row.started) return "status/logs/traffic";
+  if (row.started) {
+    const replaced = record(row.result).replaced;
+    return replaced === undefined || replaced === 0 ? "status/logs/traffic" : `status/logs/traffic replaced=${replaced}`;
+  }
   const result = record(row.result);
   const stderr = text(result.stderrPreview, "");
   const stdout = text(result.stdoutPreview, "");
@@ -807,8 +810,18 @@ tmp="$(mktemp -d)"
 trap 'rm -rf "$tmp"' EXIT
 manifest="$tmp/k3s-build-benchmark.yaml"
 printf '%s' '${encoded}' | base64 -d > "$manifest"
+selector="app.kubernetes.io/name=${BENCHMARK_APP},unidesk.ai/benchmark-profile=${profile.id},unidesk.ai/runtime-node=${target.id.toLowerCase()}"
+# Single-active guard: delete any prior benchmark Jobs for this target+profile before applying the new
+# one, so a new run cannot race with a stale/slow prior run and make status report the wrong Job.
+# The benchmark is fire-and-forget and uses emptyDir, so deleting the prior Job is the safe way to keep
+# exactly one active run per target.
+replaced="$(kubectl -n ${shQuote(target.namespace)} get jobs -l "$selector" --no-headers 2>/dev/null | wc -l | tr -d ' ')"
+[ -z "$replaced" ] && replaced=0
+if [ "$replaced" != "0" ]; then
+  kubectl -n ${shQuote(target.namespace)} delete jobs -l "$selector" --ignore-not-found >/dev/null 2>&1
+fi
 kubectl apply -f "$manifest" >/dev/null
-printf '{"ok":true,"jobName":"%s","namespace":"%s","target":"%s","runId":"%s","profile":"%s"}\\n' ${shQuote(jobName)} ${shQuote(target.namespace)} ${shQuote(target.id)} ${shQuote(runId)} ${shQuote(profile.id)}
+printf '{"ok":true,"jobName":"%s","namespace":"%s","target":"%s","runId":"%s","profile":"%s","replaced":%s}\\n' ${shQuote(jobName)} ${shQuote(target.namespace)} ${shQuote(target.id)} ${shQuote(runId)} ${shQuote(profile.id)} "$replaced"
 `;
 }