fix: decouple k3s benchmark bootstrap image

2026-06-26 13:35:04 +00:00
parent 0547a6e95d
commit 4489e34c7f
3 changed files with 169 additions and 46 deletions
@@ -11,8 +11,12 @@ profiles:
    enabled: true
    workload: k3s-build
    description: Generic k3s build benchmark with no registry/source mirror and at least 600 MiB output payload.
-    image: docker.io/library/debian:bookworm
-    imagePullPolicy: Always
+    image: docker.io/library/python:3.12-alpine
+    imagePullPolicy: IfNotPresent
+    targetOverrides:
+      D601:
+        image: 127.0.0.1:5000/platform-infra/sub2api-account-sentinel:python-3.12-alpine-openai-2.41.1
+        imagePullPolicy: IfNotPresent
    payloadMiB: 600
    timeoutSeconds: 3600
    ttlSecondsAfterFinished: 3600
@@ -21,14 +25,7 @@ profiles:
      npmRegistry: https://registry.npmjs.org/
      pipIndexUrl: https://pypi.org/simple
      registryMirror: forbidden
-    aptPackages:
-      - build-essential
-      - ca-certificates
-      - curl
-      - git
-      - make
-      - pkg-config
-      - xz-utils
+    aptPackages: []
    dependencyDownload:
      enabled: true
      url: https://speed.cloudflare.com/__down?bytes=67108864
@@ -6,12 +6,16 @@ This SPEC replaces the earlier HWLAB full-CI dependency for pikasTech/unidesk#10

 The first profile is `no-mirror-600m` in `config/platform-infra/egress-proxy-benchmarks.yaml`. It runs a fresh k3s Job per node, uses an emptyDir workspace, forbids benchmark reuse, keeps registry/source mirror settings out of the workload, and produces at least 600 MiB of build output. The profile also downloads an explicit no-mirror dependency payload so proxyserver traffic can be observed during the run.

+The benchmark separates bootstrap image availability from the measured workload. A profile may define target-scoped `targetOverrides.<target>.image` and `imagePullPolicy` when a node cannot pull the default bootstrap image directly before pod proxy variables exist. These overrides must be generic platform-infra images, not HWLAB application images, and the measured workload must still use the same no-mirror download and 500MiB+ output rules.
+
 ## Architecture

 `platform-infra egress-proxy k3s-build-benchmark` is the coordinator. It reads platform-infra targets from `config/platform-infra/sub2api.yaml`, reads workload profiles from `config/platform-infra/egress-proxy-benchmarks.yaml`, renders one Kubernetes Job per target, and uses `trans <target.route> sh -- ...` as the short control path.

 The workload Job runs in the target platform-infra namespace and receives proxy environment variables that point at the YAML-declared `sub2api-egress-proxy` service. The existing `platform-infra egress-proxy traffic` sampler remains the proxyserver-side observability source. Benchmark status may include a bounded traffic sample, but raw proxy credentials and Secret values are never printed.

+The portable workload path uses Python from the bootstrap image to download the declared dependency payload and generate the build artifact. It does not rewrite apt, apk, npm, pip, or registry sources. Debian/apt bootstrap images remain supported for future profiles, but apt is not required for the default cross-node benchmark.
+
 ## Data Model

 Stable benchmark records contain:
@@ -20,6 +24,7 @@ Stable benchmark records contain:
 - `profile`: benchmark profile id, initially `no-mirror-600m`.
 - `runId` and `jobName`: k3s Job identity.
 - `image`, `payloadMiB`, `dependencyDownload.expectedMiB`, `noMirror`: profile facts from YAML.
+- `targetOverrides`: optional target-specific bootstrap image facts; these are not measured dependency mirrors.
 - `state`: `pending`, `running`, `succeeded`, `failed`, or `missing`.
 - `startedAt`, `completedAt`, `durationSeconds`: Job timing.
 - `outputMiB`, `downloadMiB`: workload evidence parsed from Job logs.
@@ -39,7 +44,7 @@ Stable benchmark records contain:

 This benchmark is not a HWLAB application CI result and must not be reported as PipelineRun timing. It measures a generic k3s build workload under the node-local platform-infra egress proxy. It is valid for cross-node proxy path and build egress performance comparison because every target runs the same YAML profile and the same Job template.

-The workload must stay no-mirror: do not rewrite apt sources to regional mirrors, do not use npm mirror registries, do not configure Docker registry mirrors, and do not reuse a previous Job, PVC, or dependency cache. The configured profile must keep output payload at or above 500 MiB.
+The workload must stay no-mirror: do not rewrite apt/apk sources to regional mirrors, do not use npm mirror registries, do not configure Docker registry mirrors, and do not reuse a previous Job, PVC, or dependency cache. The configured profile must keep output payload at or above 500 MiB.

 ## Acceptance

@@ -13,6 +13,7 @@ const BENCHMARK_CONFIG_PATH = "config/platform-infra/egress-proxy-benchmarks.yam
 const BENCHMARK_APP = "unidesk-k3s-build-benchmark";

 type K3sBuildAction = "start" | "status" | "logs";
+type ImagePullPolicy = "Always" | "IfNotPresent" | "Never";

 interface K3sBuildBenchmarkOptions {
  action: K3sBuildAction;
@@ -31,7 +32,8 @@ interface K3sBuildBenchmarkProfile {
  workload: "k3s-build";
  description: string;
  image: string;
-  imagePullPolicy: "Always" | "IfNotPresent" | "Never";
+  imagePullPolicy: ImagePullPolicy;
+  targetOverrides: Record<string, K3sBuildBenchmarkTargetOverride>;
  payloadMiB: number;
  timeoutSeconds: number;
  ttlSecondsAfterFinished: number;
@@ -50,6 +52,11 @@ interface K3sBuildBenchmarkProfile {
  };
 }

+interface K3sBuildBenchmarkTargetOverride {
+  image?: string;
+  imagePullPolicy?: ImagePullPolicy;
+}
+
 interface K3sBuildBenchmarkConfig {
  version: number;
  kind: string;
@@ -270,6 +277,8 @@ function renderDryRun(plans: readonly TargetPlan[], options: K3sBuildBenchmarkOp
    plan.target?.route ?? "-",
    plan.target?.namespace ?? "-",
    plan.target?.egressProxy?.serviceName ?? "-",
+    plan.target !== undefined && plan.profile !== undefined ? effectiveImage(plan.target, plan.profile).image : "-",
+    plan.target !== undefined && plan.profile !== undefined ? effectiveImage(plan.target, plan.profile).imagePullPolicy : "-",
    plan.profile === undefined ? "-" : `${plan.profile.payloadMiB}MiB`,
    plan.profile === undefined ? "-" : `${plan.profile.dependencyDownload.expectedMiB}MiB`,
    plan.detail ?? "no-mirror emptyDir unique-job",
@@ -281,7 +290,7 @@ function renderDryRun(plans: readonly TargetPlan[], options: K3sBuildBenchmarkOp
    renderedText: [
      "PLATFORM-INFRA K3S BUILD BENCHMARK DRY-RUN",
      "",
-      ...table(["TARGET", "PROFILE", "STATUS", "ROUTE", "NAMESPACE", "PROXY", "PAYLOAD", "DOWNLOAD", "DETAIL"], rows),
+      ...table(["TARGET", "PROFILE", "STATUS", "ROUTE", "NAMESPACE", "PROXY", "IMAGE", "PULL", "PAYLOAD", "DOWNLOAD", "DETAIL"], rows),
      "",
      "NEXT",
      `  bun scripts/cli.ts platform-infra egress-proxy k3s-build-benchmark --targets ${plans.map((plan) => plan.targetId).join(",")} --profile ${options.profile} --confirm`,
@@ -296,6 +305,7 @@ function benchmarkJobManifest(target: Sub2ApiTargetConfig, profile: K3sBuildBenc
  if (proxy === null) throw new Error(`target ${target.id} has no egressProxy`);
  const proxyUrl = `http://${proxy.serviceName}.${target.namespace}.svc.cluster.local:${proxy.listenPort}`;
  const noProxy = proxy.noProxy.join(",");
+  const image = effectiveImage(target, profile);
  return {
    apiVersion: "batch/v1",
    kind: "Job",
@@ -307,6 +317,8 @@ function benchmarkJobManifest(target: Sub2ApiTargetConfig, profile: K3sBuildBenc
        "unidesk.ai/no-mirror": JSON.stringify(profile.noMirror),
        "unidesk.ai/payload-mib": String(profile.payloadMiB),
        "unidesk.ai/dependency-download-mib": String(profile.dependencyDownload.expectedMiB),
+        "unidesk.ai/bootstrap-image": image.image,
+        "unidesk.ai/bootstrap-image-pull-policy": image.imagePullPolicy,
      },
    },
    spec: {
@@ -319,8 +331,8 @@ function benchmarkJobManifest(target: Sub2ApiTargetConfig, profile: K3sBuildBenc
          restartPolicy: "Never",
          containers: [{
            name: "build",
-            image: profile.image,
-            imagePullPolicy: profile.imagePullPolicy,
+            image: image.image,
+            imagePullPolicy: image.imagePullPolicy,
            command: ["/bin/sh", "-lc"],
            args: [workloadScript(profile)],
            env: [
@@ -357,20 +369,23 @@ function workloadScript(profile: K3sBuildBenchmarkProfile): string {
  return `set -eu
 started_epoch="$(date +%s)"
 started_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
+export BENCHMARK_STARTED_EPOCH="$started_epoch"
+export BENCHMARK_STARTED_AT="$started_at"
 work=/work/k3s-build-benchmark
 download_dir="$work/download"
 build_dir="$work/build"
 output_dir="$work/output"
 mkdir -p "$download_dir" "$build_dir" "$output_dir"
 printf 'UNIDESK_K3S_BUILD_BENCHMARK_EVENT target=%s profile=%s run=%s payloadMiB=%s expectedDownloadMiB=%s noMirror=true\\n' "$BENCHMARK_TARGET" "$BENCHMARK_PROFILE" "$BENCHMARK_RUN_ID" "$PAYLOAD_MIB" "$DOWNLOAD_EXPECTED_MIB"
-if grep -R -E 'npmmirror|daocloud|aliyun|tuna|ustc' /etc/apt/sources.list /etc/apt/sources.list.d >/tmp/mirror-check.out 2>/dev/null; then
-  cat /tmp/mirror-check.out >&2
-  echo "unexpected apt mirror in base image" >&2
-  exit 42
-fi
-apt-get -o Acquire::http::No-Cache=true -o Acquire::https::No-Cache=true update
-apt-get -o Acquire::http::No-Cache=true -o Acquire::https::No-Cache=true install -y --no-install-recommends $APT_PACKAGES
-cat > "$build_dir/bench.c" <<'C'
+if command -v apt-get >/dev/null 2>&1 && [ -n "$APT_PACKAGES" ]; then
+  if grep -R -E 'npmmirror|daocloud|aliyun|tuna|ustc' /etc/apt/sources.list /etc/apt/sources.list.d >/tmp/mirror-check.out 2>/dev/null; then
+    cat /tmp/mirror-check.out >&2
+    echo "unexpected apt mirror in base image" >&2
+    exit 42
+  fi
+  apt-get -o Acquire::http::No-Cache=true -o Acquire::https::No-Cache=true update
+  apt-get -o Acquire::http::No-Cache=true -o Acquire::https::No-Cache=true install -y --no-install-recommends $APT_PACKAGES
+  cat > "$build_dir/bench.c" <<'C'
 #include <stdint.h>
 #include <stdio.h>
 int main(void) {
@@ -383,31 +398,112 @@ int main(void) {
  return 0;
 }
 C
-cc -O2 "$build_dir/bench.c" -o "$build_dir/bench"
-"$build_dir/bench" > "$output_dir/compile-result.txt"
-if [ "${profile.dependencyDownload.enabled ? "1" : "0"}" = "1" ]; then
-  i=1
-  while [ "$i" -le "$DOWNLOAD_CHUNKS" ]; do
-    curl -fL --retry 2 --connect-timeout 15 --max-time 240 "$DOWNLOAD_URL" -o "$download_dir/chunk-$i.bin"
-    i=$((i + 1))
-  done
+  cc -O2 "$build_dir/bench.c" -o "$build_dir/bench"
+  "$build_dir/bench" > "$output_dir/compile-result.txt"
 fi
-download_mib="$(du -sm "$download_dir" | awk '{print $1}')"
-rm -rf "$download_dir"
-dd if=/dev/zero of="$output_dir/payload.bin" bs=1M count="$PAYLOAD_MIB" status=none
-sha256sum "$output_dir/payload.bin" > "$output_dir/payload.sha256"
-output_mib="$(du -sm "$output_dir" | awk '{print $1}')"
-if [ "$output_mib" -lt 500 ]; then
-  echo "payload-too-small outputMiB=$output_mib" >&2
-  exit 43
+pybin=""
+if command -v python3 >/dev/null 2>&1; then pybin=python3; elif command -v python >/dev/null 2>&1; then pybin=python; fi
+if [ -z "$pybin" ]; then
+  echo "python-runtime-missing" >&2
+  exit 44
 fi
-completed_epoch="$(date +%s)"
-completed_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
-duration_seconds=$((completed_epoch - started_epoch))
-printf 'UNIDESK_K3S_BUILD_BENCHMARK_RESULT {"ok":true,"target":"%s","profile":"%s","runId":"%s","startedAt":"%s","completedAt":"%s","durationSeconds":%s,"payloadMiB":%s,"downloadMiB":%s,"downloadExpectedMiB":%s,"outputMiB":%s,"noMirror":true,"aptMirror":"system-default","npmRegistry":"%s","pipIndexUrl":"%s"}\\n' "$BENCHMARK_TARGET" "$BENCHMARK_PROFILE" "$BENCHMARK_RUN_ID" "$started_at" "$completed_at" "$duration_seconds" "$PAYLOAD_MIB" "$download_mib" "$DOWNLOAD_EXPECTED_MIB" "$output_mib" "$NPM_CONFIG_REGISTRY" "$PIP_INDEX_URL"
+"$pybin" - <<'PY'
+import hashlib
+import json
+import math
+import os
+import pathlib
+import shutil
+import sys
+import time
+import urllib.request
+
+started_epoch = int(os.environ.get("BENCHMARK_STARTED_EPOCH", "0") or "0") or int(time.time())
+started_at = os.environ.get("BENCHMARK_STARTED_AT") or time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(started_epoch))
+payload_mib = int(os.environ["PAYLOAD_MIB"])
+download_chunks = int(os.environ["DOWNLOAD_CHUNKS"])
+download_expected_mib = int(os.environ["DOWNLOAD_EXPECTED_MIB"])
+download_url = os.environ["DOWNLOAD_URL"]
+download_enabled = "${profile.dependencyDownload.enabled ? "1" : "0"}" == "1"
+download_dir = pathlib.Path("/work/k3s-build-benchmark/download")
+build_dir = pathlib.Path("/work/k3s-build-benchmark/build")
+output_dir = pathlib.Path("/work/k3s-build-benchmark/output")
+download_dir.mkdir(parents=True, exist_ok=True)
+build_dir.mkdir(parents=True, exist_ok=True)
+output_dir.mkdir(parents=True, exist_ok=True)
+
+def mib_from_bytes(value):
+    return int(math.ceil(value / 1048576))
+
+download_bytes = 0
+if download_enabled:
+    for index in range(1, download_chunks + 1):
+        destination = download_dir / f"chunk-{index}.bin"
+        request = urllib.request.Request(download_url, headers={"User-Agent": "curl/8.5.0", "Accept": "*/*"})
+        with urllib.request.urlopen(request, timeout=240) as response, destination.open("wb") as handle:
+            while True:
+                block = response.read(1024 * 1024)
+                if not block:
+                    break
+                handle.write(block)
+                download_bytes += len(block)
+        print(f"UNIDESK_K3S_BUILD_BENCHMARK_DOWNLOAD chunk={index} bytes={destination.stat().st_size}", flush=True)
+
+source_file = build_dir / "portable_build_source.py"
+source_file.write_text("VALUE = 'unidesk-k3s-build-benchmark'\\n", encoding="utf-8")
+__import__("py_compile").compile(str(source_file), cfile=str(build_dir / "portable_build_source.pyc"), doraise=True)
+
+payload = output_dir / "payload.bin"
+digest = hashlib.sha256()
+seed = hashlib.sha256(f"{os.environ['BENCHMARK_TARGET']}:{os.environ['BENCHMARK_RUN_ID']}".encode("utf-8")).digest()
+block = (seed * ((1024 * 1024 // len(seed)) + 1))[:1024 * 1024]
+with payload.open("wb") as handle:
+    for index in range(payload_mib):
+        digest.update(block)
+        handle.write(block)
+        if index % 64 == 0:
+            handle.flush()
+
+(output_dir / "payload.sha256").write_text(digest.hexdigest() + "  payload.bin\\n", encoding="utf-8")
+shutil.rmtree(download_dir, ignore_errors=True)
+output_bytes = sum(path.stat().st_size for path in output_dir.rglob("*") if path.is_file())
+output_mib = mib_from_bytes(output_bytes)
+download_mib = mib_from_bytes(download_bytes)
+if output_mib < 500:
+    print(f"payload-too-small outputMiB={output_mib}", file=sys.stderr)
+    sys.exit(43)
+completed_epoch = int(time.time())
+result = {
+    "ok": True,
+    "target": os.environ["BENCHMARK_TARGET"],
+    "profile": os.environ["BENCHMARK_PROFILE"],
+    "runId": os.environ["BENCHMARK_RUN_ID"],
+    "startedAt": started_at,
+    "completedAt": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(completed_epoch)),
+    "durationSeconds": completed_epoch - started_epoch,
+    "payloadMiB": payload_mib,
+    "downloadMiB": download_mib,
+    "downloadExpectedMiB": download_expected_mib,
+    "outputMiB": output_mib,
+    "noMirror": True,
+    "packageManager": "none" if not os.environ.get("APT_PACKAGES") else "apt",
+    "aptMirror": "not-used" if not os.environ.get("APT_PACKAGES") else "system-default",
+    "npmRegistry": os.environ["NPM_CONFIG_REGISTRY"],
+    "pipIndexUrl": os.environ["PIP_INDEX_URL"],
+}
+print("UNIDESK_K3S_BUILD_BENCHMARK_RESULT " + json.dumps(result, sort_keys=True), flush=True)
+PY
 `;
 }

+function effectiveImage(target: Sub2ApiTargetConfig, profile: K3sBuildBenchmarkProfile): { image: string; imagePullPolicy: ImagePullPolicy } {
+  const override = profile.targetOverrides[target.id] ?? profile.targetOverrides[target.id.toLowerCase()] ?? profile.targetOverrides[target.id.toUpperCase()];
+  return {
+    image: override?.image ?? profile.image,
+    imagePullPolicy: override?.imagePullPolicy ?? profile.imagePullPolicy,
+  };
+}
+
 function benchmarkLabels(target: Sub2ApiTargetConfig, profile: K3sBuildBenchmarkProfile, runId: string): Record<string, string> {
  return {
    "app.kubernetes.io/name": BENCHMARK_APP,
@@ -464,6 +560,14 @@ pods_result = kubectl(["get", "pods", "-l", "job-name=" + job_name, "-o", "json"
 pods = json.loads(pods_result.stdout or "{}").get("items", []) if pods_result.returncode == 0 else []
 pods.sort(key=lambda item: item.get("metadata", {}).get("creationTimestamp", ""))
 pod_name = pods[-1].get("metadata", {}).get("name") if pods else None
+waiting_reasons = []
+if pods:
+    for container_status in (pods[-1].get("status", {}) or {}).get("containerStatuses", []) or []:
+        waiting = ((container_status.get("state") or {}).get("waiting") or {})
+        if waiting:
+            reason = waiting.get("reason") or "waiting"
+            message = waiting.get("message") or ""
+            waiting_reasons.append((reason + " " + message).strip())
 logs = ""
 if pod_name:
    logs_result = kubectl(["logs", pod_name, "--tail", str(tail_lines)])
@@ -493,14 +597,14 @@ elif active:
 else:
    state = "pending"
 failure_family = "none" if state == "succeeded" else ("in-progress" if state in ("running", "pending") else "unknown")
-tail_text = (full_logs or logs)[-4000:]
+tail_text = "\\n".join(waiting_reasons + [(full_logs or logs)[-4000:]])
 if state == "missing":
    failure_family = "job-missing"
 elif "ImagePullBackOff" in tail_text or "ErrImagePull" in tail_text:
    failure_family = "image-pull"
 elif "apt-get" in tail_text and ("Failed" in tail_text or "Unable to" in tail_text):
    failure_family = "apt-download"
-elif "curl:" in tail_text:
+elif "curl:" in tail_text or "urllib.error.HTTPError" in tail_text or "urllib.error.URLError" in tail_text:
    failure_family = "dependency-download"
 elif "payload-too-small" in tail_text:
    failure_family = "payload-too-small"
@@ -516,7 +620,7 @@ payload = {
    "completedAt": status.get("completionTime") or (match or {}).get("completedAt"),
    "result": match,
    "failureFamily": failure_family,
-    "logTail": logs[-4000:],
+    "logTail": "\\n".join(waiting_reasons + [logs])[-4000:],
 }
 print(json.dumps(payload, ensure_ascii=False))
 PY
@@ -644,6 +748,7 @@ function profileSpec(id: string, raw: Record<string, unknown>): K3sBuildBenchmar
    description: stringField(raw, "description", `profiles.${id}`),
    image: stringField(raw, "image", `profiles.${id}`),
    imagePullPolicy,
+    targetOverrides: targetOverridesField(raw, `profiles.${id}`),
    payloadMiB: integerField(raw, "payloadMiB", `profiles.${id}`),
    timeoutSeconds: integerField(raw, "timeoutSeconds", `profiles.${id}`),
    ttlSecondsAfterFinished: integerField(raw, "ttlSecondsAfterFinished", `profiles.${id}`),
@@ -663,6 +768,22 @@ function profileSpec(id: string, raw: Record<string, unknown>): K3sBuildBenchmar
  };
 }

+function targetOverridesField(raw: Record<string, unknown>, path: string): Record<string, K3sBuildBenchmarkTargetOverride> {
+  if (raw.targetOverrides === undefined) return {};
+  const overrides = asRecord(raw.targetOverrides, `${path}.targetOverrides`);
+  return Object.fromEntries(Object.entries(overrides).map(([targetId, value]) => {
+    const recordValue = asRecord(value, `${path}.targetOverrides.${targetId}`);
+    const override: K3sBuildBenchmarkTargetOverride = {};
+    if (recordValue.image !== undefined) override.image = stringField(recordValue, "image", `${path}.targetOverrides.${targetId}`);
+    if (recordValue.imagePullPolicy !== undefined) {
+      const pullPolicy = stringField(recordValue, "imagePullPolicy", `${path}.targetOverrides.${targetId}`);
+      if (pullPolicy !== "Always" && pullPolicy !== "IfNotPresent" && pullPolicy !== "Never") throw new Error(`${path}.targetOverrides.${targetId}.imagePullPolicy must be Always, IfNotPresent, or Never`);
+      override.imagePullPolicy = pullPolicy;
+    }
+    return [targetId, override];
+  }));
+}
+
 function runTrans(route: string, script: string, timeoutSeconds: number): CommandResult {
  return runCommand(["/root/.local/bin/trans", route, "sh", "--", script], rootPath(), { timeoutMs: timeoutSeconds * 1000 });
 }