diff --git a/config/hwlab-node-control-plane.yaml b/config/hwlab-node-control-plane.yaml index 7ebd70fd..749773cd 100644 --- a/config/hwlab-node-control-plane.yaml +++ b/config/hwlab-node-control-plane.yaml @@ -4,6 +4,7 @@ metadata: owner: unidesk relatedIssues: - 290 + - 491 - 1119 imagePolicy: requireReproducibleBuildSource: true @@ -16,6 +17,41 @@ nodes: D601: route: D601 kubeRoute: D601:k3s + k3s: + serviceName: k3s + dropInPath: /etc/systemd/system/k3s.service.d/20-unidesk-node-config.conf + nodeStatusName: d601 + execStartPre: + - - -/usr/bin/umount + - /Docker/host + serverArgs: + - server + - --disable + - traefik + - --disable + - servicelb + - --disable + - metrics-server + - --node-name + - D601 + - --node-label + - unidesk.ai/node-id=D601 + - --node-label + - unidesk.ai/provider-id=D601 + - --tls-san + - 127.0.0.1 + - --tls-san + - host.docker.internal + - --write-kubeconfig-mode + - "644" + - --kubelet-arg + - image-gc-high-threshold=95 + - --kubelet-arg + - image-gc-low-threshold=90 + - --kubelet-arg + - max-pods=500 + kubelet: + maxPods: 500 registry: endpoint: 127.0.0.1:5000 egressProxy: diff --git a/docs/reference/cli.md b/docs/reference/cli.md index 4a281b19..94584e26 100644 --- a/docs/reference/cli.md +++ b/docs/reference/cli.md @@ -24,7 +24,7 @@ G14/D601 v03 的 bootstrap admin password 是 HWLAB runtime Secret 生命周期 `hwlab nodes web-probe run|script --node --lane ` 是 HWLAB Cloud Web 线上 DOM/Playwright 验收的受控入口;CLI 负责从 YAML 解析 workspace、public URL 和 bootstrap admin sourceRef,并只输出 redacted 凭据状态、artifact path/hash、readiness、`probe.summary` 和失败分类。`run` 使用 repo-owned 标准 DOM probe;`script` 不运行默认探针,必须通过 stdin heredoc 或 `--script-file ` 提供调用者脚本。`run --message ...` 未显式设置 trace 参数时会做轻量 trace 采样,`script` helper 可用 `recordStep` / `safeFetchJson` / `fetchApiMatrix` 保留失败前的结构化 partial evidence,完整 redacted 报告通过 `reportPath`/`reportSha256` 展开。具体 Web 开发、fake-server Playwright、fixture 脱敏、`web-probe script` helper、截图和 Workbench/Performance 判定口径统一见 `$unidesk-webdev`,本 CLI 参考不再维护第二套操作面。 -`hwlab nodes control-plane infra plan|status|apply --node D601 --lane v03` 是 D601 HWLAB v03 节点本地 CI/CD 与 git-mirror 前置控制面的 YAML 驱动入口,配置真相源是 `config/hwlab-node-control-plane.yaml`。`plan` 只读展示 YAML target 和将渲染的 control-plane 对象;`status` 只读观察 D601 Tekton、CI namespace、git-mirror、Argo、node-local registry 和 tools image readiness;`apply --dry-run` 只输出 manifest 摘要;`apply --confirm` 只收敛 D601 control-plane bootstrap 对象,不触发 HWLAB runtime rollout,不创建 PK01 DB,也不修改 Caddy/FRP。tools image 的 node-local registry 地址只能作为输出 artifact;输入 base image 必须由 YAML 声明为公开 registry 来源,缺少 output image 时应在 `status.next.blockers` 中体现,而不是把现有 node-local image 当成输入基础镜像。 +`hwlab nodes control-plane infra plan|status|apply --node D601 --lane v03` 是 D601 HWLAB v03 节点本地 k3s、CI/CD 与 git-mirror 前置控制面的 YAML 驱动入口,配置真相源是 `config/hwlab-node-control-plane.yaml`。`plan` 只读展示 YAML target、host k3s node config 摘要和将渲染的 control-plane 对象;`status` 只读观察 k3s systemd drop-in 与 node `capacity/allocatable.pods`、D601 Tekton、CI namespace、git-mirror、Argo、node-local registry 和 tools image readiness;`apply --dry-run` 只输出 manifest 与 host config 摘要;`apply --confirm` 按 YAML 收敛 D601 host k3s drop-in 和 control-plane bootstrap 对象,只有 host k3s 配置或 live pod capacity 未收敛时才重启 k3s,不触发 HWLAB runtime rollout,不创建 PK01 DB,也不修改 Caddy/FRP。D601 host 侧 k3s pre-start 修正也必须写成 YAML `execStartPre` argv,不做手工 systemd 热改;当 kube API 已不可用时,`apply` 可用同一 YAML 渲染出的 host 脚本经 node-local tools image/Docker fallback 恢复 systemd drop-in,输出仍只给对象名、SHA、exit code 和摘要。k3s pod capacity 等可调数值只以 YAML 为准,长期参考不复制具体数值;tools image 的 node-local registry 地址只能作为输出 artifact,输入 base image 必须由 YAML 声明为公开 registry 来源,缺少 output image 时应在 `status.next.blockers` 中体现,而不是把现有 node-local image 当成输入基础镜像。 `hwlab nodes git-mirror status|sync|flush --node --lane ` 是 node-scoped runtime lane 的 Git mirror 维护入口。`status` 的 `githubSource` / `githubGitops` 来自本地 mirror cache 的 `refs/mirror-stage/...`,不是实时 GitHub API;输出中的 `refSources.githubFieldsAreMirrorStageCache=true` 和 `refSources.cacheRefresh` 给出这一来源和刷新命令。`sync --confirm --wait` 的 k3s Job 遇到 GitHub SSH transient 时,应通过目标 workspace fallback 拉取 GitHub source/gitops 并写回 node-local mirror,输出只披露 commit、mirror write URL 和 fallback 状态。`flush --confirm --wait` 如果已经把 GitOps ref push 到 GitHub,但 post-push fetch/recheck 因 transient SSH 失败而无法刷新 mirror-stage,会标记 `partialSuccess=push-succeeded-fetch-failed`;CLI 应自动执行一次受控 sync 刷新 mirror-stage,若恢复后 `pendingFlush=false` 且 `githubInSync=true`,结果应为 `ok=true` 并输出 `partialSuccessRecovered` / `postPushRecovery`,否则才保留 `degradedReason=node-runtime-git-mirror-flush-post-push-fetch-failed` 和下一步 `sync --confirm --wait`。不要把这种 partial success 解读为需要连续盲目 flush。`hwlab nodes control-plane trigger-current --node --lane --confirm --wait` 会在 source sync 后自动执行必要的 pre-flush,在 PipelineRun terminal 后自动执行必要的 post-flush;progress 事件必须显式输出 `git-mirror-pre-flush` / `git-mirror-post-flush` 的 executed/skipped、jobName、local/github source、local/github GitOps、`pendingFlush` 和 `githubInSync`,且已恢复的 partial success 不能让顶层 trigger-current false-fail。`control-plane status` 仍是只读入口,只暴露 compact `gitMirror` 摘要和下一步 flush 命令,不隐式执行写操作。 diff --git a/scripts/src/help.ts b/scripts/src/help.ts index da61e6f7..79293e24 100644 --- a/scripts/src/help.ts +++ b/scripts/src/help.ts @@ -59,7 +59,7 @@ export function rootHelp(): unknown { { command: "gh preflight|auth|issue|pr", description: "Run safe GitHub issue and PR CRUD/lifecycle operations through REST with body-file update replace/append, issue/comment apply_patch body patching, comment delete, token diagnostics, PR closeout preflight, hard delete unsupported, and guarded PR merge." }, { command: "git github-push-fallback [--repo owner/name] [--branch branch] [--host-name host-or-ip] [--confirm]", description: "Plan or execute a one-shot GitHub push through ssh.github.com:443 without editing remotes; use only for reviewed DNS/port-22 push fallback." }, { command: "commander contract|plan --dry-run|smoke --dry-run|approval request --dry-run", description: "Host Codex commander skeleton contract, no-daemon smoke plan, and dry-run approval preview without live bridges or message sends." }, - { command: "hwlab nodes control-plane|git-mirror|secret|test-accounts|web-probe --node --lane ", description: "Manage HWLAB node/lane runtime prerequisites, including D601 YAML-declared infra/tools-image/Argo bootstrap, redacted test-account preparation, Web DOM probe credential injection, and G14 v0.3+ runtime lanes, with the node identity passed as data." }, + { command: "hwlab nodes control-plane|git-mirror|secret|test-accounts|web-probe --node --lane ", description: "Manage HWLAB node/lane runtime prerequisites, including D601 YAML-declared k3s infra/tools-image/Argo bootstrap, redacted test-account preparation, Web DOM probe credential injection, and G14 v0.3+ runtime lanes, with the node identity passed as data." }, { command: "hwlab g14 monitor-prs | hwlab g14 control-plane status|apply|trigger-current|runtime-migration|cleanup-runs|cleanup-released-pvs | hwlab g14 git-mirror status|apply|sync|flush | hwlab g14 tools-image status|build", description: "Start the legacy G14 PR monitor, run bounded v0.2 Tekton/Argo control-plane, manual PipelineRun trigger, runtime migration, CI workspace retention, manual devops-infra git mirror/relay maintenance, or fixed HWLAB CI tools image actions; long confirmed trigger/sync/flush actions return async jobs by default." }, { command: "agentrun get|describe|events|logs|result|ack|cancel|dispatch|create|apply|send|control-plane|git-mirror", description: "Use AgentRun v0.1 resource primitives with low-noise human output by default; session follow-up uses send only and the server decides internal steer vs turn." }, { command: "platform-infra sub2api|langbot|n8n|wechat-archive ...", description: "Deploy platform-infra services such as Sub2API, LangBot and n8n, manage YAML-controlled public FRP/Caddy exposure and WeChat archive workflows, and inspect status/logs without printing secrets." }, diff --git a/scripts/src/hwlab-node-control-plane.ts b/scripts/src/hwlab-node-control-plane.ts index a35690d2..f83979c6 100644 --- a/scripts/src/hwlab-node-control-plane.ts +++ b/scripts/src/hwlab-node-control-plane.ts @@ -50,10 +50,20 @@ interface ControlPlaneNodeSpec { id: string; route: string; kubeRoute: string; + k3s: ControlPlaneK3sNodeSpec | null; registry: { endpoint: string }; egressProxy: ControlPlaneEgressProxySpec | null; } +interface ControlPlaneK3sNodeSpec { + serviceName: string; + dropInPath: string; + nodeStatusName: string; + execStartPre: readonly (readonly string[])[]; + serverArgs: readonly string[]; + kubelet: { maxPods: number }; +} + interface DockerfileInlineSpec { filename: string; lines: readonly string[]; @@ -180,7 +190,7 @@ export function hwlabNodeControlPlaneInfraHelp(): Record { ok: true, command: "hwlab nodes control-plane infra", configPath: HWLAB_NODE_CONTROL_PLANE_CONFIG_PATH, - description: "Plan/status/apply YAML-controlled HWLAB node-local CI/CD and git-mirror control-plane prerequisites. Cross-node PK01/Caddy/FRP/runtime rollout remains explicit semi-automatic CLI work.", + description: "Plan/status/apply YAML-controlled HWLAB node-local k3s, CI/CD and git-mirror control-plane prerequisites. Cross-node PK01/Caddy/FRP/runtime rollout remains explicit semi-automatic CLI work.", usage: [ "bun scripts/cli.ts hwlab nodes control-plane infra plan --node D601 --lane v03", "bun scripts/cli.ts hwlab nodes control-plane infra status --node D601 --lane v03", @@ -210,6 +220,7 @@ function infraPlan(_config: ControlPlaneConfig, node: ControlPlaneNodeSpec, targ mutation: false, target: planSummary(node, target), expected: expectedSummary(node, target), + hostConfig: k3sNodeConfigPlan(node), imagePolicy: _config.imagePolicy, g14Consistency: { laneVocabulary: ["sourceBranch", "gitopsBranch", "catalogPath", "runtime.path", "runtime.namespace", "tekton.pipeline", "pipelineRunPrefix", "argo.application"], @@ -229,7 +240,7 @@ function infraPlan(_config: ControlPlaneConfig, node: ControlPlaneNodeSpec, targ } function infraStatus(_config: ControlPlaneConfig, node: ControlPlaneNodeSpec, target: ControlPlaneTargetSpec, options: InfraOptions): Record { - const script = statusScript(target, node.registry.endpoint, target.tekton.toolsImage.output); + const script = statusScript(node, target); const result = runTransK3s(node.kubeRoute, script, options.timeoutSeconds); const parsed = parseRemoteJson(result.stdout); const status = typeof parsed === "object" && parsed !== null ? parsed as Record : { parseError: "remote status did not return a JSON object", stdoutPreview: result.stdout.slice(0, 1000) }; @@ -240,7 +251,13 @@ function infraStatus(_config: ControlPlaneConfig, node: ControlPlaneNodeSpec, ta const tekton = record(components.tekton); const ciNamespace = record(components.ciNamespace); const registry = record(components.registry); + const k3sNodeConfig = record(components.k3sNodeConfig); + const k3sNodeConfigReady = node.k3s === null + || (boolField(k3sNodeConfig, "dropInMatches") + && numberValue(k3sNodeConfig.liveCapacityPods) === node.k3s.kubelet.maxPods + && numberValue(k3sNodeConfig.liveAllocatablePods) === node.k3s.kubelet.maxPods); const ok = result.exitCode === 0 + && k3sNodeConfigReady && boolField(tekton, "installed") && boolField(ciNamespace, "exists") && boolField(gitMirror, "namespaceExists") @@ -267,6 +284,7 @@ function infraStatus(_config: ControlPlaneConfig, node: ControlPlaneNodeSpec, ta status, readiness: { ok, + k3sNodeConfigReady, tektonInstalled: boolField(tekton, "installed"), ciNamespaceExists: boolField(ciNamespace, "exists"), gitMirrorNamespaceExists: boolField(gitMirror, "namespaceExists"), @@ -286,7 +304,7 @@ function infraStatus(_config: ControlPlaneConfig, node: ControlPlaneNodeSpec, ta toolsImageReady: boolField(registry, "toolsImageReady"), }, result: compactCommandResult(result), - next: ok ? { runtimePreparation: `bun scripts/cli.ts hwlab nodes control-plane plan --node ${node.id} --lane ${target.lane}` } : statusNext(node, target, registry, gitMirror, argo, ciNamespace), + next: ok ? { runtimePreparation: `bun scripts/cli.ts hwlab nodes control-plane plan --node ${node.id} --lane ${target.lane}` } : statusNext(node, target, registry, gitMirror, argo, ciNamespace, k3sNodeConfig), }; } @@ -306,6 +324,7 @@ function infraApply(_config: ControlPlaneConfig, node: ControlPlaneNodeSpec, tar mode: "dry-run", mutation: false, expected: expectedSummary(node, target), + hostConfig: k3sNodeConfigPlan(node), preflight: { registryReady: imageStatus.registryReady, toolsImageReady: imageStatus.toolsImageReady, @@ -318,7 +337,7 @@ function infraApply(_config: ControlPlaneConfig, node: ControlPlaneNodeSpec, tar next: applyNext(node, target, imageStatus), }; } - const script = applyScript(yaml); + const script = applyScript(yaml, node, target); const result = runTransK3s(node.kubeRoute, script, options.timeoutSeconds); const parsed = parseRemoteJson(result.stdout); return { @@ -430,7 +449,7 @@ function toolsImageBuild(node: ControlPlaneNodeSpec, target: ControlPlaneTargetS } function argoCommandStatus(node: ControlPlaneNodeSpec, target: ControlPlaneTargetSpec, options: ArgoOptions): Record { - const result = runTransK3s(node.kubeRoute, statusScript(target, node.registry.endpoint, target.tekton.toolsImage.output), options.timeoutSeconds); + const result = runTransK3s(node.kubeRoute, statusScript(node, target), options.timeoutSeconds); const parsed = parseRemoteJson(result.stdout); const status = typeof parsed === "object" && parsed !== null ? parsed as Record : {}; const argo = record(record(status.components).argo); @@ -733,15 +752,67 @@ function isNodeLocalImage(image: string): boolean { function nodeSpec(id: string, raw: Record): ControlPlaneNodeSpec { const registry = asRecord(raw.registry, `nodes.${id}.registry`); const egressProxy = raw.egressProxy === undefined ? null : egressProxySpec(asRecord(raw.egressProxy, `nodes.${id}.egressProxy`), `nodes.${id}.egressProxy`); + const k3s = raw.k3s === undefined ? null : k3sNodeSpec(asRecord(raw.k3s, `nodes.${id}.k3s`), `nodes.${id}.k3s`); return { id, route: stringField(raw, "route", `nodes.${id}`), kubeRoute: stringField(raw, "kubeRoute", `nodes.${id}`), + k3s, registry: { endpoint: stringField(registry, "endpoint", `nodes.${id}.registry`) }, egressProxy, }; } +function k3sNodeSpec(raw: Record, path: string): ControlPlaneK3sNodeSpec { + const kubelet = asRecord(raw.kubelet, `${path}.kubelet`); + const serviceName = stringField(raw, "serviceName", path); + if (!/^[A-Za-z0-9_.@-]+$/u.test(serviceName)) throw new Error(`${path}.serviceName has an unsupported systemd unit name`); + const dropInPath = stringField(raw, "dropInPath", path); + if (!dropInPath.startsWith("/etc/systemd/system/") || !dropInPath.endsWith(".conf") || dropInPath.includes("..")) { + throw new Error(`${path}.dropInPath must be an absolute /etc/systemd/system/*.conf path`); + } + const nodeStatusName = stringField(raw, "nodeStatusName", path); + if (!/^[A-Za-z0-9_.-]+$/u.test(nodeStatusName)) throw new Error(`${path}.nodeStatusName has an unsupported Kubernetes node name`); + const execStartPre = execStartPreField(raw.execStartPre, `${path}.execStartPre`); + const serverArgs = stringArrayField(raw, "serverArgs", path); + if (serverArgs.length === 0 || serverArgs[0] !== "server") throw new Error(`${path}.serverArgs must start with k3s server`); + for (const [index, arg] of serverArgs.entries()) { + if (arg.includes("\n") || arg.includes("\r") || arg.length === 0) throw new Error(`${path}.serverArgs[${index}] must be a single non-empty argv token`); + } + const maxPods = positiveConfigIntegerField(kubelet, "maxPods", `${path}.kubelet`); + const expectedMaxPodsArg = `max-pods=${maxPods}`; + let hasExpectedMaxPodsArg = false; + for (let index = 0; index < serverArgs.length - 1; index += 1) { + if (serverArgs[index] === "--kubelet-arg" && serverArgs[index + 1] === expectedMaxPodsArg) hasExpectedMaxPodsArg = true; + } + if (!hasExpectedMaxPodsArg) throw new Error(`${path}.serverArgs must include --kubelet-arg ${expectedMaxPodsArg}`); + return { + serviceName, + dropInPath, + nodeStatusName, + execStartPre, + serverArgs, + kubelet: { maxPods }, + }; +} + +function execStartPreField(raw: unknown, path: string): readonly (readonly string[])[] { + if (raw === undefined) return []; + if (!Array.isArray(raw)) throw new Error(`${path} must be an array of argv arrays`); + return raw.map((item, index) => { + if (!Array.isArray(item)) throw new Error(`${path}[${index}] must be an argv array`); + const command = item.map((value, tokenIndex) => { + if (typeof value !== "string") throw new Error(`${path}[${index}][${tokenIndex}] must be a string`); + if (value.length === 0 || value.includes("\n") || value.includes("\r")) throw new Error(`${path}[${index}][${tokenIndex}] must be a single non-empty argv token`); + return value; + }); + if (command.length === 0) throw new Error(`${path}[${index}] must not be empty`); + const executable = command[0].startsWith("-") ? command[0].slice(1) : command[0]; + if (!executable.startsWith("/") || executable.includes("..")) throw new Error(`${path}[${index}][0] must be an absolute executable path, optionally prefixed with -`); + return command; + }); +} + function egressProxySpec(raw: Record, path: string): ControlPlaneEgressProxySpec { const mode = stringField(raw, "mode", path); if (mode !== "k8s-service-cluster-ip") throw new Error(`${path}.mode must be k8s-service-cluster-ip`); @@ -1298,6 +1369,7 @@ function planSummary(node: ControlPlaneNodeSpec, target: ControlPlaneTargetSpec) enabled: target.enabled, ciNamespace: target.ciNamespace, runtimeNamespace: target.runtimeNamespace, + k3sNodeConfig: k3sNodeConfigPlan(node), registry: node.registry.endpoint, egressProxy: node.egressProxy, sourceBranch: target.source.branch, @@ -1328,6 +1400,7 @@ function expectedSummary(node: ControlPlaneNodeSpec, target: ControlPlaneTargetS runtimePath: target.gitops.path, runtimeNamespace: target.runtimeNamespace, namespace: target.ciNamespace, + k3sNodeConfig: k3sNodeConfigPlan(node), gitMirror: { namespace: target.gitMirror.namespace, readUrl: target.gitMirror.readUrl, @@ -1366,10 +1439,43 @@ function expectedSummary(node: ControlPlaneNodeSpec, target: ControlPlaneTargetS }; } -function statusScript(target: ControlPlaneTargetSpec, registryEndpoint: string, toolsImage: string): string { +function k3sNodeConfigPlan(node: ControlPlaneNodeSpec): Record { + if (node.k3s === null) return { managed: false }; + const dropIn = k3sDropInContent(node.k3s); + return { + managed: true, + serviceName: node.k3s.serviceName, + dropInPath: node.k3s.dropInPath, + nodeStatusName: node.k3s.nodeStatusName, + desiredMaxPods: node.k3s.kubelet.maxPods, + dropInSha256: sha256Short(dropIn), + execStartPreCount: node.k3s.execStartPre.length, + serverArgCount: node.k3s.serverArgs.length, + }; +} + +function k3sDropInContent(spec: ControlPlaneK3sNodeSpec): string { + return [ + "# Managed by UniDesk. Source: config/hwlab-node-control-plane.yaml nodes..k3s", + "[Service]", + ...spec.execStartPre.map((command) => `ExecStartPre=${command.map(systemdExecArg).join(" ")}`), + "ExecStart=", + `ExecStart=${["/usr/local/bin/k3s", ...spec.serverArgs].map(systemdExecArg).join(" ")}`, + "", + ].join("\n"); +} + +function systemdExecArg(value: string): string { + if (/^[A-Za-z0-9_@%+=:,./-]+$/u.test(value)) return value; + return `"${value.replaceAll("\\", "\\\\").replaceAll("\"", "\\\"").replaceAll("$", "\\$").replaceAll("`", "\\`")}"`; +} + +function statusScript(nodeSpec: ControlPlaneNodeSpec, target: ControlPlaneTargetSpec): string { const requiredCrds = shellJsonArray(target.argo.install.requiredCrds); const argoDeployments = shellJsonArray(target.argo.install.expectedDeployments); const argoStatefulSets = shellJsonArray(target.argo.install.expectedStatefulSets); + const k3s = nodeSpec.k3s; + const k3sDropIn = k3s === null ? "" : k3sDropInContent(k3s); return ` set +e node=${shQuote(target.node)} @@ -1388,11 +1494,17 @@ service_account=${shQuote(target.tekton.serviceAccountName)} argo_ns=${shQuote(target.argo.namespace)} argo_project=${shQuote(target.argo.projectName)} argo_app=${shQuote(target.argo.applicationName)} -registry=${shQuote(registryEndpoint)} -tools_image=${shQuote(toolsImage)} +registry=${shQuote(nodeSpec.registry.endpoint)} +tools_image=${shQuote(target.tekton.toolsImage.output)} required_crds_json=${shQuote(requiredCrds)} argo_deployments_json=${shQuote(argoDeployments)} argo_statefulsets_json=${shQuote(argoStatefulSets)} +k3s_managed=${k3s === null ? "false" : "true"} +k3s_service=${shQuote(k3s?.serviceName ?? "")} +k3s_dropin=${shQuote(k3s?.dropInPath ?? "")} +k3s_node=${shQuote(k3s?.nodeStatusName ?? "")} +k3s_desired_max_pods=${shQuote(String(k3s?.kubelet.maxPods ?? ""))} +k3s_expected_sha=${shQuote(k3s === null ? "" : sha256Short(k3sDropIn))} exists_ns() { kubectl get ns "$1" >/dev/null 2>&1 && printf true || printf false; } exists_res() { kubectl -n "$1" get "$2" "$3" >/dev/null 2>&1 && printf true || printf false; } deploy_ready() { desired=$(kubectl -n "$1" get deploy "$2" -o 'jsonpath={.spec.replicas}' 2>/dev/null || true); ready=$(kubectl -n "$1" get deploy "$2" -o 'jsonpath={.status.readyReplicas}' 2>/dev/null || true); [ -n "$desired" ] && [ "$desired" -gt 0 ] 2>/dev/null && [ "\${ready:-0}" = "$desired" ] && printf true || printf false; } @@ -1407,6 +1519,64 @@ tools_image_ready=false if [ "$tools_repo" != "$tools_repo_tag" ] && command -v curl >/dev/null 2>&1; then curl -fsS --max-time 5 "http://$registry/v2/$tools_repo/manifests/$tools_tag" >/tmp/hwlab-tools-image.out 2>/tmp/hwlab-tools-image.err && tools_image_ready=true; fi cache_host_path_ready=false if [ -n "$cache_host_path" ] && kubectl -n "$gitmirror_ns" exec deploy/"$read_deploy" -- sh -lc 'test -d /cache' >/dev/null 2>&1; then cache_host_path_ready=true; fi +k3s_fragment=$(python3 - "$k3s_managed" "$k3s_service" "$k3s_dropin" "$k3s_node" "$k3s_desired_max_pods" "$k3s_expected_sha" <<'PY' +import hashlib, json, re, subprocess, sys +managed = sys.argv[1] == "true" +service, dropin, node_name, desired_raw, expected_sha = sys.argv[2:7] +def run(args): + return subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) +def to_int(value): + try: + return int(value) + except Exception: + return None +if not managed: + print(json.dumps({"managed": False, "ready": True})) + raise SystemExit(0) +desired = to_int(desired_raw) +node_json = run(["kubectl", "get", "node", node_name, "-o", "json"]) +capacity = None +allocatable = None +node_ready = False +if node_json.returncode == 0: + data = json.loads(node_json.stdout) + capacity = to_int(data.get("status", {}).get("capacity", {}).get("pods")) + allocatable = to_int(data.get("status", {}).get("allocatable", {}).get("pods")) + for condition in data.get("status", {}).get("conditions", []): + if condition.get("type") == "Ready": + node_ready = condition.get("status") == "True" +unit = run(["systemctl", "cat", service]) +unit_text = unit.stdout if unit.returncode == 0 else "" +dropin_read = run(["cat", dropin]) +dropin_exists = dropin_read.returncode == 0 +dropin_text = dropin_read.stdout if dropin_exists else "" +dropin_sha = "sha256:" + hashlib.sha256(dropin_text.encode()).hexdigest() if dropin_exists else None +matches = re.findall(r"max-pods=([0-9]+)", unit_text + "\\n" + dropin_text) +configured = to_int(matches[-1]) if matches else None +dropin_matches = dropin_sha == expected_sha +ready = dropin_matches and capacity == desired and allocatable == desired +source = "managed-dropin" if dropin_matches else ("systemd-or-config" if configured is not None else "kubelet-default") +print(json.dumps({ + "managed": True, + "ready": ready, + "serviceName": service, + "dropInPath": dropin, + "dropInExists": dropin_exists, + "dropInSha256": dropin_sha, + "expectedDropInSha256": expected_sha, + "dropInMatches": dropin_matches, + "configuredMaxPods": configured, + "desiredMaxPods": desired, + "liveNodeName": node_name, + "liveCapacityPods": capacity, + "liveAllocatablePods": allocatable, + "nodeReady": node_ready, + "restartRequired": not ready, + "source": source, + "unitReadable": unit.returncode == 0, +})) +PY +) python3 - "$required_crds_json" "$argo_deployments_json" "$argo_statefulsets_json" <<'PY' >/tmp/hwlab-node-status-fragments.json import json, subprocess, sys required_crds=json.loads(sys.argv[1]) @@ -1432,27 +1602,274 @@ print(json.dumps({"crds": crds, "deployments": deploy, "statefulSets": sts, "crd PY argo_fragment=$(cat /tmp/hwlab-node-status-fragments.json 2>/dev/null || printf '{}') cat </dev/null 2>&1 && printf true || printf false),"controllerReady":$(deploy_ready tekton-pipelines tekton-pipelines-controller),"webhookReady":$(deploy_ready tekton-pipelines tekton-pipelines-webhook)},"ciNamespace":{"name":"$ci_ns","exists":$(exists_ns "$ci_ns"),"serviceAccountExists":$(exists_res "$ci_ns" serviceaccount "$service_account"),"pipelineExists":$(exists_res "$ci_ns" pipeline "$pipeline")},"gitMirror":{"namespace":"$gitmirror_ns","namespaceExists":$(exists_ns "$gitmirror_ns"),"readDeploymentReady":$(deploy_ready "$gitmirror_ns" "$read_deploy"),"writeDeploymentReady":$(deploy_ready "$gitmirror_ns" "$write_deploy"),"readServiceExists":$(exists_res "$gitmirror_ns" service "$read_svc"),"writeServiceExists":$(exists_res "$gitmirror_ns" service "$write_svc"),"readEndpointsReady":$(endpoint_ready "$gitmirror_ns" "$read_svc"),"writeEndpointsReady":$(endpoint_ready "$gitmirror_ns" "$write_svc"),"cachePvcExists":$(exists_res "$gitmirror_ns" pvc "$cache_pvc"),"cacheHostPath":"$cache_host_path","cacheHostPathReady":$cache_host_path_ready,"summary":{"localSource":null,"githubSource":null,"localGitops":null,"githubGitops":null,"pendingFlush":null,"flushNeeded":null,"githubInSync":null}},"argo":{"namespace":"$argo_ns","namespaceExists":$(exists_ns "$argo_ns"),"installed":$(kubectl get crd applications.argoproj.io appprojects.argoproj.io >/dev/null 2>&1 && printf true || printf false),"projectExists":$(kubectl -n "$argo_ns" get appproject "$argo_project" >/dev/null 2>&1 && printf true || printf false),"applicationExists":$(kubectl -n "$argo_ns" get application "$argo_app" >/dev/null 2>&1 && printf true || printf false),"install":$argo_fragment},"registry":{"endpoint":"$registry","ready":$registry_ready,"toolsImage":"$tools_image","toolsImageReady":$tools_image_ready},"runtimeNamespace":{"name":"$runtime_ns","exists":$(exists_ns "$runtime_ns")}}} +{"observedAt":"$(date -u +%Y-%m-%dT%H:%M:%SZ)","node":"$node","lane":"$lane","components":{"k3sNodeConfig":$k3s_fragment,"tekton":{"installed":$(kubectl get crd pipelines.tekton.dev pipelineruns.tekton.dev >/dev/null 2>&1 && printf true || printf false),"controllerReady":$(deploy_ready tekton-pipelines tekton-pipelines-controller),"webhookReady":$(deploy_ready tekton-pipelines tekton-pipelines-webhook)},"ciNamespace":{"name":"$ci_ns","exists":$(exists_ns "$ci_ns"),"serviceAccountExists":$(exists_res "$ci_ns" serviceaccount "$service_account"),"pipelineExists":$(exists_res "$ci_ns" pipeline "$pipeline")},"gitMirror":{"namespace":"$gitmirror_ns","namespaceExists":$(exists_ns "$gitmirror_ns"),"readDeploymentReady":$(deploy_ready "$gitmirror_ns" "$read_deploy"),"writeDeploymentReady":$(deploy_ready "$gitmirror_ns" "$write_deploy"),"readServiceExists":$(exists_res "$gitmirror_ns" service "$read_svc"),"writeServiceExists":$(exists_res "$gitmirror_ns" service "$write_svc"),"readEndpointsReady":$(endpoint_ready "$gitmirror_ns" "$read_svc"),"writeEndpointsReady":$(endpoint_ready "$gitmirror_ns" "$write_svc"),"cachePvcExists":$(exists_res "$gitmirror_ns" pvc "$cache_pvc"),"cacheHostPath":"$cache_host_path","cacheHostPathReady":$cache_host_path_ready,"summary":{"localSource":null,"githubSource":null,"localGitops":null,"githubGitops":null,"pendingFlush":null,"flushNeeded":null,"githubInSync":null}},"argo":{"namespace":"$argo_ns","namespaceExists":$(exists_ns "$argo_ns"),"installed":$(kubectl get crd applications.argoproj.io appprojects.argoproj.io >/dev/null 2>&1 && printf true || printf false),"projectExists":$(kubectl -n "$argo_ns" get appproject "$argo_project" >/dev/null 2>&1 && printf true || printf false),"applicationExists":$(kubectl -n "$argo_ns" get application "$argo_app" >/dev/null 2>&1 && printf true || printf false),"install":$argo_fragment},"registry":{"endpoint":"$registry","ready":$registry_ready,"toolsImage":"$tools_image","toolsImageReady":$tools_image_ready},"runtimeNamespace":{"name":"$runtime_ns","exists":$(exists_ns "$runtime_ns")}}} JSON `; } -function applyScript(yaml: string): string { +function applyScript(yaml: string, node: ControlPlaneNodeSpec, target: ControlPlaneTargetSpec): string { const encoded = Buffer.from(yaml, "utf8").toString("base64"); return ` set +e manifest=$(mktemp /tmp/hwlab-node-infra.XXXXXX.yaml) printf %s ${shQuote(encoded)} | base64 -d >"$manifest" kubectl apply --server-side --field-manager=unidesk-hwlab-node-control-plane -f "$manifest" >/tmp/hwlab-node-infra-apply.out 2>/tmp/hwlab-node-infra-apply.err -rc=$? -python3 - "$rc" <<'PY' +kubectl_rc=$? +${k3sApplyScriptFragment(node.k3s, target)} +python3 - "$kubectl_rc" "$k3s_report_file" <<'PY' import json, pathlib, sys +k3s_report = {} +try: + k3s_report = json.loads(pathlib.Path(sys.argv[2]).read_text(errors='replace')) +except Exception as exc: + k3s_report = {"managed": None, "ok": False, "parseError": str(exc)} out=pathlib.Path('/tmp/hwlab-node-infra-apply.out').read_text(errors='replace') if pathlib.Path('/tmp/hwlab-node-infra-apply.out').exists() else '' err=pathlib.Path('/tmp/hwlab-node-infra-apply.err').read_text(errors='replace') if pathlib.Path('/tmp/hwlab-node-infra-apply.err').exists() else '' -print(json.dumps({'applyExitCode': int(sys.argv[1]), 'stdoutPreview': out[-2000:], 'stderrPreview': err[-2000:], 'runtimeRolloutTriggered': False, 'pk01Touched': False}, ensure_ascii=False)) +print(json.dumps({'k3sNodeConfig': k3s_report, 'kubernetesApply': {'applyExitCode': int(sys.argv[1]), 'stdoutPreview': out[-2000:], 'stderrPreview': err[-2000:], 'runtimeRolloutTriggered': False, 'pk01Touched': False}}, ensure_ascii=False)) PY rm -f "$manifest" -exit "$rc" +if [ "$kubectl_rc" != 0 ]; then exit "$kubectl_rc"; fi +exit "$k3s_rc" +`; +} + +function k3sApplyScriptFragment(spec: ControlPlaneK3sNodeSpec | null, target: ControlPlaneTargetSpec): string { + if (spec === null) { + return ` +k3s_report_file=$(mktemp /tmp/hwlab-node-k3s.XXXXXX.json) +printf '{"managed":false,"ok":true,"mutation":false}\\n' >"$k3s_report_file" +k3s_rc=0 +`; + } + const content = k3sDropInContent(spec); + const encoded = Buffer.from(content, "utf8").toString("base64"); + return ` +k3s_report_file=$(mktemp /tmp/hwlab-node-k3s.XXXXXX.json) +k3s_service=${shQuote(spec.serviceName)} +k3s_dropin=${shQuote(spec.dropInPath)} +k3s_node=${shQuote(spec.nodeStatusName)} +k3s_namespace=${shQuote(target.ciNamespace)} +k3s_image=${shQuote(target.tekton.toolsImage.output)} +k3s_desired_max_pods=${shQuote(String(spec.kubelet.maxPods))} +k3s_expected_sha=${shQuote(sha256Short(content))} +k3s_before_capacity=$(kubectl get node "$k3s_node" -o 'jsonpath={.status.capacity.pods}' 2>/dev/null || true) +k3s_before_allocatable=$(kubectl get node "$k3s_node" -o 'jsonpath={.status.allocatable.pods}' 2>/dev/null || true) +capacity_restart=false +if [ "$k3s_before_capacity" != "$k3s_desired_max_pods" ] || [ "$k3s_before_allocatable" != "$k3s_desired_max_pods" ]; then capacity_restart=true; fi +k3s_current_dropin_sha= +if [ -f "$k3s_dropin" ]; then k3s_current_dropin_sha=$(sha256sum "$k3s_dropin" | awk '{print "sha256:"$1}'); fi +if [ "$k3s_current_dropin_sha" = "$k3s_expected_sha" ] && [ "$capacity_restart" = false ]; then + python3 - "$k3s_current_dropin_sha" "$k3s_expected_sha" "$k3s_service" "$k3s_dropin" "$k3s_node" "$k3s_desired_max_pods" "$k3s_before_capacity" "$k3s_before_allocatable" <<'PY' >"$k3s_report_file" +import json, sys +dropin_sha, expected_sha, service, dropin, node_name, desired, before_capacity, before_allocatable = sys.argv[1:9] +print(json.dumps({ + "managed": True, + "ok": True, + "mutation": False, + "applyMode": "noop", + "completionPending": False, + "serviceName": service, + "dropInPath": dropin, + "dropInSha256": dropin_sha, + "expectedDropInSha256": expected_sha, + "dropInMatches": dropin_sha == expected_sha, + "nodeName": node_name, + "desiredMaxPods": int(desired), + "beforeCapacityPods": int(before_capacity) if before_capacity.isdigit() else None, + "beforeAllocatablePods": int(before_allocatable) if before_allocatable.isdigit() else None, +}, ensure_ascii=False)) +PY + k3s_rc=0 +else +k3s_job="hwlab-node-k3s-config-$(date +%s)" +k3s_job_manifest=$(mktemp /tmp/hwlab-node-k3s-job.XXXXXX.json) +k3s_host_script=$(mktemp /tmp/hwlab-node-k3s-host.XXXXXX.sh) +k3s_job_apply_stdout=/tmp/hwlab-node-k3s-job-apply.out +k3s_job_apply_stderr=/tmp/hwlab-node-k3s-job-apply.err +k3s_docker_stdout=/tmp/hwlab-node-k3s-docker.out +k3s_docker_stderr=/tmp/hwlab-node-k3s-docker.err +k3s_host_report="/tmp/$k3s_job-report.json" +rm -f "$k3s_host_report" +python3 - "$k3s_job_manifest" "$k3s_host_script" "$k3s_job" "$k3s_namespace" "$k3s_image" "$k3s_dropin" ${shQuote(encoded)} "$k3s_service" "$k3s_desired_max_pods" "$k3s_expected_sha" "$capacity_restart" "$k3s_host_report" <<'PY' +import json, os, shlex, sys +manifest_path, host_script_path, job, namespace, image, dropin, encoded, service, desired, expected_sha, capacity_restart, report_path = sys.argv[1:13] +script = f"""#!/bin/sh +set -eu +expected=/tmp/unidesk-k3s-dropin.conf +printf %s {shlex.quote(encoded)} | base64 -d > "$expected" +host_dropin=/host{shlex.quote(dropin)} +host_report=/host{shlex.quote(report_path)} +mkdir -p "$(dirname "$host_dropin")" +before_sha= +if [ -f "$host_dropin" ]; then before_sha=$(sha256sum "$host_dropin" | awk '{{print "sha256:"$1}}'); fi +changed=false +if ! cmp -s "$expected" "$host_dropin" 2>/dev/null; then + cp "$expected" "$host_dropin" + chown 0:0 "$host_dropin" 2>/dev/null || true + chmod 0644 "$host_dropin" + changed=true +fi +nsenter_path=$(command -v nsenter || true) +host_systemctl() {{ + if command -v chroot >/dev/null 2>&1 && [ -x /host/usr/bin/systemctl ]; then + chroot /host /usr/bin/systemctl "$@" + return $? + fi + if [ -n "$nsenter_path" ]; then + "$nsenter_path" -t 1 -m -u -i -n -p -- /usr/bin/systemctl "$@" + return $? + fi + return 127 +}} +daemon_reload_rc=0 +restart_rc=0 +restarted=false +if command -v chroot >/dev/null 2>&1 || [ -n "$nsenter_path" ]; then + host_systemctl daemon-reload || daemon_reload_rc=$? + if [ "$changed" = true ] || [ {shlex.quote(capacity_restart)} = true ]; then + restarted=true + host_systemctl restart {shlex.quote(service)} || restart_rc=$? + fi +else + daemon_reload_rc=127 + restart_rc=127 +fi +after_sha= +if [ -f "$host_dropin" ]; then after_sha=$(sha256sum "$host_dropin" | awk '{{print "sha256:"$1}}'); fi +service_active=unknown +if command -v chroot >/dev/null 2>&1 || [ -n "$nsenter_path" ]; then service_active=$(host_systemctl is-active {shlex.quote(service)} 2>/dev/null || true); fi +python3 - "$changed" "$restarted" "$daemon_reload_rc" "$restart_rc" "$before_sha" "$after_sha" "$service_active" "$nsenter_path" <<'REPORT' >"$host_report" +import json, sys +changed, restarted = sys.argv[1] == "true", sys.argv[2] == "true" +daemon_reload_rc, restart_rc = int(sys.argv[3] or "0"), int(sys.argv[4] or "0") +print(json.dumps({{ + "jobChanged": changed, + "jobRestarted": restarted, + "daemonReloadExitCode": daemon_reload_rc, + "restartExitCode": restart_rc, + "beforeDropInSha256": sys.argv[5] or None, + "dropInSha256": sys.argv[6] or None, + "expectedDropInSha256": {json.dumps(expected_sha)}, + "dropInMatches": sys.argv[6] == {json.dumps(expected_sha)}, + "serviceActiveText": sys.argv[7] or None, + "nsenterPresent": bool(sys.argv[8]), +}})) +REPORT +chmod 0644 "$host_report" 2>/dev/null || true +cat "$host_report" +""" +with open(host_script_path, "w", encoding="utf-8") as handle: + handle.write(script) +os.chmod(host_script_path, 0o755) +manifest = { + "apiVersion": "batch/v1", + "kind": "Job", + "metadata": {"name": job, "namespace": namespace, "labels": {"app.kubernetes.io/part-of": "hwlab-node-control-plane", "unidesk.ai/operation": "k3s-node-config"}}, + "spec": { + "backoffLimit": 0, + "ttlSecondsAfterFinished": 300, + "template": { + "metadata": {"labels": {"app.kubernetes.io/part-of": "hwlab-node-control-plane", "unidesk.ai/operation": "k3s-node-config"}}, + "spec": { + "restartPolicy": "Never", + "hostPID": True, + "hostNetwork": True, + "containers": [{ + "name": "apply-k3s-node-config", + "image": image, + "imagePullPolicy": "IfNotPresent", + "securityContext": {"privileged": True}, + "command": ["/bin/sh", "-lc", script], + "volumeMounts": [{"name": "host-root", "mountPath": "/host"}], + }], + "volumes": [{"name": "host-root", "hostPath": {"path": "/", "type": "Directory"}}], + }, + }, + }, +} +with open(manifest_path, "w", encoding="utf-8") as handle: + json.dump(manifest, handle) +PY +k3s_render_rc=$? +if [ "$k3s_render_rc" != 0 ]; then + python3 - "$k3s_render_rc" "$k3s_expected_sha" "$k3s_service" "$k3s_dropin" "$k3s_node" "$k3s_desired_max_pods" <<'PY' >"$k3s_report_file" +import json, sys +render_rc = int(sys.argv[1] or "1") +expected_sha, service, dropin, node_name, desired = sys.argv[2:7] +print(json.dumps({ + "managed": True, + "ok": False, + "mutation": False, + "renderExitCode": render_rc, + "serviceName": service, + "dropInPath": dropin, + "expectedDropInSha256": expected_sha, + "nodeName": node_name, + "desiredMaxPods": int(desired), +}, ensure_ascii=False)) +PY + k3s_rc=$k3s_render_rc +else +kubectl apply -f "$k3s_job_manifest" >"$k3s_job_apply_stdout" 2>"$k3s_job_apply_stderr" +k3s_job_apply_rc=$? +k3s_apply_mode=kubernetes-job +k3s_docker_rc=127 +if [ "$k3s_job_apply_rc" != 0 ] && command -v docker >/dev/null 2>&1; then + k3s_apply_mode=docker-host-fallback + docker run --rm --privileged --pid=host --network=host -v /:/host --entrypoint /bin/sh "$k3s_image" "/host$k3s_host_script" >"$k3s_docker_stdout" 2>"$k3s_docker_stderr" + k3s_docker_rc=$? +fi +k3s_submit_rc=$k3s_job_apply_rc +if [ "$k3s_job_apply_rc" != 0 ] && [ "$k3s_docker_rc" = 0 ]; then k3s_submit_rc=0; fi +python3 - "$k3s_submit_rc" "$k3s_job_apply_rc" "$k3s_docker_rc" "$k3s_apply_mode" "$k3s_before_capacity" "$k3s_before_allocatable" "$k3s_expected_sha" "$k3s_service" "$k3s_dropin" "$k3s_node" "$k3s_desired_max_pods" "$k3s_job" "$k3s_namespace" "$k3s_host_report" "$k3s_job_apply_stdout" "$k3s_job_apply_stderr" "$k3s_docker_stdout" "$k3s_docker_stderr" <<'PY' >"$k3s_report_file" +import json, pathlib, sys +submit_rc, job_apply_rc, docker_rc = [int(value or "0") for value in sys.argv[1:4]] +apply_mode = sys.argv[4] +before_capacity, before_allocatable = sys.argv[5:7] +expected_sha, service, dropin, node_name, desired, job_name, namespace, host_report = sys.argv[7:15] +def read(path): + return pathlib.Path(path).read_text(errors='replace') if pathlib.Path(path).exists() else '' +try: + host_report_data = json.loads(read(host_report) or "{}") +except Exception: + host_report_data = {} +apply_ok = submit_rc == 0 +print(json.dumps({ + "managed": True, + "ok": apply_ok, + "mutation": apply_ok, + "completionPending": apply_ok and apply_mode == "kubernetes-job", + "applyMode": apply_mode, + "jobName": job_name, + "namespace": namespace, + "jobApplyExitCode": job_apply_rc, + "dockerFallbackExitCode": docker_rc, + "serviceName": service, + "dropInPath": dropin, + "dropInSha256": host_report_data.get("dropInSha256"), + "expectedDropInSha256": expected_sha, + "dropInMatches": host_report_data.get("dropInSha256") == expected_sha if host_report_data else None, + "daemonReloadExitCode": host_report_data.get("daemonReloadExitCode"), + "restartExitCode": host_report_data.get("restartExitCode"), + "serviceActive": host_report_data.get("serviceActiveText") == "active" if host_report_data else None, + "nodeName": node_name, + "desiredMaxPods": int(desired), + "beforeCapacityPods": int(before_capacity) if before_capacity.isdigit() else None, + "beforeAllocatablePods": int(before_allocatable) if before_allocatable.isdigit() else None, + "hostReportPath": host_report, + "statusCommand": f"bun scripts/cli.ts hwlab nodes control-plane infra status --node {node_name.upper()} --lane ${target.lane}", + "jobCompletionCommand": f"kubectl -n {namespace} wait --for=condition=complete job/{job_name} --timeout=120s", + "jobLogsCommand": f"kubectl -n {namespace} logs job/{job_name} --tail=120", + "jobApplyStdoutPreview": read(sys.argv[15])[-1000:], + "jobApplyStderrPreview": read(sys.argv[16])[-1000:], + "dockerStdoutPreview": read(sys.argv[17])[-1000:], + "dockerStderrPreview": read(sys.argv[18])[-1000:], +}, ensure_ascii=False)) +PY +k3s_rc=$k3s_submit_rc +fi +rm -f "$k3s_job_manifest" "$k3s_host_script" +fi `; } @@ -1499,6 +1916,7 @@ function statusNext( gitMirror: Record, argo: Record, ciNamespace: Record, + k3sNodeConfig: Record, ): Record { const bootstrapMissing = !boolField(ciNamespace, "exists") || !boolField(gitMirror, "namespaceExists") @@ -1506,6 +1924,7 @@ function statusNext( || !boolField(gitMirror, "writeServiceExists") || (!boolField(gitMirror, "cachePvcExists") && !boolField(gitMirror, "cacheHostPathReady")); const blockers: string[] = []; + if (node.k3s !== null && !boolField(k3sNodeConfig, "ready")) blockers.push("k3s-node-config-not-applied"); if (!boolField(registry, "ready")) blockers.push("node-local-registry-not-ready"); if (!boolField(registry, "toolsImageReady")) blockers.push("tools-image-missing"); if (bootstrapMissing) blockers.push("control-plane-bootstrap-missing"); @@ -1530,6 +1949,9 @@ function statusNext( if (!boolField(argo, "installed")) { next.installArgo = "准备受控 D601 Argo CD 安装入口后再进入 runtime rollout。"; } + if (node.k3s !== null && !boolField(k3sNodeConfig, "ready")) { + next.applyK3sNodeConfig = `bun scripts/cli.ts hwlab nodes control-plane infra apply --node ${node.id} --lane ${target.lane} --confirm`; + } if (bootstrapMissing) next.applyBootstrap = `bun scripts/cli.ts hwlab nodes control-plane infra apply --node ${node.id} --lane ${target.lane} --confirm`; else next.reapplyBootstrap = `bun scripts/cli.ts hwlab nodes control-plane infra apply --node ${node.id} --lane ${target.lane} --confirm`; return next; @@ -1900,6 +2322,10 @@ function boolField(obj: Record, key: string): boolean { return obj[key] === true; } +function numberValue(value: unknown): number | null { + return typeof value === "number" && Number.isFinite(value) ? value : null; +} + function requiredOption(args: string[], name: string): string { const index = args.indexOf(name); if (index === -1) throw new Error(`${name} is required`);