fix: manage D601 k3s pod capacity via YAML
This commit is contained in:
@@ -4,6 +4,7 @@ metadata:
|
||||
owner: unidesk
|
||||
relatedIssues:
|
||||
- 290
|
||||
- 491
|
||||
- 1119
|
||||
imagePolicy:
|
||||
requireReproducibleBuildSource: true
|
||||
@@ -16,6 +17,41 @@ nodes:
|
||||
D601:
|
||||
route: D601
|
||||
kubeRoute: D601:k3s
|
||||
k3s:
|
||||
serviceName: k3s
|
||||
dropInPath: /etc/systemd/system/k3s.service.d/20-unidesk-node-config.conf
|
||||
nodeStatusName: d601
|
||||
execStartPre:
|
||||
- - -/usr/bin/umount
|
||||
- /Docker/host
|
||||
serverArgs:
|
||||
- server
|
||||
- --disable
|
||||
- traefik
|
||||
- --disable
|
||||
- servicelb
|
||||
- --disable
|
||||
- metrics-server
|
||||
- --node-name
|
||||
- D601
|
||||
- --node-label
|
||||
- unidesk.ai/node-id=D601
|
||||
- --node-label
|
||||
- unidesk.ai/provider-id=D601
|
||||
- --tls-san
|
||||
- 127.0.0.1
|
||||
- --tls-san
|
||||
- host.docker.internal
|
||||
- --write-kubeconfig-mode
|
||||
- "644"
|
||||
- --kubelet-arg
|
||||
- image-gc-high-threshold=95
|
||||
- --kubelet-arg
|
||||
- image-gc-low-threshold=90
|
||||
- --kubelet-arg
|
||||
- max-pods=500
|
||||
kubelet:
|
||||
maxPods: 500
|
||||
registry:
|
||||
endpoint: 127.0.0.1:5000
|
||||
egressProxy:
|
||||
|
||||
@@ -24,7 +24,7 @@ G14/D601 v03 的 bootstrap admin password 是 HWLAB runtime Secret 生命周期
|
||||
|
||||
`hwlab nodes web-probe run|script --node <node> --lane <lane>` 是 HWLAB Cloud Web 线上 DOM/Playwright 验收的受控入口;CLI 负责从 YAML 解析 workspace、public URL 和 bootstrap admin sourceRef,并只输出 redacted 凭据状态、artifact path/hash、readiness、`probe.summary` 和失败分类。`run` 使用 repo-owned 标准 DOM probe;`script` 不运行默认探针,必须通过 stdin heredoc 或 `--script-file <path>` 提供调用者脚本。`run --message ...` 未显式设置 trace 参数时会做轻量 trace 采样,`script` helper 可用 `recordStep` / `safeFetchJson` / `fetchApiMatrix` 保留失败前的结构化 partial evidence,完整 redacted 报告通过 `reportPath`/`reportSha256` 展开。具体 Web 开发、fake-server Playwright、fixture 脱敏、`web-probe script` helper、截图和 Workbench/Performance 判定口径统一见 `$unidesk-webdev`,本 CLI 参考不再维护第二套操作面。
|
||||
|
||||
`hwlab nodes control-plane infra plan|status|apply --node D601 --lane v03` 是 D601 HWLAB v03 节点本地 CI/CD 与 git-mirror 前置控制面的 YAML 驱动入口,配置真相源是 `config/hwlab-node-control-plane.yaml`。`plan` 只读展示 YAML target 和将渲染的 control-plane 对象;`status` 只读观察 D601 Tekton、CI namespace、git-mirror、Argo、node-local registry 和 tools image readiness;`apply --dry-run` 只输出 manifest 摘要;`apply --confirm` 只收敛 D601 control-plane bootstrap 对象,不触发 HWLAB runtime rollout,不创建 PK01 DB,也不修改 Caddy/FRP。tools image 的 node-local registry 地址只能作为输出 artifact;输入 base image 必须由 YAML 声明为公开 registry 来源,缺少 output image 时应在 `status.next.blockers` 中体现,而不是把现有 node-local image 当成输入基础镜像。
|
||||
`hwlab nodes control-plane infra plan|status|apply --node D601 --lane v03` 是 D601 HWLAB v03 节点本地 k3s、CI/CD 与 git-mirror 前置控制面的 YAML 驱动入口,配置真相源是 `config/hwlab-node-control-plane.yaml`。`plan` 只读展示 YAML target、host k3s node config 摘要和将渲染的 control-plane 对象;`status` 只读观察 k3s systemd drop-in 与 node `capacity/allocatable.pods`、D601 Tekton、CI namespace、git-mirror、Argo、node-local registry 和 tools image readiness;`apply --dry-run` 只输出 manifest 与 host config 摘要;`apply --confirm` 按 YAML 收敛 D601 host k3s drop-in 和 control-plane bootstrap 对象,只有 host k3s 配置或 live pod capacity 未收敛时才重启 k3s,不触发 HWLAB runtime rollout,不创建 PK01 DB,也不修改 Caddy/FRP。D601 host 侧 k3s pre-start 修正也必须写成 YAML `execStartPre` argv,不做手工 systemd 热改;当 kube API 已不可用时,`apply` 可用同一 YAML 渲染出的 host 脚本经 node-local tools image/Docker fallback 恢复 systemd drop-in,输出仍只给对象名、SHA、exit code 和摘要。k3s pod capacity 等可调数值只以 YAML 为准,长期参考不复制具体数值;tools image 的 node-local registry 地址只能作为输出 artifact,输入 base image 必须由 YAML 声明为公开 registry 来源,缺少 output image 时应在 `status.next.blockers` 中体现,而不是把现有 node-local image 当成输入基础镜像。
|
||||
|
||||
`hwlab nodes git-mirror status|sync|flush --node <node> --lane <lane>` 是 node-scoped runtime lane 的 Git mirror 维护入口。`status` 的 `githubSource` / `githubGitops` 来自本地 mirror cache 的 `refs/mirror-stage/...`,不是实时 GitHub API;输出中的 `refSources.githubFieldsAreMirrorStageCache=true` 和 `refSources.cacheRefresh` 给出这一来源和刷新命令。`sync --confirm --wait` 的 k3s Job 遇到 GitHub SSH transient 时,应通过目标 workspace fallback 拉取 GitHub source/gitops 并写回 node-local mirror,输出只披露 commit、mirror write URL 和 fallback 状态。`flush --confirm --wait` 如果已经把 GitOps ref push 到 GitHub,但 post-push fetch/recheck 因 transient SSH 失败而无法刷新 mirror-stage,会标记 `partialSuccess=push-succeeded-fetch-failed`;CLI 应自动执行一次受控 sync 刷新 mirror-stage,若恢复后 `pendingFlush=false` 且 `githubInSync=true`,结果应为 `ok=true` 并输出 `partialSuccessRecovered` / `postPushRecovery`,否则才保留 `degradedReason=node-runtime-git-mirror-flush-post-push-fetch-failed` 和下一步 `sync --confirm --wait`。不要把这种 partial success 解读为需要连续盲目 flush。`hwlab nodes control-plane trigger-current --node <node> --lane <lane> --confirm --wait` 会在 source sync 后自动执行必要的 pre-flush,在 PipelineRun terminal 后自动执行必要的 post-flush;progress 事件必须显式输出 `git-mirror-pre-flush` / `git-mirror-post-flush` 的 executed/skipped、jobName、local/github source、local/github GitOps、`pendingFlush` 和 `githubInSync`,且已恢复的 partial success 不能让顶层 trigger-current false-fail。`control-plane status` 仍是只读入口,只暴露 compact `gitMirror` 摘要和下一步 flush 命令,不隐式执行写操作。
|
||||
|
||||
|
||||
+1
-1
@@ -59,7 +59,7 @@ export function rootHelp(): unknown {
|
||||
{ command: "gh preflight|auth|issue|pr", description: "Run safe GitHub issue and PR CRUD/lifecycle operations through REST with body-file update replace/append, issue/comment apply_patch body patching, comment delete, token diagnostics, PR closeout preflight, hard delete unsupported, and guarded PR merge." },
|
||||
{ command: "git github-push-fallback [--repo owner/name] [--branch branch] [--host-name host-or-ip] [--confirm]", description: "Plan or execute a one-shot GitHub push through ssh.github.com:443 without editing remotes; use only for reviewed DNS/port-22 push fallback." },
|
||||
{ command: "commander contract|plan --dry-run|smoke --dry-run|approval request --dry-run", description: "Host Codex commander skeleton contract, no-daemon smoke plan, and dry-run approval preview without live bridges or message sends." },
|
||||
{ command: "hwlab nodes control-plane|git-mirror|secret|test-accounts|web-probe --node <node> --lane <lane>", description: "Manage HWLAB node/lane runtime prerequisites, including D601 YAML-declared infra/tools-image/Argo bootstrap, redacted test-account preparation, Web DOM probe credential injection, and G14 v0.3+ runtime lanes, with the node identity passed as data." },
|
||||
{ command: "hwlab nodes control-plane|git-mirror|secret|test-accounts|web-probe --node <node> --lane <lane>", description: "Manage HWLAB node/lane runtime prerequisites, including D601 YAML-declared k3s infra/tools-image/Argo bootstrap, redacted test-account preparation, Web DOM probe credential injection, and G14 v0.3+ runtime lanes, with the node identity passed as data." },
|
||||
{ command: "hwlab g14 monitor-prs | hwlab g14 control-plane status|apply|trigger-current|runtime-migration|cleanup-runs|cleanup-released-pvs | hwlab g14 git-mirror status|apply|sync|flush | hwlab g14 tools-image status|build", description: "Start the legacy G14 PR monitor, run bounded v0.2 Tekton/Argo control-plane, manual PipelineRun trigger, runtime migration, CI workspace retention, manual devops-infra git mirror/relay maintenance, or fixed HWLAB CI tools image actions; long confirmed trigger/sync/flush actions return async jobs by default." },
|
||||
{ command: "agentrun get|describe|events|logs|result|ack|cancel|dispatch|create|apply|send|control-plane|git-mirror", description: "Use AgentRun v0.1 resource primitives with low-noise human output by default; session follow-up uses send only and the server decides internal steer vs turn." },
|
||||
{ command: "platform-infra sub2api|langbot|n8n|wechat-archive ...", description: "Deploy platform-infra services such as Sub2API, LangBot and n8n, manage YAML-controlled public FRP/Caddy exposure and WeChat archive workflows, and inspect status/logs without printing secrets." },
|
||||
|
||||
@@ -50,10 +50,20 @@ interface ControlPlaneNodeSpec {
|
||||
id: string;
|
||||
route: string;
|
||||
kubeRoute: string;
|
||||
k3s: ControlPlaneK3sNodeSpec | null;
|
||||
registry: { endpoint: string };
|
||||
egressProxy: ControlPlaneEgressProxySpec | null;
|
||||
}
|
||||
|
||||
interface ControlPlaneK3sNodeSpec {
|
||||
serviceName: string;
|
||||
dropInPath: string;
|
||||
nodeStatusName: string;
|
||||
execStartPre: readonly (readonly string[])[];
|
||||
serverArgs: readonly string[];
|
||||
kubelet: { maxPods: number };
|
||||
}
|
||||
|
||||
interface DockerfileInlineSpec {
|
||||
filename: string;
|
||||
lines: readonly string[];
|
||||
@@ -180,7 +190,7 @@ export function hwlabNodeControlPlaneInfraHelp(): Record<string, unknown> {
|
||||
ok: true,
|
||||
command: "hwlab nodes control-plane infra",
|
||||
configPath: HWLAB_NODE_CONTROL_PLANE_CONFIG_PATH,
|
||||
description: "Plan/status/apply YAML-controlled HWLAB node-local CI/CD and git-mirror control-plane prerequisites. Cross-node PK01/Caddy/FRP/runtime rollout remains explicit semi-automatic CLI work.",
|
||||
description: "Plan/status/apply YAML-controlled HWLAB node-local k3s, CI/CD and git-mirror control-plane prerequisites. Cross-node PK01/Caddy/FRP/runtime rollout remains explicit semi-automatic CLI work.",
|
||||
usage: [
|
||||
"bun scripts/cli.ts hwlab nodes control-plane infra plan --node D601 --lane v03",
|
||||
"bun scripts/cli.ts hwlab nodes control-plane infra status --node D601 --lane v03",
|
||||
@@ -210,6 +220,7 @@ function infraPlan(_config: ControlPlaneConfig, node: ControlPlaneNodeSpec, targ
|
||||
mutation: false,
|
||||
target: planSummary(node, target),
|
||||
expected: expectedSummary(node, target),
|
||||
hostConfig: k3sNodeConfigPlan(node),
|
||||
imagePolicy: _config.imagePolicy,
|
||||
g14Consistency: {
|
||||
laneVocabulary: ["sourceBranch", "gitopsBranch", "catalogPath", "runtime.path", "runtime.namespace", "tekton.pipeline", "pipelineRunPrefix", "argo.application"],
|
||||
@@ -229,7 +240,7 @@ function infraPlan(_config: ControlPlaneConfig, node: ControlPlaneNodeSpec, targ
|
||||
}
|
||||
|
||||
function infraStatus(_config: ControlPlaneConfig, node: ControlPlaneNodeSpec, target: ControlPlaneTargetSpec, options: InfraOptions): Record<string, unknown> {
|
||||
const script = statusScript(target, node.registry.endpoint, target.tekton.toolsImage.output);
|
||||
const script = statusScript(node, target);
|
||||
const result = runTransK3s(node.kubeRoute, script, options.timeoutSeconds);
|
||||
const parsed = parseRemoteJson(result.stdout);
|
||||
const status = typeof parsed === "object" && parsed !== null ? parsed as Record<string, unknown> : { parseError: "remote status did not return a JSON object", stdoutPreview: result.stdout.slice(0, 1000) };
|
||||
@@ -240,7 +251,13 @@ function infraStatus(_config: ControlPlaneConfig, node: ControlPlaneNodeSpec, ta
|
||||
const tekton = record(components.tekton);
|
||||
const ciNamespace = record(components.ciNamespace);
|
||||
const registry = record(components.registry);
|
||||
const k3sNodeConfig = record(components.k3sNodeConfig);
|
||||
const k3sNodeConfigReady = node.k3s === null
|
||||
|| (boolField(k3sNodeConfig, "dropInMatches")
|
||||
&& numberValue(k3sNodeConfig.liveCapacityPods) === node.k3s.kubelet.maxPods
|
||||
&& numberValue(k3sNodeConfig.liveAllocatablePods) === node.k3s.kubelet.maxPods);
|
||||
const ok = result.exitCode === 0
|
||||
&& k3sNodeConfigReady
|
||||
&& boolField(tekton, "installed")
|
||||
&& boolField(ciNamespace, "exists")
|
||||
&& boolField(gitMirror, "namespaceExists")
|
||||
@@ -267,6 +284,7 @@ function infraStatus(_config: ControlPlaneConfig, node: ControlPlaneNodeSpec, ta
|
||||
status,
|
||||
readiness: {
|
||||
ok,
|
||||
k3sNodeConfigReady,
|
||||
tektonInstalled: boolField(tekton, "installed"),
|
||||
ciNamespaceExists: boolField(ciNamespace, "exists"),
|
||||
gitMirrorNamespaceExists: boolField(gitMirror, "namespaceExists"),
|
||||
@@ -286,7 +304,7 @@ function infraStatus(_config: ControlPlaneConfig, node: ControlPlaneNodeSpec, ta
|
||||
toolsImageReady: boolField(registry, "toolsImageReady"),
|
||||
},
|
||||
result: compactCommandResult(result),
|
||||
next: ok ? { runtimePreparation: `bun scripts/cli.ts hwlab nodes control-plane plan --node ${node.id} --lane ${target.lane}` } : statusNext(node, target, registry, gitMirror, argo, ciNamespace),
|
||||
next: ok ? { runtimePreparation: `bun scripts/cli.ts hwlab nodes control-plane plan --node ${node.id} --lane ${target.lane}` } : statusNext(node, target, registry, gitMirror, argo, ciNamespace, k3sNodeConfig),
|
||||
};
|
||||
}
|
||||
|
||||
@@ -306,6 +324,7 @@ function infraApply(_config: ControlPlaneConfig, node: ControlPlaneNodeSpec, tar
|
||||
mode: "dry-run",
|
||||
mutation: false,
|
||||
expected: expectedSummary(node, target),
|
||||
hostConfig: k3sNodeConfigPlan(node),
|
||||
preflight: {
|
||||
registryReady: imageStatus.registryReady,
|
||||
toolsImageReady: imageStatus.toolsImageReady,
|
||||
@@ -318,7 +337,7 @@ function infraApply(_config: ControlPlaneConfig, node: ControlPlaneNodeSpec, tar
|
||||
next: applyNext(node, target, imageStatus),
|
||||
};
|
||||
}
|
||||
const script = applyScript(yaml);
|
||||
const script = applyScript(yaml, node, target);
|
||||
const result = runTransK3s(node.kubeRoute, script, options.timeoutSeconds);
|
||||
const parsed = parseRemoteJson(result.stdout);
|
||||
return {
|
||||
@@ -430,7 +449,7 @@ function toolsImageBuild(node: ControlPlaneNodeSpec, target: ControlPlaneTargetS
|
||||
}
|
||||
|
||||
function argoCommandStatus(node: ControlPlaneNodeSpec, target: ControlPlaneTargetSpec, options: ArgoOptions): Record<string, unknown> {
|
||||
const result = runTransK3s(node.kubeRoute, statusScript(target, node.registry.endpoint, target.tekton.toolsImage.output), options.timeoutSeconds);
|
||||
const result = runTransK3s(node.kubeRoute, statusScript(node, target), options.timeoutSeconds);
|
||||
const parsed = parseRemoteJson(result.stdout);
|
||||
const status = typeof parsed === "object" && parsed !== null ? parsed as Record<string, unknown> : {};
|
||||
const argo = record(record(status.components).argo);
|
||||
@@ -733,15 +752,67 @@ function isNodeLocalImage(image: string): boolean {
|
||||
function nodeSpec(id: string, raw: Record<string, unknown>): ControlPlaneNodeSpec {
|
||||
const registry = asRecord(raw.registry, `nodes.${id}.registry`);
|
||||
const egressProxy = raw.egressProxy === undefined ? null : egressProxySpec(asRecord(raw.egressProxy, `nodes.${id}.egressProxy`), `nodes.${id}.egressProxy`);
|
||||
const k3s = raw.k3s === undefined ? null : k3sNodeSpec(asRecord(raw.k3s, `nodes.${id}.k3s`), `nodes.${id}.k3s`);
|
||||
return {
|
||||
id,
|
||||
route: stringField(raw, "route", `nodes.${id}`),
|
||||
kubeRoute: stringField(raw, "kubeRoute", `nodes.${id}`),
|
||||
k3s,
|
||||
registry: { endpoint: stringField(registry, "endpoint", `nodes.${id}.registry`) },
|
||||
egressProxy,
|
||||
};
|
||||
}
|
||||
|
||||
function k3sNodeSpec(raw: Record<string, unknown>, path: string): ControlPlaneK3sNodeSpec {
|
||||
const kubelet = asRecord(raw.kubelet, `${path}.kubelet`);
|
||||
const serviceName = stringField(raw, "serviceName", path);
|
||||
if (!/^[A-Za-z0-9_.@-]+$/u.test(serviceName)) throw new Error(`${path}.serviceName has an unsupported systemd unit name`);
|
||||
const dropInPath = stringField(raw, "dropInPath", path);
|
||||
if (!dropInPath.startsWith("/etc/systemd/system/") || !dropInPath.endsWith(".conf") || dropInPath.includes("..")) {
|
||||
throw new Error(`${path}.dropInPath must be an absolute /etc/systemd/system/*.conf path`);
|
||||
}
|
||||
const nodeStatusName = stringField(raw, "nodeStatusName", path);
|
||||
if (!/^[A-Za-z0-9_.-]+$/u.test(nodeStatusName)) throw new Error(`${path}.nodeStatusName has an unsupported Kubernetes node name`);
|
||||
const execStartPre = execStartPreField(raw.execStartPre, `${path}.execStartPre`);
|
||||
const serverArgs = stringArrayField(raw, "serverArgs", path);
|
||||
if (serverArgs.length === 0 || serverArgs[0] !== "server") throw new Error(`${path}.serverArgs must start with k3s server`);
|
||||
for (const [index, arg] of serverArgs.entries()) {
|
||||
if (arg.includes("\n") || arg.includes("\r") || arg.length === 0) throw new Error(`${path}.serverArgs[${index}] must be a single non-empty argv token`);
|
||||
}
|
||||
const maxPods = positiveConfigIntegerField(kubelet, "maxPods", `${path}.kubelet`);
|
||||
const expectedMaxPodsArg = `max-pods=${maxPods}`;
|
||||
let hasExpectedMaxPodsArg = false;
|
||||
for (let index = 0; index < serverArgs.length - 1; index += 1) {
|
||||
if (serverArgs[index] === "--kubelet-arg" && serverArgs[index + 1] === expectedMaxPodsArg) hasExpectedMaxPodsArg = true;
|
||||
}
|
||||
if (!hasExpectedMaxPodsArg) throw new Error(`${path}.serverArgs must include --kubelet-arg ${expectedMaxPodsArg}`);
|
||||
return {
|
||||
serviceName,
|
||||
dropInPath,
|
||||
nodeStatusName,
|
||||
execStartPre,
|
||||
serverArgs,
|
||||
kubelet: { maxPods },
|
||||
};
|
||||
}
|
||||
|
||||
function execStartPreField(raw: unknown, path: string): readonly (readonly string[])[] {
|
||||
if (raw === undefined) return [];
|
||||
if (!Array.isArray(raw)) throw new Error(`${path} must be an array of argv arrays`);
|
||||
return raw.map((item, index) => {
|
||||
if (!Array.isArray(item)) throw new Error(`${path}[${index}] must be an argv array`);
|
||||
const command = item.map((value, tokenIndex) => {
|
||||
if (typeof value !== "string") throw new Error(`${path}[${index}][${tokenIndex}] must be a string`);
|
||||
if (value.length === 0 || value.includes("\n") || value.includes("\r")) throw new Error(`${path}[${index}][${tokenIndex}] must be a single non-empty argv token`);
|
||||
return value;
|
||||
});
|
||||
if (command.length === 0) throw new Error(`${path}[${index}] must not be empty`);
|
||||
const executable = command[0].startsWith("-") ? command[0].slice(1) : command[0];
|
||||
if (!executable.startsWith("/") || executable.includes("..")) throw new Error(`${path}[${index}][0] must be an absolute executable path, optionally prefixed with -`);
|
||||
return command;
|
||||
});
|
||||
}
|
||||
|
||||
function egressProxySpec(raw: Record<string, unknown>, path: string): ControlPlaneEgressProxySpec {
|
||||
const mode = stringField(raw, "mode", path);
|
||||
if (mode !== "k8s-service-cluster-ip") throw new Error(`${path}.mode must be k8s-service-cluster-ip`);
|
||||
@@ -1298,6 +1369,7 @@ function planSummary(node: ControlPlaneNodeSpec, target: ControlPlaneTargetSpec)
|
||||
enabled: target.enabled,
|
||||
ciNamespace: target.ciNamespace,
|
||||
runtimeNamespace: target.runtimeNamespace,
|
||||
k3sNodeConfig: k3sNodeConfigPlan(node),
|
||||
registry: node.registry.endpoint,
|
||||
egressProxy: node.egressProxy,
|
||||
sourceBranch: target.source.branch,
|
||||
@@ -1328,6 +1400,7 @@ function expectedSummary(node: ControlPlaneNodeSpec, target: ControlPlaneTargetS
|
||||
runtimePath: target.gitops.path,
|
||||
runtimeNamespace: target.runtimeNamespace,
|
||||
namespace: target.ciNamespace,
|
||||
k3sNodeConfig: k3sNodeConfigPlan(node),
|
||||
gitMirror: {
|
||||
namespace: target.gitMirror.namespace,
|
||||
readUrl: target.gitMirror.readUrl,
|
||||
@@ -1366,10 +1439,43 @@ function expectedSummary(node: ControlPlaneNodeSpec, target: ControlPlaneTargetS
|
||||
};
|
||||
}
|
||||
|
||||
function statusScript(target: ControlPlaneTargetSpec, registryEndpoint: string, toolsImage: string): string {
|
||||
function k3sNodeConfigPlan(node: ControlPlaneNodeSpec): Record<string, unknown> {
|
||||
if (node.k3s === null) return { managed: false };
|
||||
const dropIn = k3sDropInContent(node.k3s);
|
||||
return {
|
||||
managed: true,
|
||||
serviceName: node.k3s.serviceName,
|
||||
dropInPath: node.k3s.dropInPath,
|
||||
nodeStatusName: node.k3s.nodeStatusName,
|
||||
desiredMaxPods: node.k3s.kubelet.maxPods,
|
||||
dropInSha256: sha256Short(dropIn),
|
||||
execStartPreCount: node.k3s.execStartPre.length,
|
||||
serverArgCount: node.k3s.serverArgs.length,
|
||||
};
|
||||
}
|
||||
|
||||
function k3sDropInContent(spec: ControlPlaneK3sNodeSpec): string {
|
||||
return [
|
||||
"# Managed by UniDesk. Source: config/hwlab-node-control-plane.yaml nodes.<node>.k3s",
|
||||
"[Service]",
|
||||
...spec.execStartPre.map((command) => `ExecStartPre=${command.map(systemdExecArg).join(" ")}`),
|
||||
"ExecStart=",
|
||||
`ExecStart=${["/usr/local/bin/k3s", ...spec.serverArgs].map(systemdExecArg).join(" ")}`,
|
||||
"",
|
||||
].join("\n");
|
||||
}
|
||||
|
||||
function systemdExecArg(value: string): string {
|
||||
if (/^[A-Za-z0-9_@%+=:,./-]+$/u.test(value)) return value;
|
||||
return `"${value.replaceAll("\\", "\\\\").replaceAll("\"", "\\\"").replaceAll("$", "\\$").replaceAll("`", "\\`")}"`;
|
||||
}
|
||||
|
||||
function statusScript(nodeSpec: ControlPlaneNodeSpec, target: ControlPlaneTargetSpec): string {
|
||||
const requiredCrds = shellJsonArray(target.argo.install.requiredCrds);
|
||||
const argoDeployments = shellJsonArray(target.argo.install.expectedDeployments);
|
||||
const argoStatefulSets = shellJsonArray(target.argo.install.expectedStatefulSets);
|
||||
const k3s = nodeSpec.k3s;
|
||||
const k3sDropIn = k3s === null ? "" : k3sDropInContent(k3s);
|
||||
return `
|
||||
set +e
|
||||
node=${shQuote(target.node)}
|
||||
@@ -1388,11 +1494,17 @@ service_account=${shQuote(target.tekton.serviceAccountName)}
|
||||
argo_ns=${shQuote(target.argo.namespace)}
|
||||
argo_project=${shQuote(target.argo.projectName)}
|
||||
argo_app=${shQuote(target.argo.applicationName)}
|
||||
registry=${shQuote(registryEndpoint)}
|
||||
tools_image=${shQuote(toolsImage)}
|
||||
registry=${shQuote(nodeSpec.registry.endpoint)}
|
||||
tools_image=${shQuote(target.tekton.toolsImage.output)}
|
||||
required_crds_json=${shQuote(requiredCrds)}
|
||||
argo_deployments_json=${shQuote(argoDeployments)}
|
||||
argo_statefulsets_json=${shQuote(argoStatefulSets)}
|
||||
k3s_managed=${k3s === null ? "false" : "true"}
|
||||
k3s_service=${shQuote(k3s?.serviceName ?? "")}
|
||||
k3s_dropin=${shQuote(k3s?.dropInPath ?? "")}
|
||||
k3s_node=${shQuote(k3s?.nodeStatusName ?? "")}
|
||||
k3s_desired_max_pods=${shQuote(String(k3s?.kubelet.maxPods ?? ""))}
|
||||
k3s_expected_sha=${shQuote(k3s === null ? "" : sha256Short(k3sDropIn))}
|
||||
exists_ns() { kubectl get ns "$1" >/dev/null 2>&1 && printf true || printf false; }
|
||||
exists_res() { kubectl -n "$1" get "$2" "$3" >/dev/null 2>&1 && printf true || printf false; }
|
||||
deploy_ready() { desired=$(kubectl -n "$1" get deploy "$2" -o 'jsonpath={.spec.replicas}' 2>/dev/null || true); ready=$(kubectl -n "$1" get deploy "$2" -o 'jsonpath={.status.readyReplicas}' 2>/dev/null || true); [ -n "$desired" ] && [ "$desired" -gt 0 ] 2>/dev/null && [ "\${ready:-0}" = "$desired" ] && printf true || printf false; }
|
||||
@@ -1407,6 +1519,64 @@ tools_image_ready=false
|
||||
if [ "$tools_repo" != "$tools_repo_tag" ] && command -v curl >/dev/null 2>&1; then curl -fsS --max-time 5 "http://$registry/v2/$tools_repo/manifests/$tools_tag" >/tmp/hwlab-tools-image.out 2>/tmp/hwlab-tools-image.err && tools_image_ready=true; fi
|
||||
cache_host_path_ready=false
|
||||
if [ -n "$cache_host_path" ] && kubectl -n "$gitmirror_ns" exec deploy/"$read_deploy" -- sh -lc 'test -d /cache' >/dev/null 2>&1; then cache_host_path_ready=true; fi
|
||||
k3s_fragment=$(python3 - "$k3s_managed" "$k3s_service" "$k3s_dropin" "$k3s_node" "$k3s_desired_max_pods" "$k3s_expected_sha" <<'PY'
|
||||
import hashlib, json, re, subprocess, sys
|
||||
managed = sys.argv[1] == "true"
|
||||
service, dropin, node_name, desired_raw, expected_sha = sys.argv[2:7]
|
||||
def run(args):
|
||||
return subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
||||
def to_int(value):
|
||||
try:
|
||||
return int(value)
|
||||
except Exception:
|
||||
return None
|
||||
if not managed:
|
||||
print(json.dumps({"managed": False, "ready": True}))
|
||||
raise SystemExit(0)
|
||||
desired = to_int(desired_raw)
|
||||
node_json = run(["kubectl", "get", "node", node_name, "-o", "json"])
|
||||
capacity = None
|
||||
allocatable = None
|
||||
node_ready = False
|
||||
if node_json.returncode == 0:
|
||||
data = json.loads(node_json.stdout)
|
||||
capacity = to_int(data.get("status", {}).get("capacity", {}).get("pods"))
|
||||
allocatable = to_int(data.get("status", {}).get("allocatable", {}).get("pods"))
|
||||
for condition in data.get("status", {}).get("conditions", []):
|
||||
if condition.get("type") == "Ready":
|
||||
node_ready = condition.get("status") == "True"
|
||||
unit = run(["systemctl", "cat", service])
|
||||
unit_text = unit.stdout if unit.returncode == 0 else ""
|
||||
dropin_read = run(["cat", dropin])
|
||||
dropin_exists = dropin_read.returncode == 0
|
||||
dropin_text = dropin_read.stdout if dropin_exists else ""
|
||||
dropin_sha = "sha256:" + hashlib.sha256(dropin_text.encode()).hexdigest() if dropin_exists else None
|
||||
matches = re.findall(r"max-pods=([0-9]+)", unit_text + "\\n" + dropin_text)
|
||||
configured = to_int(matches[-1]) if matches else None
|
||||
dropin_matches = dropin_sha == expected_sha
|
||||
ready = dropin_matches and capacity == desired and allocatable == desired
|
||||
source = "managed-dropin" if dropin_matches else ("systemd-or-config" if configured is not None else "kubelet-default")
|
||||
print(json.dumps({
|
||||
"managed": True,
|
||||
"ready": ready,
|
||||
"serviceName": service,
|
||||
"dropInPath": dropin,
|
||||
"dropInExists": dropin_exists,
|
||||
"dropInSha256": dropin_sha,
|
||||
"expectedDropInSha256": expected_sha,
|
||||
"dropInMatches": dropin_matches,
|
||||
"configuredMaxPods": configured,
|
||||
"desiredMaxPods": desired,
|
||||
"liveNodeName": node_name,
|
||||
"liveCapacityPods": capacity,
|
||||
"liveAllocatablePods": allocatable,
|
||||
"nodeReady": node_ready,
|
||||
"restartRequired": not ready,
|
||||
"source": source,
|
||||
"unitReadable": unit.returncode == 0,
|
||||
}))
|
||||
PY
|
||||
)
|
||||
python3 - "$required_crds_json" "$argo_deployments_json" "$argo_statefulsets_json" <<'PY' >/tmp/hwlab-node-status-fragments.json
|
||||
import json, subprocess, sys
|
||||
required_crds=json.loads(sys.argv[1])
|
||||
@@ -1432,27 +1602,274 @@ print(json.dumps({"crds": crds, "deployments": deploy, "statefulSets": sts, "crd
|
||||
PY
|
||||
argo_fragment=$(cat /tmp/hwlab-node-status-fragments.json 2>/dev/null || printf '{}')
|
||||
cat <<JSON
|
||||
{"observedAt":"$(date -u +%Y-%m-%dT%H:%M:%SZ)","node":"$node","lane":"$lane","components":{"tekton":{"installed":$(kubectl get crd pipelines.tekton.dev pipelineruns.tekton.dev >/dev/null 2>&1 && printf true || printf false),"controllerReady":$(deploy_ready tekton-pipelines tekton-pipelines-controller),"webhookReady":$(deploy_ready tekton-pipelines tekton-pipelines-webhook)},"ciNamespace":{"name":"$ci_ns","exists":$(exists_ns "$ci_ns"),"serviceAccountExists":$(exists_res "$ci_ns" serviceaccount "$service_account"),"pipelineExists":$(exists_res "$ci_ns" pipeline "$pipeline")},"gitMirror":{"namespace":"$gitmirror_ns","namespaceExists":$(exists_ns "$gitmirror_ns"),"readDeploymentReady":$(deploy_ready "$gitmirror_ns" "$read_deploy"),"writeDeploymentReady":$(deploy_ready "$gitmirror_ns" "$write_deploy"),"readServiceExists":$(exists_res "$gitmirror_ns" service "$read_svc"),"writeServiceExists":$(exists_res "$gitmirror_ns" service "$write_svc"),"readEndpointsReady":$(endpoint_ready "$gitmirror_ns" "$read_svc"),"writeEndpointsReady":$(endpoint_ready "$gitmirror_ns" "$write_svc"),"cachePvcExists":$(exists_res "$gitmirror_ns" pvc "$cache_pvc"),"cacheHostPath":"$cache_host_path","cacheHostPathReady":$cache_host_path_ready,"summary":{"localSource":null,"githubSource":null,"localGitops":null,"githubGitops":null,"pendingFlush":null,"flushNeeded":null,"githubInSync":null}},"argo":{"namespace":"$argo_ns","namespaceExists":$(exists_ns "$argo_ns"),"installed":$(kubectl get crd applications.argoproj.io appprojects.argoproj.io >/dev/null 2>&1 && printf true || printf false),"projectExists":$(kubectl -n "$argo_ns" get appproject "$argo_project" >/dev/null 2>&1 && printf true || printf false),"applicationExists":$(kubectl -n "$argo_ns" get application "$argo_app" >/dev/null 2>&1 && printf true || printf false),"install":$argo_fragment},"registry":{"endpoint":"$registry","ready":$registry_ready,"toolsImage":"$tools_image","toolsImageReady":$tools_image_ready},"runtimeNamespace":{"name":"$runtime_ns","exists":$(exists_ns "$runtime_ns")}}}
|
||||
{"observedAt":"$(date -u +%Y-%m-%dT%H:%M:%SZ)","node":"$node","lane":"$lane","components":{"k3sNodeConfig":$k3s_fragment,"tekton":{"installed":$(kubectl get crd pipelines.tekton.dev pipelineruns.tekton.dev >/dev/null 2>&1 && printf true || printf false),"controllerReady":$(deploy_ready tekton-pipelines tekton-pipelines-controller),"webhookReady":$(deploy_ready tekton-pipelines tekton-pipelines-webhook)},"ciNamespace":{"name":"$ci_ns","exists":$(exists_ns "$ci_ns"),"serviceAccountExists":$(exists_res "$ci_ns" serviceaccount "$service_account"),"pipelineExists":$(exists_res "$ci_ns" pipeline "$pipeline")},"gitMirror":{"namespace":"$gitmirror_ns","namespaceExists":$(exists_ns "$gitmirror_ns"),"readDeploymentReady":$(deploy_ready "$gitmirror_ns" "$read_deploy"),"writeDeploymentReady":$(deploy_ready "$gitmirror_ns" "$write_deploy"),"readServiceExists":$(exists_res "$gitmirror_ns" service "$read_svc"),"writeServiceExists":$(exists_res "$gitmirror_ns" service "$write_svc"),"readEndpointsReady":$(endpoint_ready "$gitmirror_ns" "$read_svc"),"writeEndpointsReady":$(endpoint_ready "$gitmirror_ns" "$write_svc"),"cachePvcExists":$(exists_res "$gitmirror_ns" pvc "$cache_pvc"),"cacheHostPath":"$cache_host_path","cacheHostPathReady":$cache_host_path_ready,"summary":{"localSource":null,"githubSource":null,"localGitops":null,"githubGitops":null,"pendingFlush":null,"flushNeeded":null,"githubInSync":null}},"argo":{"namespace":"$argo_ns","namespaceExists":$(exists_ns "$argo_ns"),"installed":$(kubectl get crd applications.argoproj.io appprojects.argoproj.io >/dev/null 2>&1 && printf true || printf false),"projectExists":$(kubectl -n "$argo_ns" get appproject "$argo_project" >/dev/null 2>&1 && printf true || printf false),"applicationExists":$(kubectl -n "$argo_ns" get application "$argo_app" >/dev/null 2>&1 && printf true || printf false),"install":$argo_fragment},"registry":{"endpoint":"$registry","ready":$registry_ready,"toolsImage":"$tools_image","toolsImageReady":$tools_image_ready},"runtimeNamespace":{"name":"$runtime_ns","exists":$(exists_ns "$runtime_ns")}}}
|
||||
JSON
|
||||
`;
|
||||
}
|
||||
|
||||
function applyScript(yaml: string): string {
|
||||
function applyScript(yaml: string, node: ControlPlaneNodeSpec, target: ControlPlaneTargetSpec): string {
|
||||
const encoded = Buffer.from(yaml, "utf8").toString("base64");
|
||||
return `
|
||||
set +e
|
||||
manifest=$(mktemp /tmp/hwlab-node-infra.XXXXXX.yaml)
|
||||
printf %s ${shQuote(encoded)} | base64 -d >"$manifest"
|
||||
kubectl apply --server-side --field-manager=unidesk-hwlab-node-control-plane -f "$manifest" >/tmp/hwlab-node-infra-apply.out 2>/tmp/hwlab-node-infra-apply.err
|
||||
rc=$?
|
||||
python3 - "$rc" <<'PY'
|
||||
kubectl_rc=$?
|
||||
${k3sApplyScriptFragment(node.k3s, target)}
|
||||
python3 - "$kubectl_rc" "$k3s_report_file" <<'PY'
|
||||
import json, pathlib, sys
|
||||
k3s_report = {}
|
||||
try:
|
||||
k3s_report = json.loads(pathlib.Path(sys.argv[2]).read_text(errors='replace'))
|
||||
except Exception as exc:
|
||||
k3s_report = {"managed": None, "ok": False, "parseError": str(exc)}
|
||||
out=pathlib.Path('/tmp/hwlab-node-infra-apply.out').read_text(errors='replace') if pathlib.Path('/tmp/hwlab-node-infra-apply.out').exists() else ''
|
||||
err=pathlib.Path('/tmp/hwlab-node-infra-apply.err').read_text(errors='replace') if pathlib.Path('/tmp/hwlab-node-infra-apply.err').exists() else ''
|
||||
print(json.dumps({'applyExitCode': int(sys.argv[1]), 'stdoutPreview': out[-2000:], 'stderrPreview': err[-2000:], 'runtimeRolloutTriggered': False, 'pk01Touched': False}, ensure_ascii=False))
|
||||
print(json.dumps({'k3sNodeConfig': k3s_report, 'kubernetesApply': {'applyExitCode': int(sys.argv[1]), 'stdoutPreview': out[-2000:], 'stderrPreview': err[-2000:], 'runtimeRolloutTriggered': False, 'pk01Touched': False}}, ensure_ascii=False))
|
||||
PY
|
||||
rm -f "$manifest"
|
||||
exit "$rc"
|
||||
if [ "$kubectl_rc" != 0 ]; then exit "$kubectl_rc"; fi
|
||||
exit "$k3s_rc"
|
||||
`;
|
||||
}
|
||||
|
||||
function k3sApplyScriptFragment(spec: ControlPlaneK3sNodeSpec | null, target: ControlPlaneTargetSpec): string {
|
||||
if (spec === null) {
|
||||
return `
|
||||
k3s_report_file=$(mktemp /tmp/hwlab-node-k3s.XXXXXX.json)
|
||||
printf '{"managed":false,"ok":true,"mutation":false}\\n' >"$k3s_report_file"
|
||||
k3s_rc=0
|
||||
`;
|
||||
}
|
||||
const content = k3sDropInContent(spec);
|
||||
const encoded = Buffer.from(content, "utf8").toString("base64");
|
||||
return `
|
||||
k3s_report_file=$(mktemp /tmp/hwlab-node-k3s.XXXXXX.json)
|
||||
k3s_service=${shQuote(spec.serviceName)}
|
||||
k3s_dropin=${shQuote(spec.dropInPath)}
|
||||
k3s_node=${shQuote(spec.nodeStatusName)}
|
||||
k3s_namespace=${shQuote(target.ciNamespace)}
|
||||
k3s_image=${shQuote(target.tekton.toolsImage.output)}
|
||||
k3s_desired_max_pods=${shQuote(String(spec.kubelet.maxPods))}
|
||||
k3s_expected_sha=${shQuote(sha256Short(content))}
|
||||
k3s_before_capacity=$(kubectl get node "$k3s_node" -o 'jsonpath={.status.capacity.pods}' 2>/dev/null || true)
|
||||
k3s_before_allocatable=$(kubectl get node "$k3s_node" -o 'jsonpath={.status.allocatable.pods}' 2>/dev/null || true)
|
||||
capacity_restart=false
|
||||
if [ "$k3s_before_capacity" != "$k3s_desired_max_pods" ] || [ "$k3s_before_allocatable" != "$k3s_desired_max_pods" ]; then capacity_restart=true; fi
|
||||
k3s_current_dropin_sha=
|
||||
if [ -f "$k3s_dropin" ]; then k3s_current_dropin_sha=$(sha256sum "$k3s_dropin" | awk '{print "sha256:"$1}'); fi
|
||||
if [ "$k3s_current_dropin_sha" = "$k3s_expected_sha" ] && [ "$capacity_restart" = false ]; then
|
||||
python3 - "$k3s_current_dropin_sha" "$k3s_expected_sha" "$k3s_service" "$k3s_dropin" "$k3s_node" "$k3s_desired_max_pods" "$k3s_before_capacity" "$k3s_before_allocatable" <<'PY' >"$k3s_report_file"
|
||||
import json, sys
|
||||
dropin_sha, expected_sha, service, dropin, node_name, desired, before_capacity, before_allocatable = sys.argv[1:9]
|
||||
print(json.dumps({
|
||||
"managed": True,
|
||||
"ok": True,
|
||||
"mutation": False,
|
||||
"applyMode": "noop",
|
||||
"completionPending": False,
|
||||
"serviceName": service,
|
||||
"dropInPath": dropin,
|
||||
"dropInSha256": dropin_sha,
|
||||
"expectedDropInSha256": expected_sha,
|
||||
"dropInMatches": dropin_sha == expected_sha,
|
||||
"nodeName": node_name,
|
||||
"desiredMaxPods": int(desired),
|
||||
"beforeCapacityPods": int(before_capacity) if before_capacity.isdigit() else None,
|
||||
"beforeAllocatablePods": int(before_allocatable) if before_allocatable.isdigit() else None,
|
||||
}, ensure_ascii=False))
|
||||
PY
|
||||
k3s_rc=0
|
||||
else
|
||||
k3s_job="hwlab-node-k3s-config-$(date +%s)"
|
||||
k3s_job_manifest=$(mktemp /tmp/hwlab-node-k3s-job.XXXXXX.json)
|
||||
k3s_host_script=$(mktemp /tmp/hwlab-node-k3s-host.XXXXXX.sh)
|
||||
k3s_job_apply_stdout=/tmp/hwlab-node-k3s-job-apply.out
|
||||
k3s_job_apply_stderr=/tmp/hwlab-node-k3s-job-apply.err
|
||||
k3s_docker_stdout=/tmp/hwlab-node-k3s-docker.out
|
||||
k3s_docker_stderr=/tmp/hwlab-node-k3s-docker.err
|
||||
k3s_host_report="/tmp/$k3s_job-report.json"
|
||||
rm -f "$k3s_host_report"
|
||||
python3 - "$k3s_job_manifest" "$k3s_host_script" "$k3s_job" "$k3s_namespace" "$k3s_image" "$k3s_dropin" ${shQuote(encoded)} "$k3s_service" "$k3s_desired_max_pods" "$k3s_expected_sha" "$capacity_restart" "$k3s_host_report" <<'PY'
|
||||
import json, os, shlex, sys
|
||||
manifest_path, host_script_path, job, namespace, image, dropin, encoded, service, desired, expected_sha, capacity_restart, report_path = sys.argv[1:13]
|
||||
script = f"""#!/bin/sh
|
||||
set -eu
|
||||
expected=/tmp/unidesk-k3s-dropin.conf
|
||||
printf %s {shlex.quote(encoded)} | base64 -d > "$expected"
|
||||
host_dropin=/host{shlex.quote(dropin)}
|
||||
host_report=/host{shlex.quote(report_path)}
|
||||
mkdir -p "$(dirname "$host_dropin")"
|
||||
before_sha=
|
||||
if [ -f "$host_dropin" ]; then before_sha=$(sha256sum "$host_dropin" | awk '{{print "sha256:"$1}}'); fi
|
||||
changed=false
|
||||
if ! cmp -s "$expected" "$host_dropin" 2>/dev/null; then
|
||||
cp "$expected" "$host_dropin"
|
||||
chown 0:0 "$host_dropin" 2>/dev/null || true
|
||||
chmod 0644 "$host_dropin"
|
||||
changed=true
|
||||
fi
|
||||
nsenter_path=$(command -v nsenter || true)
|
||||
host_systemctl() {{
|
||||
if command -v chroot >/dev/null 2>&1 && [ -x /host/usr/bin/systemctl ]; then
|
||||
chroot /host /usr/bin/systemctl "$@"
|
||||
return $?
|
||||
fi
|
||||
if [ -n "$nsenter_path" ]; then
|
||||
"$nsenter_path" -t 1 -m -u -i -n -p -- /usr/bin/systemctl "$@"
|
||||
return $?
|
||||
fi
|
||||
return 127
|
||||
}}
|
||||
daemon_reload_rc=0
|
||||
restart_rc=0
|
||||
restarted=false
|
||||
if command -v chroot >/dev/null 2>&1 || [ -n "$nsenter_path" ]; then
|
||||
host_systemctl daemon-reload || daemon_reload_rc=$?
|
||||
if [ "$changed" = true ] || [ {shlex.quote(capacity_restart)} = true ]; then
|
||||
restarted=true
|
||||
host_systemctl restart {shlex.quote(service)} || restart_rc=$?
|
||||
fi
|
||||
else
|
||||
daemon_reload_rc=127
|
||||
restart_rc=127
|
||||
fi
|
||||
after_sha=
|
||||
if [ -f "$host_dropin" ]; then after_sha=$(sha256sum "$host_dropin" | awk '{{print "sha256:"$1}}'); fi
|
||||
service_active=unknown
|
||||
if command -v chroot >/dev/null 2>&1 || [ -n "$nsenter_path" ]; then service_active=$(host_systemctl is-active {shlex.quote(service)} 2>/dev/null || true); fi
|
||||
python3 - "$changed" "$restarted" "$daemon_reload_rc" "$restart_rc" "$before_sha" "$after_sha" "$service_active" "$nsenter_path" <<'REPORT' >"$host_report"
|
||||
import json, sys
|
||||
changed, restarted = sys.argv[1] == "true", sys.argv[2] == "true"
|
||||
daemon_reload_rc, restart_rc = int(sys.argv[3] or "0"), int(sys.argv[4] or "0")
|
||||
print(json.dumps({{
|
||||
"jobChanged": changed,
|
||||
"jobRestarted": restarted,
|
||||
"daemonReloadExitCode": daemon_reload_rc,
|
||||
"restartExitCode": restart_rc,
|
||||
"beforeDropInSha256": sys.argv[5] or None,
|
||||
"dropInSha256": sys.argv[6] or None,
|
||||
"expectedDropInSha256": {json.dumps(expected_sha)},
|
||||
"dropInMatches": sys.argv[6] == {json.dumps(expected_sha)},
|
||||
"serviceActiveText": sys.argv[7] or None,
|
||||
"nsenterPresent": bool(sys.argv[8]),
|
||||
}}))
|
||||
REPORT
|
||||
chmod 0644 "$host_report" 2>/dev/null || true
|
||||
cat "$host_report"
|
||||
"""
|
||||
with open(host_script_path, "w", encoding="utf-8") as handle:
|
||||
handle.write(script)
|
||||
os.chmod(host_script_path, 0o755)
|
||||
manifest = {
|
||||
"apiVersion": "batch/v1",
|
||||
"kind": "Job",
|
||||
"metadata": {"name": job, "namespace": namespace, "labels": {"app.kubernetes.io/part-of": "hwlab-node-control-plane", "unidesk.ai/operation": "k3s-node-config"}},
|
||||
"spec": {
|
||||
"backoffLimit": 0,
|
||||
"ttlSecondsAfterFinished": 300,
|
||||
"template": {
|
||||
"metadata": {"labels": {"app.kubernetes.io/part-of": "hwlab-node-control-plane", "unidesk.ai/operation": "k3s-node-config"}},
|
||||
"spec": {
|
||||
"restartPolicy": "Never",
|
||||
"hostPID": True,
|
||||
"hostNetwork": True,
|
||||
"containers": [{
|
||||
"name": "apply-k3s-node-config",
|
||||
"image": image,
|
||||
"imagePullPolicy": "IfNotPresent",
|
||||
"securityContext": {"privileged": True},
|
||||
"command": ["/bin/sh", "-lc", script],
|
||||
"volumeMounts": [{"name": "host-root", "mountPath": "/host"}],
|
||||
}],
|
||||
"volumes": [{"name": "host-root", "hostPath": {"path": "/", "type": "Directory"}}],
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
with open(manifest_path, "w", encoding="utf-8") as handle:
|
||||
json.dump(manifest, handle)
|
||||
PY
|
||||
k3s_render_rc=$?
|
||||
if [ "$k3s_render_rc" != 0 ]; then
|
||||
python3 - "$k3s_render_rc" "$k3s_expected_sha" "$k3s_service" "$k3s_dropin" "$k3s_node" "$k3s_desired_max_pods" <<'PY' >"$k3s_report_file"
|
||||
import json, sys
|
||||
render_rc = int(sys.argv[1] or "1")
|
||||
expected_sha, service, dropin, node_name, desired = sys.argv[2:7]
|
||||
print(json.dumps({
|
||||
"managed": True,
|
||||
"ok": False,
|
||||
"mutation": False,
|
||||
"renderExitCode": render_rc,
|
||||
"serviceName": service,
|
||||
"dropInPath": dropin,
|
||||
"expectedDropInSha256": expected_sha,
|
||||
"nodeName": node_name,
|
||||
"desiredMaxPods": int(desired),
|
||||
}, ensure_ascii=False))
|
||||
PY
|
||||
k3s_rc=$k3s_render_rc
|
||||
else
|
||||
kubectl apply -f "$k3s_job_manifest" >"$k3s_job_apply_stdout" 2>"$k3s_job_apply_stderr"
|
||||
k3s_job_apply_rc=$?
|
||||
k3s_apply_mode=kubernetes-job
|
||||
k3s_docker_rc=127
|
||||
if [ "$k3s_job_apply_rc" != 0 ] && command -v docker >/dev/null 2>&1; then
|
||||
k3s_apply_mode=docker-host-fallback
|
||||
docker run --rm --privileged --pid=host --network=host -v /:/host --entrypoint /bin/sh "$k3s_image" "/host$k3s_host_script" >"$k3s_docker_stdout" 2>"$k3s_docker_stderr"
|
||||
k3s_docker_rc=$?
|
||||
fi
|
||||
k3s_submit_rc=$k3s_job_apply_rc
|
||||
if [ "$k3s_job_apply_rc" != 0 ] && [ "$k3s_docker_rc" = 0 ]; then k3s_submit_rc=0; fi
|
||||
python3 - "$k3s_submit_rc" "$k3s_job_apply_rc" "$k3s_docker_rc" "$k3s_apply_mode" "$k3s_before_capacity" "$k3s_before_allocatable" "$k3s_expected_sha" "$k3s_service" "$k3s_dropin" "$k3s_node" "$k3s_desired_max_pods" "$k3s_job" "$k3s_namespace" "$k3s_host_report" "$k3s_job_apply_stdout" "$k3s_job_apply_stderr" "$k3s_docker_stdout" "$k3s_docker_stderr" <<'PY' >"$k3s_report_file"
|
||||
import json, pathlib, sys
|
||||
submit_rc, job_apply_rc, docker_rc = [int(value or "0") for value in sys.argv[1:4]]
|
||||
apply_mode = sys.argv[4]
|
||||
before_capacity, before_allocatable = sys.argv[5:7]
|
||||
expected_sha, service, dropin, node_name, desired, job_name, namespace, host_report = sys.argv[7:15]
|
||||
def read(path):
|
||||
return pathlib.Path(path).read_text(errors='replace') if pathlib.Path(path).exists() else ''
|
||||
try:
|
||||
host_report_data = json.loads(read(host_report) or "{}")
|
||||
except Exception:
|
||||
host_report_data = {}
|
||||
apply_ok = submit_rc == 0
|
||||
print(json.dumps({
|
||||
"managed": True,
|
||||
"ok": apply_ok,
|
||||
"mutation": apply_ok,
|
||||
"completionPending": apply_ok and apply_mode == "kubernetes-job",
|
||||
"applyMode": apply_mode,
|
||||
"jobName": job_name,
|
||||
"namespace": namespace,
|
||||
"jobApplyExitCode": job_apply_rc,
|
||||
"dockerFallbackExitCode": docker_rc,
|
||||
"serviceName": service,
|
||||
"dropInPath": dropin,
|
||||
"dropInSha256": host_report_data.get("dropInSha256"),
|
||||
"expectedDropInSha256": expected_sha,
|
||||
"dropInMatches": host_report_data.get("dropInSha256") == expected_sha if host_report_data else None,
|
||||
"daemonReloadExitCode": host_report_data.get("daemonReloadExitCode"),
|
||||
"restartExitCode": host_report_data.get("restartExitCode"),
|
||||
"serviceActive": host_report_data.get("serviceActiveText") == "active" if host_report_data else None,
|
||||
"nodeName": node_name,
|
||||
"desiredMaxPods": int(desired),
|
||||
"beforeCapacityPods": int(before_capacity) if before_capacity.isdigit() else None,
|
||||
"beforeAllocatablePods": int(before_allocatable) if before_allocatable.isdigit() else None,
|
||||
"hostReportPath": host_report,
|
||||
"statusCommand": f"bun scripts/cli.ts hwlab nodes control-plane infra status --node {node_name.upper()} --lane ${target.lane}",
|
||||
"jobCompletionCommand": f"kubectl -n {namespace} wait --for=condition=complete job/{job_name} --timeout=120s",
|
||||
"jobLogsCommand": f"kubectl -n {namespace} logs job/{job_name} --tail=120",
|
||||
"jobApplyStdoutPreview": read(sys.argv[15])[-1000:],
|
||||
"jobApplyStderrPreview": read(sys.argv[16])[-1000:],
|
||||
"dockerStdoutPreview": read(sys.argv[17])[-1000:],
|
||||
"dockerStderrPreview": read(sys.argv[18])[-1000:],
|
||||
}, ensure_ascii=False))
|
||||
PY
|
||||
k3s_rc=$k3s_submit_rc
|
||||
fi
|
||||
rm -f "$k3s_job_manifest" "$k3s_host_script"
|
||||
fi
|
||||
`;
|
||||
}
|
||||
|
||||
@@ -1499,6 +1916,7 @@ function statusNext(
|
||||
gitMirror: Record<string, unknown>,
|
||||
argo: Record<string, unknown>,
|
||||
ciNamespace: Record<string, unknown>,
|
||||
k3sNodeConfig: Record<string, unknown>,
|
||||
): Record<string, unknown> {
|
||||
const bootstrapMissing = !boolField(ciNamespace, "exists")
|
||||
|| !boolField(gitMirror, "namespaceExists")
|
||||
@@ -1506,6 +1924,7 @@ function statusNext(
|
||||
|| !boolField(gitMirror, "writeServiceExists")
|
||||
|| (!boolField(gitMirror, "cachePvcExists") && !boolField(gitMirror, "cacheHostPathReady"));
|
||||
const blockers: string[] = [];
|
||||
if (node.k3s !== null && !boolField(k3sNodeConfig, "ready")) blockers.push("k3s-node-config-not-applied");
|
||||
if (!boolField(registry, "ready")) blockers.push("node-local-registry-not-ready");
|
||||
if (!boolField(registry, "toolsImageReady")) blockers.push("tools-image-missing");
|
||||
if (bootstrapMissing) blockers.push("control-plane-bootstrap-missing");
|
||||
@@ -1530,6 +1949,9 @@ function statusNext(
|
||||
if (!boolField(argo, "installed")) {
|
||||
next.installArgo = "准备受控 D601 Argo CD 安装入口后再进入 runtime rollout。";
|
||||
}
|
||||
if (node.k3s !== null && !boolField(k3sNodeConfig, "ready")) {
|
||||
next.applyK3sNodeConfig = `bun scripts/cli.ts hwlab nodes control-plane infra apply --node ${node.id} --lane ${target.lane} --confirm`;
|
||||
}
|
||||
if (bootstrapMissing) next.applyBootstrap = `bun scripts/cli.ts hwlab nodes control-plane infra apply --node ${node.id} --lane ${target.lane} --confirm`;
|
||||
else next.reapplyBootstrap = `bun scripts/cli.ts hwlab nodes control-plane infra apply --node ${node.id} --lane ${target.lane} --confirm`;
|
||||
return next;
|
||||
@@ -1900,6 +2322,10 @@ function boolField(obj: Record<string, unknown>, key: string): boolean {
|
||||
return obj[key] === true;
|
||||
}
|
||||
|
||||
function numberValue(value: unknown): number | null {
|
||||
return typeof value === "number" && Number.isFinite(value) ? value : null;
|
||||
}
|
||||
|
||||
function requiredOption(args: string[], name: string): string {
|
||||
const index = args.indexOf(name);
|
||||
if (index === -1) throw new Error(`${name} is required`);
|
||||
|
||||
Reference in New Issue
Block a user