From dc19a07478365741909cd6a160eeaa08c9c8ce40 Mon Sep 17 00:00:00 2001 From: Codex Date: Thu, 18 Jun 2026 14:22:33 +0000 Subject: [PATCH] fix: add node scoped hwlab ci cleanup --- .agents/skills/unidesk-cicd/SKILL.md | 4 + docs/reference/cli.md | 2 +- scripts/src/hwlab-node-help.ts | 2 + scripts/src/hwlab-node-impl.ts | 455 ++++++++++++++++++++++++++- 4 files changed, 459 insertions(+), 4 deletions(-) diff --git a/.agents/skills/unidesk-cicd/SKILL.md b/.agents/skills/unidesk-cicd/SKILL.md index 7fcc9abb..0cea5052 100644 --- a/.agents/skills/unidesk-cicd/SKILL.md +++ b/.agents/skills/unidesk-cicd/SKILL.md @@ -214,6 +214,10 @@ bun scripts/cli.ts hwlab g14 tools-image build \ bun scripts/cli.ts hwlab g14 control-plane cleanup-runs \ --lane v02|g14|all [--min-age-minutes N] [--limit N] [--dry-run|--confirm] +# D601/G14 node-scoped runtime lane retention +bun scripts/cli.ts hwlab nodes control-plane cleanup-runs \ + --node D601 --lane v03 [--min-age-minutes N] [--limit N] [--dry-run|--confirm --wait] + # 补充清理 Released PV bun scripts/cli.ts hwlab g14 control-plane cleanup-released-pvs \ --lane all [--limit N] [--dry-run|--confirm] diff --git a/docs/reference/cli.md b/docs/reference/cli.md index 49d5d6fe..2e24466a 100644 --- a/docs/reference/cli.md +++ b/docs/reference/cli.md @@ -77,7 +77,7 @@ G14/D601 v03 的 bootstrap admin password 是 HWLAB runtime Secret 生命周期 `--dry-run` 只报告是否会 pre-sync,不创建 Job;confirmed trigger 默认创建 `.state/jobs/` 异步 job 并立刻返回 `job.id`、`statusCommand`、stdout/stderr 路径,避免 git mirror pre-sync 或 PipelineRun 创建期间长时间阻塞;`--wait` 路径也必须向 stderr 输出 `hwlab.v02.trigger.progress` JSON 事件,覆盖 `control-plane-refresh`、`git-mirror-pre-sync` 和 `create-pipelinerun`,避免异步 job 长时间只有启动命令而无法判断卡点;默认 JSON 必须对 `manifest_b64`、长脚本和远端 stdout/stderr 做有界摘要,保留长度与 hash,最终 trigger 结果只返回阶段摘要和关键 tail,完整内容通过 job stdout/stderr 文件渐进披露;只有现场同步调试才显式加 `--wait`;旧 `rerun-current` 只作为输入别名保留。PipelineRun `Completed`、Argo `Synced/Healthy` 和 `webAssets.ok=true` 只证明 G14 runtime 已更新;交付收口还必须用 `hwlab g14 git-mirror status` 查看 `cache.summary.pendingFlush`,若为 true,继续执行受控 `hwlab g14 git-mirror flush --confirm` 并用 job status 轮询到 `pendingFlush=false`。 - `hwlab g14 control-plane runtime-migration --lane v02 [--dry-run|--allow-live-db-read --dry-run|--confirm]` 只通过 `hwlab-v02` namespace 当前 `deployment/hwlab-cloud-api -c hwlab-cloud-api` 内 repo-owned migration CLI 执行;不读取或打印 Secret 值、不触碰 PROD、不绕到手工 `psql`。 - `hwlab g14 secret status|ensure --lane v02 --name hwlab-v02-openfga|hwlab-v02-master-server-admin-api-key [--dry-run|--confirm]` 和 `hwlab nodes secret status|ensure --node G14 --lane v03 --name hwlab-v03-master-server-admin-api-key [--dry-run|--confirm]` 是 HWLAB runtime lane SecretRef bootstrap 的保留入口。v03+ Cloud API/OpenFGA datastore SecretRef 已迁移到 G14 platform PostgreSQL,`hwlab nodes secret status --node G14 --lane v03 --name hwlab-cloud-api-v03-db|hwlab-v03-openfga` 只做 redacted SecretRef 与 `g14-platform-postgres` bridge 观测;旧 `ensure` 路径已删除,不再从 `hwlab-v03-postgres` Secret 或 StatefulSet 派生。`hwpod-v03-db`、`hwpod_v03`、`hwpod_v03_app` 是废弃残留,不能作为 status 完成态保留,发现后用 `hwlab nodes secret cleanup-obsolete --node G14 --lane v03 --name hwpod-v03-db [--dry-run|--confirm]` 清理。平台库凭据、桥接 Service 和 SecretRef 轮换边界见 `docs/reference/g14-platform-db.md`。master server admin API key preset 确保本机 `/root/.config/hwlab-v0x/master-server-admin-api-key.env` 以 0600 保存 `HWLAB_API_KEY`,并同步到对应 lane 的 `*-master-server-admin-api-key/api-key`。`status` 只返回 key 是否存在、解码后字节数、key prefix、bridge 存在性和 runtime health 相关结果,永远不读取、不打印、不回传 secret 明文。`hwlab nodes secret cleanup-owned-postgres --node G14 --lane v03 [--dry-run|--confirm]` 是 v0.3+ 迁移到 G14 平台 Postgres 后的受控残留清理入口,精确删除旧 repo-owned `hwlab-v03-postgres` StatefulSet/Service/ConfigMap/Secret 和 `data-hwlab-v03-postgres-0` PVC;它要求 `g14-platform-postgres` Service 已存在,默认 dry-run,不触碰平台数据库、OpenFGA/Cloud API 当前 SecretRef 或 GitOps desired state。`hwlab g14 secret delete --lane v02 --name [--dry-run|--confirm]` 只用于删除确认已不被 workload 引用的 v0.2 废弃 Secret,默认 dry-run,拒绝删除 OpenFGA/Postgres/master admin API key 等必需 Secret;共享 device-pod API key 已退出当前授权路径,不再提供 ensure/bootstrap 入口。 -- `hwlab g14 control-plane cleanup-runs --lane v02|v03|g14|all [--min-age-minutes N] [--limit N] [--dry-run|--confirm]` 是完成态 PipelineRun 工作区 retention 入口;真实清理只删除已完成 PipelineRun,让 Tekton/local-path 回收临时 PVC,不触碰 registry storage、业务 PVC、Secret、runtime workload 或 GitOps desired state。带 `--pipeline-run ` 或 `--source-commit ` 的定点清理必须先直接查询目标 PipelineRun,而不是只从全量列表过滤;不存在的目标返回 `target-pipelinerun-not-found`,未完成目标返回 `target-pipelinerun-not-terminal`,空查询和读取失败分别返回 `target-pipelinerun-query-empty` / `target-pipelinerun-query-failed`,年龄保护仍返回 `below-min-age`。`hwlab nodes control-plane cleanup-runs --node G14 --lane v03 --pipeline-run ` 是 v0.3 failed run 受控重试前的清理入口。 +- `hwlab g14 control-plane cleanup-runs --lane v02|v03|g14|all [--min-age-minutes N] [--limit N] [--dry-run|--confirm]` 和 `hwlab nodes control-plane cleanup-runs --node --lane [--min-age-minutes N] [--limit N] [--dry-run|--confirm]` 是完成态 PipelineRun 工作区 retention 入口;真实清理只删除已完成 PipelineRun 及其 Tekton TaskRun/Pod 链路,让 Tekton/local-path 回收临时 PVC,不触碰 registry storage、业务 PVC、Secret、runtime workload 或 GitOps desired state。带 `--pipeline-run ` 或 `--source-commit ` 的定点清理必须先直接查询目标 PipelineRun,而不是只从全量列表过滤;不存在的目标返回 `target-pipelinerun-not-found`,未完成目标返回 `target-pipelinerun-not-terminal`,年龄保护仍返回 `below-min-age`。D601 等非默认 node 的 CI Pod capacity 被终态 TaskRun/Pod 占满时,先用 node-scoped `cleanup-runs --dry-run` 查看保护对象、候选 PipelineRun、owned TaskRun/Pod/PVC 和后续 `--confirm --wait` 命令;禁止用原生 `kubectl delete` 长期替代该入口。 - `hwlab g14 control-plane cleanup-released-pvs --lane all [--limit N] [--dry-run|--confirm]` 是 local-path 未自动回收后的补充 retention 入口;只列并删除 `Released`、`local-path`、`Delete`、`claimNamespace=hwlab-ci` 且 claim 名称形如 Tekton 临时 `pvc-*` 的 PV。 - `hwlab g14 git-mirror status|apply|sync|flush [--dry-run|--confirm]` 是 `devops-infra` git mirror/relay 的受控维护入口:`apply` 渲染并 server-side apply `devops-infra/git-mirror.yaml`,同时删除遗留 `git-mirror-hwlab-sync` CronJob;`sync` 创建一次性 manual Job,把 GitHub allowlist refs 拉入本地 mirror;`flush` 创建一次性 manual Job,把本地 `v0.2-gitops` 快进推回 GitHub。 `status` 返回 read/write URL、last sync/write/flush、本地 ref、GitHub staging ref 和 pending flush 状态,并在 `cache.summary` 给出 `localV02`、`localGitops`、`githubGitops`、`pendingFlush`、`flushNeeded`、`githubInSync` 和下一条受控 `flushCommand`。confirmed `sync` 和 `flush` 默认创建 `.state/jobs/` 异步 job 并立刻返回可查询状态,只有现场同步调试才显式加 `--wait`;mirror 不设置 CronJob。 diff --git a/scripts/src/hwlab-node-help.ts b/scripts/src/hwlab-node-help.ts index eadd88f0..28c7470e 100644 --- a/scripts/src/hwlab-node-help.ts +++ b/scripts/src/hwlab-node-help.ts @@ -19,6 +19,8 @@ export function hwlabNodeHelp(): Record { "bun scripts/cli.ts hwlab nodes control-plane plan --node D601 --lane v03", "bun scripts/cli.ts hwlab nodes control-plane status --node D601 --lane v03", "bun scripts/cli.ts hwlab nodes control-plane status --node D601 --lane v03 --full", + "bun scripts/cli.ts hwlab nodes control-plane cleanup-runs --node D601 --lane v03 --min-age-minutes 60 --limit 20 --dry-run", + "bun scripts/cli.ts hwlab nodes control-plane cleanup-runs --node D601 --lane v03 --min-age-minutes 60 --limit 20 --confirm --wait", "bun scripts/cli.ts hwlab nodes control-plane status --node G14 --lane v03", "bun scripts/cli.ts hwlab nodes control-plane apply --node G14 --lane v03 --dry-run", "bun scripts/cli.ts hwlab nodes control-plane refresh --node G14 --lane v03 --confirm", diff --git a/scripts/src/hwlab-node-impl.ts b/scripts/src/hwlab-node-impl.ts index f0568f30..d6bdd898 100644 --- a/scripts/src/hwlab-node-impl.ts +++ b/scripts/src/hwlab-node-impl.ts @@ -78,6 +78,25 @@ interface NodeRuntimeRenderResult { readonly location: NodeRuntimeRenderLocation; } +interface NodeRuntimeCleanupPipelineRunRow { + name: string; + createdAt: string | null; + ageMinutes: number | null; + status: string | null; + reason: string | null; + selected?: boolean; + selectedReason?: string; +} + +interface NodeRuntimeCleanupOptions { + minAgeMinutes: number; + limit: number; + sourceCommit?: string; + pipelineRun?: string; + targetPipelineRun?: string; + dryRun: boolean; +} + interface NodeSecretOptions { action: SecretAction; node: string; @@ -195,6 +214,7 @@ const CODE_AGENT_PROVIDER_OPENAI_KEY = "openai-api-key"; const CODE_AGENT_PROVIDER_OPENCODE_KEY = "opencode-api-key"; const CODE_AGENT_PROVIDER_SOURCE_NAMESPACE = "hwlab-v02"; const CODE_AGENT_PROVIDER_SOURCE_SECRET = "hwlab-v02-code-agent-provider"; +const HWLAB_CI_NAMESPACE = "hwlab-ci"; export async function runHwlabNodeCommand(_config: Config, args: string[]): Promise> { if (args.length === 0) return hwlabNodeHelp(); @@ -280,7 +300,7 @@ async function runNodeDelegatedDomain(config: Config, domain: DelegatedNodeDomai } if (domain === "control-plane" && scoped.node !== defaultSpec.nodeId) { if (scoped.action === "status") return nodeRuntimeControlPlaneStatus(scoped); - if (scoped.action === "apply" || scoped.action === "trigger-current" || scoped.action === "refresh" || scoped.action === "sync" || scoped.action === "runtime-migration") { + if (scoped.action === "apply" || scoped.action === "trigger-current" || scoped.action === "refresh" || scoped.action === "sync" || scoped.action === "runtime-migration" || scoped.action === "cleanup-runs") { if (scoped.confirm && !scoped.dryRun && !scoped.wait) return startNodeDelegatedJob(scoped); return nodeRuntimeControlPlaneRun(scoped); } @@ -1130,6 +1150,15 @@ function compactRuntimeCommand(result: CommandResult): Record { }; } +function compactRuntimeCommandStats(result: CommandResult): Record { + return { + exitCode: result.exitCode, + stdoutBytes: Buffer.byteLength(result.stdout), + stderrBytes: Buffer.byteLength(result.stderr), + timedOut: result.timedOut, + }; +} + function parseLastJsonLineObject(text: string): Record { for (const line of text.split(/\r?\n/u).reverse()) { const trimmed = line.trim(); @@ -1214,7 +1243,7 @@ function nodeRuntimeUnsupportedAction(scoped: ReturnType): NodeRuntimeCleanupOptions { + const sourceCommitRaw = optionValue(scoped.originalArgs, "--source-commit"); + const pipelineRunRaw = optionValue(scoped.originalArgs, "--pipeline-run"); + if (sourceCommitRaw !== undefined && pipelineRunRaw !== undefined) { + throw new Error("control-plane cleanup-runs accepts only one of --source-commit or --pipeline-run"); + } + const sourceCommit = sourceCommitRaw?.toLowerCase(); + if (sourceCommit !== undefined && !/^[0-9a-f]{40}$/u.test(sourceCommit)) { + throw new Error("--source-commit must be a full 40-character git sha for cleanup-runs"); + } + const pipelineRun = pipelineRunRaw === undefined ? undefined : validateNodeRuntimePipelineRunName(scoped.spec, pipelineRunRaw); + return { + minAgeMinutes: positiveIntegerOption(scoped.originalArgs, "--min-age-minutes", 60, 10080), + limit: positiveIntegerOption(scoped.originalArgs, "--limit", 20, 200), + sourceCommit, + pipelineRun, + targetPipelineRun: pipelineRun ?? (sourceCommit === undefined ? undefined : nodeRuntimePipelineRunName(scoped.spec, sourceCommit)), + dryRun: scoped.dryRun || !scoped.confirm, + }; +} + +function validateNodeRuntimePipelineRunName(spec: HwlabRuntimeLaneSpec, value: string): string { + const escapedPrefix = spec.pipelineRunPrefix.replace(/[.*+?^${}()|[\]\\]/gu, "\\$&"); + if (!new RegExp(`^${escapedPrefix}-[0-9a-f]{7,40}(?:-[a-z0-9][a-z0-9-]{0,24})?$`, "iu").test(value)) { + throw new Error(`--pipeline-run must be a ${spec.pipelineRunPrefix}-[-rerun] PipelineRun name for --lane ${spec.lane}`); + } + return value.toLowerCase(); +} + +function nodeRuntimeCleanupRuns(scoped: ReturnType): Record { + const options = parseNodeRuntimeCleanupOptions(scoped); + const beforeCounts = nodeRuntimeCiObjectCounts(scoped.spec); + const candidates = listNodeRuntimeCleanupPipelineRuns(scoped.spec, options); + const selectedPipelineRuns = candidates + .filter((item) => item.selected !== false) + .map((item) => item.name); + const ownedResources = listNodeRuntimeCleanupOwnedResources(scoped.spec, selectedPipelineRuns); + const command = `hwlab nodes control-plane cleanup-runs --node ${scoped.node} --lane ${scoped.lane}`; + if (options.dryRun) { + return { + ok: true, + command, + mode: "dry-run", + node: scoped.node, + lane: scoped.lane, + namespace: HWLAB_CI_NAMESPACE, + minAgeMinutes: options.minAgeMinutes, + limit: options.limit, + sourceCommit: options.sourceCommit, + pipelineRun: options.pipelineRun, + candidates, + candidateCount: candidates.length, + selectedPipelineRuns, + selectedPipelineRunCount: selectedPipelineRuns.length, + ownedResources, + ciObjectCounts: beforeCounts, + mutation: false, + next: { + confirm: [ + command, + `--min-age-minutes ${options.minAgeMinutes}`, + `--limit ${options.limit}`, + options.pipelineRun === undefined ? "" : `--pipeline-run ${options.pipelineRun}`, + options.sourceCommit === undefined ? "" : `--source-commit ${options.sourceCommit}`, + "--confirm", + "--wait", + ].filter(Boolean).join(" "), + }, + }; + } + const deletion = deleteNodeRuntimeCleanupRuns(scoped.spec, selectedPipelineRuns, scoped.timeoutSeconds); + const afterCounts = nodeRuntimeCiObjectCounts(scoped.spec); + return { + ok: isCommandSuccess(deletion), + command, + mode: "confirmed-cleanup", + node: scoped.node, + lane: scoped.lane, + namespace: HWLAB_CI_NAMESPACE, + minAgeMinutes: options.minAgeMinutes, + limit: options.limit, + sourceCommit: options.sourceCommit, + pipelineRun: options.pipelineRun, + deletedPipelineRuns: selectedPipelineRuns, + deletedPipelineRunCount: selectedPipelineRuns.length, + ownedResourcesBefore: ownedResources, + ciObjectCountsBefore: beforeCounts, + ciObjectCountsAfter: afterCounts, + deletion: compactRuntimeCommand(deletion), + mutation: isCommandSuccess(deletion), + degradedReason: isCommandSuccess(deletion) ? undefined : "node-runtime-ci-cleanup-delete-failed", + next: { + status: `bun scripts/cli.ts hwlab nodes control-plane status --node ${scoped.node} --lane ${scoped.lane}`, + rerunStatus: options.targetPipelineRun === undefined + ? undefined + : `bun scripts/cli.ts hwlab nodes control-plane status --node ${scoped.node} --lane ${scoped.lane} --pipeline-run ${options.targetPipelineRun}`, + }, + }; +} + +function listNodeRuntimeCleanupPipelineRuns(spec: HwlabRuntimeLaneSpec, options: NodeRuntimeCleanupOptions): NodeRuntimeCleanupPipelineRunRow[] { + const result = runNodeK3sArgs(spec, [ + "kubectl", + "-n", + HWLAB_CI_NAMESPACE, + "get", + "pipelinerun", + "-o", + 'jsonpath={range .items[*]}{.metadata.name}{"\\t"}{.metadata.creationTimestamp}{"\\t"}{.status.conditions[0].status}{"\\t"}{.status.conditions[0].reason}{"\\n"}{end}', + ], 60); + if (!isCommandSuccess(result)) throw new Error(`failed to list ${HWLAB_CI_NAMESPACE} PipelineRuns: ${result.stderr.trim().slice(0, 1000)}`); + const rows = nodeRuntimeCleanupPipelineRunRowsFromText(result.stdout); + if (options.targetPipelineRun !== undefined) { + const target = rows.find((item) => item.name === options.targetPipelineRun); + if (target === undefined) { + return [{ + name: options.targetPipelineRun, + createdAt: null, + ageMinutes: null, + status: null, + reason: "target-pipelinerun-not-found", + selected: false, + }]; + } + if (target.status !== "True" && target.status !== "False") { + return [{ ...target, selected: false, selectedReason: "target-pipelinerun-not-terminal" }]; + } + if (target.ageMinutes === null || target.ageMinutes < options.minAgeMinutes) { + return [{ ...target, selected: false, selectedReason: target.ageMinutes === null ? "missing-creation-timestamp" : "below-min-age" }]; + } + return [target]; + } + const prefix = `${spec.pipelineRunPrefix}-`; + const terminalRuns = rows + .filter((item) => item.name.startsWith(prefix)) + .filter((item) => item.status === "True" || item.status === "False") + .sort((left, right) => String(left.createdAt ?? "").localeCompare(String(right.createdAt ?? ""))); + const protectedLatest = terminalRuns + .slice() + .sort((left, right) => String(right.createdAt ?? "").localeCompare(String(left.createdAt ?? "")))[0]?.name ?? null; + return terminalRuns + .filter((item) => typeof item.ageMinutes === "number" && item.ageMinutes >= options.minAgeMinutes) + .map((item) => item.name === protectedLatest ? { ...item, selected: false, selectedReason: "protected-latest-pipelinerun" } : item) + .slice(0, options.limit); +} + +function nodeRuntimeCleanupPipelineRunRowsFromText(text: string): NodeRuntimeCleanupPipelineRunRow[] { + const now = Date.now(); + return text.split(/\r?\n/u).map((line) => { + const [name = "", createdAtRaw = "", statusRaw = "", reasonRaw = ""] = line.trim().split("\t"); + const createdAt = createdAtRaw.length > 0 ? createdAtRaw : null; + const createdMs = createdAt === null ? NaN : Date.parse(createdAt); + return { + name, + createdAt, + ageMinutes: Number.isFinite(createdMs) ? Math.floor((now - createdMs) / 60000) : null, + status: statusRaw || null, + reason: reasonRaw || null, + }; + }).filter((item) => item.name.length > 0); +} + +function listNodeRuntimeCleanupOwnedResources(spec: HwlabRuntimeLaneSpec, pipelineRunNames: string[]): Record { + const previewLimit = 24; + if (pipelineRunNames.length === 0) { + return { + taskRunPreview: [], + podPreview: [], + pvcPreview: [], + previewLimit, + truncated: false, + taskRunCount: 0, + podCount: 0, + pvcCount: 0, + }; + } + const wanted = new Set(pipelineRunNames); + const taskRunsResult = runNodeK3sArgs(spec, [ + "kubectl", + "-n", + HWLAB_CI_NAMESPACE, + "get", + "taskrun", + "-o", + 'go-template={{range .items}}{{.metadata.name}}{{"\\t"}}{{index .metadata.labels "tekton.dev/pipelineRun"}}{{"\\t"}}{{range .status.conditions}}{{if eq .type "Succeeded"}}{{.status}}{{"\\t"}}{{.reason}}{{end}}{{end}}{{"\\n"}}{{end}}', + ], 60); + const podsResult = runNodeK3sArgs(spec, [ + "kubectl", + "-n", + HWLAB_CI_NAMESPACE, + "get", + "pod", + "-o", + 'go-template={{range .items}}{{.metadata.name}}{{"\\t"}}{{index .metadata.labels "tekton.dev/pipelineRun"}}{{"\\t"}}{{.status.phase}}{{"\\t"}}{{.spec.nodeName}}{{"\\n"}}{{end}}', + ], 60); + const pvcsResult = runNodeK3sArgs(spec, [ + "kubectl", + "-n", + HWLAB_CI_NAMESPACE, + "get", + "pvc", + "-o", + 'go-template={{range .items}}{{.metadata.name}}{{"\\t"}}{{.spec.volumeName}}{{"\\t"}}{{.status.phase}}{{"\\t"}}{{range .metadata.ownerReferences}}{{if eq .kind "PipelineRun"}}{{.kind}}{{"\\t"}}{{.name}}{{end}}{{end}}{{"\\t"}}{{.spec.resources.requests.storage}}{{"\\n"}}{{end}}', + ], 60); + const taskRuns = isCommandSuccess(taskRunsResult) ? nodeRuntimeCleanupOwnedTaskRunsFromText(taskRunsResult.stdout, wanted) : []; + const pods = isCommandSuccess(podsResult) ? nodeRuntimeCleanupOwnedPodsFromText(podsResult.stdout, wanted) : []; + const pvcs = isCommandSuccess(pvcsResult) ? nodeRuntimeCleanupOwnedPvcsFromText(pvcsResult.stdout, wanted) : []; + return { + taskRunPreview: taskRuns.slice(0, previewLimit), + podPreview: pods.slice(0, previewLimit), + pvcPreview: pvcs.slice(0, previewLimit), + previewLimit, + truncated: taskRuns.length > previewLimit || pods.length > previewLimit || pvcs.length > previewLimit, + taskRunCount: taskRuns.length, + podCount: pods.length, + pvcCount: pvcs.length, + query: { + taskRuns: compactRuntimeCommandStats(taskRunsResult), + pods: compactRuntimeCommandStats(podsResult), + pvcs: compactRuntimeCommandStats(pvcsResult), + }, + }; +} + +function nodeRuntimeCleanupOwnedTaskRunsFromText(text: string, wanted: Set): Record[] { + return text.split(/\r?\n/u).map((line) => { + const [name = "", pipelineRun = "", status = "", reason = ""] = line.trim().split("\t"); + if (name.length === 0 || !wanted.has(pipelineRun)) return null; + return { + name, + pipelineRun, + status: status || null, + reason: reason || null, + }; + }).filter((item): item is Record => item !== null); +} + +function nodeRuntimeCleanupOwnedPodsFromText(text: string, wanted: Set): Record[] { + return text.split(/\r?\n/u).map((line) => { + const [name = "", pipelineRun = "", phase = "", nodeName = ""] = line.trim().split("\t"); + if (name.length === 0 || !wanted.has(pipelineRun)) return null; + return { + name, + pipelineRun, + phase: phase || null, + nodeName: nodeName || null, + }; + }).filter((item): item is Record => item !== null); +} + +function nodeRuntimeCleanupOwnedPvcsFromText(text: string, wanted: Set): Record[] { + return text.split(/\r?\n/u).map((line) => { + const [name = "", volumeName = "", phase = "", ownerKind = "", ownerName = "", storage = ""] = line.trim().split("\t"); + if (name.length === 0 || ownerKind !== "PipelineRun" || !wanted.has(ownerName)) return null; + return { + name, + pipelineRun: ownerName, + phase: phase || null, + volumeName: volumeName || null, + storage: storage || null, + }; + }).filter((item): item is Record => item !== null); +} + +function deleteNodeRuntimeCleanupRuns(spec: HwlabRuntimeLaneSpec, pipelineRunNames: string[], timeoutSeconds: number): CommandResult { + if (pipelineRunNames.length === 0) { + return { + command: [], + cwd: repoRoot, + exitCode: 0, + stdout: "no candidates", + stderr: "", + signal: null, + timedOut: false, + }; + } + const script = [ + "set -eu", + `namespace=${shellQuote(HWLAB_CI_NAMESPACE)}`, + "names_file=$(mktemp)", + "cat > \"$names_file\"", + "deleted_pipeline_runs=$(grep -c . \"$names_file\" | tr -d ' ')", + "xargs -r -n 50 kubectl -n \"$namespace\" delete pipelinerun --ignore-not-found=true --wait=false < \"$names_file\"", + "deleted_pod_groups=0", + "deleted_taskrun_groups=0", + "explicit_owned_cleanup=skipped-large-batch", + "if [ \"$deleted_pipeline_runs\" -le 40 ]; then", + " explicit_owned_cleanup=executed", + " while IFS= read -r pipeline_run; do", + " [ -n \"$pipeline_run\" ] || continue", + " kubectl -n \"$namespace\" delete pod -l tekton.dev/pipelineRun=\"$pipeline_run\" --ignore-not-found=true --wait=false >/dev/null 2>&1 || true", + " deleted_pod_groups=$((deleted_pod_groups + 1))", + " kubectl -n \"$namespace\" delete taskrun -l tekton.dev/pipelineRun=\"$pipeline_run\" --ignore-not-found=true --wait=false >/dev/null 2>&1 || true", + " deleted_taskrun_groups=$((deleted_taskrun_groups + 1))", + " done < \"$names_file\"", + "fi", + "printf 'deletedPipelineRunCount\\t%s\\n' \"$deleted_pipeline_runs\"", + "printf 'deletedTaskRunLabelGroups\\t%s\\n' \"$deleted_taskrun_groups\"", + "printf 'deletedPodLabelGroups\\t%s\\n' \"$deleted_pod_groups\"", + "printf 'explicitOwnedCleanup\\t%s\\n' \"$explicit_owned_cleanup\"", + "rm -f \"$names_file\"", + ].join("\n"); + return runNodeK3sScript(spec, script, timeoutSeconds, pipelineRunNames.join("\n") + "\n"); +} + +function nodeRuntimeCiObjectCounts(spec: HwlabRuntimeLaneSpec): Record { + const script = [ + "set +e", + `namespace=${shellQuote(HWLAB_CI_NAMESPACE)}`, + "count_kind() { kubectl -n \"$namespace\" get \"$1\" -o name 2>/dev/null | wc -l | tr -d ' '; }", + "printf 'pipelineRuns\\t%s\\n' \"$(count_kind pipelinerun)\"", + "printf 'taskRuns\\t%s\\n' \"$(count_kind taskrun)\"", + "printf 'pods\\t%s\\n' \"$(count_kind pod)\"", + "printf 'pvcs\\t%s\\n' \"$(count_kind pvc)\"", + ].join("\n"); + const result = runNodeK3sScript(spec, script, 30); + const fields = keyValueLinesFromText(statusText(result)); + return { + pipelineRuns: numericField(fields.pipelineRuns), + taskRuns: numericField(fields.taskRuns), + pods: numericField(fields.pods), + pvcs: numericField(fields.pvcs), + result: compactRuntimeCommand(result), + }; +} + function nodeRuntimeBaseImageCommand(scoped: ReturnType): Record { const action = scoped.runtimeImageAction; if (action === null) { @@ -2510,6 +2866,9 @@ function nodeRuntimeControlPlaneStatus(scoped: ReturnType, scoped: ReturnType): Record { const pipelineRun = record(status.pipelineRun); + const pipelineRunDiagnostics = record(status.pipelineRunDiagnostics); const argo = record(status.argo); const runtime = record(status.runtime); const publicProbes = record(status.publicProbes); @@ -2688,6 +3052,14 @@ function summarizeNodeRuntimeControlPlaneStatus(status: Record, message: pipelineRun.message ?? null, createdAt: pipelineRun.createdAt ?? null, ready: pipelineRun.status === "True", + diagnostics: Object.keys(pipelineRunDiagnostics).length === 0 ? null : { + degradedReason: pipelineRunDiagnostics.degradedReason ?? null, + taskRunCount: pipelineRunDiagnostics.taskRunCount ?? null, + podCount: pipelineRunDiagnostics.podCount ?? null, + pendingTaskRuns: pipelineRunDiagnostics.pendingTaskRuns ?? [], + unscheduledPods: pipelineRunDiagnostics.unscheduledPods ?? [], + schedulingMessages: pipelineRunDiagnostics.schedulingMessages ?? [], + }, }, argo: { application: argo.application ?? null, @@ -2746,6 +3118,9 @@ function nodeRuntimeStatusNextAction(status: Record, scoped: Re if (reason === "pipelinerun-not-succeeded") { return `bun scripts/cli.ts hwlab nodes control-plane trigger-current --node ${scoped.node} --lane ${scoped.lane} --confirm`; } + if (reason === "node-runtime-ci-pod-capacity-exhausted" || reason === "node-runtime-ci-pod-unschedulable") { + return `bun scripts/cli.ts hwlab nodes control-plane cleanup-runs --node ${scoped.node} --lane ${scoped.lane} --min-age-minutes 60 --limit 20 --dry-run`; + } if (reason === "public-probe-not-ready") { return `bun scripts/cli.ts hwlab nodes web-probe run --node ${scoped.node} --lane ${scoped.lane}`; } @@ -2765,6 +3140,80 @@ function nodeRuntimeStatusCommand(scoped: ReturnType { + const taskRunsResult = runNodeK3sArgs(spec, ["kubectl", "-n", HWLAB_CI_NAMESPACE, "get", "taskrun", "-l", `tekton.dev/pipelineRun=${pipelineRun}`, "-o", "json"], 60); + const podsResult = runNodeK3sArgs(spec, ["kubectl", "-n", HWLAB_CI_NAMESPACE, "get", "pod", "-l", `tekton.dev/pipelineRun=${pipelineRun}`, "-o", "json"], 60); + const taskRuns = isCommandSuccess(taskRunsResult) ? nodeRuntimePipelineDiagnosticTaskRuns(parseJsonRecordFromText(taskRunsResult.stdout)) : []; + const pods = isCommandSuccess(podsResult) ? nodeRuntimePipelineDiagnosticPods(parseJsonRecordFromText(podsResult.stdout)) : []; + const pendingTaskRuns = taskRuns.filter((item) => item.status !== "True" && item.status !== "False"); + const unscheduledPods = pods.filter((item) => item.scheduled === false); + const schedulingMessages = unscheduledPods + .map((item) => typeof item.scheduledMessage === "string" ? item.scheduledMessage : "") + .filter((message) => message.length > 0); + const tooManyPods = schedulingMessages.some((message) => /too many pods/iu.test(message)); + return { + ok: taskRunsResult.exitCode === 0 && podsResult.exitCode === 0, + pipelineRun, + taskRuns, + pods, + taskRunCount: taskRuns.length, + podCount: pods.length, + pendingTaskRuns, + unscheduledPods, + schedulingMessages, + degradedReason: tooManyPods + ? "node-runtime-ci-pod-capacity-exhausted" + : unscheduledPods.length > 0 + ? "node-runtime-ci-pod-unschedulable" + : pendingTaskRuns.length > 0 + ? "node-runtime-ci-taskrun-pending" + : undefined, + query: { + taskRuns: compactRuntimeCommand(taskRunsResult), + pods: compactRuntimeCommand(podsResult), + }, + next: tooManyPods || unscheduledPods.length > 0 + ? { cleanupRuns: `bun scripts/cli.ts hwlab nodes control-plane cleanup-runs --node ${spec.nodeId} --lane ${spec.lane} --min-age-minutes 60 --limit 20 --dry-run` } + : undefined, + }; +} + +function nodeRuntimePipelineDiagnosticTaskRuns(json: Record): Array> { + const items = Array.isArray(json.items) ? json.items.map(record) : []; + return items.map((item) => { + const metadata = record(item.metadata); + const status = record(item.status); + const conditions = Array.isArray(status.conditions) ? status.conditions.map(record) : []; + const condition = conditions[0] ?? {}; + return { + name: metadata.name ?? null, + status: condition.status ?? null, + reason: condition.reason ?? null, + message: condition.message ?? null, + podName: status.podName ?? null, + }; + }); +} + +function nodeRuntimePipelineDiagnosticPods(json: Record): Array> { + const items = Array.isArray(json.items) ? json.items.map(record) : []; + return items.map((item) => { + const metadata = record(item.metadata); + const spec = record(item.spec); + const status = record(item.status); + const conditions = Array.isArray(status.conditions) ? status.conditions.map(record) : []; + const scheduled = conditions.find((condition) => condition.type === "PodScheduled"); + return { + name: metadata.name ?? null, + phase: status.phase ?? null, + nodeName: spec.nodeName ?? null, + scheduled: scheduled === undefined ? null : scheduled.status === "True", + scheduledReason: scheduled?.reason ?? null, + scheduledMessage: scheduled?.message ?? null, + }; + }); +} + function nodeRuntimeRenderToken(): string { return `${process.pid}-${Date.now().toString(36)}-${Math.random().toString(16).slice(2, 10)}`.replace(/[^A-Za-z0-9_.-]/gu, "-"); }