fix: add node scoped hwlab ci cleanup

This commit is contained in:
Codex
2026-06-18 14:22:33 +00:00
parent 1ae7796aa0
commit dc19a07478
4 changed files with 459 additions and 4 deletions
+4
View File
@@ -214,6 +214,10 @@ bun scripts/cli.ts hwlab g14 tools-image build \
bun scripts/cli.ts hwlab g14 control-plane cleanup-runs \
--lane v02|g14|all [--min-age-minutes N] [--limit N] [--dry-run|--confirm]
# D601/G14 node-scoped runtime lane retention
bun scripts/cli.ts hwlab nodes control-plane cleanup-runs \
--node D601 --lane v03 [--min-age-minutes N] [--limit N] [--dry-run|--confirm --wait]
# 补充清理 Released PV
bun scripts/cli.ts hwlab g14 control-plane cleanup-released-pvs \
--lane all [--limit N] [--dry-run|--confirm]
+1 -1
View File
@@ -77,7 +77,7 @@ G14/D601 v03 的 bootstrap admin password 是 HWLAB runtime Secret 生命周期
`--dry-run` 只报告是否会 pre-sync,不创建 Jobconfirmed trigger 默认创建 `.state/jobs/` 异步 job 并立刻返回 `job.id``statusCommand`、stdout/stderr 路径,避免 git mirror pre-sync 或 PipelineRun 创建期间长时间阻塞;`--wait` 路径也必须向 stderr 输出 `hwlab.v02.trigger.progress` JSON 事件,覆盖 `control-plane-refresh``git-mirror-pre-sync``create-pipelinerun`,避免异步 job 长时间只有启动命令而无法判断卡点;默认 JSON 必须对 `manifest_b64`、长脚本和远端 stdout/stderr 做有界摘要,保留长度与 hash,最终 trigger 结果只返回阶段摘要和关键 tail,完整内容通过 job stdout/stderr 文件渐进披露;只有现场同步调试才显式加 `--wait`;旧 `rerun-current` 只作为输入别名保留。PipelineRun `Completed`、Argo `Synced/Healthy``webAssets.ok=true` 只证明 G14 runtime 已更新;交付收口还必须用 `hwlab g14 git-mirror status` 查看 `cache.summary.pendingFlush`,若为 true,继续执行受控 `hwlab g14 git-mirror flush --confirm` 并用 job status 轮询到 `pendingFlush=false`
- `hwlab g14 control-plane runtime-migration --lane v02 [--dry-run|--allow-live-db-read --dry-run|--confirm]` 只通过 `hwlab-v02` namespace 当前 `deployment/hwlab-cloud-api -c hwlab-cloud-api` 内 repo-owned migration CLI 执行;不读取或打印 Secret 值、不触碰 PROD、不绕到手工 `psql`
- `hwlab g14 secret status|ensure --lane v02 --name hwlab-v02-openfga|hwlab-v02-master-server-admin-api-key [--dry-run|--confirm]``hwlab nodes secret status|ensure --node G14 --lane v03 --name hwlab-v03-master-server-admin-api-key [--dry-run|--confirm]` 是 HWLAB runtime lane SecretRef bootstrap 的保留入口。v03+ Cloud API/OpenFGA datastore SecretRef 已迁移到 G14 platform PostgreSQL`hwlab nodes secret status --node G14 --lane v03 --name hwlab-cloud-api-v03-db|hwlab-v03-openfga` 只做 redacted SecretRef 与 `g14-platform-postgres` bridge 观测;旧 `ensure` 路径已删除,不再从 `hwlab-v03-postgres` Secret 或 StatefulSet 派生。`hwpod-v03-db``hwpod_v03``hwpod_v03_app` 是废弃残留,不能作为 status 完成态保留,发现后用 `hwlab nodes secret cleanup-obsolete --node G14 --lane v03 --name hwpod-v03-db [--dry-run|--confirm]` 清理。平台库凭据、桥接 Service 和 SecretRef 轮换边界见 `docs/reference/g14-platform-db.md`。master server admin API key preset 确保本机 `/root/.config/hwlab-v0x/master-server-admin-api-key.env` 以 0600 保存 `HWLAB_API_KEY`,并同步到对应 lane 的 `*-master-server-admin-api-key/api-key``status` 只返回 key 是否存在、解码后字节数、key prefix、bridge 存在性和 runtime health 相关结果,永远不读取、不打印、不回传 secret 明文。`hwlab nodes secret cleanup-owned-postgres --node G14 --lane v03 [--dry-run|--confirm]` 是 v0.3+ 迁移到 G14 平台 Postgres 后的受控残留清理入口,精确删除旧 repo-owned `hwlab-v03-postgres` StatefulSet/Service/ConfigMap/Secret 和 `data-hwlab-v03-postgres-0` PVC;它要求 `g14-platform-postgres` Service 已存在,默认 dry-run,不触碰平台数据库、OpenFGA/Cloud API 当前 SecretRef 或 GitOps desired state。`hwlab g14 secret delete --lane v02 --name <obsolete-hwlab-v02-secret> [--dry-run|--confirm]` 只用于删除确认已不被 workload 引用的 v0.2 废弃 Secret,默认 dry-run,拒绝删除 OpenFGA/Postgres/master admin API key 等必需 Secret;共享 device-pod API key 已退出当前授权路径,不再提供 ensure/bootstrap 入口。
- `hwlab g14 control-plane cleanup-runs --lane v02|v03|g14|all [--min-age-minutes N] [--limit N] [--dry-run|--confirm]` 是完成态 PipelineRun 工作区 retention 入口;真实清理只删除已完成 PipelineRun,让 Tekton/local-path 回收临时 PVC,不触碰 registry storage、业务 PVC、Secret、runtime workload 或 GitOps desired state。带 `--pipeline-run <name>``--source-commit <full-sha>` 的定点清理必须先直接查询目标 PipelineRun,而不是只从全量列表过滤;不存在的目标返回 `target-pipelinerun-not-found`,未完成目标返回 `target-pipelinerun-not-terminal`空查询和读取失败分别返回 `target-pipelinerun-query-empty` / `target-pipelinerun-query-failed`,年龄保护仍返回 `below-min-age``hwlab nodes control-plane cleanup-runs --node G14 --lane v03 --pipeline-run <name>` 是 v0.3 failed run 受控重试前的清理入口。
- `hwlab g14 control-plane cleanup-runs --lane v02|v03|g14|all [--min-age-minutes N] [--limit N] [--dry-run|--confirm]``hwlab nodes control-plane cleanup-runs --node <node> --lane <lane> [--min-age-minutes N] [--limit N] [--dry-run|--confirm]` 是完成态 PipelineRun 工作区 retention 入口;真实清理只删除已完成 PipelineRun 及其 Tekton TaskRun/Pod 链路,让 Tekton/local-path 回收临时 PVC,不触碰 registry storage、业务 PVC、Secret、runtime workload 或 GitOps desired state。带 `--pipeline-run <name>``--source-commit <full-sha>` 的定点清理必须先直接查询目标 PipelineRun,而不是只从全量列表过滤;不存在的目标返回 `target-pipelinerun-not-found`,未完成目标返回 `target-pipelinerun-not-terminal`年龄保护仍返回 `below-min-age`。D601 等非默认 node 的 CI Pod capacity 被终态 TaskRun/Pod 占满时,先用 node-scoped `cleanup-runs --dry-run` 查看保护对象、候选 PipelineRun、owned TaskRun/Pod/PVC 和后续 `--confirm --wait` 命令;禁止用原生 `kubectl delete` 长期替代该入口。
- `hwlab g14 control-plane cleanup-released-pvs --lane all [--limit N] [--dry-run|--confirm]` 是 local-path 未自动回收后的补充 retention 入口;只列并删除 `Released``local-path``Delete``claimNamespace=hwlab-ci` 且 claim 名称形如 Tekton 临时 `pvc-*` 的 PV。
- `hwlab g14 git-mirror status|apply|sync|flush [--dry-run|--confirm]``devops-infra` git mirror/relay 的受控维护入口:`apply` 渲染并 server-side apply `devops-infra/git-mirror.yaml`,同时删除遗留 `git-mirror-hwlab-sync` CronJob`sync` 创建一次性 manual Job,把 GitHub allowlist refs 拉入本地 mirror`flush` 创建一次性 manual Job,把本地 `v0.2-gitops` 快进推回 GitHub。
`status` 返回 read/write URL、last sync/write/flush、本地 ref、GitHub staging ref 和 pending flush 状态,并在 `cache.summary` 给出 `localV02``localGitops``githubGitops``pendingFlush``flushNeeded``githubInSync` 和下一条受控 `flushCommand`。confirmed `sync``flush` 默认创建 `.state/jobs/` 异步 job 并立刻返回可查询状态,只有现场同步调试才显式加 `--wait`mirror 不设置 CronJob。
+2
View File
@@ -19,6 +19,8 @@ export function hwlabNodeHelp(): Record<string, unknown> {
"bun scripts/cli.ts hwlab nodes control-plane plan --node D601 --lane v03",
"bun scripts/cli.ts hwlab nodes control-plane status --node D601 --lane v03",
"bun scripts/cli.ts hwlab nodes control-plane status --node D601 --lane v03 --full",
"bun scripts/cli.ts hwlab nodes control-plane cleanup-runs --node D601 --lane v03 --min-age-minutes 60 --limit 20 --dry-run",
"bun scripts/cli.ts hwlab nodes control-plane cleanup-runs --node D601 --lane v03 --min-age-minutes 60 --limit 20 --confirm --wait",
"bun scripts/cli.ts hwlab nodes control-plane status --node G14 --lane v03",
"bun scripts/cli.ts hwlab nodes control-plane apply --node G14 --lane v03 --dry-run",
"bun scripts/cli.ts hwlab nodes control-plane refresh --node G14 --lane v03 --confirm",
+452 -3
View File
@@ -78,6 +78,25 @@ interface NodeRuntimeRenderResult {
readonly location: NodeRuntimeRenderLocation;
}
interface NodeRuntimeCleanupPipelineRunRow {
name: string;
createdAt: string | null;
ageMinutes: number | null;
status: string | null;
reason: string | null;
selected?: boolean;
selectedReason?: string;
}
interface NodeRuntimeCleanupOptions {
minAgeMinutes: number;
limit: number;
sourceCommit?: string;
pipelineRun?: string;
targetPipelineRun?: string;
dryRun: boolean;
}
interface NodeSecretOptions {
action: SecretAction;
node: string;
@@ -195,6 +214,7 @@ const CODE_AGENT_PROVIDER_OPENAI_KEY = "openai-api-key";
const CODE_AGENT_PROVIDER_OPENCODE_KEY = "opencode-api-key";
const CODE_AGENT_PROVIDER_SOURCE_NAMESPACE = "hwlab-v02";
const CODE_AGENT_PROVIDER_SOURCE_SECRET = "hwlab-v02-code-agent-provider";
const HWLAB_CI_NAMESPACE = "hwlab-ci";
export async function runHwlabNodeCommand(_config: Config, args: string[]): Promise<Record<string, unknown>> {
if (args.length === 0) return hwlabNodeHelp();
@@ -280,7 +300,7 @@ async function runNodeDelegatedDomain(config: Config, domain: DelegatedNodeDomai
}
if (domain === "control-plane" && scoped.node !== defaultSpec.nodeId) {
if (scoped.action === "status") return nodeRuntimeControlPlaneStatus(scoped);
if (scoped.action === "apply" || scoped.action === "trigger-current" || scoped.action === "refresh" || scoped.action === "sync" || scoped.action === "runtime-migration") {
if (scoped.action === "apply" || scoped.action === "trigger-current" || scoped.action === "refresh" || scoped.action === "sync" || scoped.action === "runtime-migration" || scoped.action === "cleanup-runs") {
if (scoped.confirm && !scoped.dryRun && !scoped.wait) return startNodeDelegatedJob(scoped);
return nodeRuntimeControlPlaneRun(scoped);
}
@@ -1130,6 +1150,15 @@ function compactRuntimeCommand(result: CommandResult): Record<string, unknown> {
};
}
function compactRuntimeCommandStats(result: CommandResult): Record<string, unknown> {
return {
exitCode: result.exitCode,
stdoutBytes: Buffer.byteLength(result.stdout),
stderrBytes: Buffer.byteLength(result.stderr),
timedOut: result.timedOut,
};
}
function parseLastJsonLineObject(text: string): Record<string, unknown> {
for (const line of text.split(/\r?\n/u).reverse()) {
const trimmed = line.trim();
@@ -1214,7 +1243,7 @@ function nodeRuntimeUnsupportedAction(scoped: ReturnType<typeof parseNodeScopedD
lane: scoped.lane,
mutation: false,
degradedReason: "unsupported-node-scoped-runtime-action",
message: "node-scoped runtime currently supports plan/status/apply/refresh/sync/trigger-current/runtime-image/runtime-migration",
message: "node-scoped runtime currently supports plan/status/apply/refresh/sync/trigger-current/cleanup-runs/runtime-image/runtime-migration",
expected: nodeRuntimeExpected(scoped.spec),
};
}
@@ -1430,9 +1459,336 @@ function nodeRuntimeControlPlaneRun(scoped: ReturnType<typeof parseNodeScopedDel
if (scoped.action === "apply") return nodeRuntimeApply(scoped);
if (scoped.action === "trigger-current") return nodeRuntimeTriggerCurrent(scoped);
if (scoped.action === "runtime-migration") return nodeRuntimeMigration(scoped);
if (scoped.action === "cleanup-runs") return nodeRuntimeCleanupRuns(scoped);
return nodeRuntimeUnsupportedAction(scoped);
}
function parseNodeRuntimeCleanupOptions(scoped: ReturnType<typeof parseNodeScopedDelegatedOptions>): NodeRuntimeCleanupOptions {
const sourceCommitRaw = optionValue(scoped.originalArgs, "--source-commit");
const pipelineRunRaw = optionValue(scoped.originalArgs, "--pipeline-run");
if (sourceCommitRaw !== undefined && pipelineRunRaw !== undefined) {
throw new Error("control-plane cleanup-runs accepts only one of --source-commit or --pipeline-run");
}
const sourceCommit = sourceCommitRaw?.toLowerCase();
if (sourceCommit !== undefined && !/^[0-9a-f]{40}$/u.test(sourceCommit)) {
throw new Error("--source-commit must be a full 40-character git sha for cleanup-runs");
}
const pipelineRun = pipelineRunRaw === undefined ? undefined : validateNodeRuntimePipelineRunName(scoped.spec, pipelineRunRaw);
return {
minAgeMinutes: positiveIntegerOption(scoped.originalArgs, "--min-age-minutes", 60, 10080),
limit: positiveIntegerOption(scoped.originalArgs, "--limit", 20, 200),
sourceCommit,
pipelineRun,
targetPipelineRun: pipelineRun ?? (sourceCommit === undefined ? undefined : nodeRuntimePipelineRunName(scoped.spec, sourceCommit)),
dryRun: scoped.dryRun || !scoped.confirm,
};
}
function validateNodeRuntimePipelineRunName(spec: HwlabRuntimeLaneSpec, value: string): string {
const escapedPrefix = spec.pipelineRunPrefix.replace(/[.*+?^${}()|[\]\\]/gu, "\\$&");
if (!new RegExp(`^${escapedPrefix}-[0-9a-f]{7,40}(?:-[a-z0-9][a-z0-9-]{0,24})?$`, "iu").test(value)) {
throw new Error(`--pipeline-run must be a ${spec.pipelineRunPrefix}-<sha>[-rerun] PipelineRun name for --lane ${spec.lane}`);
}
return value.toLowerCase();
}
function nodeRuntimeCleanupRuns(scoped: ReturnType<typeof parseNodeScopedDelegatedOptions>): Record<string, unknown> {
const options = parseNodeRuntimeCleanupOptions(scoped);
const beforeCounts = nodeRuntimeCiObjectCounts(scoped.spec);
const candidates = listNodeRuntimeCleanupPipelineRuns(scoped.spec, options);
const selectedPipelineRuns = candidates
.filter((item) => item.selected !== false)
.map((item) => item.name);
const ownedResources = listNodeRuntimeCleanupOwnedResources(scoped.spec, selectedPipelineRuns);
const command = `hwlab nodes control-plane cleanup-runs --node ${scoped.node} --lane ${scoped.lane}`;
if (options.dryRun) {
return {
ok: true,
command,
mode: "dry-run",
node: scoped.node,
lane: scoped.lane,
namespace: HWLAB_CI_NAMESPACE,
minAgeMinutes: options.minAgeMinutes,
limit: options.limit,
sourceCommit: options.sourceCommit,
pipelineRun: options.pipelineRun,
candidates,
candidateCount: candidates.length,
selectedPipelineRuns,
selectedPipelineRunCount: selectedPipelineRuns.length,
ownedResources,
ciObjectCounts: beforeCounts,
mutation: false,
next: {
confirm: [
command,
`--min-age-minutes ${options.minAgeMinutes}`,
`--limit ${options.limit}`,
options.pipelineRun === undefined ? "" : `--pipeline-run ${options.pipelineRun}`,
options.sourceCommit === undefined ? "" : `--source-commit ${options.sourceCommit}`,
"--confirm",
"--wait",
].filter(Boolean).join(" "),
},
};
}
const deletion = deleteNodeRuntimeCleanupRuns(scoped.spec, selectedPipelineRuns, scoped.timeoutSeconds);
const afterCounts = nodeRuntimeCiObjectCounts(scoped.spec);
return {
ok: isCommandSuccess(deletion),
command,
mode: "confirmed-cleanup",
node: scoped.node,
lane: scoped.lane,
namespace: HWLAB_CI_NAMESPACE,
minAgeMinutes: options.minAgeMinutes,
limit: options.limit,
sourceCommit: options.sourceCommit,
pipelineRun: options.pipelineRun,
deletedPipelineRuns: selectedPipelineRuns,
deletedPipelineRunCount: selectedPipelineRuns.length,
ownedResourcesBefore: ownedResources,
ciObjectCountsBefore: beforeCounts,
ciObjectCountsAfter: afterCounts,
deletion: compactRuntimeCommand(deletion),
mutation: isCommandSuccess(deletion),
degradedReason: isCommandSuccess(deletion) ? undefined : "node-runtime-ci-cleanup-delete-failed",
next: {
status: `bun scripts/cli.ts hwlab nodes control-plane status --node ${scoped.node} --lane ${scoped.lane}`,
rerunStatus: options.targetPipelineRun === undefined
? undefined
: `bun scripts/cli.ts hwlab nodes control-plane status --node ${scoped.node} --lane ${scoped.lane} --pipeline-run ${options.targetPipelineRun}`,
},
};
}
function listNodeRuntimeCleanupPipelineRuns(spec: HwlabRuntimeLaneSpec, options: NodeRuntimeCleanupOptions): NodeRuntimeCleanupPipelineRunRow[] {
const result = runNodeK3sArgs(spec, [
"kubectl",
"-n",
HWLAB_CI_NAMESPACE,
"get",
"pipelinerun",
"-o",
'jsonpath={range .items[*]}{.metadata.name}{"\\t"}{.metadata.creationTimestamp}{"\\t"}{.status.conditions[0].status}{"\\t"}{.status.conditions[0].reason}{"\\n"}{end}',
], 60);
if (!isCommandSuccess(result)) throw new Error(`failed to list ${HWLAB_CI_NAMESPACE} PipelineRuns: ${result.stderr.trim().slice(0, 1000)}`);
const rows = nodeRuntimeCleanupPipelineRunRowsFromText(result.stdout);
if (options.targetPipelineRun !== undefined) {
const target = rows.find((item) => item.name === options.targetPipelineRun);
if (target === undefined) {
return [{
name: options.targetPipelineRun,
createdAt: null,
ageMinutes: null,
status: null,
reason: "target-pipelinerun-not-found",
selected: false,
}];
}
if (target.status !== "True" && target.status !== "False") {
return [{ ...target, selected: false, selectedReason: "target-pipelinerun-not-terminal" }];
}
if (target.ageMinutes === null || target.ageMinutes < options.minAgeMinutes) {
return [{ ...target, selected: false, selectedReason: target.ageMinutes === null ? "missing-creation-timestamp" : "below-min-age" }];
}
return [target];
}
const prefix = `${spec.pipelineRunPrefix}-`;
const terminalRuns = rows
.filter((item) => item.name.startsWith(prefix))
.filter((item) => item.status === "True" || item.status === "False")
.sort((left, right) => String(left.createdAt ?? "").localeCompare(String(right.createdAt ?? "")));
const protectedLatest = terminalRuns
.slice()
.sort((left, right) => String(right.createdAt ?? "").localeCompare(String(left.createdAt ?? "")))[0]?.name ?? null;
return terminalRuns
.filter((item) => typeof item.ageMinutes === "number" && item.ageMinutes >= options.minAgeMinutes)
.map((item) => item.name === protectedLatest ? { ...item, selected: false, selectedReason: "protected-latest-pipelinerun" } : item)
.slice(0, options.limit);
}
function nodeRuntimeCleanupPipelineRunRowsFromText(text: string): NodeRuntimeCleanupPipelineRunRow[] {
const now = Date.now();
return text.split(/\r?\n/u).map((line) => {
const [name = "", createdAtRaw = "", statusRaw = "", reasonRaw = ""] = line.trim().split("\t");
const createdAt = createdAtRaw.length > 0 ? createdAtRaw : null;
const createdMs = createdAt === null ? NaN : Date.parse(createdAt);
return {
name,
createdAt,
ageMinutes: Number.isFinite(createdMs) ? Math.floor((now - createdMs) / 60000) : null,
status: statusRaw || null,
reason: reasonRaw || null,
};
}).filter((item) => item.name.length > 0);
}
function listNodeRuntimeCleanupOwnedResources(spec: HwlabRuntimeLaneSpec, pipelineRunNames: string[]): Record<string, unknown> {
const previewLimit = 24;
if (pipelineRunNames.length === 0) {
return {
taskRunPreview: [],
podPreview: [],
pvcPreview: [],
previewLimit,
truncated: false,
taskRunCount: 0,
podCount: 0,
pvcCount: 0,
};
}
const wanted = new Set(pipelineRunNames);
const taskRunsResult = runNodeK3sArgs(spec, [
"kubectl",
"-n",
HWLAB_CI_NAMESPACE,
"get",
"taskrun",
"-o",
'go-template={{range .items}}{{.metadata.name}}{{"\\t"}}{{index .metadata.labels "tekton.dev/pipelineRun"}}{{"\\t"}}{{range .status.conditions}}{{if eq .type "Succeeded"}}{{.status}}{{"\\t"}}{{.reason}}{{end}}{{end}}{{"\\n"}}{{end}}',
], 60);
const podsResult = runNodeK3sArgs(spec, [
"kubectl",
"-n",
HWLAB_CI_NAMESPACE,
"get",
"pod",
"-o",
'go-template={{range .items}}{{.metadata.name}}{{"\\t"}}{{index .metadata.labels "tekton.dev/pipelineRun"}}{{"\\t"}}{{.status.phase}}{{"\\t"}}{{.spec.nodeName}}{{"\\n"}}{{end}}',
], 60);
const pvcsResult = runNodeK3sArgs(spec, [
"kubectl",
"-n",
HWLAB_CI_NAMESPACE,
"get",
"pvc",
"-o",
'go-template={{range .items}}{{.metadata.name}}{{"\\t"}}{{.spec.volumeName}}{{"\\t"}}{{.status.phase}}{{"\\t"}}{{range .metadata.ownerReferences}}{{if eq .kind "PipelineRun"}}{{.kind}}{{"\\t"}}{{.name}}{{end}}{{end}}{{"\\t"}}{{.spec.resources.requests.storage}}{{"\\n"}}{{end}}',
], 60);
const taskRuns = isCommandSuccess(taskRunsResult) ? nodeRuntimeCleanupOwnedTaskRunsFromText(taskRunsResult.stdout, wanted) : [];
const pods = isCommandSuccess(podsResult) ? nodeRuntimeCleanupOwnedPodsFromText(podsResult.stdout, wanted) : [];
const pvcs = isCommandSuccess(pvcsResult) ? nodeRuntimeCleanupOwnedPvcsFromText(pvcsResult.stdout, wanted) : [];
return {
taskRunPreview: taskRuns.slice(0, previewLimit),
podPreview: pods.slice(0, previewLimit),
pvcPreview: pvcs.slice(0, previewLimit),
previewLimit,
truncated: taskRuns.length > previewLimit || pods.length > previewLimit || pvcs.length > previewLimit,
taskRunCount: taskRuns.length,
podCount: pods.length,
pvcCount: pvcs.length,
query: {
taskRuns: compactRuntimeCommandStats(taskRunsResult),
pods: compactRuntimeCommandStats(podsResult),
pvcs: compactRuntimeCommandStats(pvcsResult),
},
};
}
function nodeRuntimeCleanupOwnedTaskRunsFromText(text: string, wanted: Set<string>): Record<string, unknown>[] {
return text.split(/\r?\n/u).map((line) => {
const [name = "", pipelineRun = "", status = "", reason = ""] = line.trim().split("\t");
if (name.length === 0 || !wanted.has(pipelineRun)) return null;
return {
name,
pipelineRun,
status: status || null,
reason: reason || null,
};
}).filter((item): item is Record<string, unknown> => item !== null);
}
function nodeRuntimeCleanupOwnedPodsFromText(text: string, wanted: Set<string>): Record<string, unknown>[] {
return text.split(/\r?\n/u).map((line) => {
const [name = "", pipelineRun = "", phase = "", nodeName = ""] = line.trim().split("\t");
if (name.length === 0 || !wanted.has(pipelineRun)) return null;
return {
name,
pipelineRun,
phase: phase || null,
nodeName: nodeName || null,
};
}).filter((item): item is Record<string, unknown> => item !== null);
}
function nodeRuntimeCleanupOwnedPvcsFromText(text: string, wanted: Set<string>): Record<string, unknown>[] {
return text.split(/\r?\n/u).map((line) => {
const [name = "", volumeName = "", phase = "", ownerKind = "", ownerName = "", storage = ""] = line.trim().split("\t");
if (name.length === 0 || ownerKind !== "PipelineRun" || !wanted.has(ownerName)) return null;
return {
name,
pipelineRun: ownerName,
phase: phase || null,
volumeName: volumeName || null,
storage: storage || null,
};
}).filter((item): item is Record<string, unknown> => item !== null);
}
function deleteNodeRuntimeCleanupRuns(spec: HwlabRuntimeLaneSpec, pipelineRunNames: string[], timeoutSeconds: number): CommandResult {
if (pipelineRunNames.length === 0) {
return {
command: [],
cwd: repoRoot,
exitCode: 0,
stdout: "no candidates",
stderr: "",
signal: null,
timedOut: false,
};
}
const script = [
"set -eu",
`namespace=${shellQuote(HWLAB_CI_NAMESPACE)}`,
"names_file=$(mktemp)",
"cat > \"$names_file\"",
"deleted_pipeline_runs=$(grep -c . \"$names_file\" | tr -d ' ')",
"xargs -r -n 50 kubectl -n \"$namespace\" delete pipelinerun --ignore-not-found=true --wait=false < \"$names_file\"",
"deleted_pod_groups=0",
"deleted_taskrun_groups=0",
"explicit_owned_cleanup=skipped-large-batch",
"if [ \"$deleted_pipeline_runs\" -le 40 ]; then",
" explicit_owned_cleanup=executed",
" while IFS= read -r pipeline_run; do",
" [ -n \"$pipeline_run\" ] || continue",
" kubectl -n \"$namespace\" delete pod -l tekton.dev/pipelineRun=\"$pipeline_run\" --ignore-not-found=true --wait=false >/dev/null 2>&1 || true",
" deleted_pod_groups=$((deleted_pod_groups + 1))",
" kubectl -n \"$namespace\" delete taskrun -l tekton.dev/pipelineRun=\"$pipeline_run\" --ignore-not-found=true --wait=false >/dev/null 2>&1 || true",
" deleted_taskrun_groups=$((deleted_taskrun_groups + 1))",
" done < \"$names_file\"",
"fi",
"printf 'deletedPipelineRunCount\\t%s\\n' \"$deleted_pipeline_runs\"",
"printf 'deletedTaskRunLabelGroups\\t%s\\n' \"$deleted_taskrun_groups\"",
"printf 'deletedPodLabelGroups\\t%s\\n' \"$deleted_pod_groups\"",
"printf 'explicitOwnedCleanup\\t%s\\n' \"$explicit_owned_cleanup\"",
"rm -f \"$names_file\"",
].join("\n");
return runNodeK3sScript(spec, script, timeoutSeconds, pipelineRunNames.join("\n") + "\n");
}
function nodeRuntimeCiObjectCounts(spec: HwlabRuntimeLaneSpec): Record<string, unknown> {
const script = [
"set +e",
`namespace=${shellQuote(HWLAB_CI_NAMESPACE)}`,
"count_kind() { kubectl -n \"$namespace\" get \"$1\" -o name 2>/dev/null | wc -l | tr -d ' '; }",
"printf 'pipelineRuns\\t%s\\n' \"$(count_kind pipelinerun)\"",
"printf 'taskRuns\\t%s\\n' \"$(count_kind taskrun)\"",
"printf 'pods\\t%s\\n' \"$(count_kind pod)\"",
"printf 'pvcs\\t%s\\n' \"$(count_kind pvc)\"",
].join("\n");
const result = runNodeK3sScript(spec, script, 30);
const fields = keyValueLinesFromText(statusText(result));
return {
pipelineRuns: numericField(fields.pipelineRuns),
taskRuns: numericField(fields.taskRuns),
pods: numericField(fields.pods),
pvcs: numericField(fields.pvcs),
result: compactRuntimeCommand(result),
};
}
function nodeRuntimeBaseImageCommand(scoped: ReturnType<typeof parseNodeScopedDelegatedOptions>): Record<string, unknown> {
const action = scoped.runtimeImageAction;
if (action === null) {
@@ -2510,6 +2866,9 @@ function nodeRuntimeControlPlaneStatus(scoped: ReturnType<typeof parseNodeScoped
const argo = runNodeK3sArgs(spec, ["kubectl", "-n", "argocd", "get", "application", spec.app, "-o", "jsonpath={.spec.source.repoURL}{\"\\n\"}{.spec.source.targetRevision}{\"\\n\"}{.spec.source.path}{\"\\n\"}{.status.sync.revision}{\"\\n\"}{.status.sync.status}{\"\\n\"}{.status.health.status}{\"\\n\"}"], 60);
const [repoURL = "", targetRevision = "", path = "", syncRevision = "", syncStatus = "", health = ""] = argo.stdout.split(/\r?\n/u);
const pipelineRunProbe = pipelineRun === null ? null : getNodeRuntimePipelineRun(spec, pipelineRun);
const pipelineRunDiagnostics = pipelineRun !== null && pipelineRunProbe?.status === "Unknown"
? nodeRuntimePipelineRunDiagnostics(spec, pipelineRun)
: null;
const workloads = namespaceExists
? runNodeK3sArgs(spec, ["kubectl", "-n", spec.runtimeNamespace, "get", "deploy,statefulset,svc,ingress,configmap", "-l", `hwlab.pikastech.local/gitops-target=${spec.lane}`, "-o", "name"], 60)
: null;
@@ -2528,6 +2887,9 @@ function nodeRuntimeControlPlaneStatus(scoped: ReturnType<typeof parseNodeScoped
const runtimeReady = namespaceExists && localPostgresObjects.length === 0 && workloadsReady && (spec.externalPostgres === undefined || (bridge.ready && secrets.ready));
const argoReady = argo.exitCode === 0 && repoURL === spec.argoRepoUrl && targetRevision === spec.gitopsBranch && path === spec.runtimePath && syncStatus === "Synced" && health === "Healthy";
const pipelineRunReady = pipelineRunProbe !== null && pipelineRunProbe.status === "True";
const pipelineRunDegradedReason = typeof pipelineRunDiagnostics?.degradedReason === "string"
? pipelineRunDiagnostics.degradedReason
: "pipelinerun-not-succeeded";
const publicReady = publicProbes.ready === true;
const gitMirrorReady = gitMirror.ok === true && gitMirrorCompact.pendingFlush === false && gitMirrorCompact.githubInSync === true;
const fullStatus = {
@@ -2561,6 +2923,7 @@ function nodeRuntimeControlPlaneStatus(scoped: ReturnType<typeof parseNodeScoped
result: compactRuntimeCommand(argo),
},
pipelineRun: pipelineRunProbe,
pipelineRunDiagnostics,
runtime: {
ready: runtimeReady,
namespace: spec.runtimeNamespace,
@@ -2593,7 +2956,7 @@ function nodeRuntimeControlPlaneStatus(scoped: ReturnType<typeof parseNodeScoped
? publicReady
? gitMirrorReady ? undefined : "git-mirror-pending-flush"
: "public-probe-not-ready"
: "pipelinerun-not-succeeded"
: pipelineRunDegradedReason
: "argo-not-synced-healthy"
: namespaceExists ? "runtime-not-ready" : "runtime-namespace-missing"
: "control-plane-not-ready",
@@ -2661,6 +3024,7 @@ function joinUrlPath(baseUrl: string, suffix: string): string {
function summarizeNodeRuntimeControlPlaneStatus(status: Record<string, unknown>, scoped: ReturnType<typeof parseNodeScopedDelegatedOptions>): Record<string, unknown> {
const pipelineRun = record(status.pipelineRun);
const pipelineRunDiagnostics = record(status.pipelineRunDiagnostics);
const argo = record(status.argo);
const runtime = record(status.runtime);
const publicProbes = record(status.publicProbes);
@@ -2688,6 +3052,14 @@ function summarizeNodeRuntimeControlPlaneStatus(status: Record<string, unknown>,
message: pipelineRun.message ?? null,
createdAt: pipelineRun.createdAt ?? null,
ready: pipelineRun.status === "True",
diagnostics: Object.keys(pipelineRunDiagnostics).length === 0 ? null : {
degradedReason: pipelineRunDiagnostics.degradedReason ?? null,
taskRunCount: pipelineRunDiagnostics.taskRunCount ?? null,
podCount: pipelineRunDiagnostics.podCount ?? null,
pendingTaskRuns: pipelineRunDiagnostics.pendingTaskRuns ?? [],
unscheduledPods: pipelineRunDiagnostics.unscheduledPods ?? [],
schedulingMessages: pipelineRunDiagnostics.schedulingMessages ?? [],
},
},
argo: {
application: argo.application ?? null,
@@ -2746,6 +3118,9 @@ function nodeRuntimeStatusNextAction(status: Record<string, unknown>, scoped: Re
if (reason === "pipelinerun-not-succeeded") {
return `bun scripts/cli.ts hwlab nodes control-plane trigger-current --node ${scoped.node} --lane ${scoped.lane} --confirm`;
}
if (reason === "node-runtime-ci-pod-capacity-exhausted" || reason === "node-runtime-ci-pod-unschedulable") {
return `bun scripts/cli.ts hwlab nodes control-plane cleanup-runs --node ${scoped.node} --lane ${scoped.lane} --min-age-minutes 60 --limit 20 --dry-run`;
}
if (reason === "public-probe-not-ready") {
return `bun scripts/cli.ts hwlab nodes web-probe run --node ${scoped.node} --lane ${scoped.lane}`;
}
@@ -2765,6 +3140,80 @@ function nodeRuntimeStatusCommand(scoped: ReturnType<typeof parseNodeScopedDeleg
].filter(Boolean).join(" ");
}
function nodeRuntimePipelineRunDiagnostics(spec: HwlabRuntimeLaneSpec, pipelineRun: string): Record<string, unknown> {
const taskRunsResult = runNodeK3sArgs(spec, ["kubectl", "-n", HWLAB_CI_NAMESPACE, "get", "taskrun", "-l", `tekton.dev/pipelineRun=${pipelineRun}`, "-o", "json"], 60);
const podsResult = runNodeK3sArgs(spec, ["kubectl", "-n", HWLAB_CI_NAMESPACE, "get", "pod", "-l", `tekton.dev/pipelineRun=${pipelineRun}`, "-o", "json"], 60);
const taskRuns = isCommandSuccess(taskRunsResult) ? nodeRuntimePipelineDiagnosticTaskRuns(parseJsonRecordFromText(taskRunsResult.stdout)) : [];
const pods = isCommandSuccess(podsResult) ? nodeRuntimePipelineDiagnosticPods(parseJsonRecordFromText(podsResult.stdout)) : [];
const pendingTaskRuns = taskRuns.filter((item) => item.status !== "True" && item.status !== "False");
const unscheduledPods = pods.filter((item) => item.scheduled === false);
const schedulingMessages = unscheduledPods
.map((item) => typeof item.scheduledMessage === "string" ? item.scheduledMessage : "")
.filter((message) => message.length > 0);
const tooManyPods = schedulingMessages.some((message) => /too many pods/iu.test(message));
return {
ok: taskRunsResult.exitCode === 0 && podsResult.exitCode === 0,
pipelineRun,
taskRuns,
pods,
taskRunCount: taskRuns.length,
podCount: pods.length,
pendingTaskRuns,
unscheduledPods,
schedulingMessages,
degradedReason: tooManyPods
? "node-runtime-ci-pod-capacity-exhausted"
: unscheduledPods.length > 0
? "node-runtime-ci-pod-unschedulable"
: pendingTaskRuns.length > 0
? "node-runtime-ci-taskrun-pending"
: undefined,
query: {
taskRuns: compactRuntimeCommand(taskRunsResult),
pods: compactRuntimeCommand(podsResult),
},
next: tooManyPods || unscheduledPods.length > 0
? { cleanupRuns: `bun scripts/cli.ts hwlab nodes control-plane cleanup-runs --node ${spec.nodeId} --lane ${spec.lane} --min-age-minutes 60 --limit 20 --dry-run` }
: undefined,
};
}
function nodeRuntimePipelineDiagnosticTaskRuns(json: Record<string, unknown>): Array<Record<string, unknown>> {
const items = Array.isArray(json.items) ? json.items.map(record) : [];
return items.map((item) => {
const metadata = record(item.metadata);
const status = record(item.status);
const conditions = Array.isArray(status.conditions) ? status.conditions.map(record) : [];
const condition = conditions[0] ?? {};
return {
name: metadata.name ?? null,
status: condition.status ?? null,
reason: condition.reason ?? null,
message: condition.message ?? null,
podName: status.podName ?? null,
};
});
}
function nodeRuntimePipelineDiagnosticPods(json: Record<string, unknown>): Array<Record<string, unknown>> {
const items = Array.isArray(json.items) ? json.items.map(record) : [];
return items.map((item) => {
const metadata = record(item.metadata);
const spec = record(item.spec);
const status = record(item.status);
const conditions = Array.isArray(status.conditions) ? status.conditions.map(record) : [];
const scheduled = conditions.find((condition) => condition.type === "PodScheduled");
return {
name: metadata.name ?? null,
phase: status.phase ?? null,
nodeName: spec.nodeName ?? null,
scheduled: scheduled === undefined ? null : scheduled.status === "True",
scheduledReason: scheduled?.reason ?? null,
scheduledMessage: scheduled?.message ?? null,
};
});
}
function nodeRuntimeRenderToken(): string {
return `${process.pid}-${Date.now().toString(36)}-${Math.random().toString(16).slice(2, 10)}`.replace(/[^A-Za-z0-9_.-]/gu, "-");
}