From 63030fa2552f3ae9d70867c71d3c2e61d4d6389b Mon Sep 17 00:00:00 2001 From: Codex Date: Thu, 11 Jun 2026 10:34:35 +0000 Subject: [PATCH] fix: make CI install observable --- docs/reference/ci.md | 6 +- docs/reference/cli.md | 2 +- .../ci-install-visibility-contract-test.ts | 88 ++++++ scripts/src/ci.ts | 264 +++++++++++++++--- scripts/src/jobs.ts | 55 +++- 5 files changed, 369 insertions(+), 46 deletions(-) create mode 100644 scripts/ci-install-visibility-contract-test.ts diff --git a/docs/reference/ci.md b/docs/reference/ci.md index 32193d30..02493871 100644 --- a/docs/reference/ci.md +++ b/docs/reference/ci.md @@ -9,7 +9,7 @@ UniDesk CI is hosted on the D601 native k3s cluster with Tekton Pipelines and Te - UniDesk CI namespace: `unidesk-ci`. - Manifests: `src/components/microservices/k3sctl-adapter/k3s/ci/`. - Artifact catalog: root `CI.json`, which is CI artifact catalog only. It describes build inputs, image naming and summary fields; runtime topology, rollout target, ports, namespaces and desired service commits remain in `config.json`, service manifests and `deploy.json`. -- CLI entry: `bun scripts/cli.ts ci install|status|run|publish-backend-core|publish-user-service|run-dev-e2e|logs`. +- CLI entry: `bun scripts/cli.ts ci install|install-status|status|run|publish-backend-core|publish-user-service|run-dev-e2e|logs`. - Dev namespace e2e runner: `bun scripts/cli.ts ci run-dev-e2e`; authoritative runner path, manifest contract and safety boundary are in `docs/reference/dev-ci-runner.md`. - Rust backend-core check/build boundary: CI may run `UNIDESK_D601_RUST_CHECK=1 bun scripts/cli.ts check --full --rust` on D601; the master server must not compile Rust for backend-core iteration. The authoritative dev environment rule is `docs/reference/dev-environment.md`. @@ -27,7 +27,9 @@ Each commit CI run performs: CI/CD bootstrap, repair and upgrade actions are infrastructure operations. They are manually tested and may be promoted directly to production when the infrastructure itself is the target; do not add CI jobs whose purpose is to prove that CI/CD can bootstrap or repair itself. -`ci install` also prewarms the D601 k3s containerd runtime with the Tekton entrypoint/workingdir helper images, `oven/bun:1-debian`, `alpine/git:2.45.2` and `unidesk-code-queue:dev`. Missing images are pulled through the node-local provider-gateway WS egress proxy and then imported into native k3s containerd with digests preserved, so PipelineRun pods do not hang on external registry pulls. Sustained pull throughput below 1 MB/s is treated as a provider/main-server network or proxy degradation first, not as a Dockerfile or application failure. +`ci install` is fire-and-forget by default: it creates a `.state/jobs` job and immediately returns `job.id`, stdout/stderr paths and `ci install-status `. Use `--wait` only for explicit synchronous debugging. `ci install-status` must show bounded log tails and `ci.install.progress` stages for prewarm, Tekton install, manifest upload, `kubectl apply` and final status, so a stalled install is diagnosable without waiting on a silent foreground command. + +`ci install` also prewarms the D601 k3s containerd runtime with the Tekton entrypoint/workingdir helper images, `oven/bun:1-debian`, `alpine/git:2.45.2` and `unidesk-code-queue:dev`. Missing images are pulled through the node-local provider-gateway WS egress proxy and then imported into native k3s containerd with digests preserved, so PipelineRun pods do not hang on external registry pulls. Sustained pull throughput below 1 MB/s is treated as a provider/main-server network or proxy degradation first, not as a Dockerfile or application failure. When Tekton is already installed and only UniDesk CI manifests/triggers need refreshing, use `ci install --skip-prewarm --skip-tekton-install`; manifest apply still reports byte counts and per-manifest `kubectl-apply` progress through `install-status`. Git clone and dependency downloads inside the repo check task use `d601-provider-egress-proxy.unidesk.svc.cluster.local:18789`; the NO_PROXY list keeps the in-cluster read service and D601 TCP egress gateway on the cluster network. diff --git a/docs/reference/cli.md b/docs/reference/cli.md index 27ab1f6a..bde7d125 100644 --- a/docs/reference/cli.md +++ b/docs/reference/cli.md @@ -90,7 +90,7 @@ CI/CD、GitOps、rollout、artifact 发布、PR 合并后的 runtime lane 滚动 - `gh pr merge` 的 already-merged 终态是 guarded merge 的幂等成功例外:当目标 PR 已经处于 `merged` 状态时,命令返回 `ok=true`、`alreadyMerged=true`、`pullRequest.merged=true` 和 merge commit 摘要,不再把并发 monitor、GitHub UI 或人工合并后的 `closed` 状态误报为 validation-failed。 - `gh pr list` 与 `gh issue list` 一样接受单个位置参数 `owner/repo` 作为 `--repo owner/repo` 兼容别名;位置 repo 与显式 `--repo` 冲突时会结构化失败,输出里的 `repo` 始终反映真实请求目标。`--number N --repo owner/repo` 是单 PR/comment 数字目标命令的位置参数兼容别名,适用于 `view/read/files/diff/preflight/closeout/edit/update/comment create/comment delete/close/reopen/merge`,成功输出必须带 `standardSyntaxHint`;comment delete 中的 `--number` 表示 commentId,不是 PR number;`list/create` 不能静默忽略 `--number`。 - PR dry-run/probe 的最小手动序列是:`bun scripts/cli.ts gh auth status --repo pikasTech/unidesk` 只读检查 token 来源、GitHub REST egress、repo 可见性和 issue read;`bun scripts/cli.ts gh pr create --repo pikasTech/unidesk --title --body-stdin --base master --head <head> --dry-run <<'EOF' ... EOF` 检查创建计划;`bun scripts/cli.ts gh pr list --repo pikasTech/unidesk --state open --limit 5 --json number,title,state,url,head,base`、`bun scripts/cli.ts gh pr files <number> --repo pikasTech/unidesk --limit 30`、`bun scripts/cli.ts gh pr view <number> --repo pikasTech/unidesk --json body,title,state,stateDetail,closed,closedAt,merged,mergedAt,mergeCommit,head,base,headRefName,baseRefName,mergeable,mergeStateStatus,statusCheckRollup` 和 `bun scripts/cli.ts gh pr preflight <number> --repo pikasTech/unidesk` 做只读 PR 观察、文件统计和收口元数据检查;`bun scripts/cli.ts gh pr edit <number> --repo pikasTech/unidesk --title <title> --body-stdin --dry-run <<'EOF' ... EOF` 检查低噪声 PR 标题/正文编辑计划;`bun scripts/cli.ts gh pr comment create <number> --repo pikasTech/unidesk --body-stdin --dry-run <<'EOF' ... EOF` 检查评论计划;`bun scripts/cli.ts gh pr merge <number> --repo pikasTech/unidesk --dry-run` 检查 guarded merge plan,真实 merge 只能在任务边界明确允许且 preflight ready 后执行。Code Queue runner 可用 `bun scripts/code-queue-pr-preflight-example.ts --repo pikasTech/unidesk --base master --head <head> --comment-pr <number>` 一次性跑只读 auth status 与 PR create/comment dry-run;该脚本不得输出 token 值,也不会创建、评论或 merge PR。 -- `ci install|status|run|publish-backend-core|publish-user-service|run-dev-e2e|logs` 管理 D601 原生 k3s 上的 Tekton CI。`run` 手动创建每 commit 检查和 Code Queue 只读性能门禁;`publish-backend-core` 与 `publish-user-service` 从 pushed Git commit 构建并发布 `127.0.0.1:5000/unidesk/<service>:<commit>` commit-pinned artifacts,输出 `artifactSummary`(含 `serviceId`、`sourceCommit`、`sourceRepo`、`dockerfile`、`imageRef`、`tag`、`digest`、`digestRef`),但不部署生产;`run-dev-e2e` 的 Git 控制 runner、短 launcher、host fetch 边界、临时 smoke namespace 和 no-CD 规则只在 `docs/reference/dev-ci-runner.md` 定义;Tekton CI 通用规则见 `docs/reference/ci.md`。 +- `ci install|install-status|status|run|publish-backend-core|publish-user-service|run-dev-e2e|logs` 管理 D601 原生 k3s 上的 Tekton CI。`install` 默认创建 `.state/jobs` 异步 job 并立即返回,`install-status <jobId|latest>` 读取阶段化 progress 和 bounded log tail;只有现场同步调试才显式加 `--wait`。`run` 手动创建每 commit 检查和 Code Queue 只读性能门禁;`publish-backend-core` 与 `publish-user-service` 从 pushed Git commit 构建并发布 `127.0.0.1:5000/unidesk/<service>:<commit>` commit-pinned artifacts,输出 `artifactSummary`(含 `serviceId`、`sourceCommit`、`sourceRepo`、`dockerfile`、`imageRef`、`tag`、`digest`、`digestRef`),但不部署生产;`run-dev-e2e` 的 Git 控制 runner、短 launcher、host fetch 边界、临时 smoke namespace 和 no-CD 规则只在 `docs/reference/dev-ci-runner.md` 定义;Tekton CI 通用规则见 `docs/reference/ci.md`。 - `schedule list|get|runs|run|retry-run|delete|upsert-pgdata-backup` 管理 backend-core 定时任务和运行历史。`schedule list`、`schedule get`、`schedule runs --limit N` 和 `schedule runs <scheduleId> --limit N` 是只读观察入口;`schedule run`、`schedule retry-run`、`schedule delete` 和 `schedule upsert-pgdata-backup` 会触发运行或写入配置,生产恢复时必须有明确授权。`schedule runs --limit N` 是全局历史视图,返回 `scope=global` 和 `scheduleId=null`;`schedule runs <scheduleId> --limit N` 是指定 schedule 历史视图,返回 `scope=schedule` 和对应 `scheduleId`。CLI 必须拒绝 `schedule runs 50` 这类纯数字位置参数,并提示使用 `schedule runs --limit 50`,避免把空数组误判成“没有历史 run”。`schedule run <id> --wait-ms N` 触发同一 schedule,并且即使 wait 超时也必须返回 `newRunId` 和 `observeCommand`;`schedule retry-run <failedRunId>` 只接受 failed run,从原 run 反查 `scheduleId` 后重触发同一 schedule,并输出 `originalRunId`、`scheduleId`、`newRunId` 和 `observeCommand`。当 backend-core 目标容器缺失或只观察到 verify-only 容器时,schedule/microservice 命令必须以非零退出并返回 `failureKind=target-stack-not-running`、`runnerDisposition=infra-blocked`、`readOnlyCommands` 和 `authorizationRequiredForRecovery`,不得把 Docker 的 `No such container` 当成成功的空历史。 - `codex deploy <commitId>` 是旧 Code Queue 兼容部署入口,已禁用以防止维护通道直连 D601 部署 Code Queue;当前 dev 自动化只做 `ci run-dev-e2e` smoke,不提供 Code Queue CD,详细规则见 `docs/reference/codex-deploy.md`。 - `agentrun get|describe|events|logs|result|ack|cancel|dispatch|create|apply|steer|send` 是当前指挥官新任务和 AgentRun session 控制入口。UniDesk CLI 是 render-only client:客户端保留 k8s 风格命令解析、human 表格、生命周期摘要、下一步命令、分页、`-o json|yaml` 稳定客户端 schema 和错误展示;AgentRun 服务端只提供稳定 RESTful API、鉴权和业务事实,不承载 UniDesk CLI 渲染。日常查看用 `get tasks --queue commander`、`describe task/<taskId>`、`events run/<runId>`、`logs session/<sessionId>`、`result run/<runId> --command <commandId>`;日常写入用 `create task --aipod Artificer --prompt-stdin`、`apply -f -`、`dispatch task/<taskId>`、`steer/send session/<sessionId>`、`ack/cancel task|session/<id>`。兼容 group `queue|runs|commands|runner|sessions|aipod-specs` 也走同一 direct HTTP transport,`--raw` 只披露直连 AgentRun REST envelope。 diff --git a/scripts/ci-install-visibility-contract-test.ts b/scripts/ci-install-visibility-contract-test.ts new file mode 100644 index 00000000..25f445e5 --- /dev/null +++ b/scripts/ci-install-visibility-contract-test.ts @@ -0,0 +1,88 @@ +import { readFileSync } from "node:fs"; +import { ciHelp } from "./src/ci"; + +function assertCondition(condition: unknown, message: string, detail: unknown = {}): void { + if (!condition) throw new Error(`${message}: ${JSON.stringify(detail)}`); +} + +const source = readFileSync("scripts/src/ci.ts", "utf8"); +const jobsSource = readFileSync("scripts/src/jobs.ts", "utf8"); +const help = JSON.stringify(ciHelp()); + +assertCondition( + source.includes("interface CiInstallOptions") + && source.includes("skipPrewarm: boolFlag(args, \"--skip-prewarm\")") + && source.includes("skipTektonInstall: boolFlag(args, \"--skip-tekton-install\")") + && source.includes("wait: boolFlag(args, \"--wait\")") + && source.includes("prewarm: options.skipPrewarm ? \"skipped\" : \"completed\"") + && source.includes("tektonInstall: options.skipTektonInstall ? \"skipped\" : \"completed\""), + "ci install must expose controlled manifest refresh skip modes", +); + +assertCondition( + source.includes("function installAsync(options: CiInstallOptions)") + && source.includes("\"ci_install\"") + && source.includes("ciInstallCommand(options)") + && source.includes("options.wait ? install(config, options) : installAsync(options)") + && source.includes("if (action === \"install-status\") return installStatus(nameArg ?? \"latest\")"), + "ci install must default to async job mode with status follow-up", +); + +assertCondition( + source.includes("event: \"ci.install.progress\"") + && source.includes("emitCiInstallProgress(\"apply-manifest\", \"started\"") + && source.includes("emitCiInstallProgress(\"upload-manifest\", \"started\"") + && source.includes("emitCiInstallProgress(\"kubectl-apply\", \"started\"") + && source.includes("emitCiInstallProgress(\"install\", \"failed\"") + && jobsSource.includes("summarizeCiInstallJobProgress") + && jobsSource.includes("parseJsonLineEvents(stderrTail, \"ci.install.progress\")"), + "ci install job status must expose stage progress events", +); + +assertCondition( + source.includes("timeoutMs: Math.min(Math.max(waitMs, 15_000), 45_000)") + && source.includes("maxResponseBytes: 3_000_000, timeoutMs: 15_000") + && source.includes("backend-core-dispatch-timeout"), + "ci dispatch submit must use bounded backend-core fetch timeout", +); + +assertCondition( + source.includes("runSshCommandCapture(config, `${target.providerId}:k3s`, [\"script\"], script)") + && source.includes("base64 -d > \\\"$tmp\\\" <<'UNIDESK_CI_MANIFEST_B64'") + && source.includes("test -s \\\"$tmp\\\"") + && source.includes("manifest_bytes="), + "ci manifest apply must embed YAML through k3s route script with byte visibility", +); + +assertCondition( + source.includes("ci_runtime_image_containerd_root_required=true") + && source.includes("failureLines") + && source.includes("rerun with: bun scripts/cli.ts ci install --skip-prewarm"), + "ci prewarm failure must expose concise root/containerd visibility and recovery", +); + +assertCondition( + help.includes("ci install --skip-prewarm") + && help.includes("ci install --skip-prewarm --skip-tekton-install") + && help.includes("ci install-status latest") + && help.includes("ci install --wait --skip-prewarm --skip-tekton-install") + && help.includes("async-job") + && help.includes("Use --wait only for explicit synchronous debugging") + && help.includes("Use --skip-prewarm only to refresh Tekton/CI manifests") + && help.includes("Use --skip-tekton-install with --skip-prewarm only when Tekton is already installed"), + "ci help must document manifest refresh boundaries", + ciHelp(), +); + +console.log(JSON.stringify({ + ok: true, + checks: [ + "ci install exposes controlled manifest refresh skip modes", + "ci install defaults to async job mode with status follow-up", + "ci install job status exposes stage progress events", + "ci dispatch submit uses bounded backend-core fetch timeout", + "ci manifest apply embeds YAML through k3s route script with byte visibility", + "ci prewarm failure exposes concise root/containerd recovery", + "ci help documents manifest refresh boundaries", + ], +})); diff --git a/scripts/src/ci.ts b/scripts/src/ci.ts index ffa28da9..5167d010 100644 --- a/scripts/src/ci.ts +++ b/scripts/src/ci.ts @@ -6,7 +6,7 @@ import { blockedCatalogArtifactIds, catalogSummary, findCiCatalogArtifact, loadC import { runCommand } from "./command"; import { type UniDeskConfig, repoRoot, rootPath } from "./config"; import { ensureGithubSshIdentityForProvider, gitSshHttpConnectProxySource } from "./deploy-ssh-identity"; -import { startJob } from "./jobs"; +import { jobWithTail, listJobs, readJob, startJob } from "./jobs"; import { coreInternalFetch } from "./microservices"; import { artifactRegistryReadonlyResultFromCommand, @@ -83,6 +83,13 @@ interface CiOptions { target: CiTarget; } +interface CiInstallOptions { + target: CiTarget; + skipPrewarm: boolean; + skipTektonInstall: boolean; + wait: boolean; +} + interface CiTarget { providerId: string; kubeconfig: string; @@ -553,6 +560,16 @@ function coreBody(response: unknown): Record<string, unknown> | null { return asRecord(asRecord(response)?.body); } +function emitCiInstallProgress(stage: string, status: "started" | "skipped" | "succeeded" | "failed", detail: Record<string, unknown> = {}): void { + console.error(JSON.stringify({ + event: "ci.install.progress", + at: new Date().toISOString(), + stage, + status, + ...detail, + })); +} + function responseOk(response: unknown): boolean { if (typeof response !== "object" || response === null) return false; const record = response as Record<string, unknown>; @@ -850,6 +867,7 @@ function requireCiScriptPath(value: unknown): string { async function dispatchSsh(command: string, waitMs: number, remoteTimeoutMs: number, pollCompletion = true, target = ciTarget(null)): Promise<DispatchResult> { const dispatchResponse = coreInternalFetch("/api/dispatch", { method: "POST", + timeoutMs: Math.min(Math.max(waitMs, 15_000), 45_000), body: { providerId: target.providerId, command: "host.ssh", @@ -865,14 +883,19 @@ async function dispatchSsh(command: string, waitMs: number, remoteTimeoutMs: num const dispatchBody = coreBody(dispatchResponse); const taskId = asString(dispatchBody?.taskId); if (dispatchBody?.ok !== true || taskId.length === 0) { + const failureKind = asRecord(dispatchResponse)?.timedOut === true ? "backend-core-dispatch-timeout" : "backend-core-dispatch-submit-failed"; return { ok: false, taskId: taskId || null, status: null, stdout: "", - stderr: asString(dispatchBody?.error) || "dispatch did not return a task id", + stderr: asString(dispatchBody?.error) || `${failureKind}: dispatch did not return a task id`, exitCode: null, - raw: dispatchResponse, + raw: { + failureKind, + dispatchResponse, + timeoutMs: Math.min(Math.max(waitMs, 15_000), 45_000), + }, }; } if (!pollCompletion) { @@ -890,7 +913,7 @@ async function dispatchSsh(command: string, waitMs: number, remoteTimeoutMs: num const deadline = Date.now() + Math.max(effectiveWaitMs, 1_000); let latest: unknown = null; while (Date.now() < deadline) { - latest = coreInternalFetch(`/api/tasks/${encodeURIComponent(taskId)}`, { maxResponseBytes: 3_000_000 }); + latest = coreInternalFetch(`/api/tasks/${encodeURIComponent(taskId)}`, { maxResponseBytes: 3_000_000, timeoutMs: 15_000 }); const task = asRecord(coreBody(latest)?.task); const status = asString(task?.status); if (status === "succeeded" || status === "failed") { @@ -947,7 +970,7 @@ async function uploadRemoteBase64(path: string, encoded: string, target = ciTarg "chmod 600 \"$target\"", ].join("\n"), 20_000, 10_000, true, target); if (!init.ok) return init; - for (const chunk of chunks(encoded, 950)) { + for (const chunk of chunks(encoded, 6_000)) { const append = await dispatchSsh([ "set -euo pipefail", `target=${shellQuote(path)}`, @@ -958,7 +981,10 @@ async function uploadRemoteBase64(path: string, encoded: string, target = ciTarg return dispatchSsh([ "set -euo pipefail", `target=${shellQuote(path)}`, - "wc -c \"$target\"", + `expected=${shellQuote(String(Buffer.byteLength(encoded)))}`, + "actual=$(wc -c < \"$target\" | tr -d ' ')", + "printf 'uploaded_bytes=%s expected_bytes=%s path=%s\\n' \"$actual\" \"$expected\" \"$target\"", + "test \"$actual\" = \"$expected\"", ].join("\n"), 20_000, 10_000, true, target); } @@ -1028,25 +1054,29 @@ async function runRemoteBackground(label: string, script: string, timeoutMs: num }; } -async function remoteApplyManifest(path: string, target = ciTarget(null)): Promise<void> { +async function remoteApplyManifest(config: UniDeskConfig, path: string, target = ciTarget(null)): Promise<void> { const absolute = rootPath(path); if (!existsSync(absolute)) throw new Error(`manifest not found: ${path}`); - const encoded = Buffer.from(readFileSync(absolute, "utf8"), "utf8").toString("base64"); - const token = randomUUID().replace(/-/gu, "").slice(0, 12); - const b64Path = `/tmp/unidesk-ci-apply-${token}.b64`; - const upload = await uploadRemoteBase64(b64Path, encoded, target); - if (!upload.ok) throw new Error(`failed to upload manifest ${path}: ${upload.stderr || upload.stdout}`); + const manifest = readFileSync(absolute, "utf8"); + const encoded = Buffer.from(manifest, "utf8").toString("base64"); + emitCiInstallProgress("upload-manifest", "started", { providerId: target.providerId, manifest: path, bytes: Buffer.byteLength(manifest) }); const script = [ - "set -euo pipefail", + "set -eu", ...ciTargetGuardShellLines(target), "tmp=$(mktemp /tmp/unidesk-ci-apply.XXXXXX.yaml)", - `b64_path=${shellQuote(b64Path)}`, - "trap 'rm -f \"$tmp\" \"$b64_path\"' EXIT", - "base64 -d \"$b64_path\" > \"$tmp\"", + "trap 'rm -f \"$tmp\"' EXIT", + "base64 -d > \"$tmp\" <<'UNIDESK_CI_MANIFEST_B64'", + encoded, + "UNIDESK_CI_MANIFEST_B64", + "test -s \"$tmp\"", + "printf 'manifest_bytes=%s\\n' \"$(wc -c < \"$tmp\" | tr -d ' ')\"", "kubectl apply -f \"$tmp\"", ].join("\n"); - const result = await runRemoteBackground(`apply-${path.split("/").pop() ?? "manifest"}`, script, 180_000, target); - if (!result.ok) throw new Error(`kubectl apply failed for ${path}: ${result.stderr || result.stdout}`); + emitCiInstallProgress("kubectl-apply", "started", { providerId: target.providerId, manifest: path }); + const result = await runSshCommandCapture(config, `${target.providerId}:k3s`, ["script"], script); + if (result.exitCode !== 0) throw new Error(`kubectl apply failed for ${path}: ${result.stderr || result.stdout}`); + emitCiInstallProgress("upload-manifest", "succeeded", { providerId: target.providerId, manifest: path, upload: result.stdout.split(/\r?\n/u).find((line) => line.startsWith("manifest_bytes=")) ?? "" }); + emitCiInstallProgress("kubectl-apply", "succeeded", { providerId: target.providerId, manifest: path }); } async function prewarmCiRuntimeImages(target = ciTarget(null)): Promise<void> { @@ -1070,8 +1100,9 @@ async function prewarmCiRuntimeImages(target = ciTarget(null)): Promise<void> { "if ! printf '%s' \"$pause_entrypoint\" | grep -q '\"/pause\"'; then echo native_k3s_pause_image_invalid_entrypoint=$pause_entrypoint >&2; exit 1; fi", "root_exec() {", " if [ \"$(id -u)\" = \"0\" ]; then \"$@\"; return $?; fi", - " if command -v sudo >/dev/null 2>&1; then sudo \"$@\"; return $?; fi", + " if command -v sudo >/dev/null 2>&1 && sudo -n true >/dev/null 2>&1; then sudo \"$@\"; return $?; fi", " if [ -x /mnt/c/Windows/System32/wsl.exe ]; then /mnt/c/Windows/System32/wsl.exe -u root -- \"$@\"; return $?; fi", + " echo ci_runtime_image_containerd_root_required=true >&2", " \"$@\"", "}", "containerd_images=$(root_exec ctr --address /run/k3s/containerd/containerd.sock -n k8s.io images ls 2>/tmp/unidesk-ci-containerd-images.err || true)", @@ -1100,7 +1131,23 @@ async function prewarmCiRuntimeImages(target = ciTarget(null)): Promise<void> { `root_exec ctr --address /run/k3s/containerd/containerd.sock -n k8s.io images ls | grep -F ${shellQuote(`docker.io/library/${target.codeQueueImage}`)} >/dev/null`, ].join("\n"); const result = await runRemoteBackground("prewarm-runtime-images", script, 900_000, target); - if (!result.ok) throw new Error(`CI runtime image prewarm failed: ${result.stderr || result.stdout}`); + if (!result.ok) { + const combined = `${result.stdout}\n${result.stderr}`.trim(); + const failureLines = extractCiLogFailureHints(combined).concat( + combined.split(/\r?\n/u).filter((line) => /ci_runtime_image_|containerd_|root_required|sudo/iu.test(line)).map(boundedHintLine).slice(-40), + ); + throw new Error(`CI runtime image prewarm failed: ${JSON.stringify({ + providerId: target.providerId, + exitCode: result.exitCode, + failureLines: Array.from(new Set(failureLines)).slice(-50), + stdoutTail: tailTextLines(result.stdout, 80), + stderrTail: tailTextLines(result.stderr, 80), + recovery: [ + "If Tekton is already installed and only CI manifests need refreshing, rerun with: bun scripts/cli.ts ci install --skip-prewarm", + "If runtime helper images are missing from containerd, restore passwordless/root containerd import on the provider and rerun without --skip-prewarm.", + ], + })}`); + } } async function status(target = ciTarget(null)): Promise<Record<string, unknown>> { @@ -1126,27 +1173,135 @@ async function status(target = ciTarget(null)): Promise<Record<string, unknown>> }; } -async function install(target = ciTarget(null)): Promise<Record<string, unknown>> { - if (!existsSync(rootPath(target.pipelineManifest))) { +async function install(config: UniDeskConfig, options: CiInstallOptions): Promise<Record<string, unknown>> { + if (!existsSync(rootPath(options.target.pipelineManifest))) { throw new Error("CI manifests are missing"); } - await prewarmCiRuntimeImages(target); - const installTektonScript = [ - "set -euo pipefail", - ...ciTargetGuardShellLines(target), - `kubectl apply -f ${shellQuote(tektonPipelineReleaseUrl)}`, - "kubectl wait --for=condition=Available deployment --all -n tekton-pipelines --timeout=900s", - `kubectl apply -f ${shellQuote(tektonTriggersReleaseUrl)}`, - `kubectl apply -f ${shellQuote(tektonTriggersInterceptorsUrl)}`, - "kubectl wait --for=condition=Available deployment --all -n tekton-pipelines --timeout=900s", - "kubectl wait --for=condition=Available deployment --all -n tekton-pipelines-resolvers --timeout=900s", - ].join("\n"); - const installTekton = await runRemoteBackground("install-tekton", installTektonScript, 1_200_000, target); - if (!installTekton.ok) throw new Error(`Tekton install failed: ${installTekton.stderr || installTekton.stdout}`); - await remoteApplyManifest("src/components/microservices/k3sctl-adapter/k3s/ci/tekton-install.yaml", target); - await remoteApplyManifest(target.pipelineManifest, target); - await remoteApplyManifest("src/components/microservices/k3sctl-adapter/k3s/ci/unidesk-ci.triggers.yaml", target); - return status(target); + emitCiInstallProgress("install", "started", { + providerId: options.target.providerId, + skipPrewarm: options.skipPrewarm, + skipTektonInstall: options.skipTektonInstall, + }); + try { + if (options.skipPrewarm) { + emitCiInstallProgress("prewarm", "skipped", { providerId: options.target.providerId }); + } else { + emitCiInstallProgress("prewarm", "started", { providerId: options.target.providerId }); + await prewarmCiRuntimeImages(options.target); + emitCiInstallProgress("prewarm", "succeeded", { providerId: options.target.providerId }); + } + if (options.skipTektonInstall) { + emitCiInstallProgress("install-tekton", "skipped", { providerId: options.target.providerId }); + } else { + emitCiInstallProgress("install-tekton", "started", { providerId: options.target.providerId }); + const installTektonScript = [ + "set -euo pipefail", + ...ciTargetGuardShellLines(options.target), + `kubectl apply -f ${shellQuote(tektonPipelineReleaseUrl)}`, + "kubectl wait --for=condition=Available deployment --all -n tekton-pipelines --timeout=900s", + `kubectl apply -f ${shellQuote(tektonTriggersReleaseUrl)}`, + `kubectl apply -f ${shellQuote(tektonTriggersInterceptorsUrl)}`, + "kubectl wait --for=condition=Available deployment --all -n tekton-pipelines --timeout=900s", + "kubectl wait --for=condition=Available deployment --all -n tekton-pipelines-resolvers --timeout=900s", + ].join("\n"); + const installTekton = await runRemoteBackground("install-tekton", installTektonScript, 1_200_000, options.target); + if (!installTekton.ok) throw new Error(`Tekton install failed: ${installTekton.stderr || installTekton.stdout}`); + emitCiInstallProgress("install-tekton", "succeeded", { providerId: options.target.providerId }); + } + for (const manifest of [ + "src/components/microservices/k3sctl-adapter/k3s/ci/tekton-install.yaml", + options.target.pipelineManifest, + "src/components/microservices/k3sctl-adapter/k3s/ci/unidesk-ci.triggers.yaml", + ]) { + emitCiInstallProgress("apply-manifest", "started", { providerId: options.target.providerId, manifest }); + await remoteApplyManifest(config, manifest, options.target); + emitCiInstallProgress("apply-manifest", "succeeded", { providerId: options.target.providerId, manifest }); + } + emitCiInstallProgress("status", "started", { providerId: options.target.providerId }); + const summary = await status(options.target); + emitCiInstallProgress("status", "succeeded", { providerId: options.target.providerId }); + emitCiInstallProgress("install", "succeeded", { providerId: options.target.providerId }); + return { + ...summary, + install: { + providerId: options.target.providerId, + prewarm: options.skipPrewarm ? "skipped" : "completed", + tektonInstall: options.skipTektonInstall ? "skipped" : "completed", + }, + }; + } catch (error) { + emitCiInstallProgress("install", "failed", { + providerId: options.target.providerId, + errorTail: (error instanceof Error ? error.message : String(error)).slice(-1200), + }); + throw error; + } +} + +function ciInstallCommand(options: CiInstallOptions): string[] { + return [ + "bun", + "scripts/cli.ts", + "ci", + "install", + "--provider-id", + options.target.providerId, + "--wait", + ...(options.skipPrewarm ? ["--skip-prewarm"] : []), + ...(options.skipTektonInstall ? ["--skip-tekton-install"] : []), + ]; +} + +function installAsync(options: CiInstallOptions): Record<string, unknown> { + const command = ciInstallCommand(options); + const job = startJob( + "ci_install", + command, + `Install/refresh Tekton CI on ${options.target.providerId} native k3s`, + ); + return { + ok: true, + mode: "async", + providerId: options.target.providerId, + skipped: { + prewarm: options.skipPrewarm, + tektonInstall: options.skipTektonInstall, + }, + job: { + id: job.id, + status: job.status, + command: job.command, + stdoutFile: job.stdoutFile, + stderrFile: job.stderrFile, + note: job.note, + }, + next: [ + `bun scripts/cli.ts ci install-status ${job.id}`, + `bun scripts/cli.ts job status ${job.id} --tail-bytes 12000`, + ], + boundary: "ci install is fire-and-forget by default; use --wait only for explicit synchronous debugging.", + }; +} + +function latestCiInstallJobId(): string { + const job = listJobs().find((item) => item.name === "ci_install"); + if (job === undefined) throw new Error("no ci_install job found"); + return job.id; +} + +function installStatus(id: string): Record<string, unknown> { + const jobId = id === "latest" || id.length === 0 ? latestCiInstallJobId() : id; + const job = readJob(jobId); + if (job.name !== "ci_install") { + throw new Error(`job ${jobId} is ${job.name}, not ci_install`); + } + return { + ok: job.status !== "failed" && job.status !== "canceled", + job: jobWithTail(job, 12_000), + next: job.status === "running" || job.status === "queued" + ? [`bun scripts/cli.ts ci install-status ${job.id}`] + : ["bun scripts/cli.ts ci status"], + }; } function pipelineRunManifest(options: CiOptions): string { @@ -2893,6 +3048,10 @@ export function ciHelp(): Record<string, unknown> { description: "Manage native k3s Tekton CI on D601 or G14. CI may publish commit-pinned image artifacts, but it intentionally does not deploy CD.", examples: [ "bun scripts/cli.ts ci install", + "bun scripts/cli.ts ci install --skip-prewarm", + "bun scripts/cli.ts ci install --skip-prewarm --skip-tekton-install", + "bun scripts/cli.ts ci install-status latest", + "bun scripts/cli.ts ci install --wait --skip-prewarm --skip-tekton-install", "bun scripts/cli.ts ci install --provider-id G14", "bun scripts/cli.ts ci run --revision <commit>", "bun scripts/cli.ts ci run --provider-id G14 --revision <commit>", @@ -2923,6 +3082,14 @@ export function ciHelp(): Record<string, unknown> { }, }, }, + install: { + defaultMode: "async-job", + waitFlag: "Use --wait only for explicit synchronous debugging; the default returns a .state/jobs job immediately with install-status follow-up.", + statusCommand: "bun scripts/cli.ts ci install-status <jobId|latest>", + prewarmDefault: true, + skipPrewarm: "Use --skip-prewarm only to refresh Tekton/CI manifests when runtime images are already present or prewarm is blocked by provider root/containerd permissions.", + skipTektonInstall: "Use --skip-tekton-install with --skip-prewarm only when Tekton is already installed and only UniDesk CI manifests/triggers need refreshing.", + }, backendCoreArtifact: { producer: "D601 CI", registry: "127.0.0.1:5000/unidesk/backend-core:<commit>", @@ -2973,7 +3140,16 @@ function requireRunId(value: string): string { export async function runCiCommand(config: UniDeskConfig, args: string[]): Promise<Record<string, unknown>> { const [action = "status", nameArg] = args; if (isHelpArg(action) || args.slice(1).some(isHelpArg)) return ciHelp(); - if (action === "install") return install(ciTarget(providerIdOption(args))); + if (action === "install") { + const options = { + target: ciTarget(providerIdOption(args)), + skipPrewarm: boolFlag(args, "--skip-prewarm"), + skipTektonInstall: boolFlag(args, "--skip-tekton-install"), + wait: boolFlag(args, "--wait"), + }; + return options.wait ? install(config, options) : installAsync(options); + } + if (action === "install-status") return installStatus(nameArg ?? "latest"); if (action === "status") return status(ciTarget(providerIdOption(args))); if (action === "run") { const target = ciTarget(providerIdOption(args)); @@ -3062,6 +3238,10 @@ export async function runCiCommand(config: UniDeskConfig, args: string[]): Promi } export function startCiInstallJob(providerId = d601ProviderId): Record<string, unknown> { - const job = startJob("ci_install", ["bun", "scripts/cli.ts", "ci", "install", "--provider-id", providerId], `Install/refresh Tekton CI on ${providerId} native k3s`); - return { ok: true, job }; + return installAsync({ + target: ciTarget(providerId), + skipPrewarm: false, + skipTektonInstall: false, + wait: false, + }); } diff --git a/scripts/src/jobs.ts b/scripts/src/jobs.ts index dc17a86a..2d1101f4 100644 --- a/scripts/src/jobs.ts +++ b/scripts/src/jobs.ts @@ -223,12 +223,14 @@ export function jobWithTail(job: JobRecord, maxBytes = 12000): JobRecord & { } function summarizeJobProgress(job: JobRecord, maxBytes = 96_000, tails?: { stdoutTail: string; stderrTail: string }): JobProgressSummary { + const nowMs = Date.now(); const knownWorkflow = job.name === "hwlab_g14_v02_trigger_current"; const v02PrMonitorWorkflow = job.name === "hwlab_g14_v02_pr_monitor"; const runtimeLaneTriggerWorkflow = /^hwlab_nodes_v[0-9]{2}_control-plane_trigger-current$/u.test(job.name); const gitMirrorWorkflow = job.name === "hwlab_g14_git_mirror_sync" || job.name === "hwlab_g14_git_mirror_flush" || job.name === "agentrun_v01_git_mirror_sync" || job.name === "agentrun_v01_git_mirror_flush"; + const ciInstallWorkflow = job.name === "ci_install"; + if (ciInstallWorkflow) return summarizeCiInstallJobProgress(job, tails?.stderrTail, nowMs); if (!knownWorkflow && !v02PrMonitorWorkflow && !runtimeLaneTriggerWorkflow && !gitMirrorWorkflow) return genericJobProgress(job, tails?.stderrTail); - const nowMs = Date.now(); const progressTailBytes = Math.max(4096, Math.floor(maxBytes)); const stderrTail = tails?.stderrTail ?? tailFile(job.stderrFile, progressTailBytes); const stdoutTail = tails?.stdoutTail ?? tailFile(job.stdoutFile, progressTailBytes); @@ -357,6 +359,57 @@ function summarizeGitMirrorJobProgress(job: JobRecord, stdoutTail: string, stder }; } +function summarizeCiInstallJobProgress(job: JobRecord, stderrTailOverride?: string, nowMs = Date.now()): JobProgressSummary { + const stderrTail = stderrTailOverride ?? tailFile(job.stderrFile, 96_000); + const events = parseJsonLineEvents(stderrTail, "ci.install.progress") + .sort((left, right) => String(left.at ?? "").localeCompare(String(right.at ?? ""))); + const lastEvent = events.at(-1) ?? {}; + const stage = stringField(lastEvent.stage); + const stageStatus = stringField(lastEvent.status); + const lastEventAt = stringField(lastEvent.at); + const elapsedSeconds = jobElapsedSeconds(job, nowMs); + const stageElapsedSeconds = currentStageElapsedSeconds(events, stage, stageStatus, job, nowMs); + const lastEventAgeSeconds = lastEventAt === null ? null : secondsSince(lastEventAt, job.finishedAt ?? nowMs); + const warnings = jobProgressWarnings({ + job, + eventsObserved: events.length, + elapsedSeconds, + stage, + stageStatus, + stageElapsedSeconds, + lastEventAgeSeconds, + }); + return { + kind: "generic", + stage, + stageStatus, + sourceCommit: null, + pipelineRun: null, + pipelineCreated: null, + elapsedSeconds, + stageElapsedSeconds, + lastEventAt, + lastEventAgeSeconds, + eventsObserved: events.length, + slow: warnings.length > 0, + warnings, + timings: {}, + summary: [ + job.status, + stage ? `${stage}${stageStatus ? `:${stageStatus}` : ""}` : "stage:unknown", + typeof lastEvent.providerId === "string" ? `provider=${lastEvent.providerId}` : null, + typeof lastEvent.manifest === "string" ? `manifest=${lastEvent.manifest}` : null, + elapsedSeconds !== null ? `elapsed=${elapsedSeconds}s` : null, + stageElapsedSeconds !== null && job.status === "running" ? `stageElapsed=${stageElapsedSeconds}s` : null, + lastEventAgeSeconds !== null && job.status === "running" ? `lastEventAge=${lastEventAgeSeconds}s` : null, + warnings.length > 0 ? "visibility-warning" : null, + ].filter(Boolean).join(" "), + nextCommand: job.status === "running" + ? `bun scripts/cli.ts ci install-status ${job.id}` + : "bun scripts/cli.ts ci status", + }; +} + function summarizeRuntimeLaneTriggerJobProgress(job: JobRecord, stdoutTail: string, stderrTail: string, nowMs = Date.now()): JobProgressSummary { const events = parseJsonLineEvents(stderrTail, "hwlab.runtime-lane.trigger.progress") .sort((left, right) => String(left.at ?? "").localeCompare(String(right.at ?? "")));