From ed32eb84b2545c5b36a3fa1ed7624f7c59f4dc17 Mon Sep 17 00:00:00 2001 From: Codex Date: Thu, 2 Jul 2026 04:38:46 +0000 Subject: [PATCH] fix: bound node runtime cicd wait closeout --- .agents/skills/unidesk-cicd/SKILL.md | 19 +- .../unidesk-cicd/references/agentrun.md | 63 ++++ .../unidesk-cicd/references/control-plane.md | 75 ++++ .../skills/unidesk-cicd/references/full.md | 324 ----------------- .../unidesk-cicd/references/git-mirror.md | 34 ++ .../unidesk-cicd/references/platform-ops.md | 81 +++++ .../unidesk-cicd/references/pr-monitor.md | 31 ++ docs/reference/cli.md | 2 +- scripts/src/hwlab-node/git-mirror.ts | 331 ++++++++++++++++-- scripts/src/hwlab-node/render.ts | 175 +++++++-- scripts/src/hwlab-node/status.ts | 133 +++++-- scripts/src/hwlab-node/web-probe.ts | 21 +- 12 files changed, 880 insertions(+), 409 deletions(-) create mode 100644 .agents/skills/unidesk-cicd/references/agentrun.md create mode 100644 .agents/skills/unidesk-cicd/references/control-plane.md delete mode 100644 .agents/skills/unidesk-cicd/references/full.md create mode 100644 .agents/skills/unidesk-cicd/references/git-mirror.md create mode 100644 .agents/skills/unidesk-cicd/references/platform-ops.md create mode 100644 .agents/skills/unidesk-cicd/references/pr-monitor.md diff --git a/.agents/skills/unidesk-cicd/SKILL.md b/.agents/skills/unidesk-cicd/SKILL.md index c1a1dbed..61bfe3ef 100644 --- a/.agents/skills/unidesk-cicd/SKILL.md +++ b/.agents/skills/unidesk-cicd/SKILL.md @@ -17,7 +17,13 @@ bun scripts/cli.ts hwlab g14 git-mirror status --lane v02 bun scripts/cli.ts agentrun control-plane status ``` -完整 PR monitor、control-plane、git-mirror、Secret、observability、platform-infra、CI tools image、PipelineRun 清理、rollout 补记和 AgentRun v0.1 部署矩阵见 [references/full.md](references/full.md)。 +按职责读取拆分后的 reference: + +- PR monitor 与自动合并: [references/pr-monitor.md](references/pr-monitor.md)。 +- Tekton/Argo、node-scoped runtime lane、D601 infra bootstrap: [references/control-plane.md](references/control-plane.md)。 +- HWLAB/AgentRun git-mirror source authority 与 flush: [references/git-mirror.md](references/git-mirror.md)。 +- Secret、observability、platform-infra、CI tools image、PipelineRun 清理和 rollout 补记: [references/platform-ops.md](references/platform-ops.md)。 +- AgentRun YAML-only lane、v0.1 兼容入口和 AgentRun git-mirror: [references/agentrun.md](references/agentrun.md)。 ## P0 边界 @@ -29,14 +35,15 @@ bun scripts/cli.ts agentrun control-plane status - CI/CD 一键交付的端到端 wall-clock 目标是低于 2 分钟;计时从操作者触发受控命令开始,到 runtime ready 且 `/health` 端点验证完成为止。具体 wait/timeout/budget 字段必须从 YAML/source-of-truth 读取并配置到满足该目标。 - CI/CD validation 阶段只能验证部署对象的 `/health` 端点和必要 provenance;禁止在 CI/CD gate 中运行 web-probe、Playwright、远程浏览器截图、用户路径 E2E 或等价重型业务探针。业务/用户入口验证只能作为发布后的独立 post-deploy validation 证据,不得阻塞 CI/CD 一键交付。 - 任一 CI/CD 阶段或总耗时超过 2 分钟时,不要继续死等或把超长等待视为正常;先输出阶段耗时分解,并优先从 env reuse、git mirror、BuildKit/cache、GitOps/Argo watch 和 runtime readiness 探测方向优化后再继续交付。 -- `trigger-current --wait` 超过 2 分钟后,先用受控 `control-plane status --full` / `git-mirror status` 区分构建、GitOps flush、Argo sync 和 runtime readiness;若 PipelineRun 已成功但 `git-mirror pending=true`,按 CLI 提示走受控 `git-mirror flush --confirm --wait`,再 `control-plane refresh/status` 复查收敛。不要为了推进发布改应用源码或绕过 GitOps/Argo。 +- node-scoped `trigger-current --wait` 必须把 source sync、pre/post flush、PipelineRun、GitOps/Argo、runtime readiness 和 `/health` closeout 放进同一 120s 端到端预算;超预算时由 CLI 输出阶段分解、Argo target revision、runtime/public 状态和 TaskRun/Pod drill-down,不继续死等,也不要求操作者手动串联多个状态/flush 命令才能完成一次交付。 - 触发或验收 rollout 时必须绑定 lane、source commit、PipelineRun/GitOps revision、runtime ready 和 `/health` 端点验证结果;web-probe/Playwright 结果只能作为单独的 post-deploy 证据。 - Secret 只通过 YAML sourceRef/targetKey 和受控 CLI 下发;输出只披露 presence/fingerprint。 - 长命令用异步 job 或短轮询;不要长时间挂住 trans/ssh。 ## 何时读取 reference -- PR 自动合并、v0.2/v0.3 lane 差异:读 [references/full.md](references/full.md) 的 PR 监控段。 -- 手动触发、定点 PipelineRun/source commit、RBAC/Pipeline/Argo:读控制面段。 -- git-mirror、Secret、observability、CI tools image、PipelineRun/PV 清理:读对应段。 -- AgentRun v0.1 或 YAML-only lane 部署:读 AgentRun 控制面段。 +- PR 自动合并、v0.2/v0.3 lane 差异:读 [references/pr-monitor.md](references/pr-monitor.md)。 +- 手动触发、定点 PipelineRun/source commit、RBAC/Pipeline/Argo、node-scoped runtime lane:读 [references/control-plane.md](references/control-plane.md)。 +- git-mirror source authority 或 flush:读 [references/git-mirror.md](references/git-mirror.md)。 +- Secret、observability、CI tools image、PipelineRun/PV 清理:读 [references/platform-ops.md](references/platform-ops.md)。 +- AgentRun v0.1 或 YAML-only lane 部署:读 [references/agentrun.md](references/agentrun.md)。 diff --git a/.agents/skills/unidesk-cicd/references/agentrun.md b/.agents/skills/unidesk-cicd/references/agentrun.md new file mode 100644 index 00000000..4e0016b1 --- /dev/null +++ b/.agents/skills/unidesk-cicd/references/agentrun.md @@ -0,0 +1,63 @@ +# AgentRun Control Plane + +AgentRun YAML-only lane 以 `config/agentrun.yaml` 为部署真相;node/lane、source workspace/branch、image build、GitOps branch/path、runtime namespace、Secret、外置数据库、manager env、git-mirror 和 edge 暴露都从 YAML 进入 CLI。 + +## YAML-Only Lane + +```bash +bun scripts/cli.ts agentrun control-plane plan --node D601 --lane v02 +bun scripts/cli.ts agentrun control-plane apply --node D601 --lane v02 [--dry-run|--confirm] +bun scripts/cli.ts agentrun control-plane secret-sync --node D601 --lane v02 [--dry-run|--confirm] +bun scripts/cli.ts agentrun control-plane restart --node D601 --lane v02 [--dry-run|--confirm] +bun scripts/cli.ts agentrun control-plane trigger-current --node D601 --lane v02 [--dry-run|--confirm] +bun scripts/cli.ts agentrun control-plane cleanup-runners --node D601 --lane v02 [--force-active] [--dry-run|--confirm] +bun scripts/cli.ts agentrun control-plane status --node D601 --lane v02 [--pipeline-run |--source-commit ] [--full|--raw] +``` + +- `plan`: 只读解析 YAML,输出控制面、source、image build、GitOps、runtime 和 Secret plan,不打印 Secret value。 +- `apply`: 按 YAML 渲染并 apply Tekton RBAC/Pipeline、Argo AppProject/Application 和 runtime namespace。 +- `secret-sync`: 按 YAML 的 Secret sourceRef/keyMapping 同步 runtime Secret 和外置 DB Secret,只输出 fingerprint。 +- `restart`: patch manager Deployment restart annotation 并等待 rollout,用于 Secret export/DB 连接串变化后让 workload 读取新 Secret;不要手工删除 Pod。 +- `trigger-current`: v0.2 lane source authority 只读 k8s git-mirror snapshot。confirmed 运行先触发受控 `git-mirror sync`,为 source branch tip 创建 `refs/unidesk/snapshots/agentrun-yaml-lane//`,再从该 snapshot 构建并推送 YAML 声明的 image,渲染 GitOps/artifact catalog,flush git-mirror 并创建 provenance PipelineRun。 +- `cleanup-runners`: 只清 YAML 选中 lane runtime namespace 中匹配 `deployment.runner.retention.selectors` 的 runner Job/Pod;runner 上限、最后活跃排序、active heartbeat 窗口、age-based cleanup 开关和 selector 都以 YAML 为准。 +- `status`: 默认返回 compact commander JSON,关键结论在 `.data.summary` 和 `.data.alignment`;完整 YAML target、原始 source/runtime/gitMirror payload 和 probe tail 只在 `--full|--raw` 展开。 + +YAML-only lane 的长步骤必须由 CLI 拆成短提交和状态轮询:k8s git-mirror snapshot sync、image build、GitOps publish、git-mirror flush 和 PipelineRun 创建不得塞进一个顶层 `trans` 长连接。GitOps publish 必须使用隔离临时 clone/worktree,不能切换或污染 YAML 声明的固定 source workspace。 + +AgentRun YAML-only lane closeout 必须同时看当前 k8s git-mirror source snapshot、目标 PipelineRun、GitOps revision、Argo revision 和 manager source commit。发布过程中如果 source branch 被并行 PR 推进,`status --pipeline-run ` 会通过 `summary.branchDrift` / `alignment.branchDrift` 标记目标 PipelineRun 是否已被当前 snapshot tip supersede。最终只用最新 PipelineRun 的 `status` 中 `aligned=true`、`blockers=[]`、`argoSyncedToGitops=true` 和 `managerSourceMatchesExpected=true` 收口。 + +Runner egress proxy、持久化、idle timeout 和 retention 只从 `config/agentrun.yaml` 的 `deployment.runner.*` 进入部署。验收不能只看 manager Deployment/Pod env;必须用 HWLAB/AgentRun 原入口创建新 turn 或 runner Job,并检查新 runner Job env、session PVC、`AGENTRUN_SOURCE_COMMIT` 和 trace/result 是否复用同一 run 且没有 `reuse-blocked`。 + +Provider credential 的 `config.toml` 变更同样走 YAML `sourceRef`、`secret-sync` 和 `restart`;lane config 只声明该 lane 需要的 Codex CLI runtime options。不要复制指挥机全局 `~/.codex/config.toml` 作为长期事实。 + +## AgentRun v0.1 Compatibility + +```bash +bun scripts/cli.ts agentrun control-plane status [--dry-run|--confirm] +bun scripts/cli.ts agentrun control-plane trigger-current [--dry-run|--confirm] +bun scripts/cli.ts agentrun control-plane refresh [--dry-run|--confirm] +bun scripts/cli.ts agentrun control-plane cleanup-runs [--min-age-minutes N] [--limit N] [--dry-run|--confirm] +bun scripts/cli.ts agentrun control-plane cleanup-released-pvs [--limit N] [--dry-run|--confirm] +``` + +- `status`: 只读汇总 source commit、PipelineRun、Argo、manager image、git mirror 和 `aligned` 结论。 +- `trigger-current`: 快进 `G14:/root/agentrun-v01`,mirror sync,创建 `agentrun-v01-ci-` PipelineRun。 +- `refresh`: Argo hard refresh,不 patch runtime workload。 +- `cleanup-runs`: 只清理 `agentrun-ci` 中已完成 PipelineRun 和临时 PVC;不清理 runtime runner Job/Pod/Secret。 +- `cleanup-released-pvs`: 回收 Released PV。 + +AgentRun compact JSON 关键字段在 `.data.summary.sourceCommit`、`.data.summary.expectedPipelineRun`、`.data.summary.runtimeAlignment`、`.data.summary.gitMirror`、`.data.summary.ci.pipelineRun`、`.data.summary.argo` 和 `.data.alignment`,不要假设存在 `.data.status`。 + +## AgentRun Git Mirror + +```bash +bun scripts/cli.ts agentrun git-mirror status [--full|--raw] +bun scripts/cli.ts agentrun git-mirror sync [--dry-run|--confirm] [--wait] +bun scripts/cli.ts agentrun git-mirror flush [--dry-run|--confirm] [--wait] +``` + +- `status`: 返回 `localV01`/`githubV01`/`localGitops`/`githubGitops`/`pendingFlush`/`githubInSync`。 +- `sync`: 拉取 GitHub `v0.1` 和 `v0.1-gitops` refs。 +- `flush`: 推送本地 `v0.1-gitops` 到 GitHub。 + +AgentRun mirror 与 HWLAB mirror 共用 `devops-infra` 服务和 cache PVC,但 repo path、refs、status 文件和 CLI 命令彼此独立。 diff --git a/.agents/skills/unidesk-cicd/references/control-plane.md b/.agents/skills/unidesk-cicd/references/control-plane.md new file mode 100644 index 00000000..b6f58841 --- /dev/null +++ b/.agents/skills/unidesk-cicd/references/control-plane.md @@ -0,0 +1,75 @@ +# Tekton And Argo Control Plane + +控制面负责 PipelineRun、RBAC/Pipeline/Argo、node-scoped runtime lane 和 D601 节点本地 infra bootstrap。正式入口都必须使用 `bun scripts/cli.ts`,不要把裸 `kubectl`、`argo` 或 `tkn` 当长期操作面。 + +## G14 Control Plane + +```bash +# 最新 head +bun scripts/cli.ts hwlab g14 control-plane status --lane v02 + +# 定点 PipelineRun +bun scripts/cli.ts hwlab g14 control-plane status \ + --lane v02 --pipeline-run hwlab-v02-ci-poll- + +# 定点 source commit +bun scripts/cli.ts hwlab g14 control-plane status \ + --lane v02 --source-commit + +# 手动触发 +bun scripts/cli.ts hwlab g14 control-plane trigger-current \ + --lane v02|v03 [--dry-run|--confirm] + +# 应用 RBAC/Pipeline/Argo +bun scripts/cli.ts hwlab g14 control-plane apply --lane v02 [--dry-run|--confirm] +``` + +定点 status 输出 `targetValidation.state=passed|superseded`,只检查指定 target 的证据。confirmed trigger 创建 commit-pinned PipelineRun;同名 PipelineRun 存在时默认复用现有状态,不做隐式 delete/create。 + +## Node-Scoped Runtime Lane + +```bash +bun scripts/cli.ts hwlab nodes control-plane status --node --lane v03 [--pipeline-run |--source-commit ] [--full|--raw] +bun scripts/cli.ts hwlab nodes control-plane trigger-current --node --lane v03 --confirm --wait +bun scripts/cli.ts hwlab nodes control-plane sync --node --lane v03 --confirm +bun scripts/cli.ts hwlab nodes control-plane refresh --node --lane v03 --confirm +``` + +`status` 默认返回 compact summary,只保留 source commit、PipelineRun、Argo、runtime readiness、public probe、git mirror 和 next action;完整 expected YAML/render target、kubectl result tail、Secret/sourceRef 详情和 probe 原始结果只在 `--full` 或 `--raw` 下展开。 + +`trigger-current --confirm --wait` 是 node/lane CI/CD 一键入口:按 YAML 解析 source head,执行 k8s git-mirror source snapshot sync 和必要 pre-flush,刷新 control-plane,创建或复用 commit-pinned PipelineRun,等待 PipelineRun 终态,并在终态成功后继续执行 post-flush、GitOps/Argo、runtime readiness 和 public `/health` closeout。 + +120 秒是同一条 CLI 的端到端预算,不是每个子阶段各等一轮。嵌套 git-mirror sync/flush 必须按剩余预算裁剪并跳过多次 retry;状态探测也必须按剩余预算收窄。超预算时 CLI 返回 `pending` warning,输出阶段分解、PipelineRun 状态、GitOps flush revision、Argo observed/target revision、runtime/public readiness、pending TaskRun/Pod drill-down 和下一条受控 status 命令。PipelineRun 已成功但 Argo/runtime/public 仍在收敛时,状态原因应落在 Argo/runtime closeout,不再泛化成 CI TaskRun pending。 + +`sync --confirm` 是 Argo runtime 收敛修复入口:先按 YAML 同步本地 postgres bootstrap Secret,再终止卡住的 running Argo operation、删除失败 hook Job,并在 StatefulSet template 已更新但旧 controller-revision pod 因 `ImagePullBackOff` / `ErrImagePull` / `CrashLoopBackOff` 卡住时受控删除该旧 pod。不要手工裸删 pod。 + +## D601 Infra Bootstrap + +```bash +bun scripts/cli.ts hwlab nodes control-plane infra plan --node D601 --lane v03 +bun scripts/cli.ts hwlab nodes control-plane infra status --node D601 --lane v03 +bun scripts/cli.ts hwlab nodes control-plane infra apply --node D601 --lane v03 --dry-run +bun scripts/cli.ts hwlab nodes control-plane infra apply --node D601 --lane v03 --confirm +bun scripts/cli.ts hwlab nodes control-plane infra ci-build-benchmark --node D601 --lane v03 --profile no-mirror-full --confirm +bun scripts/cli.ts hwlab nodes control-plane infra ci-build-benchmark status --node D601 --lane v03 --profile no-mirror-full +bun scripts/cli.ts hwlab nodes control-plane infra ci-build-benchmark logs --node D601 --lane v03 --profile no-mirror-full +``` + +从 `config/hwlab-node-control-plane.yaml` 渲染 D601 HWLAB v03 的节点本地 CI/CD、git-mirror、Tekton、runtime dependency image preload 和 Argo 前置对象。confirmed apply 只做 control-plane bootstrap,不触发 runtime rollout,不创建 PK01 DB,也不修改 Caddy/FRP。D601 Argo CD 安装也必须由 YAML 声明:官方 manifest URL、版本、镜像 rewrite/preload、CRD、期望 workload 和 AppProject/Application 都来自 YAML。 + +`ci-build-benchmark` 是 HWLAB v0.3 k3s CI/CD 全量无缓存构建出网测速入口。profile、独立 catalog path 模板、cache policy、必须输出的 timing 阶段和失败族来自 `config/hwlab-node-control-plane.yaml`;实际 service set、git mirror URL、Pipeline、ServiceAccount、registry prefix 和 base image 仍以 `config/hwlab-node-lanes.yaml` 为准。`forbidBuildkitCache=true` 时会向 PipelineRun 传 `build-cache-mode=disabled`。通过证据必须包含每个 `build-` TaskRun;PipelineRun 成功但缺少 build task 要按 `cache-hit-forbidden` 处理。 + +## Runtime Base Image + +```bash +bun scripts/cli.ts hwlab nodes control-plane runtime-image status --node G14 --lane v03 +bun scripts/cli.ts hwlab nodes control-plane runtime-image preload --node G14 --lane v03 --confirm +``` + +runtime base image 走 `config/hwlab-node-lanes.yaml`:`baseImageSource` 是公开来源,`baseImage` 是 node-local registry 目标。缺失 base image 时先用 `runtime-image status` 判断 `registryTagPresent`,再用 `preload --confirm` seed;不要手工 `docker tag/push`。 + +## Failure Triage + +PipelineRun 失败或长时间未完成时,先按定点 `control-plane status --pipeline-run ` 和 bounded 只读诊断定位失败 TaskRun/Pod/container。env-reuse service build 常见失败点是 `build-` 的 `step-publish` 日志;先用 `platform-infra sub2api status|validate` 区分共享 proxy 整体故障和单个上游 transient。proxy 健康但单个依赖下载 transient 时,可以受控 `trigger-current --rerun`;重复失败应修 `artifact-publish`/envRecipe 的有限 retry 后重新合并发布。 + +小范围 PR 触发 120s 时必须看 plan artifacts 的 `affectedServices/buildServices/reusedServices`:如果 source diff 很小却出现所有 envreuse 服务都在 `buildServices` 且 `reusedServices=[]`,优先怀疑 current GitOps artifact catalog 没有 hydrate 到 source plan 阶段,而不是继续盲目重跑 PipelineRun。 diff --git a/.agents/skills/unidesk-cicd/references/full.md b/.agents/skills/unidesk-cicd/references/full.md deleted file mode 100644 index 793ab2d2..00000000 --- a/.agents/skills/unidesk-cicd/references/full.md +++ /dev/null @@ -1,324 +0,0 @@ ---- -name: unidesk-cicd -description: UniDesk CI/CD 控制面 — `hwlab g14` 和 `agentrun` 子命令,覆盖 PR 监控自动合并、Tekton/Argo 控制面、git-mirror、Secret、observability、CI tools image、PipelineRun 清理、AgentRun v0.1 部署和 AgentRun YAML-only lane 部署。用户提到 CI/CD、deploy、rollout、PipelineRun、trigger、git-mirror、control-plane、k3s 部署、agentrun 部署、hwlab g14、monitor-prs、trigger-current 时使用。任何需要把代码变更推送部署到 G14 k3s 的操作都必须走本 skill。 ---- - -# UniDesk HWLAB G14 CI/CD CLI - -HWLAB G14 的 PR → CI → CD 控制面和运维入口,统一通过 `bun scripts/cli.ts hwlab g14 ...` 管理。 - -**固定入口前缀**: `cd /root/unidesk && bun scripts/cli.ts hwlab g14 ...` - ---- - -## PR 监控与自动合并 - -### G14 主线 - -```bash -bun scripts/cli.ts hwlab g14 monitor-prs \ - [--lane g14|v02] [--once] [--dry-run] \ - [--interval-seconds N] [--max-cycles N] [--timeout-seconds N] -``` - -后台 worker 监控 `pikasTech/HWLAB` 的 open PR → preflight → 自动合并 → 观察 CI/CD 直到 DEV `Synced/Healthy`。成功 rollout 后自动追加指挥简报。状态指针按用途分离(`latest-monitor-job.json` / `latest-once-job.json` 等)。 - -### v0.2 lane - -```bash -bun scripts/cli.ts hwlab g14 monitor-prs --lane v02 [--once] [--dry-run] -``` - -只监控 base=`v0.2` 的 PR。CD 采用 latest-only:旧 PipelineRun 不取消不等待,stale commit 以 superseded/no-op 收口。合并后在原 PR 下追加语义化状态评论(含起止时间、source commit、PipelineRun、targetValidation、git mirror 状态)。 - -### v0.3 lane - -```bash -bun scripts/cli.ts hwlab g14 monitor-prs --lane v03 [--once] [--dry-run] -``` - -只监控 base=`v0.3` 的 PR。ready PR 经 UniDesk `gh pr merge` 合并后触发 runtime lane CD,检查 PipelineRun、Argo、`hwlab-v03` runtime `/health` endpoint 和 Git mirror flush,并对失败 check、冲突、CD failure/timeout 创建或更新 failure issue。CI/CD validation 只允许使用部署对象的 `/health` 端点和必要 provenance;禁止在 CI/CD gate 中运行 web-probe、Playwright、远程浏览器截图或用户路径 E2E。public health probe 必须使用 `config/hwlab-node-lanes.yaml` 选中 node/lane 的 formal public URL;D601 `v0.3` 当前是 `https://hwlab.pikapython.com`,裸 IP、FRP 端口和 legacy `20666/20667` 只作为边缘诊断证据,不能作为 CI/CD 验收口径。 - ---- - -## 控制面(Tekton/Argo) - -### 状态查询 - -```bash -# 最新 head -bun scripts/cli.ts hwlab g14 control-plane status --lane v02 - -# 定点 PipelineRun -bun scripts/cli.ts hwlab g14 control-plane status \ - --lane v02 --pipeline-run hwlab-v02-ci-poll- - -# 定点 source commit -bun scripts/cli.ts hwlab g14 control-plane status \ - --lane v02 --source-commit -``` - -定点 status 输出 `targetValidation.state=passed|superseded`,只检查指定 target 的证据。 - -### 手动触发 - -```bash -bun scripts/cli.ts hwlab g14 control-plane trigger-current \ - --lane v02|v03 [--dry-run|--confirm] -``` - -从 `/root/hwlab-v02-cicd.git` 解析当前 `origin/v0.2` full SHA,创建 commit-pinned PipelineRun。confirmed trigger 创建异步 job 并立即返回 `job.id`。 - -### 应用 RBAC/Pipeline/Argo - -```bash -bun scripts/cli.ts hwlab g14 control-plane apply --lane v02 [--dry-run|--confirm] -``` - -server-side apply v02 的 Tekton RBAC、Pipeline 和 Argo Application。 - -### D601 节点本地 infra bootstrap - -```bash -bun scripts/cli.ts hwlab nodes control-plane infra plan --node D601 --lane v03 -bun scripts/cli.ts hwlab nodes control-plane infra status --node D601 --lane v03 -bun scripts/cli.ts hwlab nodes control-plane infra apply --node D601 --lane v03 --dry-run -bun scripts/cli.ts hwlab nodes control-plane infra apply --node D601 --lane v03 --confirm -bun scripts/cli.ts hwlab nodes control-plane infra tools-image status --node D601 --lane v03 -bun scripts/cli.ts hwlab nodes control-plane infra tools-image build --node D601 --lane v03 --confirm -bun scripts/cli.ts hwlab nodes control-plane infra runtime-image status --node D601 --lane v03 -bun scripts/cli.ts hwlab nodes control-plane infra runtime-image preload --node D601 --lane v03 --confirm -bun scripts/cli.ts hwlab nodes control-plane infra runtime-image logs --node D601 --lane v03 -bun scripts/cli.ts hwlab nodes control-plane infra argo status --node D601 --lane v03 -bun scripts/cli.ts hwlab nodes control-plane infra argo apply --node D601 --lane v03 --confirm -bun scripts/cli.ts hwlab nodes control-plane infra ci-build-benchmark --node D601 --lane v03 --profile no-mirror-full --confirm -bun scripts/cli.ts hwlab nodes control-plane infra ci-build-benchmark status --node D601 --lane v03 --profile no-mirror-full -bun scripts/cli.ts hwlab nodes control-plane infra ci-build-benchmark logs --node D601 --lane v03 --profile no-mirror-full -bun scripts/cli.ts hwlab nodes control-plane status --node D601 --lane v03 [--pipeline-run |--source-commit ] [--full|--raw] -bun scripts/cli.ts hwlab nodes control-plane trigger-current --node D601 --lane v03 --confirm --wait -bun scripts/cli.ts hwlab nodes control-plane sync --node D601 --lane v03 --confirm -``` - -从 `config/hwlab-node-control-plane.yaml` 渲染 D601 HWLAB v03 的节点本地 CI/CD、git-mirror、Tekton、runtime dependency image preload 和 Argo 前置对象。confirmed apply 只做 control-plane bootstrap,不触发 runtime rollout,不创建 PK01 DB,也不修改 Caddy/FRP。node-local registry 镜像只能作为 tools image 或 runtime dependency 的输出 artifact;输入 base/pull image 必须是 YAML 中声明的公开 registry 来源,缺失 output image 时通过 `status.next.blockers` 或 `runtime-image status` 暴露。D601 Argo CD 安装也必须由 YAML 声明:官方 manifest URL、版本、镜像 rewrite/preload、CRD、期望 workload 和 AppProject/Application 都来自 YAML,不能使用手工 kubectl/argo CLI 作为正式安装路径。 - -`ci-build-benchmark` 是 HWLAB v0.3 k3s CI/CD 全量无缓存构建出网测速入口。profile、独立 catalog path 模板、cache policy、必须输出的 timing 阶段和失败族来自 `config/hwlab-node-control-plane.yaml`;实际 service set、git mirror URL、Pipeline、ServiceAccount、registry prefix 和 base image 仍以 `config/hwlab-node-lanes.yaml` 为准。`forbidBuildkitCache=true` 时会向 PipelineRun 传 `build-cache-mode=disabled`。confirmed benchmark 只创建唯一 PipelineRun 并返回 status/logs 轮询命令;通过证据必须包含每个 `build-` TaskRun,PipelineRun 成功但缺少 build task 要按 `cache-hit-forbidden` 处理。 - -`hwlab nodes control-plane status` 默认返回 compact commander summary,只保留 source commit、PipelineRun、Argo、runtime readiness、public probe 和 next action;完整 expected YAML/render target、kubectl result tail、Secret/sourceRef 详情和 probe 原始结果只在 `--full` 或 `--raw` 下展开。 - -`hwlab nodes control-plane sync --confirm` 是 Argo runtime 收敛修复入口:会先按 YAML `runtimeStore.postgres.mode=local-k3s` 同步本地 postgres bootstrap Secret,再终止卡住的 running Argo operation、删除失败 hook Job,并在 StatefulSet template 已更新但旧 controller-revision pod 因 `ImagePullBackOff` / `ErrImagePull` / `CrashLoopBackOff` 卡住时受控删除该旧 pod,让 StatefulSet 按最新 revision 重建。不要手工裸删 pod;需要解除这类死锁时走该入口。 - -`hwlab nodes control-plane trigger-current --node --lane --confirm --wait` 是 node/lane CI/CD 一键入口:按 YAML 解析 source head,执行 git-mirror pre-sync/pre-flush,刷新 control-plane,创建或复用 commit-pinned PipelineRun,等待 PipelineRun 终态,并在终态成功后执行 post-flush。默认输出必须是低噪声 CICD 表格摘要;完整 JSON 只能通过 `--full` 或 `--raw` 展开。120 秒是严重超时阈值:PipelineRun wait 或 `trigger-current` total elapsed 超过 120 秒时,即使最终 status=ok/completed,也必须输出并在 closeout 中记录 `node-runtime-trigger-over-120s` warning、total elapsed、pipeline wait、git mirror status,并从 env-reuse 和 git-mirror/control-plane path 着手排查;未到终态时 CLI 返回 `pending` warning,不继续长时间阻塞,也不把仍在运行误报为构建失败。小范围 PR 触发 120s 时必须看 `plan-artifacts` 的 `affectedServices/buildServices/reusedServices`:如果 source diff 很小却出现所有 envreuse 服务都在 `buildServices` 且 `reusedServices=[]`,优先怀疑 current GitOps artifact catalog 没有 hydrate 到 source plan 阶段,而不是继续盲目重跑 PipelineRun。 - -Web sentinel `trigger-current --confirm --wait` can exhaust its wait budget while the Tekton publish continues in the background and the top-level summary still says `source-fetch`. Do not immediately rerun or patch the workload. First run `web-probe sentinel control-plane status --node --lane --sentinel --full`: if source, registry and GitOps have advanced to the expected source commit/digest but runtime still points at the previous digest, continue with the controlled `web-probe sentinel control-plane apply --confirm --wait` path and then recheck status. If status shows the expected source object or registry digest is still absent, inspect the reported PipelineRun logs/status drill-down and track it as a CI/CD visibility or publish defect. Closeout must record the source commit, registry digest, GitOps revision, Argo revision and runtime digest separately; a wait timeout alone is not proof that publish failed. - -### G14 v0.3 runtime base image - -```bash -bun scripts/cli.ts hwlab nodes control-plane runtime-image status --node G14 --lane v03 -bun scripts/cli.ts hwlab nodes control-plane runtime-image preload --node G14 --lane v03 --confirm -``` - -G14 v0.3 的 Tekton/BuildKit base image 也走 `config/hwlab-node-lanes.yaml`:`baseImageSource` 是公开来源,`baseImage` 是 node-local registry 目标。缺失 base image 时先用 `runtime-image status` 判断 `registryTagPresent`,再用 `preload --confirm` seed;不要手工 `docker tag/push`。`trigger-current` 后若 PipelineRun 已越过 base image 阶段但卡在某个 service build task,按 TaskRun 单独提 issue/修复,不把它并回 base-image preload 问题。长期边界见 `docs/reference/g14.md`。 - -D601/v03 env-reuse service build task 失败时,先看 `build-` TaskRun 的 `step-publish` 日志;Debian apt、npm、Go module 等外部依赖下载通过 lane YAML 注入 egress proxy 后可能出现 502、reset 或超时。先用 `platform-infra sub2api status|validate` 区分 proxy 整体故障和单个上游 transient;proxy 健康但单次下载 transient 时可以受控 `trigger-current --rerun`,重复失败应修 HWLAB `scripts/artifact-publish.mjs` / envRecipe 的有限 retry 后合并发布,不手工 patch pod 或裸删 PipelineRun。若 Pod 内 unset `HTTP_PROXY/HTTPS_PROXY/ALL_PROXY` 后外部 registry/DNS 不可达,说明该 lane 的外部依赖下载依赖 egress proxy;此时 npm/Bun retry 只能降噪,根因仍是 proxy upstream 或 catalog/plan 误触发的过量 build。凡是为证明 proxy 加速 CI/CD 而跑测速,必须同时采集 `platform-infra egress-proxy traffic --target ` 的 proxyserver 侧每客户端速率和窗口累计流量;只贴 PipelineRun 总耗时或 client-side benchmark 不能证明 workload 确实走了 proxy。 - ---- - -## Git Mirror - -```bash -bun scripts/cli.ts hwlab g14 git-mirror status -bun scripts/cli.ts hwlab g14 git-mirror apply [--dry-run|--confirm] -bun scripts/cli.ts hwlab g14 git-mirror sync [--dry-run|--confirm] -bun scripts/cli.ts hwlab g14 git-mirror flush [--dry-run|--confirm] - -# D601 node-local v0.3 lane -bun scripts/cli.ts hwlab nodes git-mirror status --node D601 --lane v03 -bun scripts/cli.ts hwlab nodes git-mirror sync --node D601 --lane v03 --confirm --wait -bun scripts/cli.ts hwlab nodes git-mirror flush --node D601 --lane v03 --confirm --wait -``` - -- `apply`: 渲染并 apply `devops-infra/git-mirror.yaml` -- `sync`: 把当前配置声明的 GitHub refs 拉入本地 mirror -- `flush`: 把本地 lane GitOps ref 快进推回 GitHub - -PipelineRun `gitops-promote` 如果报 git mirror 控制面漂移、refs 不一致或 flush/publish 未完成,优先按当前 `devops-infra/git-mirror.yaml` 收敛:先 `git-mirror apply --confirm`,再 `git-mirror sync --confirm --wait`,然后用 `control-plane cleanup-runs --pipeline-run --confirm` 受控清理失败 PipelineRun 后重试。旧 branch/path allowlist gate 已删除,不要恢复旧 hook、直接 `kubectl delete`、手工 patch pod 内 hook 或绕过 `flush`。 - -手动 trigger closeout 不能只看 PipelineRun `Completed`。必须继续查 `control-plane status --pipeline-run ` 和 `git-mirror status`;node-scoped `trigger-current --confirm --wait` 会自动做必要的 mirror pre/post flush,但 closeout 仍要确认最终 `pendingFlush=false`、`githubInSync=true`。如果 lower-level 手工路径或旧 job 留下 `pendingFlush=true`,执行 `git-mirror flush --confirm --wait` 到 `githubInSync=true`。 - -node-scoped lane 可能在本次 PR 合并后又被后续 PR 推进。`control-plane status --pipeline-run ` 是定点观察某个 PipelineRun,但输出里的当前 `sourceHead` / `summary.sourceCommit` 可能已经是最新 branch tip,而不是该 PipelineRun 名称对应的 merge commit。closeout 必须同时记录 PR merge commit、PipelineRun 名称/状态、Argo sync revision、当前 branch tip,并用 `git merge-base --is-ancestor HEAD` 或等价证据说明最新 tip 包含本次 PR;不要只凭当前 source head 判断本次 rollout。 - -`trigger-current --node D601|D518|JD01 --lane v03 --confirm --wait` 的 source selection 必须走 k8s git-mirror source snapshot:confirmed trigger 先执行受控 `git-mirror sync`,sync 在 mirror cache 中为本轮 branch tip 创建不可变 `refs/unidesk/snapshots/hwlab-node-runtime//`,随后 trigger/status/build 只读取该 snapshot ref 作为 authoritative source。旧 `source-render` / `local-git-clone-worktree` / 可变 branch ref 追 branch tip 的问题不得再用固定 worktree fetch/pull 修复;如果 mirror 缺对象或 snapshot ref 缺失,命令应以 `source-snapshot-missing` 或 git-mirror retry exhausted 类故障停止,并给出受控 sync/status 下一步。 - -D601/v03 `git-mirror` 的 GitHub upstream 标准传输固定为 YAML 声明的 SSH:`githubTransport.mode=ssh`,脚本通过 `GIT_SSH` wrapper 访问 `ssh://git@ssh.github.com:443/...`;node-global HTTP proxy 只作为 SSH CONNECT tunnel,不是 GitHub HTTPS auth/token transport。若 CLI 输出 `transport=https`、`GITHUB_TOKEN`、`git-mirror-github-token` 或 HTTPS token sourceRef,按 control-plane drift/配置回归处理:先修 `config/hwlab-node-control-plane.yaml` 并执行 `hwlab nodes control-plane apply --node D601 --lane v03 --confirm`,不要改走 HTTPS、不要增加 fallback、不要用 host workspace repair。`sync/flush` 的 retry 只消费 SSH upstream transient,并在耗尽后输出 stopped/exhausted;promotion 后若 node-local `git-mirror status` 显示 `pendingFlush=true`,执行 node-local flush 并等到 `pendingFlush=false`、`githubInSync=true`。 - -D601/node-scoped mirror status 的 `githubGitops` 来自本地 mirror cache 的 `refs/mirror-stage/...`;`status` 输出应通过 `refSources.githubFieldsAreMirrorStageCache=true` 显示这一点。`hwlab nodes git-mirror flush --node D601 --lane v03 --confirm --wait` 如果已经显示 `v0.3-gitops -> v0.3-gitops` 推送成功,但随后因 GitHub SSH `kex_exchange_identification` 或 fetch 确认失败导致命令非零退出,会标记 `partialSuccess=push-succeeded-fetch-failed`。当前 CLI 会自动做一次受控 sync/recheck;恢复后输出 `partialSuccessRecovered=true`、`postPushRecovery` 且整体 `ok=true`,未恢复时才把下一步指向 `hwlab nodes git-mirror sync --node D601 --lane v03 --confirm --wait`。不要连续盲目 flush;先刷新 mirror-stage,再用 status 确认 `localGitops=githubGitops`、`pendingFlush=false`、`githubInSync=true`。 - ---- - -## Secret 管理 - -```bash -# 查看 -bun scripts/cli.ts hwlab g14 secret status --lane v02 \ - --name hwlab-v02-openfga|hwlab-v02-master-server-admin-api-key - -# 确保 -bun scripts/cli.ts hwlab g14 secret ensure --lane v02 \ - --name hwlab-v02-master-server-admin-api-key [--dry-run|--confirm] - -# 删除废弃 Secret -bun scripts/cli.ts hwlab g14 secret delete --lane v02 \ - --name [--dry-run|--confirm] -``` - ---- - -## 运行时迁移 - -```bash -bun scripts/cli.ts hwlab g14 control-plane runtime-migration \ - --lane v02 [--dry-run|--confirm] -``` - -通过 `deployment/hwlab-cloud-api` 容器内 migration CLI 执行。 - ---- - -## Observability - -```bash -bun scripts/cli.ts hwlab g14 observability status|apply|query|targets|boundary|closeout \ - [--lane v02] [--promql ] [--expect-count N] [--expect-value V] [--dry-run|--confirm] -``` - -管理 G14 Prometheus 基础设施和 HWLAB v0.2 监控 closeout。 - ---- - -## Platform Infra - -```bash -bun scripts/cli.ts platform-infra sub2api plan|apply|status|validate -bun scripts/cli.ts platform-infra sub2api codex-pool plan|sync|validate|expose|configure-local -bun scripts/cli.ts platform-infra wechat-archive plan|apply|status|validate|pull -bun scripts/cli.ts platform-infra wechat-archive wcf-host-status|collector-plan|collector-apply|collector-status -``` - -- `platform-infra` 是 G14 k3s 上 UniDesk 运维的平台基础设施 namespace;新增平台服务优先进入该 namespace,旧 `devops-infra` 只作为渐进迁移来源。 -- Sub2API 的日常部署、Codex pool、FRP 暴露、master `~/.codex` 配置、验收和排障统一使用 `$unidesk-sub2api`(UniDesk 仓库 `.agents/skills/unidesk-sub2api/SKILL.md`)。 -- WeChat archive 是 platform-infra 的 YAML-first 工作流入口;D601 personal WeChat upstream 必须复用既有 D601 `platform-infra` namespace,`createNamespace=false`,只读 collector 的副本、镜像、WCF host、端口和版本 pin 都以 `config/platform-infra/wechat-archive.yaml` 为准。 -- 如果 WeChatFerry 配套的 PC 微信版本被微信服务端拒绝登录,按上游兼容阻塞处理:把 collector 的 YAML 副本数调为 `0` 并通过 `collector-apply --confirm --wait` 同步,保留 Secret/ConfigMap/PVC 和 Windows 准备态;不要手工 `kubectl scale`、新建 namespace 或采用版本检查绕过工具作为长期路径。 -- UniDesk 仓库 `docs/reference/platform-infra.md` 只保留开发边界、YAML-first 真相和探针口径,不重复日常操作手册。 - ---- - -## CI Tools Image - -```bash -bun scripts/cli.ts hwlab g14 tools-image status -bun scripts/cli.ts hwlab g14 tools-image build \ - --name ci-node-tools --tag \ - [--dockerfile deploy/ci/hwlab-ci-node-tools.Dockerfile] [--dry-run|--confirm] -``` - -在 G14 host 构建并 push 到本地 registry。 - ---- - -## PipelineRun 清理 - -```bash -# 清理已完成 PipelineRun -bun scripts/cli.ts hwlab g14 control-plane cleanup-runs \ - --lane v02|g14|all [--min-age-minutes N] [--limit N] [--dry-run|--confirm] - -# D601/G14 node-scoped runtime lane retention -bun scripts/cli.ts hwlab nodes control-plane cleanup-runs \ - --node D601 --lane v03 [--min-age-minutes N] [--limit N] [--dry-run|--confirm --wait] - -# 补充清理 Released PV -bun scripts/cli.ts hwlab g14 control-plane cleanup-released-pvs \ - --lane all [--limit N] [--dry-run|--confirm] -``` - ---- - -## 手动补记 rollout - -```bash -bun scripts/cli.ts hwlab g14 record-rollout --pr --source-commit -``` - -手动补记 CI/CD 耗时、TaskRun 指标和语义化 changelog 到指挥简报。 - ---- - -## AgentRun 控制面 - -YAML-only lane 以 `config/agentrun.yaml` 为部署真相;node/lane、source workspace/branch、image build、GitOps branch/path、runtime namespace、Secret、外置数据库、manager env、git-mirror 和 edge 暴露都从 YAML 进入 CLI。AgentRun service repo 的 `deploy/deploy.json` 不能作为 UniDesk deployment truth;新 lane 不再维护该文件。 - -```bash -bun scripts/cli.ts agentrun control-plane plan --node D601 --lane v02 -bun scripts/cli.ts agentrun control-plane apply --node D601 --lane v02 [--dry-run|--confirm] -bun scripts/cli.ts agentrun control-plane secret-sync --node D601 --lane v02 [--dry-run|--confirm] -bun scripts/cli.ts agentrun control-plane restart --node D601 --lane v02 [--dry-run|--confirm] -bun scripts/cli.ts agentrun control-plane trigger-current --node D601 --lane v02 [--dry-run|--confirm] -bun scripts/cli.ts agentrun control-plane cleanup-runners --node D601 --lane v02 [--force-active] [--dry-run|--confirm] -bun scripts/cli.ts agentrun control-plane status --node D601 --lane v02 [--pipeline-run |--source-commit ] [--full|--raw] -``` - -- `plan`: 只读解析 YAML,输出控制面、source、image build、GitOps、runtime 和 Secret plan,不打印 Secret value -- `apply`: 按 YAML 渲染并 apply Tekton RBAC/Pipeline、Argo AppProject/Application 和 runtime namespace -- `secret-sync`: 按 YAML 的 Secret sourceRef/keyMapping 同步 runtime Secret 和外置 DB Secret,只输出 fingerprint -- `restart`: patch manager Deployment 的 restart annotation 并等待 rollout,用于 Secret export/DB 连接串变化后让 workload 读取新 Secret;不要手工删除 Pod -- `trigger-current`: v0.2 lane 的 source authority 只读 k8s git-mirror snapshot。confirmed 运行先触发受控 `git-mirror sync`,为 source branch tip 创建 `refs/unidesk/snapshots/agentrun-yaml-lane//`,再从该 snapshot 构建并推送 YAML 声明的 image,渲染 GitOps/artifact catalog,flush git-mirror 并创建 provenance PipelineRun;dry-run/status 也只展示 snapshot/sourceStageRef,不把 host workspace 当 source。confirmed 运行可返回异步 job,必须用 `job status --tail-bytes 12000` 看 `agentrun-yaml-lane-trigger` progress,再用 `status --pipeline-run ` 轮询收口。v0.1 兼容入口仍可能使用固定 source workspace;v0.2 不恢复也不修复 host worktree。 -- `cleanup-runners`: 只清 YAML 选中 lane runtime namespace 中匹配 `deployment.runner.retention.selectors` 的 runner Job/Pod;runner 上限、最后活跃排序、active heartbeat 窗口、age-based cleanup 开关和 selector 都以 YAML 为准。`dry-run` 必须先看 manager facts、inactive candidates、selected 和 `activeRunRisk`;普通 `--confirm` 只删除 selected inactive runner,不替代 CI PipelineRun 清理。`--force-active` 只用于 operator 明确决定强杀所有匹配 runner pod/job 的资源恢复场景,必须先 dry-run 确认 `criteria.forceActive=true` 和 selection 范围;它会中断 active run/command/session,但仍优先于裸 `kubectl delete pod/job`。 -- `status`: 默认返回 compact commander JSON,关键结论在 `.data.summary` 和 `.data.alignment`,完整 YAML target、原始 source/runtime/gitMirror payload 和成功 probe tail 只在 `--full|--raw` 展开 - -YAML-only lane 的长步骤必须由 CLI 拆成短提交和状态轮询:k8s git-mirror snapshot sync、image build、GitOps publish、git-mirror flush 和 PipelineRun 创建不得塞进一个顶层 `trans` 长连接。GitOps publish 必须使用隔离临时 clone/worktree,不能切换或污染 YAML 声明的固定 source workspace;v0.2 若历史失败发布留下 dirty/detached/GitOps branch 状态,也不得把 host workspace 作为 source 修复入口,只清理已知发布残留并从 git-mirror snapshot 重新触发。后台步骤的 `status` 和 `ok` 要共同判定,`status=succeeded` 但 `ok=false` 是终态失败,不继续轮询到超时。 - -AgentRun YAML-only lane closeout 必须同时看当前 k8s git-mirror source snapshot、目标 PipelineRun、GitOps revision、Argo revision 和 manager source commit。发布过程中如果 source branch 被并行 PR 推进,`status --pipeline-run ` 会通过 `summary.branchDrift` / `alignment.branchDrift` 标记目标 PipelineRun 是否已被当前 snapshot tip supersede;先确认最新 snapshot commit 包含本次修复,再按最新 snapshot 重新 `trigger-current`。最终只用最新 PipelineRun 的 `status` 中 `aligned=true`、`blockers=[]`、`argoSyncedToGitops=true` 和 `managerSourceMatchesExpected=true` 收口。v0.2 `trigger-current` 不再恢复 YAML 声明的固定 source workspace;若看到 `source-worktree-restore-failed`、workspace dirty 或 `summary.source.workspaceDetached=true` 作为 v0.2 blocker,按 source authority 回归处理,修 CLI/配置让状态来自 git-mirror snapshot。 - -Runner egress proxy 只从 `config/agentrun.yaml` 的 `deployment.runner.egressProxyUrl` 与 `deployment.runner.noProxyExtra` 进入部署;manager Deployment 必须带 `AGENTRUN_RUNNER_EGRESS_PROXY_URL` 与 `AGENTRUN_RUNNER_NO_PROXY_EXTRA`,验收时还要用真实 `create/apply/send` 触发 runner Job,检查 Pod env、event/trace 和 final response。GitOps 已更新但 Argo 仍在旧 revision 时,走 `agentrun control-plane refresh --node --lane --confirm`,不要手工 patch runtime。 - -Runner 持久化、idle timeout 和 runner retention 只从 `config/agentrun.yaml` 的 `deployment.runner.*` / `deployment.runner.retention` 进入部署,不在 HWLAB 仓库放运维 YAML。验收不能只看 manager Deployment/Pod env;必须用 HWLAB/AgentRun 原入口创建新 turn 或 runner Job,并检查新 runner Job env、session PVC、`AGENTRUN_SOURCE_COMMIT` 和 trace/result 是否复用同一 run 且没有 `reuse-blocked`;runner retention closeout 还要用 `cleanup-runners --dry-run` 证明 over-limit selection 不触碰 active runner。 - -Provider credential 的 `config.toml` 变更同样走 YAML `sourceRef`、`secret-sync` 和 `restart`;lane config 只声明该 lane 需要的 Codex CLI runtime options。不要复制指挥机全局 `~/.codex/config.toml` 作为长期事实,也不要在没有同 lane `auth.json` / API key source 验证的情况下覆盖 provider endpoint。 - -### AgentRun v0.1 兼容入口 - -```bash -bun scripts/cli.ts agentrun control-plane status \ - [--dry-run|--confirm] -bun scripts/cli.ts agentrun control-plane trigger-current \ - [--dry-run|--confirm] -bun scripts/cli.ts agentrun control-plane refresh \ - [--dry-run|--confirm] -bun scripts/cli.ts agentrun control-plane cleanup-runs \ - [--min-age-minutes N] [--limit N] [--dry-run|--confirm] -bun scripts/cli.ts agentrun control-plane cleanup-released-pvs \ - [--limit N] [--dry-run|--confirm] -``` - -- `status`: 只读汇总 source commit、PipelineRun、Argo、manager image、git mirror 和 `aligned` 结论 -- `trigger-current`: 快进 `G14:/root/agentrun-v01` → mirror sync → 创建 `agentrun-v01-ci-` PipelineRun -- `refresh`: Argo hard refresh(不 patch runtime workload) -- `cleanup-runs`: 只清理 `agentrun-ci` 中已完成 PipelineRun + 临时 PVC;不清理 `agentrun-v01` runtime runner Job/Pod/Secret -- `cleanup-released-pvs`: 回收 Released PV - -AgentRun `control-plane status` 的 compact JSON 关键字段在 `.data.summary.sourceCommit`、`.data.summary.expectedPipelineRun`、`.data.summary.runtimeAlignment`、`.data.summary.gitMirror`、`.data.summary.ci.pipelineRun`、`.data.summary.argo` 和 `.data.alignment`,不要假设存在 `.data.status`。触发部署后如果 GitOps 已 promotion 但 git mirror `pendingFlush=true`,先执行 `bun scripts/cli.ts agentrun git-mirror flush --confirm --wait`,再 `control-plane refresh --confirm`,最后用 `control-plane status --full` 证明 `.data.summary.runtimeAlignment.argoSyncedToGitops=true`、`.data.summary.runtimeAlignment.managerSourceMatchesExpected=true` 且 `.data.summary.ci.pipelineRun.status=True`。 - -## AgentRun v0.1 Git Mirror - -```bash -bun scripts/cli.ts agentrun git-mirror status [--full|--raw] -bun scripts/cli.ts agentrun git-mirror sync [--dry-run|--confirm] [--wait] -bun scripts/cli.ts agentrun git-mirror flush [--dry-run|--confirm] [--wait] -``` - -- `status`: 返回 `localV01`/`githubV01`/`localGitops`/`githubGitops`/`pendingFlush`/`githubInSync` -- `sync`: 拉取 GitHub `v0.1` + `v0.1-gitops` refs -- `flush`: 推送本地 `v0.1-gitops` → GitHub - -与 HWLAB v0.2 mirror 共用 `devops-infra` 服务和 cache PVC。 diff --git a/.agents/skills/unidesk-cicd/references/git-mirror.md b/.agents/skills/unidesk-cicd/references/git-mirror.md new file mode 100644 index 00000000..ea83e045 --- /dev/null +++ b/.agents/skills/unidesk-cicd/references/git-mirror.md @@ -0,0 +1,34 @@ +# Git Mirror + +Git mirror 是 HWLAB/AgentRun CI/CD 的 source authority 和 GitOps flush 入口。不要回退到 operator host git、目标 host fixed workspace 或第二套 source resolver。 + +## HWLAB Git Mirror + +```bash +bun scripts/cli.ts hwlab g14 git-mirror status +bun scripts/cli.ts hwlab g14 git-mirror apply [--dry-run|--confirm] +bun scripts/cli.ts hwlab g14 git-mirror sync [--dry-run|--confirm] +bun scripts/cli.ts hwlab g14 git-mirror flush [--dry-run|--confirm] + +bun scripts/cli.ts hwlab nodes git-mirror status --node --lane v03 +bun scripts/cli.ts hwlab nodes git-mirror sync --node --lane v03 --confirm --wait +bun scripts/cli.ts hwlab nodes git-mirror flush --node --lane v03 --confirm --wait +``` + +- `apply`: 渲染并 apply `devops-infra/git-mirror.yaml`。 +- `sync`: 把当前配置声明的 GitHub refs 拉入本地 mirror,并为 source branch tip 创建不可变 snapshot ref。 +- `flush`: 把本地 lane GitOps ref 快进推回 GitHub。 + +PipelineRun `gitops-promote` 如果报 git mirror 控制面漂移、refs 不一致或 flush/publish 未完成,优先按当前 `devops-infra/git-mirror.yaml` 收敛:先 `git-mirror apply --confirm`,再 `git-mirror sync --confirm --wait`,然后用 `control-plane cleanup-runs --pipeline-run --confirm` 受控清理失败 PipelineRun 后重试。旧 branch/path allowlist gate 已删除,不要恢复旧 hook、直接 `kubectl delete`、手工 patch pod 内 hook 或绕过 `flush`。 + +手动 trigger closeout 不能只看 PipelineRun `Completed`。必须继续查 `control-plane status --pipeline-run ` 和 `git-mirror status`;node-scoped `trigger-current --confirm --wait` 会自动做必要的 mirror pre/post flush,但 closeout 仍要确认最终 `pendingFlush=false`、`githubInSync=true`。如果 lower-level 手工路径或旧 job 留下 `pendingFlush=true`,执行 `git-mirror flush --confirm --wait` 到 `pendingFlush=false`。 + +`trigger-current --node D601|D518|JD01 --lane v03 --confirm --wait` 的 source selection 必须走 k8s git-mirror source snapshot:confirmed trigger 先执行受控 `git-mirror sync`,sync 在 mirror cache 中为本轮 branch tip 创建不可变 `refs/unidesk/snapshots/hwlab-node-runtime//`,随后 trigger/status/build 只读取该 snapshot ref 作为 authoritative source。旧 `source-render` / `local-git-clone-worktree` / 可变 branch ref 追 branch tip 的问题不得再用固定 worktree fetch/pull 修复。 + +node-scoped lane 可能在本次 PR 合并后又被后续 PR 推进。`control-plane status --pipeline-run ` 是定点观察某个 PipelineRun,但输出里的当前 `sourceHead` / `summary.sourceCommit` 可能已经是最新 branch tip。closeout 必须同时记录 PR merge commit、PipelineRun 名称/状态、Argo sync revision、当前 branch tip,并证明最新 tip 包含本次 PR。 + +## D601 SSH Transport + +D601/v03 `git-mirror` 的 GitHub upstream 标准传输固定为 YAML 声明的 SSH:`githubTransport.mode=ssh`,脚本通过 `GIT_SSH` wrapper 访问 `ssh://git@ssh.github.com:443/...`;node-global HTTP proxy 只作为 SSH CONNECT tunnel,不是 GitHub HTTPS auth/token transport。若 CLI 输出 `transport=https`、`GITHUB_TOKEN`、`git-mirror-github-token` 或 HTTPS token sourceRef,按 control-plane drift/配置回归处理:先修 YAML 并执行 `hwlab nodes control-plane apply --node D601 --lane v03 --confirm`,不要改走 HTTPS、不要增加 fallback、不要用 host workspace repair。 + +D601/node-scoped mirror status 的 `githubGitops` 来自本地 mirror cache 的 `refs/mirror-stage/...`;`status` 输出应通过 `refSources.githubFieldsAreMirrorStageCache=true` 显示这一点。`flush --confirm --wait` 如果已经显示推送成功,但 post-push fetch/recheck 因 GitHub SSH transient 失败导致非零退出,会标记 `partialSuccess=push-succeeded-fetch-failed`。CLI 应自动做一次受控 sync/recheck;恢复后输出 `partialSuccessRecovered=true`、`postPushRecovery` 且整体 `ok=true`,未恢复时才指向 `sync --confirm --wait`。 diff --git a/.agents/skills/unidesk-cicd/references/platform-ops.md b/.agents/skills/unidesk-cicd/references/platform-ops.md new file mode 100644 index 00000000..b127c0c4 --- /dev/null +++ b/.agents/skills/unidesk-cicd/references/platform-ops.md @@ -0,0 +1,81 @@ +# Platform Ops + +本文件覆盖 Secret、runtime migration、observability、platform infra、CI tools image、PipelineRun 清理和 rollout 补记。 + +## Secret + +```bash +bun scripts/cli.ts hwlab g14 secret status --lane v02 \ + --name hwlab-v02-openfga|hwlab-v02-master-server-admin-api-key + +bun scripts/cli.ts hwlab g14 secret ensure --lane v02 \ + --name hwlab-v02-master-server-admin-api-key [--dry-run|--confirm] + +bun scripts/cli.ts hwlab g14 secret delete --lane v02 \ + --name [--dry-run|--confirm] +``` + +Secret 只通过 YAML sourceRef/targetKey 和受控 CLI 下发;输出只披露对象名、key 名、presence、fingerprint 和摘要,不打印完整凭据。 + +## Runtime Migration + +```bash +bun scripts/cli.ts hwlab g14 control-plane runtime-migration \ + --lane v02 [--dry-run|--confirm] +``` + +通过 runtime 容器内 migration CLI 执行迁移。不要用手工 pod shell 或裸数据库命令作为长期入口。 + +## Observability + +```bash +bun scripts/cli.ts hwlab g14 observability status|apply|query|targets|boundary|closeout \ + [--lane v02] [--promql ] [--expect-count N] [--expect-value V] [--dry-run|--confirm] +``` + +管理 G14 Prometheus 基础设施和 HWLAB lane 监控 closeout。状态、耗时、失败原因、trace、命令结果或关键证据不可见时,先补 CLI/日志/状态输出再继续。 + +## Platform Infra + +```bash +bun scripts/cli.ts platform-infra sub2api plan|apply|status|validate +bun scripts/cli.ts platform-infra sub2api codex-pool plan|sync|validate|expose|configure-local +bun scripts/cli.ts platform-infra wechat-archive plan|apply|status|validate|pull +bun scripts/cli.ts platform-infra wechat-archive wcf-host-status|collector-plan|collector-apply|collector-status +``` + +`platform-infra` 是 G14 k3s 上 UniDesk 运维的平台基础设施 namespace;新增平台服务优先进入该 namespace,旧 `devops-infra` 只作为渐进迁移来源。Sub2API 日常部署、Codex pool、FRP 暴露、master `~/.codex` 配置、验收和排障统一使用 `$unidesk-sub2api`。WeChat archive 是 platform-infra 的 YAML-first 工作流入口;只读 collector 的副本、镜像、WCF host、端口和版本 pin 都以 YAML 为准。 + +## CI Tools Image + +```bash +bun scripts/cli.ts hwlab g14 tools-image status +bun scripts/cli.ts hwlab g14 tools-image build \ + --name ci-node-tools --tag \ + [--dockerfile deploy/ci/hwlab-ci-node-tools.Dockerfile] [--dry-run|--confirm] +``` + +tools image 构建只在目标 host 和本地 registry 的受控路径中执行,不把 master server 当通用构建机。 + +## PipelineRun Cleanup + +```bash +bun scripts/cli.ts hwlab g14 control-plane cleanup-runs \ + --lane v02|g14|all [--min-age-minutes N] [--limit N] [--dry-run|--confirm] + +bun scripts/cli.ts hwlab nodes control-plane cleanup-runs \ + --node --lane v03 [--min-age-minutes N] [--limit N] [--dry-run|--confirm --wait] + +bun scripts/cli.ts hwlab g14 control-plane cleanup-released-pvs \ + --lane all [--limit N] [--dry-run|--confirm] +``` + +清理只删除已完成 PipelineRun 及其 Tekton TaskRun/Pod/PVC 链路,不触碰 registry storage、业务 PVC、Secret、runtime workload 或 GitOps desired state。带 `--pipeline-run` 或 `--source-commit` 的定点清理必须直接查询目标对象;不存在返回 `target-pipelinerun-not-found`,未完成返回 `target-pipelinerun-not-terminal`。 + +## Rollout Record + +```bash +bun scripts/cli.ts hwlab g14 record-rollout --pr --source-commit +``` + +用于手动补记 CI/CD 耗时、TaskRun 指标和语义化 changelog 到指挥简报。补记不能替代真实 runtime closeout。 diff --git a/.agents/skills/unidesk-cicd/references/pr-monitor.md b/.agents/skills/unidesk-cicd/references/pr-monitor.md new file mode 100644 index 00000000..478da7a9 --- /dev/null +++ b/.agents/skills/unidesk-cicd/references/pr-monitor.md @@ -0,0 +1,31 @@ +# PR Monitor + +HWLAB PR monitor 统一使用 `bun scripts/cli.ts hwlab g14 monitor-prs ...`,GitHub 写入必须走 UniDesk `gh` 受控入口。 + +## G14 主线 + +```bash +bun scripts/cli.ts hwlab g14 monitor-prs \ + [--lane g14|v02] [--once] [--dry-run] \ + [--interval-seconds N] [--max-cycles N] [--timeout-seconds N] +``` + +后台 worker 监控 `pikasTech/HWLAB` 的 open PR,经 preflight、自动合并、CI/CD 观察到 DEV `Synced/Healthy` 后追加指挥简报。状态指针按用途分离,避免 once/dry-run 覆盖长期 monitor 指针。 + +## v0.2 Lane + +```bash +bun scripts/cli.ts hwlab g14 monitor-prs --lane v02 [--once] [--dry-run] +``` + +只监控 base=`v0.2` 的 PR。CD 采用 latest-only:旧 PipelineRun 不取消、不等待,stale commit 以 superseded/no-op 收口。合并后在原 PR 下追加语义化状态评论,正文必须包含起止时间、source commit、PipelineRun、targetValidation、Argo/webAssets 和 git mirror `pendingFlush/githubInSync`。 + +## v0.3 Lane + +```bash +bun scripts/cli.ts hwlab g14 monitor-prs --lane v03 [--once] [--dry-run] +``` + +只监控 base=`v0.3` 的 PR。ready PR 经 UniDesk `gh pr merge` 合并后触发 runtime lane CD,检查 PipelineRun、Argo、`hwlab-v03` runtime `/health` endpoint 和 Git mirror flush,并对失败 check、冲突、CD failure/timeout 创建或更新 failure issue。 + +CI/CD validation 只允许使用部署对象的 `/health` 端点和必要 provenance;禁止在 CI/CD gate 中运行 web-probe、Playwright、远程浏览器截图或用户路径 E2E。public health probe 必须使用 `config/hwlab-node-lanes.yaml` 选中 node/lane 的 formal public URL;裸 IP、FRP 端口和 legacy 端口只作为边缘诊断证据,不能作为 CI/CD 验收口径。 diff --git a/docs/reference/cli.md b/docs/reference/cli.md index 92d13da9..9e0554b8 100644 --- a/docs/reference/cli.md +++ b/docs/reference/cli.md @@ -34,7 +34,7 @@ G14/D601 v03 的 bootstrap admin password 是 HWLAB runtime Secret 生命周期 `hwlab nodes control-plane infra ci-build-benchmark --node D601 --lane v03 --profile --confirm` 是 HWLAB v0.3 k3s CI/CD 全量无缓存构建出网测速入口,profile、cache policy、独立 catalog path 模板、PipelineRun prefix、必须输出的 timing 阶段和失败族都来自 `config/hwlab-node-control-plane.yaml`。confirmed benchmark 只创建一次唯一 PipelineRun,使用 node-lane YAML 中的实际 HWLAB v0.3 service set、git mirror read/write URL、registry prefix、base image 和 Tekton pipeline;`forbidBuildkitCache=true` 时会向 PipelineRun 传 `build-cache-mode=disabled`。status/logs 通过短连接轮询 PipelineRun/TaskRun 摘要和有界日志。成功的 benchmark 必须出现每个 `build-` TaskRun;如果 PipelineRun 成功但缺少任一 service build task,CLI 必须把该 service 报为 `cache-hit-forbidden`,不能把 catalog/env reuse 当作 #1010 这类性能验收的通过证据。 -`hwlab nodes git-mirror status|sync|flush --node --lane ` 是 node-scoped runtime lane 的 Git mirror 维护入口。`status` 的 `githubSource` / `githubGitops` 来自本地 mirror cache 的 `refs/mirror-stage/...`,不是实时 GitHub API;输出中的 `refSources.githubFieldsAreMirrorStageCache=true` 和 `refSources.cacheRefresh` 给出这一来源和刷新命令。`sync --confirm --wait` 的 k8s Job 是 upstream GitHub fetch 的唯一正式入口;遇到 GitHub SSH transient 时按受控 retry/backoff 停止并输出下一步,不回退到 operator host git、目标 host fixed workspace 或第二套 source resolver。`flush --confirm --wait` 如果已经把 GitOps ref push 到 GitHub,但 post-push fetch/recheck 因 transient SSH 失败而无法刷新 mirror-stage,会标记 `partialSuccess=push-succeeded-fetch-failed`;CLI 应自动执行一次受控 sync 刷新 mirror-stage,若恢复后 `pendingFlush=false` 且 `githubInSync=true`,结果应为 `ok=true` 并输出 `partialSuccessRecovered` / `postPushRecovery`,否则才保留 `degradedReason=node-runtime-git-mirror-flush-post-push-fetch-failed` 和下一步 `sync --confirm --wait`。不要把这种 partial success 解读为需要连续盲目 flush。`hwlab nodes control-plane trigger-current --node --lane --confirm --wait` 会先执行 k8s git-mirror source snapshot sync,再从 mirror cache 选择 source commit,随后自动执行必要的 pre-flush,并在 PipelineRun terminal 后自动执行必要的 post-flush;progress 事件必须显式输出 `git-mirror-pre-flush` / `git-mirror-post-flush` 的 executed/skipped、jobName、local/github source、local/github GitOps、`pendingFlush` 和 `githubInSync`,且已恢复的 partial success 不能让顶层 trigger-current false-fail。`control-plane status` 仍是只读入口,只读 k8s mirror cache 并暴露 compact `gitMirror` 摘要和下一步 flush 命令,不隐式执行写操作。 +`hwlab nodes git-mirror status|sync|flush --node --lane ` 是 node-scoped runtime lane 的 Git mirror 维护入口。`status` 的 `githubSource` / `githubGitops` 来自本地 mirror cache 的 `refs/mirror-stage/...`,不是实时 GitHub API;输出中的 `refSources.githubFieldsAreMirrorStageCache=true` 和 `refSources.cacheRefresh` 给出这一来源和刷新命令。`sync --confirm --wait` 的 k8s Job 是 upstream GitHub fetch 的唯一正式入口;遇到 GitHub SSH transient 时按受控 retry/backoff 停止并输出下一步,不回退到 operator host git、目标 host fixed workspace 或第二套 source resolver。`flush --confirm --wait` 如果已经把 GitOps ref push 到 GitHub,但 post-push fetch/recheck 因 transient SSH 失败而无法刷新 mirror-stage,会标记 `partialSuccess=push-succeeded-fetch-failed`;CLI 应自动执行一次受控 sync 刷新 mirror-stage,若恢复后 `pendingFlush=false` 且 `githubInSync=true`,结果应为 `ok=true` 并输出 `partialSuccessRecovered` / `postPushRecovery`,否则才保留 `degradedReason=node-runtime-git-mirror-flush-post-push-fetch-failed` 和下一步 `sync --confirm --wait`。不要把这种 partial success 解读为需要连续盲目 flush。`hwlab nodes control-plane trigger-current --node --lane --confirm --wait` 会先执行 k8s git-mirror source snapshot sync,再从 mirror cache 选择 source commit,随后自动执行必要的 pre-flush,并在 PipelineRun terminal 后继续完成 post-flush、Argo refresh/status closeout、runtime readiness 和 YAML public `/health` 探测;同一条 CLI 的端到端预算是 120s,嵌套 git-mirror sync/flush 与 status 探测必须按剩余预算裁剪,不能每阶段各自等待一轮。progress/summary 必须显式输出 `git-mirror-pre-flush` / `git-mirror-post-flush` 的 executed/skipped、jobName、local/github source、local/github GitOps、`pendingFlush`、`githubInSync`、Argo observed/target revision、runtime/public readiness 和 pending TaskRun/Pod drill-down;PipelineRun 已成功但 Argo/runtime/public 仍在收敛时,状态原因应落在 Argo/runtime closeout,不再泛化成 CI TaskRun pending。`control-plane status` 仍是只读入口,只读 k8s mirror cache 并暴露 compact `gitMirror` 摘要和下一步 flush 命令,不隐式执行写操作。 PR 合并后触发 node-scoped runtime lane 时,`control-plane status --pipeline-run ` 是某次 PipelineRun 的定点观察入口,但同一输出中的 `sourceHead` / `summary.sourceCommit` 仍可能反映当前分支最新 head;如果触发后又有后续 PR 合并,当前 head 可能已经不是该 PipelineRun 名称中的短 SHA。closeout 证据必须同时写明:PR merge commit、定点 PipelineRun 名称和状态、最终 runtime/GitOps revision、当前 branch tip,以及当前 branch tip 是否包含本次 PR merge commit。不要只凭 `summary.sourceCommit` 反推某个旧 PipelineRun 的源码身份。 diff --git a/scripts/src/hwlab-node/git-mirror.ts b/scripts/src/hwlab-node/git-mirror.ts index 98e9ec01..c7bd53e0 100644 --- a/scripts/src/hwlab-node/git-mirror.ts +++ b/scripts/src/hwlab-node/git-mirror.ts @@ -29,12 +29,12 @@ import { runDelegatedHwlabNodeCommand, type DelegatedNodeDomain } from "../hwlab import type { RenderedCliResult } from "../output"; import { formatElapsedMs, isCommandSuccess, nodeRuntimePipelineRunName, nodeRuntimeRerunPipelineRunName, resolveNodeRuntimeLaneHead, shortValue } from "./cleanup"; -import { nodeRuntimeApply } from "./control-actions"; +import { nodeRuntimeApply, nodeRuntimeRefresh } from "./control-actions"; import { NODE_RUNTIME_CICD_WAIT_WARNING_SECONDS, NODE_RUNTIME_TRIGGER_SEVERE_WARNING_MS } from "./entry"; import { parseNodeScopedDelegatedOptions } from "./plan"; -import { compactNodeRuntimeTaskRunDiagnostic, nodeRuntimePipelineFailureSummary } from "./render"; +import { compactNodeRuntimeTaskRunDiagnostic, nodeRuntimeControlPlaneStatus, nodeRuntimePipelineFailureSummary } from "./render"; import { compactRuntimeCommand } from "./runtime-common"; -import { compactNodeRuntimeGitMirrorObservation, compactNodeRuntimeGitMirrorRun, nodeRuntimeEnsureGitMirrorFlushed, nodeRuntimeEnsureGitMirrorSourceCurrent, nodeRuntimeExternalPostgresSecretRows, nodeRuntimeGitMirrorRun, nodeRuntimeGitMirrorStatus, nodeRuntimeOpportunisticGitMirrorFlush, nodeRuntimeOpportunisticGitMirrorSync, nodeScopedFullOutput } from "./status"; +import { compactNodeRuntimeGitMirrorObservation, compactNodeRuntimeGitMirrorRun, nodeRuntimeEnsureGitMirrorFlushed, nodeRuntimeEnsureGitMirrorSourceCurrent, nodeRuntimeExternalPostgresSecretRows, nodeRuntimeGitMirrorRun, nodeRuntimeGitMirrorStatus, nodeRuntimeOpportunisticGitMirrorFlush, nodeRuntimeOpportunisticGitMirrorSync, nodeScopedFullOutput, type NodeRuntimeGitMirrorRunOptions } from "./status"; import { record } from "./utils"; import { webObserveTable } from "./web-observe-render"; import { createNodeRuntimePipelineRun, getNodeRuntimePipelineRun, nodeRuntimePipelineRunManifest, printNodeRuntimeTriggerProgress, waitForNodeRuntimePipelineRunTerminal } from "./web-probe"; @@ -48,20 +48,25 @@ export function nodeRuntimeTriggerCurrentOutput(scoped: ReturnType): Record { const spec = scoped.spec; - const pipelineWaitSeconds = nodeRuntimeCicdWaitSeconds(scoped); const triggerStartedAt = Date.now(); const triggerElapsedMs = () => Date.now() - triggerStartedAt; + const triggerDeadlineMs = triggerStartedAt + (nodeRuntimeCicdWaitSeconds(scoped) * 1000); + const remainingTriggerSeconds = () => Math.max(0, Math.floor((triggerDeadlineMs - Date.now()) / 1000)); + const triggerGitMirrorOptions = nodeRuntimeTriggerGitMirrorOptions(triggerDeadlineMs); + const sourceSyncScoped = nodeRuntimeScopedForTriggerDeadline(scoped, triggerDeadlineMs); const sourceSnapshotSync = scoped.dryRun ? null - : nodeRuntimeGitMirrorRun({ - ...scoped, + : sourceSyncScoped === null + ? nodeRuntimeTriggerBudgetExhausted(scoped, "source-snapshot-sync", "-", "-") + : nodeRuntimeGitMirrorRun({ + ...sourceSyncScoped, domain: "git-mirror", action: "sync", confirm: true, dryRun: false, wait: true, discardStaleGitops: scoped.discardStaleGitops === true || scoped.rerun === true, - }); + }, triggerGitMirrorOptions); if (sourceSnapshotSync !== null && sourceSnapshotSync.ok !== true) { return { ok: false, @@ -116,20 +121,24 @@ export function nodeRuntimeTriggerCurrent(scoped: ReturnType nodeRuntimeOpportunisticGitMirrorSync(scoped, sourceCommit, pipelineRun), - opportunisticPostFlush: () => nodeRuntimeOpportunisticGitMirrorFlush(scoped, sourceCommit, pipelineRun), + ? waitForNodeRuntimePipelineRunTerminal(spec, pipelineRun, remainingTriggerSeconds(), { + opportunisticPostSync: () => nodeRuntimeOpportunisticGitMirrorSync(scoped, sourceCommit, pipelineRun, triggerGitMirrorOptions), + opportunisticPostFlush: () => nodeRuntimeOpportunisticGitMirrorFlush(scoped, sourceCommit, pipelineRun, triggerGitMirrorOptions), }) : { ok: true, status: "already-succeeded", pipelineRun: before, polls: 0, elapsedMs: 0 }; const waitedPipelineRun = record(pipelineWait.pipelineRun); const pipelineFailureSummary = nodeRuntimePipelineFailureSummary(pipelineWait); const postFlush = waitedPipelineRun.status === "True" - ? nodeRuntimeEnsureGitMirrorFlushed(scoped, "post", sourceCommit, pipelineRun) + ? nodeRuntimeEnsureGitMirrorFlushed(scoped, "post", sourceCommit, pipelineRun, null, triggerGitMirrorOptions) + : null; + const closeout = waitedPipelineRun.status === "True" && (postFlush === null || postFlush.ok === true) + ? waitForNodeRuntimeCicdCloseout(scoped, pipelineRun, sourceCommit, triggerDeadlineMs, { allowRefresh: true }) : null; const pipelinePending = pipelineWait.status === "pending"; - const ok = pipelineWait.ok === true && (postFlush === null || postFlush.ok === true); + const closeoutPending = closeout?.status === "pending"; + const ok = pipelineWait.ok === true && (postFlush === null || postFlush.ok === true) && closeout?.ok === true; const elapsedMs = triggerElapsedMs(); - const triggerWarning = pipelinePending ? null : nodeRuntimeTriggerElapsedWarning(spec, pipelineRun, elapsedMs); + const triggerWarning = pipelinePending || closeoutPending ? null : nodeRuntimeTriggerElapsedWarning(spec, pipelineRun, elapsedMs); if (triggerWarning !== null) printNodeRuntimeTriggerProgress(spec, { stage: "trigger-current", status: "warning", ...triggerWarning }); return { ok, @@ -137,15 +146,21 @@ export function nodeRuntimeTriggerCurrent(scoped: ReturnType nodeRuntimeOpportunisticGitMirrorSync(scoped, sourceCommit, pipelineRun), - opportunisticPostFlush: () => nodeRuntimeOpportunisticGitMirrorFlush(scoped, sourceCommit, pipelineRun), + ? waitForNodeRuntimePipelineRunTerminal(spec, pipelineRun, remainingTriggerSeconds(), { + opportunisticPostSync: () => nodeRuntimeOpportunisticGitMirrorSync(scoped, sourceCommit, pipelineRun, triggerGitMirrorOptions), + opportunisticPostFlush: () => nodeRuntimeOpportunisticGitMirrorFlush(scoped, sourceCommit, pipelineRun, triggerGitMirrorOptions), }) : null; const waitedPipelineRun = record(pipelineWait?.pipelineRun); const pipelineFailureSummary = nodeRuntimePipelineFailureSummary(pipelineWait); const postFlush = waitedPipelineRun.status === "True" - ? nodeRuntimeEnsureGitMirrorFlushed(scoped, "post", sourceCommit, pipelineRun) + ? nodeRuntimeEnsureGitMirrorFlushed(scoped, "post", sourceCommit, pipelineRun, null, triggerGitMirrorOptions) + : null; + const closeout = waitedPipelineRun.status === "True" && (postFlush === null || postFlush.ok === true) + ? waitForNodeRuntimeCicdCloseout(scoped, pipelineRun, sourceCommit, triggerDeadlineMs, { allowRefresh: true }) : null; const pipelineReady = pipelineWait !== null && pipelineWait.ok === true; const postFlushOk = postFlush === null || postFlush.ok === true; const pipelinePending = pipelineWait !== null && pipelineWait.status === "pending"; - const ok = createOk && (pipelineReady || pipelinePending) && postFlushOk; + const closeoutPending = closeout?.status === "pending"; + const ok = createOk && pipelineReady && postFlushOk && closeout?.ok === true; const elapsedMs = triggerElapsedMs(); - const triggerWarning = pipelinePending ? null : nodeRuntimeTriggerElapsedWarning(spec, pipelineRun, elapsedMs); + const triggerWarning = pipelinePending || closeoutPending ? null : nodeRuntimeTriggerElapsedWarning(spec, pipelineRun, elapsedMs); if (triggerWarning !== null) printNodeRuntimeTriggerProgress(spec, { stage: "trigger-current", status: "warning", ...triggerWarning }); return { ok, @@ -241,9 +271,14 @@ export function nodeRuntimeTriggerCurrent(scoped: ReturnType, + deadlineMs: number, + minRemainingSeconds = 1, +): ReturnType | null { + const remainingSeconds = Math.floor((deadlineMs - Date.now()) / 1000); + if (remainingSeconds < minRemainingSeconds) return null; + return { ...scoped, timeoutSeconds: Math.max(1, Math.min(scoped.timeoutSeconds, remainingSeconds)) }; +} + +function nodeRuntimeTriggerBudgetExhausted( + scoped: ReturnType, + phase: string, + sourceCommit: string, + pipelineRun: string, +): Record { + return { + ok: false, + command: `hwlab nodes control-plane trigger-current --node ${scoped.node} --lane ${scoped.lane}`, + node: scoped.node, + lane: scoped.lane, + phase, + sourceCommit, + pipelineRun, + status: "pending", + degradedReason: "node-runtime-cicd-budget-exhausted", + next: { status: `bun scripts/cli.ts hwlab nodes control-plane status --node ${scoped.node} --lane ${scoped.lane} --pipeline-run ${pipelineRun} --full` }, + }; +} + +export function waitForNodeRuntimeCicdCloseout( + scoped: ReturnType, + pipelineRun: string, + sourceCommit: string, + deadlineMs: number, + options: { allowRefresh?: boolean } = {}, +): Record { + const spec = scoped.spec; + const startedAt = Date.now(); + let polls = 0; + let lastSummary: Record | null = null; + let refresh: Record | null = null; + printNodeRuntimeTriggerProgress(spec, { stage: "runtime-closeout", status: "started", pipelineRun, sourceCommit, remainingSeconds: Math.max(0, Math.floor((deadlineMs - Date.now()) / 1000)) }); + while (Date.now() <= deadlineMs) { + const statusScoped = nodeRuntimeScopedForTriggerDeadline(scoped, deadlineMs, 5); + if (statusScoped === null) break; + polls += 1; + const status = nodeRuntimeControlPlaneStatus({ + ...statusScoped, + action: "status", + confirm: false, + dryRun: true, + wait: false, + originalArgs: ["status", "--node", scoped.node, "--lane", scoped.lane, "--source-commit", sourceCommit, "--pipeline-run", pipelineRun], + }); + const summary = record(record(status).summary ?? status); + lastSummary = summary; + const pipeline = record(summary.pipelineRun); + const argo = record(summary.argo); + const runtime = record(summary.runtime); + const publicProbe = record(summary.publicProbe); + const gitMirror = record(summary.gitMirror); + const degradedReason = typeof summary.degradedReason === "string" ? summary.degradedReason : null; + printNodeRuntimeTriggerProgress(spec, { + stage: "runtime-closeout", + status: "poll", + pipelineRun, + sourceCommit, + polls, + elapsedMs: Date.now() - startedAt, + degradedReason, + pipelineStatus: pipeline.status ?? null, + argoSync: argo.syncStatus ?? null, + argoHealth: argo.health ?? null, + argoRevision: argo.syncRevision ?? null, + argoTargetRevision: argo.targetGitopsRevision ?? null, + runtimeReady: runtime.ready === true, + publicReady: publicProbe.ready === true, + gitMirrorPending: gitMirror.pendingFlush ?? null, + gitMirrorInSync: gitMirror.githubInSync ?? null, + }); + if (summary.ok === true) { + printNodeRuntimeTriggerProgress(spec, { stage: "runtime-closeout", status: "succeeded", pipelineRun, sourceCommit, polls, elapsedMs: Date.now() - startedAt }); + return { + ok: true, + status: "succeeded", + pipelineRun, + sourceCommit, + polls, + elapsedMs: Date.now() - startedAt, + summary, + refresh, + }; + } + const refreshScoped = nodeRuntimeScopedForTriggerDeadline(scoped, deadlineMs, 5); + if (options.allowRefresh === true && refresh === null && refreshScoped !== null && (degradedReason === "argo-revision-not-observed" || degradedReason === "argo-target-revision-progressing" || degradedReason === "argo-health-progressing")) { + refresh = nodeRuntimeRefresh({ ...refreshScoped, action: "refresh", confirm: true, dryRun: false, wait: true }); + printNodeRuntimeTriggerProgress(spec, { + stage: "runtime-closeout-refresh", + status: refresh.ok === true ? "succeeded" : "failed", + pipelineRun, + sourceCommit, + degradedReason: refresh.degradedReason ?? null, + }); + } + if (degradedReason !== null && !nodeRuntimeCloseoutRetryableReason(degradedReason)) { + printNodeRuntimeTriggerProgress(spec, { stage: "runtime-closeout", status: "failed", pipelineRun, sourceCommit, polls, elapsedMs: Date.now() - startedAt, degradedReason }); + return { + ok: false, + status: "failed", + pipelineRun, + sourceCommit, + polls, + elapsedMs: Date.now() - startedAt, + summary, + refresh, + degradedReason, + next: { status: `bun scripts/cli.ts hwlab nodes control-plane status --node ${scoped.node} --lane ${scoped.lane} --pipeline-run ${pipelineRun} --full` }, + }; + } + const remainingMs = deadlineMs - Date.now(); + if (remainingMs <= 0) break; + sleepCloseout(Math.min(5_000, Math.max(1000, remainingMs))); + } + const elapsedMs = Date.now() - startedAt; + printNodeRuntimeTriggerProgress(spec, { + stage: "runtime-closeout", + status: "warning", + pipelineRun, + sourceCommit, + polls, + elapsedMs, + waitLimitSeconds: NODE_RUNTIME_CICD_WAIT_WARNING_SECONDS, + degradedReason: lastSummary?.degradedReason ?? null, + }); + return { + ok: false, + status: "pending", + pipelineRun, + sourceCommit, + polls, + elapsedMs, + summary: lastSummary, + refresh, + waitLimitSeconds: NODE_RUNTIME_CICD_WAIT_WARNING_SECONDS, + degradedReason: "node-runtime-cicd-closeout-pending", + next: { status: `bun scripts/cli.ts hwlab nodes control-plane status --node ${scoped.node} --lane ${scoped.lane} --pipeline-run ${pipelineRun} --full` }, + }; +} + +export function nodeRuntimeCloseoutRetryableReason(reason: string): boolean { + return reason === "argo-revision-not-observed" + || reason === "argo-target-revision-progressing" + || reason === "argo-health-progressing" + || reason === "runtime-workloads-not-ready" + || reason === "public-probe-not-ready" + || reason === "git-mirror-pending-flush"; +} + +function sleepCloseout(ms: number): void { + const buffer = new SharedArrayBuffer(4); + Atomics.wait(new Int32Array(buffer), 0, 0, Math.max(0, ms)); +} + export function nodeRuntimeCicdWaitWarning(spec: HwlabRuntimeLaneSpec, pipelineRun: string, pipelineWait: unknown): Record { const wait = record(pipelineWait); return { @@ -287,6 +497,21 @@ export function nodeRuntimeCicdWaitWarning(spec: HwlabRuntimeLaneSpec, pipelineR }; } +export function nodeRuntimeCloseoutWaitWarning(spec: HwlabRuntimeLaneSpec, pipelineRun: string, closeout: unknown): Record { + const wait = record(closeout); + const summary = record(wait.summary); + return { + code: "node-runtime-cicd-closeout-over-120s", + message: `PipelineRun ${pipelineRun} reached terminal state, but GitOps/Argo/runtime/public closeout did not converge inside the ${NODE_RUNTIME_CICD_WAIT_WARNING_SECONDS}s one-command CICD budget.`, + waitedSeconds: NODE_RUNTIME_CICD_WAIT_WARNING_SECONDS, + elapsedMs: wait.elapsedMs ?? null, + polls: wait.polls ?? null, + degradedReason: summary.degradedReason ?? wait.degradedReason ?? null, + inspectCloseout: `bun scripts/cli.ts hwlab nodes control-plane status --node ${spec.nodeId} --lane ${spec.lane} --pipeline-run ${pipelineRun} --full`, + inspectGitMirror: `bun scripts/cli.ts hwlab nodes git-mirror status --node ${spec.nodeId} --lane ${spec.lane}`, + }; +} + export function nodeRuntimeTriggerElapsedWarning(spec: HwlabRuntimeLaneSpec, pipelineRun: string, elapsedMs: number): Record | null { if (elapsedMs < NODE_RUNTIME_TRIGGER_SEVERE_WARNING_MS) return null; return { @@ -304,6 +529,12 @@ export function withNodeRuntimeTriggerRendered(result: Record, const pipelineRunRecord = record(pipelineWait.pipelineRun ?? result.after ?? result.before); const warning = record(result.warning ?? pipelineWait.warning); const postFlush = record(result.postFlush); + const closeout = record(result.closeout); + const closeoutSummary = record(closeout.summary); + const closeoutArgo = record(closeoutSummary.argo); + const closeoutRuntime = record(closeoutSummary.runtime); + const closeoutPublic = record(closeoutSummary.publicProbe); + const closeoutGitMirror = record(closeoutSummary.gitMirror); const gitMirror = record(result.gitMirror); const gitMirrorSummary = record(postFlush.afterSummary ?? postFlush.beforeSummary ?? gitMirror.statusSummary ?? gitMirror.afterSummary ?? gitMirror.beforeSummary ?? gitMirror.summary); const refresh = record(result.refresh); @@ -311,12 +542,13 @@ export function withNodeRuntimeTriggerRendered(result: Record, const next = record(result.next); const pipelineStatus = pipelineRunRecord.status ?? pipelineWait.status ?? "-"; const completion = result.completion ?? (result.ok === true ? "completed" : "failed"); + const topStatus = result.ok === true ? "ok" : completion === "pending" ? "pending" : "failed"; const renderedText = [ "hwlab nodes control-plane trigger-current", "", webObserveTable( ["NODE", "LANE", "STATUS", "COMPLETION", "SOURCE", "PIPELINERUN"], - [[scoped.node, scoped.lane, result.ok === true ? "ok" : "failed", webObserveText(completion), shortValue(result.sourceCommit), webObserveText(result.pipelineRun)]], + [[scoped.node, scoped.lane, topStatus, webObserveText(completion), shortValue(result.sourceCommit), webObserveText(result.pipelineRun)]], ), "", webObserveTable( @@ -327,6 +559,7 @@ export function withNodeRuntimeTriggerRendered(result: Record, ["create", create.exitCode === 0 || result.mutation === true ? "ok" : create.exitCode === undefined ? "-" : "failed", result.createObservedAfterTimeout === true ? "observed-after-timeout" : `exit=${webObserveText(create.exitCode)}`], ["pipeline-wait", webObserveText(pipelineWait.status), `pipeline=${webObserveText(pipelineStatus)} polls=${webObserveText(pipelineWait.polls)} elapsed=${formatElapsedMs(pipelineWait.elapsedMs)}`], ["post-flush", postFlush.ok === true ? "ok" : postFlush.ok === false ? "failed" : "-", webObserveText(postFlush.mode ?? postFlush.degradedReason)], + ["closeout", closeout.ok === true ? "ok" : closeout.status === "pending" ? "pending" : closeout.ok === false ? "failed" : "-", `reason=${webObserveText(closeoutSummary.degradedReason ?? closeout.degradedReason)} polls=${webObserveText(closeout.polls)} elapsed=${formatElapsedMs(closeout.elapsedMs)}`], ["total", result.triggerElapsedMs === undefined ? "-" : "elapsed", formatElapsedMs(result.triggerElapsedMs)], ], ), @@ -343,11 +576,24 @@ export function withNodeRuntimeTriggerRendered(result: Record, : webObserveTable( ["CHECK", "COMMAND"], [ - ["env-reuse", webObserveText(warning.inspectEnvReuse ?? `bun scripts/cli.ts hwlab nodes control-plane status --node ${scoped.node} --lane ${scoped.lane} --pipeline-run ${result.pipelineRun} --full`)], + ["closeout", webObserveText(warning.inspectCloseout ?? warning.inspectEnvReuse ?? `bun scripts/cli.ts hwlab nodes control-plane status --node ${scoped.node} --lane ${scoped.lane} --pipeline-run ${result.pipelineRun} --full`)], ["git-mirror", webObserveText(warning.inspectGitMirror ?? `bun scripts/cli.ts hwlab nodes git-mirror status --node ${scoped.node} --lane ${scoped.lane}`)], ], ), "", + Object.keys(closeoutSummary).length === 0 + ? "CLOSEOUT\n-" + : webObserveTable( + ["ARGO", "REV", "RUNTIME", "PUBLIC", "GIT_MIRROR"], + [[ + `${webObserveText(closeoutArgo.syncStatus)}/${webObserveText(closeoutArgo.health)}`, + `${shortValue(closeoutArgo.syncRevision)}->${shortValue(closeoutArgo.targetGitopsRevision)}`, + `${webObserveText(closeoutRuntime.ready)} workloads=${webObserveText(closeoutRuntime.workloadReady)}`, + webObserveText(closeoutPublic.ready), + `pending=${webObserveText(closeoutGitMirror.pendingFlush)} inSync=${webObserveText(closeoutGitMirror.githubInSync)}`, + ]], + ), + "", Object.keys(gitMirrorSummary).length === 0 ? "GIT_MIRROR\n-" : webObserveTable( @@ -428,6 +674,9 @@ export function withNodeRuntimeControlPlaneStatusRendered(result: Record 0 ? [`failed=${webObserveShort(failedTaskRuns.join(","), 80)}`] : []), @@ -453,12 +702,18 @@ export function withNodeRuntimeControlPlaneStatusRendered(result: Record [ @@ -531,12 +790,18 @@ export function withNodeRuntimeControlPlaneStatusFullRendered(result: Record): Record { const spec = scoped.spec; + const probeTimeoutSeconds = Math.max(1, Math.min(60, scoped.timeoutSeconds)); const sourceCommitOverride = optionValue(scoped.originalArgs, "--source-commit"); const pipelineRunOverride = optionValue(scoped.originalArgs, "--pipeline-run"); const head = sourceCommitOverride === undefined ? resolveNodeRuntimeLaneHead(spec) : null; const sourceCommit = sourceCommitOverride ?? head?.sourceCommit ?? null; const pipelineRun = pipelineRunOverride ?? (sourceCommit === null ? null : nodeRuntimePipelineRunName(spec, sourceCommit)); - const namespace = runNodeK3sArgs(spec, ["kubectl", "get", "ns", spec.runtimeNamespace, "-o", "name"], 60); + const namespace = runNodeK3sArgs(spec, ["kubectl", "get", "ns", spec.runtimeNamespace, "-o", "name"], probeTimeoutSeconds); const namespaceExists = namespace.exitCode === 0; const postgresObjects = namespaceExists - ? runNodeK3sArgs(spec, ["kubectl", "-n", spec.runtimeNamespace, "get", "statefulset,svc,pvc", "-o", "name"], 60) + ? runNodeK3sArgs(spec, ["kubectl", "-n", spec.runtimeNamespace, "get", "statefulset,svc,pvc", "-o", "name"], probeTimeoutSeconds) : null; const localPostgresObjects = postgresObjects === null ? [] : postgresObjects.stdout.split(/\r?\n/u).map((line) => line.trim()).filter((line) => isLocalPostgresObject(line, spec)); - const serviceAccount = runNodeK3sArgs(spec, ["kubectl", "-n", "hwlab-ci", "get", "serviceaccount", spec.serviceAccountName, "-o", "name"], 60); - const pipeline = runNodeK3sArgs(spec, ["kubectl", "-n", "hwlab-ci", "get", "pipeline", spec.pipeline, "-o", "name"], 60); - const argo = runNodeK3sArgs(spec, ["kubectl", "-n", "argocd", "get", "application", spec.app, "-o", "jsonpath={.spec.source.repoURL}{\"\\n\"}{.spec.source.targetRevision}{\"\\n\"}{.spec.source.path}{\"\\n\"}{.status.sync.revision}{\"\\n\"}{.status.sync.status}{\"\\n\"}{.status.health.status}{\"\\n\"}"], 60); + const serviceAccount = runNodeK3sArgs(spec, ["kubectl", "-n", "hwlab-ci", "get", "serviceaccount", spec.serviceAccountName, "-o", "name"], probeTimeoutSeconds); + const pipeline = runNodeK3sArgs(spec, ["kubectl", "-n", "hwlab-ci", "get", "pipeline", spec.pipeline, "-o", "name"], probeTimeoutSeconds); + const argo = runNodeK3sArgs(spec, ["kubectl", "-n", "argocd", "get", "application", spec.app, "-o", "jsonpath={.spec.source.repoURL}{\"\\n\"}{.spec.source.targetRevision}{\"\\n\"}{.spec.source.path}{\"\\n\"}{.status.sync.revision}{\"\\n\"}{.status.sync.status}{\"\\n\"}{.status.health.status}{\"\\n\"}"], probeTimeoutSeconds); const [repoURL = "", targetRevision = "", path = "", syncRevision = "", syncStatus = "", health = ""] = argo.stdout.split(/\r?\n/u); const pipelineRunProbe = pipelineRun === null ? null : getNodeRuntimePipelineRun(spec, pipelineRun); const pipelineRunDiagnostics = pipelineRun !== null && pipelineRunProbe?.exists === true && pipelineRunProbe?.status !== "True" ? nodeRuntimePipelineRunDiagnostics(spec, pipelineRun) : null; const workloads = namespaceExists - ? runNodeK3sArgs(spec, ["kubectl", "-n", spec.runtimeNamespace, "get", "deploy,statefulset,svc,ingress,configmap", "-l", `hwlab.pikastech.local/gitops-target=${spec.lane}`, "-o", "name"], 60) + ? runNodeK3sArgs(spec, ["kubectl", "-n", spec.runtimeNamespace, "get", "deploy,statefulset,svc,ingress,configmap", "-l", `hwlab.pikastech.local/gitops-target=${spec.lane}`, "-o", "name"], probeTimeoutSeconds) : null; const workloadNames = workloads === null ? [] : workloads.stdout.split(/\r?\n/u).map((line) => line.trim()).filter(Boolean); const workloadReadinessProbe = namespaceExists - ? runNodeK3sArgs(spec, ["kubectl", "-n", spec.runtimeNamespace, "get", "deploy,statefulset", "-l", `hwlab.pikastech.local/gitops-target=${spec.lane}`, "-o", "jsonpath={range .items[*]}{.kind}{\"/\"}{.metadata.name}{\"\\t\"}{.status.readyReplicas}{\"/\"}{.status.replicas}{\"/\"}{.spec.replicas}{\"\\n\"}{end}"], 60) + ? runNodeK3sArgs(spec, ["kubectl", "-n", spec.runtimeNamespace, "get", "deploy,statefulset", "-l", `hwlab.pikastech.local/gitops-target=${spec.lane}`, "-o", "jsonpath={range .items[*]}{.kind}{\"/\"}{.metadata.name}{\"\\t\"}{.status.readyReplicas}{\"/\"}{.status.replicas}{\"/\"}{.spec.replicas}{\"\\n\"}{end}"], probeTimeoutSeconds) : null; const workloadReadiness = parseNodeRuntimeWorkloadReadiness(workloadReadinessProbe?.stdout ?? ""); const bridge = externalPostgresBridgeStatus(spec, namespaceExists); @@ -228,7 +229,9 @@ export function nodeRuntimeControlPlaneStatus(scoped: ReturnType): string | null { + return typeof gitMirrorCompact.localGitops === "string" && /^[0-9a-f]{40}$/iu.test(gitMirrorCompact.localGitops) + ? gitMirrorCompact.localGitops + : typeof gitMirrorCompact.githubGitops === "string" && /^[0-9a-f]{40}$/iu.test(gitMirrorCompact.githubGitops) + ? gitMirrorCompact.githubGitops + : null; +} + +export function nodeRuntimeArgoDegradedReason(input: { + argoCommandOk: boolean; + repoURL: string; + expectedRepoURL: string; + targetRevision: string; + expectedTargetRevision: string; + path: string; + expectedPath: string; + syncRevision: string; + syncStatus: string; + health: string; + targetGitopsRevision: string | null; + runtimeReady: boolean; + publicReady: boolean; +}): string | null { + if (!input.argoCommandOk) return "argo-application-not-readable"; + if (input.repoURL !== input.expectedRepoURL || input.targetRevision !== input.expectedTargetRevision || input.path !== input.expectedPath) { + return "argo-application-spec-drift"; + } + const argoAtTarget = input.targetGitopsRevision !== null && input.syncRevision === input.targetGitopsRevision; + if (argoAtTarget && input.syncStatus === "Synced" && input.health !== "Healthy") return "argo-health-progressing"; + if (argoAtTarget && input.syncStatus !== "Synced" && input.runtimeReady && input.publicReady) return "argo-health-progressing"; + if (argoAtTarget) return "argo-target-revision-progressing"; + if (input.targetGitopsRevision !== null) return "argo-revision-not-observed"; + return "argo-not-synced-healthy"; +} + +export function nodeRuntimeStatusDegradedReason(input: { + controlPlaneReady: boolean; + pipelineRunReady: boolean; + pipelineRunDegradedReason: string; + gitMirrorReady: boolean; + gitMirrorDegradedReason: string; + argoReady: boolean; + argoDegradedReason: string | null; + runtimeReady: boolean; + runtimeDegradedReason: string; + publicReady: boolean; +}): string | undefined { + if (!input.controlPlaneReady) return "control-plane-not-ready"; + if (!input.pipelineRunReady) return input.pipelineRunDegradedReason; + if (!input.gitMirrorReady) return input.gitMirrorDegradedReason; + if (!input.argoReady) return input.argoDegradedReason ?? "argo-not-synced-healthy"; + if (!input.runtimeReady) return input.runtimeDegradedReason; + if (!input.publicReady) return "public-probe-not-ready"; + return undefined; +} + export function nodeRuntimePublicProbeStatus(spec: HwlabRuntimeLaneSpec): Record { const web = publicHttpProbe("web", spec.publicWebUrl); const apiHealth = publicHttpProbe("apiHealth", joinUrlPath(spec.publicApiUrl, "/health/live")); @@ -479,6 +557,42 @@ export function compactNodeRuntimeTaskRunDiagnostic(value: unknown): string { return [left, reason ? `(${webObserveShort(reason, 36)})` : ""].filter(Boolean).join(""); } +export function nodeRuntimePipelinePendingTaskRunSummaries( + spec: HwlabRuntimeLaneSpec, + pendingTaskRuns: Array>, + pods: Array>, +): Array> { + return pendingTaskRuns.slice(0, 16).map((taskRun) => { + const taskRunName = stringOrNull(taskRun.name); + const podName = stringOrNull(taskRun.podName); + const pod = pods.find((item) => item.name === podName || (taskRunName !== null && item.taskRun === taskRunName)) ?? {}; + const containers = Array.isArray(pod.containers) ? pod.containers.map(record) : []; + const initContainers = Array.isArray(pod.initContainers) ? pod.initContainers.map(record) : []; + const waitingContainers = [...initContainers, ...containers].filter((container) => container.state === "waiting"); + const runningContainers = [...initContainers, ...containers].filter((container) => container.state === "running"); + return { + name: taskRunName, + taskRun: taskRunName, + pipelineTask: taskRun.pipelineTask ?? null, + taskRef: taskRun.taskRef ?? null, + status: taskRun.status ?? null, + reason: taskRun.reason ?? null, + message: diagnosticText(taskRun.message), + pod: podName, + podPhase: pod.phase ?? null, + scheduled: pod.scheduled ?? null, + scheduledReason: pod.scheduledReason ?? null, + scheduledMessage: diagnosticText(pod.scheduledMessage), + waitingContainers, + runningContainers, + taskRunCommand: taskRunName === null ? null : nodeRuntimeK3sCommand(spec, ["get", "taskrun", "-n", HWLAB_CI_NAMESPACE, taskRunName, "-o", "yaml"]), + taskRunDescribeCommand: taskRunName === null ? null : nodeRuntimeK3sCommand(spec, ["describe", "taskrun", "-n", HWLAB_CI_NAMESPACE, taskRunName]), + podDescribeCommand: podName === null ? null : nodeRuntimeK3sCommand(spec, ["describe", "pod", "-n", HWLAB_CI_NAMESPACE, podName]), + podLogsCommand: podName === null ? null : nodeRuntimePipelineLogsCommand(spec, podName, null), + }; + }); +} + export function summarizeNodeRuntimeControlPlaneStatus(status: Record, scoped: ReturnType): Record { const pipelineRun = record(status.pipelineRun); const pipelineRunDiagnostics = record(status.pipelineRunDiagnostics); @@ -531,6 +645,8 @@ export function summarizeNodeRuntimeControlPlaneStatus(status: Record, sco if (reason === "argo-not-synced-healthy") { return `bun scripts/cli.ts hwlab nodes control-plane refresh --node ${scoped.node} --lane ${scoped.lane} --confirm`; } + if (reason === "argo-revision-not-observed" || reason === "argo-target-revision-progressing" || reason === "argo-health-progressing") { + return `${nodeRuntimeStatusCommand(scoped)} --full`; + } if (reason === "pipelinerun-not-succeeded") { return `bun scripts/cli.ts hwlab nodes control-plane trigger-current --node ${scoped.node} --lane ${scoped.lane} --confirm`; } + if (reason === "node-runtime-ci-taskrun-pending") { + const next = record(record(status.pipelineRunDiagnostics).next); + const pendingTaskRun = typeof next.pendingTaskRun === "string" ? next.pendingTaskRun : null; + return pendingTaskRun ?? `${nodeRuntimeStatusCommand(scoped)} --full`; + } if (reason === "node-runtime-ci-step-publish-failed") { return `bun scripts/cli.ts platform-infra sub2api status --target ${scoped.node}`; } @@ -659,6 +783,7 @@ export function nodeRuntimePipelineRunDiagnostics(spec: HwlabRuntimeLaneSpec, pi const pendingTaskRuns = taskRuns.filter((item) => item.status !== "True" && item.status !== "False"); const failedTaskRuns = taskRuns.filter((item) => item.status === "False"); const failedTaskRunSummaries = nodeRuntimePipelineFailedTaskRunSummaries(spec, failedTaskRuns, pods); + const pendingTaskRunSummaries = nodeRuntimePipelinePendingTaskRunSummaries(spec, pendingTaskRuns, pods); const stepPublishFailures = failedTaskRunSummaries.filter((item) => item.container === "step-publish" || item.step === "publish" || item.step === "step-publish"); const unscheduledPods = pods.filter((item) => item.scheduled === false); const schedulingMessages = unscheduledPods @@ -690,7 +815,8 @@ export function nodeRuntimePipelineRunDiagnostics(spec: HwlabRuntimeLaneSpec, pi failedTaskRuns: failedTaskRunSummaries, stepPublishFailures, failureSummary, - pendingTaskRuns, + pendingTaskRuns: pendingTaskRunSummaries, + pendingTaskRunCount: pendingTaskRunSummaries.length, unscheduledPods, schedulingMessages, degradedReason: tooManyPods @@ -723,7 +849,14 @@ export function nodeRuntimePipelineRunDiagnostics(spec: HwlabRuntimeLaneSpec, pi failedTaskRun: failedTaskRunSummaries[0]?.taskRunCommand ?? null, status: `bun scripts/cli.ts hwlab nodes control-plane status --node ${spec.nodeId} --lane ${spec.lane} --pipeline-run ${pipelineRun} --full`, } - : undefined, + : pendingTaskRunSummaries.length > 0 + ? { + pendingTaskRun: pendingTaskRunSummaries[0]?.taskRunDescribeCommand ?? pendingTaskRunSummaries[0]?.taskRunCommand ?? null, + pendingPod: pendingTaskRunSummaries[0]?.podDescribeCommand ?? null, + pendingPodLogs: pendingTaskRunSummaries[0]?.podLogsCommand ?? null, + status: `bun scripts/cli.ts hwlab nodes control-plane status --node ${spec.nodeId} --lane ${spec.lane} --pipeline-run ${pipelineRun} --full`, + } + : undefined, }; } diff --git a/scripts/src/hwlab-node/status.ts b/scripts/src/hwlab-node/status.ts index 920a0993..7328a2f4 100644 --- a/scripts/src/hwlab-node/status.ts +++ b/scripts/src/hwlab-node/status.ts @@ -38,6 +38,12 @@ import { webObserveTable } from "./web-observe-render"; import { nodeRuntimeGitMirrorTarget, printNodeRuntimeTriggerProgress, sleepSync } from "./web-probe"; import { webObserveShort, webObserveText } from "./web-probe-observe"; +type NodeScopedDelegatedOptions = ReturnType; +export type NodeRuntimeGitMirrorRunOptions = { + maxAttempts?: number; + deadlineMs?: number; +}; + export function nodeRuntimeExternalPostgresSecretRows(secrets: Record): unknown[][] { if (secrets.required === false) return [["not-required", "-", "-", "-", "false"]]; if (secrets.required === true && secrets.ready !== true && secrets.degradedReason !== undefined) { @@ -240,12 +246,12 @@ export function nodeRuntimeGitMirrorStatus(scoped: ReturnType): Record { +export function nodeRuntimeGitMirrorRun(scoped: NodeScopedDelegatedOptions, options: NodeRuntimeGitMirrorRunOptions = {}): Record { if (scoped.action !== "sync" && scoped.action !== "flush") return nodeRuntimeUnsupportedAction(scoped); if (!scoped.confirm && !scoped.dryRun) throw new Error(`git-mirror ${scoped.action} requires --dry-run or --confirm`); const spec = scoped.spec; const mirror = nodeRuntimeGitMirrorTarget(spec); - const retryMaxAttempts = !scoped.dryRun && (scoped.action === "sync" || scoped.action === "flush") ? 5 : 1; + const retryMaxAttempts = options.maxAttempts ?? (!scoped.dryRun && (scoped.action === "sync" || scoped.action === "flush") ? 5 : 1); const attempts: Record[] = []; let finalAttempt: { attempt: number; @@ -259,13 +265,15 @@ export function nodeRuntimeGitMirrorRun(scoped: ReturnType= options.deadlineMs) break; printNodeRuntimeTriggerProgress(spec, { stage: `git-mirror-${scoped.action}-retry`, status: "waiting", @@ -328,11 +337,14 @@ export function nodeRuntimeGitMirrorRun(scoped: ReturnType= retryMaxAttempts; const stopped = !actionSucceeded && (retryExhausted || retryableFailure?.stopped === true || retryableFailure === null); return { @@ -348,6 +360,7 @@ export function nodeRuntimeGitMirrorRun(scoped: ReturnType 1 ? { policy: "exponential-backoff", maxAttempts: retryMaxAttempts, @@ -370,6 +383,27 @@ export function nodeRuntimeGitMirrorRun(scoped: ReturnType { + return { + ok: false, + command: `hwlab nodes git-mirror ${scoped.action} --node ${scoped.node} --lane ${scoped.lane}`, + node: scoped.node, + lane: scoped.lane, + mode: `confirmed-${scoped.action}`, + mutation: false, + status: "pending", + degradedReason: "node-runtime-cicd-budget-exhausted", + next: { status: `bun scripts/cli.ts hwlab nodes git-mirror status --node ${scoped.node} --lane ${scoped.lane}` }, + }; +} + export function nodeRuntimeGitMirrorRetryableFailure( scoped: ReturnType, mirror: NodeRuntimeGitMirrorTargetSpec, @@ -480,9 +514,20 @@ export function nodeRuntimeGitMirrorHostWriteUrl(spec: HwlabRuntimeLaneSpec, mir return `http://${value}/${mirror.sourceRepository}.git`; } -export function nodeRuntimeEnsureGitMirrorSourceCurrent(scoped: ReturnType, sourceCommit: string, pipelineRun: string): Record { +export function nodeRuntimeEnsureGitMirrorSourceCurrent(scoped: NodeScopedDelegatedOptions, sourceCommit: string, pipelineRun: string, options: NodeRuntimeGitMirrorRunOptions = {}): Record { const full = nodeScopedFullOutput(scoped); - const before = nodeRuntimeGitMirrorStatus({ ...scoped, action: "status", dryRun: true, confirm: false }); + const beforeScoped = nodeRuntimeScopedWithinDeadline(scoped, options.deadlineMs); + if (beforeScoped === null) { + return { + ok: false, + mode: "budget-exhausted-before-status", + sourceCommit, + status: "pending", + degradedReason: "node-runtime-cicd-budget-exhausted", + next: { status: `bun scripts/cli.ts hwlab nodes git-mirror status --node ${scoped.node} --lane ${scoped.lane}` }, + }; + } + const before = nodeRuntimeGitMirrorStatus({ ...beforeScoped, action: "status", dryRun: true, confirm: false }); const beforeSummary = compactNodeRuntimeGitMirrorStatus(before); const beforeGithubTransport = record(before.githubTransport); if (beforeGithubTransport.required === true && beforeGithubTransport.ready === false) { @@ -501,7 +546,7 @@ export function nodeRuntimeEnsureGitMirrorSourceCurrent(scoped: ReturnType 0 ? compactNodeRuntimeGitMirrorStatus(after) : {}; const sourceOk = sync.ok === true && afterSummary.localSource === sourceCommit && afterSummary.githubSource === sourceCommit; - const flush = sourceOk ? nodeRuntimeEnsureGitMirrorFlushed(scoped, "pre", sourceCommit, pipelineRun, after) : null; + const flush = sourceOk ? nodeRuntimeEnsureGitMirrorFlushed(scoped, "pre", sourceCommit, pipelineRun, after, options) : null; const ok = sourceOk && (flush === null || flush.ok === true); return { ok, @@ -545,15 +603,28 @@ export function nodeRuntimeEnsureGitMirrorSourceCurrent(scoped: ReturnType, + scoped: NodeScopedDelegatedOptions, phase: "pre" | "post" | "parallel", sourceCommit: string, pipelineRun: string | null, statusInput: Record | null = null, + options: NodeRuntimeGitMirrorRunOptions = {}, ): Record { const stage = `git-mirror-${phase}-flush`; const full = nodeScopedFullOutput(scoped); - const before = statusInput ?? nodeRuntimeGitMirrorStatus({ ...scoped, action: "status", dryRun: true, confirm: false }); + const beforeScoped = nodeRuntimeScopedWithinDeadline(scoped, options.deadlineMs); + if (beforeScoped === null) { + return { + ok: false, + phase, + mode: "budget-exhausted-before-status", + executed: false, + status: "pending", + degradedReason: "node-runtime-cicd-budget-exhausted", + next: { status: `bun scripts/cli.ts hwlab nodes git-mirror status --node ${scoped.node} --lane ${scoped.lane}` }, + }; + } + const before = statusInput ?? nodeRuntimeGitMirrorStatus({ ...beforeScoped, action: "status", dryRun: true, confirm: false }); const beforeSummary = compactNodeRuntimeGitMirrorStatus(before); if (before.ok !== true) { printNodeRuntimeTriggerProgress(scoped.spec, { stage, status: "failed", sourceCommit, pipelineRun, reason: "git-mirror-status-failed", ...beforeSummary }); @@ -583,7 +654,21 @@ export function nodeRuntimeEnsureGitMirrorFlushed( }; } printNodeRuntimeTriggerProgress(scoped.spec, { stage, status: "started", sourceCommit, pipelineRun, flushNeeded: true, ...beforeSummary }); - const flush = nodeRuntimeGitMirrorRun({ ...scoped, domain: "git-mirror", action: "flush", confirm: true, dryRun: false, wait: true }); + const flushScoped = nodeRuntimeScopedWithinDeadline(scoped, options.deadlineMs); + if (flushScoped === null) { + return { + ok: false, + phase, + mode: "budget-exhausted-before-flush", + executed: false, + before: full ? before : undefined, + beforeSummary, + status: "pending", + degradedReason: "node-runtime-cicd-budget-exhausted", + next: { status: `bun scripts/cli.ts hwlab nodes git-mirror status --node ${scoped.node} --lane ${scoped.lane}` }, + }; + } + const flush = nodeRuntimeGitMirrorRun({ ...flushScoped, domain: "git-mirror", action: "flush", confirm: true, dryRun: false, wait: true }, options); const after = record(flush.status); const afterSummary = Object.keys(after).length > 0 ? compactNodeRuntimeGitMirrorStatus(after) : {}; const ok = flush.ok === true && Object.keys(after).length > 0 && !nodeRuntimeGitMirrorNeedsFlush(after); @@ -613,14 +698,17 @@ export function nodeRuntimeEnsureGitMirrorFlushed( } export function nodeRuntimeOpportunisticGitMirrorSync( - scoped: ReturnType, + scoped: NodeScopedDelegatedOptions, sourceCommit: string, pipelineRun: string, + options: NodeRuntimeGitMirrorRunOptions = {}, ): Record | null { const stage = "git-mirror-parallel-sync"; + const syncScoped = nodeRuntimeScopedWithinDeadline(scoped, options.deadlineMs); + if (syncScoped === null) return null; const full = nodeScopedFullOutput(scoped); printNodeRuntimeTriggerProgress(scoped.spec, { stage, status: "started", sourceCommit, pipelineRun }); - const sync = nodeRuntimeGitMirrorRun({ ...scoped, domain: "git-mirror", action: "sync", confirm: true, dryRun: false, wait: true }); + const sync = nodeRuntimeGitMirrorRun({ ...syncScoped, domain: "git-mirror", action: "sync", confirm: true, dryRun: false, wait: true }, options); const after = record(sync.status); const afterSummary = Object.keys(after).length > 0 ? compactNodeRuntimeGitMirrorStatus(after) : {}; const ok = sync.ok === true; @@ -649,13 +737,16 @@ export function nodeRuntimeOpportunisticGitMirrorSync( } export function nodeRuntimeOpportunisticGitMirrorFlush( - scoped: ReturnType, + scoped: NodeScopedDelegatedOptions, sourceCommit: string, pipelineRun: string, + options: NodeRuntimeGitMirrorRunOptions = {}, ): Record | null { - const status = nodeRuntimeGitMirrorStatus({ ...scoped, action: "status", dryRun: true, confirm: false }); + const statusScoped = nodeRuntimeScopedWithinDeadline(scoped, options.deadlineMs); + if (statusScoped === null) return null; + const status = nodeRuntimeGitMirrorStatus({ ...statusScoped, action: "status", dryRun: true, confirm: false }); if (status.ok !== true || !nodeRuntimeGitMirrorNeedsFlush(status)) return null; - return nodeRuntimeEnsureGitMirrorFlushed(scoped, "parallel", sourceCommit, pipelineRun, status); + return nodeRuntimeEnsureGitMirrorFlushed(scoped, "parallel", sourceCommit, pipelineRun, status, options); } export function nodeRuntimeGitMirrorNeedsFlush(status: Record): boolean { diff --git a/scripts/src/hwlab-node/web-probe.ts b/scripts/src/hwlab-node/web-probe.ts index e3fcc159..e8e2aaf5 100644 --- a/scripts/src/hwlab-node/web-probe.ts +++ b/scripts/src/hwlab-node/web-probe.ts @@ -348,8 +348,9 @@ export function waitForNodeRuntimePipelineRunTerminal( options: { opportunisticPostFlush?: () => Record | null; opportunisticPostSync?: () => Record | null } = {}, ): Record { const severeWarningThresholdMs = 120_000; + const effectiveTimeoutSeconds = Math.max(0, timeoutSeconds); const startedAt = Date.now(); - const deadline = startedAt + timeoutSeconds * 1000; + const deadline = startedAt + effectiveTimeoutSeconds * 1000; let polls = 0; let last: Record = { exists: false, name: pipelineRun }; let lastOpportunisticPostFlushAt = 0; @@ -357,7 +358,21 @@ export function waitForNodeRuntimePipelineRunTerminal( let opportunisticPostSyncAttempted = false; const opportunisticPostFlushes: Record[] = []; const opportunisticPostSyncs: Record[] = []; - printNodeRuntimeTriggerProgress(spec, { stage: "pipelinerun-wait", status: "started", pipelineRun, timeoutSeconds }); + printNodeRuntimeTriggerProgress(spec, { stage: "pipelinerun-wait", status: "started", pipelineRun, timeoutSeconds: effectiveTimeoutSeconds }); + if (effectiveTimeoutSeconds <= 0) { + const warning = nodeRuntimeCicdWaitWarning(spec, pipelineRun, { polls, elapsedMs: 0 }); + printNodeRuntimeTriggerProgress(spec, { stage: "pipelinerun-wait", status: "warning", pipelineRun, polls, elapsedMs: 0, waitLimitSeconds: effectiveTimeoutSeconds }); + return { + ok: false, + status: "pending", + pipelineRun: last, + polls, + elapsedMs: 0, + waitLimitSeconds: effectiveTimeoutSeconds, + warning, + next: { status: `bun scripts/cli.ts hwlab nodes control-plane status --node ${spec.nodeId} --lane ${spec.lane} --pipeline-run ${pipelineRun}` }, + }; + } while (Date.now() <= deadline) { polls += 1; last = getNodeRuntimePipelineRun(spec, pipelineRun); @@ -445,7 +460,7 @@ export function waitForNodeRuntimePipelineRunTerminal( const warning = nodeRuntimeCicdWaitWarning(spec, pipelineRun, { polls, elapsedMs }); printNodeRuntimeTriggerProgress(spec, { stage: "pipelinerun-wait", status: "warning", pipelineRun, polls, elapsedMs, waitLimitSeconds: timeoutSeconds }); return { - ok: true, + ok: false, status: "pending", pipelineRun: last, polls,