fix: align v02 target validation with rollout services
This commit is contained in:
@@ -45,7 +45,7 @@ CI/CD、GitOps、rollout、artifact 发布、PR 合并后的 DEV/PROD 滚动、P
|
||||
- `hwlab g14 monitor-prs [--once] [--dry-run] [--interval-seconds N] [--max-cycles N] [--timeout-seconds N]` 是当前 HWLAB G14 PR -> CI/CD -> DEV rollout 的一行式入口。普通调用创建 `.state/jobs/` 异步 job 并立刻返回 `job.id`、`statusCommand` 和 stdout/stderr 路径;后台 worker 每轮通过 UniDesk `gh pr list/preflight/merge` 监控 `pikasTech/HWLAB` base=`G14` 的 open PR,ready 时合并,然后通过 UniDesk `ssh G14:k3s` 观察 `hwlab-g14-ci-poll-<short>`、Argo `hwlab-g14-dev` 和 DEV `/health/live`,直到 DEV `Synced/Healthy` 且 Deployment/StatefulSet ready;历史 `Completed` smoke/debug pod 不作为 rollout blocker。每次成功 DEV rollout 后,worker 会定位或创建 #7“指挥简报索引”中的北京日期每日简报 issue,并追加 CI/CD 耗时、CI/CD 关键指标、语义化上线 changelog、自动 diff 摘要、PipelineRun、GitOps revision 和 DEV 验证摘要;关键指标来自 G14 Tekton TaskRun results,固定包含 `lazy build reused: x/y`、reused services、rebuild services 和每个 service 的独立耗时/状态/backend,用于观察 lazy build 机制效果。语义化 changelog 优先从 PR body 的 `## 修改`/`## 变更`/`## Changelog` 等段落提取,diff 摘要只作为文件和统计证据保留,不替代 changelog。也可用 `hwlab g14 record-rollout --pr <number> --source-commit <sha>` 手动补记,手动补记同样会按 PipelineRun 采集 TaskRun 指标。状态指针按用途分离:长期监控只写 `.state/hwlab-g14/latest-monitor-job.json`,`--once` 写 `latest-once-job.json`,`--dry-run` 写 `latest-dry-run-job.json`,`--once --dry-run` 写 `latest-once-dry-run-job.json`,避免一次性收口覆盖持续监控入口。`--once --dry-run` 只做单轮监控和 merge plan,不写 GitHub、不等待 rollout。该命令禁止使用原生 `gh` 或手拼 GitHub 请求;如果 UniDesk `gh` 子命令字段或行为不够,必须先改进 `scripts/src/gh.ts` 后再使用。
|
||||
- `agentrun v01 control-plane status|trigger-current|refresh [--dry-run|--confirm]` 是 AgentRun `v0.1` 在 G14 k3s 的受控 Tekton/Argo 入口。`status` 只读汇总固定 source worktree commit、对应 commit-pinned PipelineRun、GitOps latest、Argo Application、`agentrun-v01` workload、`planArtifacts.summary`、env image result 和 git mirror 摘要,并报告 Argo revision 是否对齐 `v0.1-gitops` latest;`trigger-current` 先快进 `G14:/root/agentrun-v01` 到 `origin/v0.1`,检查 `devops-infra` mirror 的 `localV01` 是否等于目标 source commit,必要时先执行受控 mirror sync,再创建 `agentrun-v01-ci-<short12>` PipelineRun。confirmed trigger 只提交 CI/CD 工作并返回后续 `status` 命令,不等待完整 PipelineRun;同名 PipelineRun 运行中或已成功时拒绝重复触发,只允许失败态重建或首次创建。`refresh` 只对 `argocd/agentrun-g14-v01` 执行 hard refresh,用于 GitOps promotion 已完成但 Argo 仍停留旧 revision 时的受控同步入口;它不直接 patch runtime workload。AgentRun 运行时和 SPEC 事实来源仍在 AgentRun 仓库,UniDesk 只维护受控运维入口。
|
||||
- `agentrun v01 git-mirror status|sync|flush [--dry-run|--confirm]` 是 AgentRun `v0.1` 使用 `devops-infra` git mirror/relay 的受控维护入口。`status` 返回 read/write URL、`localV01`、`githubV01`、`localGitops`、`githubGitops`、`pendingFlush`、`githubInSync` 和 exact full-SHA shallow fetch 结果;`sync` 创建 manual Job,把 GitHub `v0.1` 和 `v0.1-gitops` refs 拉入 `/cache/pikasTech/agentrun.git`;`flush` 把本地 `v0.1-gitops` 快进推回 GitHub。confirmed `sync`/`flush` 默认创建 `.state/jobs/` 异步 job 并立刻返回 `job.id`、`statusCommand` 和日志路径;只有现场同步调试才显式加 `--wait`。该入口与 HWLAB v0.2 mirror 共用 `devops-infra` 服务和 cache PVC,但 repo path、refs、status 文件和 CLI 命令彼此独立。
|
||||
- `hwlab g14 control-plane status|apply --lane v02 [--dry-run|--confirm]` 是 HWLAB `v0.2` 加法 lane 的受控 Tekton/Argo 控制面维护入口,source commit 只来自 G14 专用 bare repo `/root/hwlab-v02-cicd.git` 的 `refs/remotes/origin/v0.2`;`/root/hwlab-v02` 只作为人工开发和短连接源码工具 workspace 被观测,dirty/stale 状态必须输出为 isolated warning 而不能阻塞 CI/CD。该入口面向 branch `v0.2`、namespace `hwlab-ci` 和 Argo application `hwlab-g14-v02`;默认 `status` 只读汇总最新 source head 的 pipeline、RBAC/ServiceAccount、Argo、当前 commit PipelineRun、当前 PipelineRun 的 TaskRun 条件摘要、最近 PipelineRun 摘要、活跃 PipelineRun、遗留 v02 CronJob 清理状态、commit alignment,以及 19666/19667 的 Cloud Web 静态资源和 API live 探针。分支被后续提交推进后,要复查已完成 run 时使用 `status --lane v02 --pipeline-run hwlab-v02-ci-poll-<short-sha>`;已知完整 source SHA 但不想依赖最新 head 时使用 `status --lane v02 --source-commit <full-sha>`。定点 `status` 输出 `statusTarget.mode` 和 `targetValidation`,只检查指定 PipelineRun/source commit 的证据;`targetValidation.state=passed` 表示该目标已满足 PipelineRun succeeded、Argo `Synced/Healthy`、19666/19667 探针、Git mirror flushed、`hwlab-cloud-api`/`hwlab-cloud-web` runtime source commit 对齐;`targetValidation.state=superseded` 表示该目标已成功且 runtime 已被同一分支后续成功 PipelineRun 取代,`falseGreenGuard` 在该状态下应标为 superseded/not-applicable。两种状态都不得因为 `origin/v0.2` 后续推进而把历史 run 判为失败;默认不带定点参数时仍严格判定最新 source head alignment。TaskRun 摘要的 `performance` 字段会把超过 120s 的 build TaskRun 标为慢任务、超过 180s 标为 critical warning,用于暴露 env reuse/git mirror 命中率回归,但不作为阻断门禁;CI/CD 性能验收应同时看 `planArtifacts.summary`、`taskRuns.performance.warningCount` 和 PipelineRun duration,纯 CLI/文档或无 runtime 重建需求的后续提交应稳定表现为 `build=0 reuse=<service-count>` 且无 build TaskRun warning,首次引入或切换 env image 时允许只构建必要 env image 一次。`webAssets` 必须直接给出 `readonly-rpc` 删除、sidebar/workspace/event panel 关键 CSS、`/app.js` 是否可读取和字节数、`/health/live` 与 API revision;`apiRevision` 是 cloud-api 服务自身 revision,Cloud Web 静态资源变更时允许它与 source commit 不同,不能把这种差异误判成 Cloud Web 未发布。默认只读取必要字段,禁止把完整 PipelineRun spec、Tekton 内联脚本、历史大对象或整份 CSS/HTML/JS 展开到默认输出;`apply` 先自动 fetch `/root/hwlab-v02-cicd.git` 并从 commit-pinned detached worktree 执行 render check,再经 `G14:k3s` server-side apply `tekton-v02/rbac.yaml`、`pipeline.yaml`、`argocd/project.yaml` 和 `argocd/application-v02.yaml`,confirmed apply 会删除遗留 v02 CronJob,但不会应用 runtime-v02 workload、Secret 或数据迁移。
|
||||
- `hwlab g14 control-plane status|apply --lane v02 [--dry-run|--confirm]` 是 HWLAB `v0.2` 加法 lane 的受控 Tekton/Argo 控制面维护入口,source commit 只来自 G14 专用 bare repo `/root/hwlab-v02-cicd.git` 的 `refs/remotes/origin/v0.2`;`/root/hwlab-v02` 只作为人工开发和短连接源码工具 workspace 被观测,dirty/stale 状态必须输出为 isolated warning 而不能阻塞 CI/CD。该入口面向 branch `v0.2`、namespace `hwlab-ci` 和 Argo application `hwlab-g14-v02`;默认 `status` 只读汇总最新 source head 的 pipeline、RBAC/ServiceAccount、Argo、当前 commit PipelineRun、当前 PipelineRun 的 TaskRun 条件摘要、最近 PipelineRun 摘要、活跃 PipelineRun、遗留 v02 CronJob 清理状态、commit alignment,以及 19666/19667 的 Cloud Web 静态资源和 API live 探针。分支被后续提交推进后,要复查已完成 run 时使用 `status --lane v02 --pipeline-run hwlab-v02-ci-poll-<short-sha>`;已知完整 source SHA 但不想依赖最新 head 时使用 `status --lane v02 --source-commit <full-sha>`。定点 `status` 输出 `statusTarget.mode` 和 `targetValidation`,只检查指定 PipelineRun/source commit 的证据;`targetValidation.state=passed` 表示该目标已满足 PipelineRun succeeded、Argo `Synced/Healthy`、19666/19667 探针、Git mirror flushed,并且该 run 的 `planArtifacts.rolloutServices` 运行时 source commit 对齐;`planArtifacts.reusedServices` 作为 runtime/provenance 证据呈现,但不能被强制要求等于目标 source commit。`targetValidation.state=superseded` 表示该目标已成功且 runtime 已被同一分支后续成功 PipelineRun 取代,`falseGreenGuard` 在该状态下应标为 superseded/not-applicable。两种状态都不得因为 `origin/v0.2` 后续推进而把历史 run 判为失败;默认不带定点参数时仍严格判定最新 source head alignment。TaskRun 摘要的 `performance` 字段会把超过 120s 的 build TaskRun 标为慢任务、超过 180s 标为 critical warning,用于暴露 env reuse/git mirror 命中率回归,但不作为阻断门禁;CI/CD 性能验收应同时看 `planArtifacts.summary`、`taskRuns.performance.warningCount` 和 PipelineRun duration,纯 CLI/文档或无 runtime 重建需求的后续提交应稳定表现为 `build=0 reuse=<service-count>` 且无 build TaskRun warning,首次引入或切换 env image 时允许只构建必要 env image 一次。`webAssets` 必须直接给出 `readonly-rpc` 删除、sidebar/workspace/event panel 关键 CSS、`/app.js` 是否可读取和字节数、`/health/live` 与 API revision;`apiRevision` 是 cloud-api 服务自身 revision,Cloud Web 静态资源变更时允许它与 source commit 不同,不能把这种差异误判成 Cloud Web 未发布。默认只读取必要字段,禁止把完整 PipelineRun spec、Tekton 内联脚本、历史大对象或整份 CSS/HTML/JS 展开到默认输出;`apply` 先自动 fetch `/root/hwlab-v02-cicd.git` 并从 commit-pinned detached worktree 执行 render check,再经 `G14:k3s` server-side apply `tekton-v02/rbac.yaml`、`pipeline.yaml`、`argocd/project.yaml` 和 `argocd/application-v02.yaml`,confirmed apply 会删除遗留 v02 CronJob,但不会应用 runtime-v02 workload、Secret 或数据迁移。
|
||||
- `hwlab g14 control-plane trigger-current --lane v02 [--dry-run|--confirm]` 是 v02 标准手动触发入口:先自动 fetch `/root/hwlab-v02-cicd.git`,解析当前 `origin/v0.2` full SHA,创建 commit-pinned `hwlab-v02-ci-poll-<short12>` PipelineRun;读 Git 走 `git-mirror-http.devops-infra.svc.cluster.local`,GitOps promotion 写 `git-mirror-write.devops-infra.svc.cluster.local`;confirmed trigger 在删除/创建 PipelineRun 前会先按当前 source commit 在 G14 临时 detached worktree 中 render,再 server-side apply v02 Tekton RBAC、Pipeline 与 Argo Application,避免 CI/CD 脚本或 runtime-ready 逻辑已合并但集群仍执行旧 Pipeline 定义;该 render 不要求固定 `/root/hwlab-v02` 工作树 clean,也不得因 `.worktree/` 或其他并行未提交修改阻塞;同名 PipelineRun 成功或运行中时拒绝重复触发,失败或不存在时才删除旧对象并重新创建。
|
||||
创建 PipelineRun 前会读取 `devops-infra` mirror refs,若 `localV02` 未等于当前 source commit,则自动执行一次受控 manual `git-mirror sync` Job 并复核 ref,复核失败时停止触发,避免 Tekton `prepare-source` 已知失败;services 参数只包含 v02 runtime service matrix,`hwlab-cli` 是固定 repo 短连接源码工具,不进入 PipelineRun service build。
|
||||
`--dry-run` 只报告是否会 pre-sync,不创建 Job;confirmed trigger 默认创建 `.state/jobs/` 异步 job 并立刻返回 `job.id`、`statusCommand`、stdout/stderr 路径,避免 git mirror pre-sync 或 PipelineRun 创建期间长时间阻塞;`--wait` 路径也必须向 stderr 输出 `hwlab.v02.trigger.progress` JSON 事件,覆盖 `control-plane-refresh`、`git-mirror-pre-sync`、`delete-existing-pipelinerun` 和 `create-pipelinerun`,避免异步 job 长时间只有启动命令而无法判断卡点;默认 JSON 必须对 `manifest_b64`、长脚本和远端 stdout/stderr 做有界摘要,保留长度与 hash,最终 trigger 结果只返回阶段摘要和关键 tail,完整内容通过 job stdout/stderr 文件渐进披露;只有现场同步调试才显式加 `--wait`;旧 `rerun-current` 只作为输入别名保留。PipelineRun `Completed`、Argo `Synced/Healthy` 和 `webAssets.ok=true` 只证明 G14 runtime 已更新;交付收口还必须用 `hwlab g14 git-mirror status` 查看 `cache.summary.pendingFlush`,若为 true,继续执行受控 `hwlab g14 git-mirror flush --confirm` 并用 job status 轮询到 `pendingFlush=false`。
|
||||
|
||||
@@ -78,7 +78,7 @@ bun scripts/cli.ts hwlab g14 control-plane status --lane v02 --pipeline-run hwla
|
||||
bun scripts/cli.ts hwlab g14 control-plane status --lane v02 --source-commit <full-sha>
|
||||
```
|
||||
|
||||
Targeted status must expose `statusTarget.mode` and `targetValidation`. `targetValidation.state=passed` means the requested PipelineRun/source commit reached a succeeded PipelineRun, Argo `Synced/Healthy`, public web/API probes, flushed Git mirror, and matching `hwlab-cloud-api` / `hwlab-cloud-web` runtime source commits. `targetValidation.state=superseded` means the requested PipelineRun succeeded and was later replaced in runtime by a newer succeeded `v0.2` PipelineRun; this is valid closure evidence for the requested run when the newer commit is on the same branch lineage. In both states, `commitAlignment.staleReasons` may still mention later `origin/v0.2` or CI/CD source head movement; that is parallel-head context, not a failure of the requested run. `falseGreenGuard` is a current-runtime guard and should report not-applicable/superseded for such historical targets instead of turning later runtime movement into a false failure. Default status without a target remains strict for the latest source head.
|
||||
Targeted status must expose `statusTarget.mode` and `targetValidation`. `targetValidation.state=passed` means the requested PipelineRun/source commit reached a succeeded PipelineRun, Argo `Synced/Healthy`, public web/API probes, flushed Git mirror, and matching runtime source commits for the services listed in that run's `planArtifacts.rolloutServices`; services listed in `planArtifacts.reusedServices` remain visible as runtime/provenance evidence but must not be forced to the target source commit. `targetValidation.state=superseded` means the requested PipelineRun succeeded and was later replaced in runtime by a newer succeeded `v0.2` PipelineRun; this is valid closure evidence for the requested run when the newer commit is on the same branch lineage. In both states, `commitAlignment.staleReasons` may still mention later `origin/v0.2` or CI/CD source head movement; that is parallel-head context, not a failure of the requested run. `falseGreenGuard` is a current-runtime guard and should report not-applicable/superseded for such historical targets instead of turning later runtime movement into a false failure. Default status without a target remains strict for the latest source head.
|
||||
|
||||
For HWLAB user-feedback, CLI, Cloud Web, AgentRun, device-pod, public API, or runtime workflow issues, source-level validation is not enough to close the issue. Unit tests, contract tests, `git diff --check`, targeted build checks, PR merge metadata, and source commit rollout evidence are supporting evidence only. The issue may be closed only after the affected user entry or original entry has been exercised against the target runtime. For CLI issues, that means running the relevant `hwlab-cli` or UniDesk-controlled CLI command from the G14 `v0.2` workspace or approved execution plane against the intended lane/URL/namespace and proving the observed behavior, not just proving the helper code compiles. For Cloud Web or public API issues, use the public endpoint or a bounded browser/API smoke that reaches the deployed runtime. For AgentRun or device-pod issues, capture the trace/session/thread/run/job/device evidence that proves the specific continuation or hardware workflow reached the live backend.
|
||||
|
||||
|
||||
@@ -1238,6 +1238,7 @@ function v02TargetValidation(input: {
|
||||
sourceCommit: string | null;
|
||||
pipelineRun: Record<string, unknown> | null;
|
||||
argo: Record<string, unknown>;
|
||||
planArtifacts: Record<string, unknown>;
|
||||
runtimeWorkloads: Record<string, unknown>;
|
||||
webAssets: Record<string, unknown>;
|
||||
gitMirror: Record<string, unknown>;
|
||||
@@ -1259,6 +1260,9 @@ function v02TargetValidation(input: {
|
||||
const serviceSourceCommits = Object.fromEntries(workloadItems
|
||||
.filter((item) => item.serviceId === "hwlab-cloud-api" || item.serviceId === "hwlab-cloud-web")
|
||||
.map((item) => [String(item.serviceId), stringOrNull(item.sourceCommit) ?? stringOrNull(item.artifactSourceCommit)]));
|
||||
const rolloutServices = stringArray(input.planArtifacts.rolloutServices)
|
||||
.filter((serviceId) => serviceId === "hwlab-cloud-api" || serviceId === "hwlab-cloud-web");
|
||||
const requiredRuntimeServices = rolloutServices.length > 0 ? rolloutServices : ["hwlab-cloud-api"];
|
||||
const recentItems = Array.isArray(input.recentPipelineRuns.items)
|
||||
? input.recentPipelineRuns.items.map((item) => record(item))
|
||||
: [];
|
||||
@@ -1290,7 +1294,7 @@ function v02TargetValidation(input: {
|
||||
githubInSync: gitMirrorSummary.githubInSync ?? null,
|
||||
});
|
||||
}
|
||||
for (const serviceId of ["hwlab-cloud-api", "hwlab-cloud-web"]) {
|
||||
for (const serviceId of requiredRuntimeServices) {
|
||||
const serviceCommit = serviceSourceCommits[serviceId];
|
||||
if (sourceCommit !== null && serviceCommit !== sourceCommit) {
|
||||
const mismatch = { reason: "runtime-service-source-mismatch", serviceId, expectedSourceCommit: sourceCommit, actualSourceCommit: serviceCommit ?? null };
|
||||
@@ -1314,6 +1318,9 @@ function v02TargetValidation(input: {
|
||||
: { name: input.pipelineRun.pipelineRun ?? null, status: input.pipelineRun.status ?? null, reason: input.pipelineRun.reason ?? null },
|
||||
argo: { syncRevision: argoFields.syncRevision ?? null, syncStatus: argoFields.syncStatus ?? null, health: argoFields.health ?? null },
|
||||
runtimeServices: serviceSourceCommits,
|
||||
requiredRuntimeServices,
|
||||
rolloutServices: stringArray(input.planArtifacts.rolloutServices),
|
||||
reusedServices: stringArray(input.planArtifacts.reusedServices),
|
||||
superseded,
|
||||
supersededRuntimeServices,
|
||||
newerSucceededRuns: newerSucceededRuns.slice(0, 3).map((item) => ({
|
||||
@@ -2042,6 +2049,7 @@ function v02ControlPlaneStatus(target: V02ControlPlaneStatusTarget = {}): Record
|
||||
ok: shellSectionOk(argo),
|
||||
fields: { targetRevision, path, syncRevision, syncStatus, health },
|
||||
},
|
||||
planArtifacts,
|
||||
runtimeWorkloads,
|
||||
webAssets,
|
||||
gitMirror,
|
||||
|
||||
Reference in New Issue
Block a user