From 0cecdd9ed3d32f528451d68e6a2c0c4a28914c61 Mon Sep 17 00:00:00 2001 From: Codex Date: Mon, 1 Jun 2026 14:27:52 +0000 Subject: [PATCH] fix: warn on slow HWLAB v02 build taskruns --- docs/reference/cli.md | 2 +- scripts/hwlab-g14-contract-test.ts | 28 +++++++++++++- scripts/src/hwlab-g14.ts | 60 +++++++++++++++++++++++++++++- 3 files changed, 87 insertions(+), 3 deletions(-) diff --git a/docs/reference/cli.md b/docs/reference/cli.md index a14f0271..7a5db581 100644 --- a/docs/reference/cli.md +++ b/docs/reference/cli.md @@ -44,7 +44,7 @@ CI/CD、GitOps、rollout、artifact 发布、PR 合并后的 DEV/PROD 滚动、P - `commander contract|plan --dry-run|smoke --dry-run|approval request --dry-run|prompt-lint --kind gpt55-pr` 是 host Codex 指挥官直管微服务 skeleton 入口。当前命令返回 `phase=source-contract`、service/API/state/bridge/prompt/trace/#20/#46/ClaudeQQ 审批边界、.state/commander/ 状态模型、dev 无 daemon smoke contract、dry-run 计划和 GPT-5.5 PR prompt 边界辅助 lint,不接 live bridge、不注入 prompt、不发送 ClaudeQQ。`approval request --dry-run` 会生成 200 字以内中文纯文本 ClaudeQQ 审批草案、`notification-path-unavailable` blocker 和授权后唯一可用的 `bun scripts/cli.ts microservice proxy claudeqq /api/push/text --method POST --body-json '' --raw` 命令;不得提示使用本机 ClaudeQQ skill、powershell 或本地 server。`prompt-lint` 支持 `--prompt-file` 与 `--stdin`,输出 `ok`、`missingClauses`、`riskLevel`、`suggestedPatchSnippet` 且不回显完整 prompt;它是 commander 辅助检查,不是业务 PR 门禁,也不改变 `codex submit` 默认行为。`plan`、`smoke` 与 `approval request` 必须带 `--dry-run`;缺少时返回 `error=dry-run-required`。长期规则见 `docs/reference/host-codex-commander.md`。 - `hwlab g14 monitor-prs [--once] [--dry-run] [--interval-seconds N] [--max-cycles N] [--timeout-seconds N]` 是当前 HWLAB G14 PR -> CI/CD -> DEV rollout 的一行式入口。普通调用创建 `.state/jobs/` 异步 job 并立刻返回 `job.id`、`statusCommand` 和 stdout/stderr 路径;后台 worker 每轮通过 UniDesk `gh pr list/preflight/merge` 监控 `pikasTech/HWLAB` base=`G14` 的 open PR,ready 时合并,然后通过 UniDesk `ssh G14:k3s` 观察 `hwlab-g14-ci-poll-`、Argo `hwlab-g14-dev` 和 DEV `/health/live`,直到 DEV `Synced/Healthy` 且 Deployment/StatefulSet ready;历史 `Completed` smoke/debug pod 不作为 rollout blocker。每次成功 DEV rollout 后,worker 会定位或创建 #7“指挥简报索引”中的北京日期每日简报 issue,并追加 CI/CD 耗时、CI/CD 关键指标、语义化上线 changelog、自动 diff 摘要、PipelineRun、GitOps revision 和 DEV 验证摘要;关键指标来自 G14 Tekton TaskRun results,固定包含 `lazy build reused: x/y`、reused services、rebuild services 和每个 service 的独立耗时/状态/backend,用于观察 lazy build 机制效果。语义化 changelog 优先从 PR body 的 `## 修改`/`## 变更`/`## Changelog` 等段落提取,diff 摘要只作为文件和统计证据保留,不替代 changelog。也可用 `hwlab g14 record-rollout --pr --source-commit ` 手动补记,手动补记同样会按 PipelineRun 采集 TaskRun 指标。状态指针按用途分离:长期监控只写 `.state/hwlab-g14/latest-monitor-job.json`,`--once` 写 `latest-once-job.json`,`--dry-run` 写 `latest-dry-run-job.json`,`--once --dry-run` 写 `latest-once-dry-run-job.json`,避免一次性收口覆盖持续监控入口。`--once --dry-run` 只做单轮监控和 merge plan,不写 GitHub、不等待 rollout。该命令禁止使用原生 `gh` 或手拼 GitHub 请求;如果 UniDesk `gh` 子命令字段或行为不够,必须先改进 `scripts/src/gh.ts` 后再使用。 - `agentrun v01 control-plane status|trigger-current [--dry-run|--confirm]` 是 AgentRun `v0.1` 在 G14 k3s 的受控 Tekton/Argo 入口。`status` 只读汇总固定 source worktree commit、对应 commit-pinned PipelineRun、Argo Application 和 `agentrun-v01` workload;`trigger-current` 先快进 `G14:/root/agentrun-v01` 到 `origin/v0.1`,再创建 `agentrun-v01-ci-` PipelineRun。confirmed trigger 只提交 CI/CD 工作并返回后续 `status` 命令,不等待完整 PipelineRun;同名 PipelineRun 运行中或已成功时拒绝重复触发,只允许失败态重建或首次创建。AgentRun 运行时和 SPEC 事实来源仍在 AgentRun 仓库,UniDesk 只维护受控运维入口。 -- `hwlab g14 control-plane status|apply --lane v02 [--dry-run|--confirm]` 是 HWLAB `v0.2` 加法 lane 的受控 Tekton/Argo 控制面维护入口,source commit 只来自 G14 专用 bare repo `/root/hwlab-v02-cicd.git` 的 `refs/remotes/origin/v0.2`;`/root/hwlab-v02` 只作为人工开发和短连接源码工具 workspace 被观测,dirty/stale 状态必须输出为 isolated warning 而不能阻塞 CI/CD。该入口面向 branch `v0.2`、namespace `hwlab-ci` 和 Argo application `hwlab-g14-v02`;`status` 只读汇总 pipeline、RBAC/ServiceAccount、Argo、当前 commit PipelineRun、当前 PipelineRun 的 TaskRun 条件摘要、最近 PipelineRun 摘要、活跃 PipelineRun、遗留 v02 CronJob 清理状态、commit alignment,以及 19666/19667 的 Cloud Web 静态资源和 API live 探针。`webAssets` 必须直接给出 `readonly-rpc` 删除、sidebar/workspace/event panel 关键 CSS、`/app.js` 是否可读取和字节数、`/health/live` 与 API revision;`apiRevision` 是 cloud-api 服务自身 revision,Cloud Web 静态资源变更时允许它与 source commit 不同,不能把这种差异误判成 Cloud Web 未发布。默认只读取必要字段,禁止把完整 PipelineRun spec、Tekton 内联脚本、历史大对象或整份 CSS/HTML/JS 展开到默认输出;`apply` 先自动 fetch `/root/hwlab-v02-cicd.git` 并从 commit-pinned detached worktree 执行 render check,再经 `G14:k3s` server-side apply `tekton-v02/rbac.yaml`、`pipeline.yaml`、`argocd/project.yaml` 和 `argocd/application-v02.yaml`,confirmed apply 会删除遗留 v02 CronJob,但不会应用 runtime-v02 workload、Secret 或数据迁移。 +- `hwlab g14 control-plane status|apply --lane v02 [--dry-run|--confirm]` 是 HWLAB `v0.2` 加法 lane 的受控 Tekton/Argo 控制面维护入口,source commit 只来自 G14 专用 bare repo `/root/hwlab-v02-cicd.git` 的 `refs/remotes/origin/v0.2`;`/root/hwlab-v02` 只作为人工开发和短连接源码工具 workspace 被观测,dirty/stale 状态必须输出为 isolated warning 而不能阻塞 CI/CD。该入口面向 branch `v0.2`、namespace `hwlab-ci` 和 Argo application `hwlab-g14-v02`;`status` 只读汇总 pipeline、RBAC/ServiceAccount、Argo、当前 commit PipelineRun、当前 PipelineRun 的 TaskRun 条件摘要、最近 PipelineRun 摘要、活跃 PipelineRun、遗留 v02 CronJob 清理状态、commit alignment,以及 19666/19667 的 Cloud Web 静态资源和 API live 探针。TaskRun 摘要的 `performance` 字段会把超过 120s 的 build TaskRun 标为慢任务、超过 180s 标为 critical warning,用于暴露 env reuse/git mirror 命中率回归,但不作为阻断门禁。`webAssets` 必须直接给出 `readonly-rpc` 删除、sidebar/workspace/event panel 关键 CSS、`/app.js` 是否可读取和字节数、`/health/live` 与 API revision;`apiRevision` 是 cloud-api 服务自身 revision,Cloud Web 静态资源变更时允许它与 source commit 不同,不能把这种差异误判成 Cloud Web 未发布。默认只读取必要字段,禁止把完整 PipelineRun spec、Tekton 内联脚本、历史大对象或整份 CSS/HTML/JS 展开到默认输出;`apply` 先自动 fetch `/root/hwlab-v02-cicd.git` 并从 commit-pinned detached worktree 执行 render check,再经 `G14:k3s` server-side apply `tekton-v02/rbac.yaml`、`pipeline.yaml`、`argocd/project.yaml` 和 `argocd/application-v02.yaml`,confirmed apply 会删除遗留 v02 CronJob,但不会应用 runtime-v02 workload、Secret 或数据迁移。 - `hwlab g14 control-plane trigger-current --lane v02 [--dry-run|--confirm]` 是 v02 标准手动触发入口:先自动 fetch `/root/hwlab-v02-cicd.git`,解析当前 `origin/v0.2` full SHA,创建 commit-pinned `hwlab-v02-ci-poll-` PipelineRun;读 Git 走 `git-mirror-http.devops-infra.svc.cluster.local`,GitOps promotion 写 `git-mirror-write.devops-infra.svc.cluster.local`;confirmed trigger 在删除/创建 PipelineRun 前会先按当前 source commit 在 G14 临时 detached worktree 中 render,再 server-side apply v02 Tekton RBAC、Pipeline 与 Argo Application,避免 CI/CD 脚本或 runtime-ready 逻辑已合并但集群仍执行旧 Pipeline 定义;该 render 不要求固定 `/root/hwlab-v02` 工作树 clean,也不得因 `.worktree/` 或其他并行未提交修改阻塞;同名 PipelineRun 成功或运行中时拒绝重复触发,失败或不存在时才删除旧对象并重新创建。 创建 PipelineRun 前会读取 `devops-infra` mirror refs,若 `localV02` 未等于当前 source commit,则自动执行一次受控 manual `git-mirror sync` Job 并复核 ref,复核失败时停止触发,避免 Tekton `prepare-source` 已知失败;services 参数只包含 v02 runtime service matrix,`hwlab-cli` 是固定 repo 短连接源码工具,不进入 PipelineRun service build。 `--dry-run` 只报告是否会 pre-sync,不创建 Job;confirmed trigger 默认创建 `.state/jobs/` 异步 job 并立刻返回 `job.id`、`statusCommand`、stdout/stderr 路径,避免 git mirror pre-sync 或 PipelineRun 创建期间长时间阻塞;`--wait` 路径也必须向 stderr 输出 `hwlab.v02.trigger.progress` JSON 事件,覆盖 `control-plane-refresh`、`git-mirror-pre-sync`、`delete-existing-pipelinerun` 和 `create-pipelinerun`,避免异步 job 长时间只有启动命令而无法判断卡点;默认 JSON 必须对 `manifest_b64`、长脚本和远端 stdout/stderr 做有界摘要,保留长度与 hash,最终 trigger 结果只返回阶段摘要和关键 tail,完整内容通过 job stdout/stderr 文件渐进披露;只有现场同步调试才显式加 `--wait`;旧 `rerun-current` 只作为输入别名保留。PipelineRun `Completed`、Argo `Synced/Healthy` 和 `webAssets.ok=true` 只证明 G14 runtime 已更新;交付收口还必须用 `hwlab g14 git-mirror status` 查看 `cache.summary.pendingFlush`,若为 true,继续执行受控 `hwlab g14 git-mirror flush --confirm` 并用 job status 轮询到 `pendingFlush=false`。 diff --git a/scripts/hwlab-g14-contract-test.ts b/scripts/hwlab-g14-contract-test.ts index 1f7c6a72..9d9b2544 100644 --- a/scripts/hwlab-g14-contract-test.ts +++ b/scripts/hwlab-g14-contract-test.ts @@ -1,4 +1,4 @@ -import { gitMirrorFlushJobManifest, gitMirrorStatusSummary, gitMirrorSyncJobManifest, gitMirrorV02SyncRequirement, hwlabG14MonitorStateFileName, parseGitMirrorStatusRefs, parsePipelineTaskRunMetrics, rolloutRecordBody, semanticChangelogBullets, v02CommitAlignment, v02ControlPlaneRenderScript, v02FalseGreenGuard, v02PipelineServiceIds } from "./src/hwlab-g14"; +import { gitMirrorFlushJobManifest, gitMirrorStatusSummary, gitMirrorSyncJobManifest, gitMirrorV02SyncRequirement, hwlabG14MonitorStateFileName, parseGitMirrorStatusRefs, parsePipelineTaskRunMetrics, rolloutRecordBody, semanticChangelogBullets, v02CommitAlignment, v02ControlPlaneRenderScript, v02FalseGreenGuard, v02PipelineServiceIds, v02TaskRunPerformanceSummary } from "./src/hwlab-g14"; function assertCondition(condition: unknown, message: string, detail: unknown = {}): void { if (!condition) throw new Error(`${message}: ${JSON.stringify(detail)}`); @@ -184,6 +184,31 @@ assertCondition( falseGreenRuntimeMismatch, ); +const slowBuildSummary = v02TaskRunPerformanceSummary([ + { + name: "hwlab-v02-ci-poll-f8a090b66616-build-hwlab-agent-worker", + status: "True", + reason: "Succeeded", + durationSeconds: 234, + }, + { + name: "hwlab-v02-ci-poll-f8a090b66616-build-hwlab-cloud-web", + status: "True", + reason: "Succeeded", + durationSeconds: 37, + }, +]); +const slowBuildItems = Array.isArray(record(slowBuildSummary).slowTaskRuns) ? record(slowBuildSummary).slowTaskRuns as Record[] : []; +assertCondition( + slowBuildSummary.ok === false + && record(record(slowBuildSummary).thresholds).buildTaskRunWarningSeconds === 120 + && slowBuildItems.length === 1 + && slowBuildItems[0]?.serviceId === "hwlab-agent-worker" + && slowBuildItems[0]?.severity === "critical", + "v0.2 status must warn on slow build TaskRuns like issue #659", + slowBuildSummary, +); + const prBody = [ "## 背景", "", @@ -295,6 +320,7 @@ console.log(JSON.stringify({ "v0.2 status alignment reports stale-success without coupling CI to dirty workspace state", "v0.2 PipelineRun service matrix excludes hwlab-cli", "v0.2 false-green guard checks build TaskRuns, runtime artifact source commits, and reuse provenance", + "v0.2 status warns on slow build TaskRuns", "rollout brief includes natural-language changelog before automatic diff summary", "semantic changelog extracts Chinese summary sections", "rollout brief includes lazy-build reused/rebuild metrics and service durations", diff --git a/scripts/src/hwlab-g14.ts b/scripts/src/hwlab-g14.ts index a8891777..f90c3044 100644 --- a/scripts/src/hwlab-g14.ts +++ b/scripts/src/hwlab-g14.ts @@ -58,6 +58,8 @@ const DEFAULT_MAX_CYCLES = 0; const DEFAULT_TIMEOUT_SECONDS = 1800; const G14_BRIEF_INDEX_ISSUE = 7; const BEIJING_OFFSET_MS = 8 * 60 * 60 * 1000; +const V02_BUILD_TASKRUN_WARNING_SECONDS = 120; +const V02_BUILD_TASKRUN_CRITICAL_SECONDS = 180; interface G14MonitorOptions { intervalSeconds: number; @@ -840,6 +842,7 @@ function taskRunsCompactFromText(text: string, commandOk: boolean, pipelineRun: stderr: stderr.trim().slice(0, 2000), counts: { succeeded: 0, failed: 0, running: 0, unknown: 0 }, items: [], + performance: v02TaskRunPerformanceSummary([]), }; } const items = text @@ -863,16 +866,71 @@ function taskRunsCompactFromText(text: string, commandOk: boolean, pipelineRun: running: items.filter((item) => item.status === "Unknown").length, unknown: items.filter((item) => item.status !== "True" && item.status !== "False" && item.status !== "Unknown").length, }; + const performance = v02TaskRunPerformanceSummary(items); + const performanceWarning = performance.ok === false ? `; ${String(performance.summary ?? "")}` : ""; return { ok: true, pipelineRun, counts, items, - summary: `taskruns succeeded=${counts.succeeded} failed=${counts.failed} running=${counts.running} unknown=${counts.unknown}`, + performance, + summary: `taskruns succeeded=${counts.succeeded} failed=${counts.failed} running=${counts.running} unknown=${counts.unknown}${performanceWarning}`, disclosure: items.length > 0 ? "complete taskrun condition summary" : "no taskruns observed yet", }; } +function v02TaskRunPipelineTaskName(name: string): string | null { + const buildIndex = name.lastIndexOf("-build-"); + if (buildIndex >= 0) return name.slice(buildIndex + 1); + const knownSuffixes = [ + "prepare-source", + "plan-artifacts", + "publish-artifact-catalog", + "gitops-render", + "gitops-promote", + "runtime-ready", + "collect-artifacts", + ]; + return knownSuffixes.find((suffix) => name.endsWith(`-${suffix}`)) ?? null; +} + +export function v02TaskRunPerformanceSummary(taskRuns: unknown[]): Record { + const slowTaskRuns: Record[] = []; + for (const itemRaw of taskRuns) { + const item = record(itemRaw); + const name = stringOrNull(item.name) ?? ""; + const pipelineTask = stringOrNull(item.pipelineTask) ?? v02TaskRunPipelineTaskName(name); + const durationSeconds = typeof item.durationSeconds === "number" ? item.durationSeconds : null; + if (!pipelineTask?.startsWith("build-") || durationSeconds === null || durationSeconds <= V02_BUILD_TASKRUN_WARNING_SECONDS) continue; + const serviceId = pipelineTask.slice("build-".length) || null; + slowTaskRuns.push({ + name, + pipelineTask, + serviceId, + status: stringOrNull(item.status), + reason: stringOrNull(item.reason), + durationSeconds, + budgetSeconds: V02_BUILD_TASKRUN_WARNING_SECONDS, + severity: durationSeconds >= V02_BUILD_TASKRUN_CRITICAL_SECONDS ? "critical" : "warning", + message: `${pipelineTask} took ${durationSeconds}s, above v0.2 build TaskRun warning budget ${V02_BUILD_TASKRUN_WARNING_SECONDS}s`, + }); + } + slowTaskRuns.sort((left, right) => Number(right.durationSeconds ?? 0) - Number(left.durationSeconds ?? 0)); + const worst = slowTaskRuns[0]; + return { + ok: slowTaskRuns.length === 0, + warningCount: slowTaskRuns.length, + thresholds: { + buildTaskRunWarningSeconds: V02_BUILD_TASKRUN_WARNING_SECONDS, + buildTaskRunCriticalSeconds: V02_BUILD_TASKRUN_CRITICAL_SECONDS, + }, + slowTaskRuns, + summary: worst + ? `slow build taskruns=${slowTaskRuns.length}; worst=${String(worst.pipelineTask)} ${String(worst.durationSeconds)}s budget=${V02_BUILD_TASKRUN_WARNING_SECONDS}s` + : `no build taskrun over ${V02_BUILD_TASKRUN_WARNING_SECONDS}s`, + }; +} + function v02WebAssetsFromText( text: string, commandOk: boolean,