From 0cecdd9ed3d32f528451d68e6a2c0c4a28914c61 Mon Sep 17 00:00:00 2001
From: Codex <codex@noreply.local>
Date: Mon, 1 Jun 2026 14:27:52 +0000
Subject: [PATCH] fix: warn on slow HWLAB v02 build taskruns

---
 docs/reference/cli.md              |  2 +-
 scripts/hwlab-g14-contract-test.ts | 28 +++++++++++++-
 scripts/src/hwlab-g14.ts           | 60 +++++++++++++++++++++++++++++-
 3 files changed, 87 insertions(+), 3 deletions(-)
diff --git a/docs/reference/cli.md b/docs/reference/cli.md
index a14f0271..7a5db581 100644
--- a/docs/reference/cli.md
+++ b/docs/reference/cli.md
@@ -44,7 +44,7 @@ CI/CD、GitOps、rollout、artifact 发布、PR 合并后的 DEV/PROD 滚动、P
 - `commander contract|plan --dry-run|smoke --dry-run|approval request --dry-run|prompt-lint --kind gpt55-pr` 是 host Codex 指挥官直管微服务 skeleton 入口。当前命令返回 `phase=source-contract`、service/API/state/bridge/prompt/trace/#20/#46/ClaudeQQ 审批边界、.state/commander/ 状态模型、dev 无 daemon smoke contract、dry-run 计划和 GPT-5.5 PR prompt 边界辅助 lint，不接 live bridge、不注入 prompt、不发送 ClaudeQQ。`approval request --dry-run` 会生成 200 字以内中文纯文本 ClaudeQQ 审批草案、`notification-path-unavailable` blocker 和授权后唯一可用的 `bun scripts/cli.ts microservice proxy claudeqq /api/push/text --method POST --body-json '<payload>' --raw` 命令；不得提示使用本机 ClaudeQQ skill、powershell 或本地 server。`prompt-lint` 支持 `--prompt-file` 与 `--stdin`，输出 `ok`、`missingClauses`、`riskLevel`、`suggestedPatchSnippet` 且不回显完整 prompt；它是 commander 辅助检查，不是业务 PR 门禁，也不改变 `codex submit` 默认行为。`plan`、`smoke` 与 `approval request` 必须带 `--dry-run`；缺少时返回 `error=dry-run-required`。长期规则见 `docs/reference/host-codex-commander.md`。
 - `hwlab g14 monitor-prs [--once] [--dry-run] [--interval-seconds N] [--max-cycles N] [--timeout-seconds N]` 是当前 HWLAB G14 PR -> CI/CD -> DEV rollout 的一行式入口。普通调用创建 `.state/jobs/` 异步 job 并立刻返回 `job.id`、`statusCommand` 和 stdout/stderr 路径；后台 worker 每轮通过 UniDesk `gh pr list/preflight/merge` 监控 `pikasTech/HWLAB` base=`G14` 的 open PR，ready 时合并，然后通过 UniDesk `ssh G14:k3s` 观察 `hwlab-g14-ci-poll-<short>`、Argo `hwlab-g14-dev` 和 DEV `/health/live`，直到 DEV `Synced/Healthy` 且 Deployment/StatefulSet ready；历史 `Completed` smoke/debug pod 不作为 rollout blocker。每次成功 DEV rollout 后，worker 会定位或创建 #7“指挥简报索引”中的北京日期每日简报 issue，并追加 CI/CD 耗时、CI/CD 关键指标、语义化上线 changelog、自动 diff 摘要、PipelineRun、GitOps revision 和 DEV 验证摘要；关键指标来自 G14 Tekton TaskRun results，固定包含 `lazy build reused: x/y`、reused services、rebuild services 和每个 service 的独立耗时/状态/backend，用于观察 lazy build 机制效果。语义化 changelog 优先从 PR body 的 `## 修改`/`## 变更`/`## Changelog` 等段落提取，diff 摘要只作为文件和统计证据保留，不替代 changelog。也可用 `hwlab g14 record-rollout --pr <number> --source-commit <sha>` 手动补记，手动补记同样会按 PipelineRun 采集 TaskRun 指标。状态指针按用途分离：长期监控只写 `.state/hwlab-g14/latest-monitor-job.json`，`--once` 写 `latest-once-job.json`，`--dry-run` 写 `latest-dry-run-job.json`，`--once --dry-run` 写 `latest-once-dry-run-job.json`，避免一次性收口覆盖持续监控入口。`--once --dry-run` 只做单轮监控和 merge plan，不写 GitHub、不等待 rollout。该命令禁止使用原生 `gh` 或手拼 GitHub 请求；如果 UniDesk `gh` 子命令字段或行为不够，必须先改进 `scripts/src/gh.ts` 后再使用。
 - `agentrun v01 control-plane status|trigger-current [--dry-run|--confirm]` 是 AgentRun `v0.1` 在 G14 k3s 的受控 Tekton/Argo 入口。`status` 只读汇总固定 source worktree commit、对应 commit-pinned PipelineRun、Argo Application 和 `agentrun-v01` workload；`trigger-current` 先快进 `G14:/root/agentrun-v01` 到 `origin/v0.1`，再创建 `agentrun-v01-ci-<short12>` PipelineRun。confirmed trigger 只提交 CI/CD 工作并返回后续 `status` 命令，不等待完整 PipelineRun；同名 PipelineRun 运行中或已成功时拒绝重复触发，只允许失败态重建或首次创建。AgentRun 运行时和 SPEC 事实来源仍在 AgentRun 仓库，UniDesk 只维护受控运维入口。
-- `hwlab g14 control-plane status|apply --lane v02 [--dry-run|--confirm]` 是 HWLAB `v0.2` 加法 lane 的受控 Tekton/Argo 控制面维护入口，source commit 只来自 G14 专用 bare repo `/root/hwlab-v02-cicd.git` 的 `refs/remotes/origin/v0.2`；`/root/hwlab-v02` 只作为人工开发和短连接源码工具 workspace 被观测，dirty/stale 状态必须输出为 isolated warning 而不能阻塞 CI/CD。该入口面向 branch `v0.2`、namespace `hwlab-ci` 和 Argo application `hwlab-g14-v02`；`status` 只读汇总 pipeline、RBAC/ServiceAccount、Argo、当前 commit PipelineRun、当前 PipelineRun 的 TaskRun 条件摘要、最近 PipelineRun 摘要、活跃 PipelineRun、遗留 v02 CronJob 清理状态、commit alignment，以及 19666/19667 的 Cloud Web 静态资源和 API live 探针。`webAssets` 必须直接给出 `readonly-rpc` 删除、sidebar/workspace/event panel 关键 CSS、`/app.js` 是否可读取和字节数、`/health/live` 与 API revision；`apiRevision` 是 cloud-api 服务自身 revision，Cloud Web 静态资源变更时允许它与 source commit 不同，不能把这种差异误判成 Cloud Web 未发布。默认只读取必要字段，禁止把完整 PipelineRun spec、Tekton 内联脚本、历史大对象或整份 CSS/HTML/JS 展开到默认输出；`apply` 先自动 fetch `/root/hwlab-v02-cicd.git` 并从 commit-pinned detached worktree 执行 render check，再经 `G14:k3s` server-side apply `tekton-v02/rbac.yaml`、`pipeline.yaml`、`argocd/project.yaml` 和 `argocd/application-v02.yaml`，confirmed apply 会删除遗留 v02 CronJob，但不会应用 runtime-v02 workload、Secret 或数据迁移。
+- `hwlab g14 control-plane status|apply --lane v02 [--dry-run|--confirm]` 是 HWLAB `v0.2` 加法 lane 的受控 Tekton/Argo 控制面维护入口，source commit 只来自 G14 专用 bare repo `/root/hwlab-v02-cicd.git` 的 `refs/remotes/origin/v0.2`；`/root/hwlab-v02` 只作为人工开发和短连接源码工具 workspace 被观测，dirty/stale 状态必须输出为 isolated warning 而不能阻塞 CI/CD。该入口面向 branch `v0.2`、namespace `hwlab-ci` 和 Argo application `hwlab-g14-v02`；`status` 只读汇总 pipeline、RBAC/ServiceAccount、Argo、当前 commit PipelineRun、当前 PipelineRun 的 TaskRun 条件摘要、最近 PipelineRun 摘要、活跃 PipelineRun、遗留 v02 CronJob 清理状态、commit alignment，以及 19666/19667 的 Cloud Web 静态资源和 API live 探针。TaskRun 摘要的 `performance` 字段会把超过 120s 的 build TaskRun 标为慢任务、超过 180s 标为 critical warning，用于暴露 env reuse/git mirror 命中率回归，但不作为阻断门禁。`webAssets` 必须直接给出 `readonly-rpc` 删除、sidebar/workspace/event panel 关键 CSS、`/app.js` 是否可读取和字节数、`/health/live` 与 API revision；`apiRevision` 是 cloud-api 服务自身 revision，Cloud Web 静态资源变更时允许它与 source commit 不同，不能把这种差异误判成 Cloud Web 未发布。默认只读取必要字段，禁止把完整 PipelineRun spec、Tekton 内联脚本、历史大对象或整份 CSS/HTML/JS 展开到默认输出；`apply` 先自动 fetch `/root/hwlab-v02-cicd.git` 并从 commit-pinned detached worktree 执行 render check，再经 `G14:k3s` server-side apply `tekton-v02/rbac.yaml`、`pipeline.yaml`、`argocd/project.yaml` 和 `argocd/application-v02.yaml`，confirmed apply 会删除遗留 v02 CronJob，但不会应用 runtime-v02 workload、Secret 或数据迁移。
 - `hwlab g14 control-plane trigger-current --lane v02 [--dry-run|--confirm]` 是 v02 标准手动触发入口：先自动 fetch `/root/hwlab-v02-cicd.git`，解析当前 `origin/v0.2` full SHA，创建 commit-pinned `hwlab-v02-ci-poll-<short12>` PipelineRun；读 Git 走 `git-mirror-http.devops-infra.svc.cluster.local`，GitOps promotion 写 `git-mirror-write.devops-infra.svc.cluster.local`；confirmed trigger 在删除/创建 PipelineRun 前会先按当前 source commit 在 G14 临时 detached worktree 中 render，再 server-side apply v02 Tekton RBAC、Pipeline 与 Argo Application，避免 CI/CD 脚本或 runtime-ready 逻辑已合并但集群仍执行旧 Pipeline 定义；该 render 不要求固定 `/root/hwlab-v02` 工作树 clean，也不得因 `.worktree/` 或其他并行未提交修改阻塞；同名 PipelineRun 成功或运行中时拒绝重复触发，失败或不存在时才删除旧对象并重新创建。
   创建 PipelineRun 前会读取 `devops-infra` mirror refs，若 `localV02` 未等于当前 source commit，则自动执行一次受控 manual `git-mirror sync` Job 并复核 ref，复核失败时停止触发，避免 Tekton `prepare-source` 已知失败；services 参数只包含 v02 runtime service matrix，`hwlab-cli` 是固定 repo 短连接源码工具，不进入 PipelineRun service build。
   `--dry-run` 只报告是否会 pre-sync，不创建 Job；confirmed trigger 默认创建 `.state/jobs/` 异步 job 并立刻返回 `job.id`、`statusCommand`、stdout/stderr 路径，避免 git mirror pre-sync 或 PipelineRun 创建期间长时间阻塞；`--wait` 路径也必须向 stderr 输出 `hwlab.v02.trigger.progress` JSON 事件，覆盖 `control-plane-refresh`、`git-mirror-pre-sync`、`delete-existing-pipelinerun` 和 `create-pipelinerun`，避免异步 job 长时间只有启动命令而无法判断卡点；默认 JSON 必须对 `manifest_b64`、长脚本和远端 stdout/stderr 做有界摘要，保留长度与 hash，最终 trigger 结果只返回阶段摘要和关键 tail，完整内容通过 job stdout/stderr 文件渐进披露；只有现场同步调试才显式加 `--wait`；旧 `rerun-current` 只作为输入别名保留。PipelineRun `Completed`、Argo `Synced/Healthy` 和 `webAssets.ok=true` 只证明 G14 runtime 已更新；交付收口还必须用 `hwlab g14 git-mirror status` 查看 `cache.summary.pendingFlush`，若为 true，继续执行受控 `hwlab g14 git-mirror flush --confirm` 并用 job status 轮询到 `pendingFlush=false`。
diff --git a/scripts/hwlab-g14-contract-test.ts b/scripts/hwlab-g14-contract-test.ts
index 1f7c6a72..9d9b2544 100644
--- a/scripts/hwlab-g14-contract-test.ts
+++ b/scripts/hwlab-g14-contract-test.ts
@@ -1,4 +1,4 @@
-import { gitMirrorFlushJobManifest, gitMirrorStatusSummary, gitMirrorSyncJobManifest, gitMirrorV02SyncRequirement, hwlabG14MonitorStateFileName, parseGitMirrorStatusRefs, parsePipelineTaskRunMetrics, rolloutRecordBody, semanticChangelogBullets, v02CommitAlignment, v02ControlPlaneRenderScript, v02FalseGreenGuard, v02PipelineServiceIds } from "./src/hwlab-g14";
+import { gitMirrorFlushJobManifest, gitMirrorStatusSummary, gitMirrorSyncJobManifest, gitMirrorV02SyncRequirement, hwlabG14MonitorStateFileName, parseGitMirrorStatusRefs, parsePipelineTaskRunMetrics, rolloutRecordBody, semanticChangelogBullets, v02CommitAlignment, v02ControlPlaneRenderScript, v02FalseGreenGuard, v02PipelineServiceIds, v02TaskRunPerformanceSummary } from "./src/hwlab-g14";
 
 function assertCondition(condition: unknown, message: string, detail: unknown = {}): void {
   if (!condition) throw new Error(`${message}: ${JSON.stringify(detail)}`);
@@ -184,6 +184,31 @@ assertCondition(
   falseGreenRuntimeMismatch,
 );
 
+const slowBuildSummary = v02TaskRunPerformanceSummary([
+  {
+    name: "hwlab-v02-ci-poll-f8a090b66616-build-hwlab-agent-worker",
+    status: "True",
+    reason: "Succeeded",
+    durationSeconds: 234,
+  },
+  {
+    name: "hwlab-v02-ci-poll-f8a090b66616-build-hwlab-cloud-web",
+    status: "True",
+    reason: "Succeeded",
+    durationSeconds: 37,
+  },
+]);
+const slowBuildItems = Array.isArray(record(slowBuildSummary).slowTaskRuns) ? record(slowBuildSummary).slowTaskRuns as Record<string, unknown>[] : [];
+assertCondition(
+  slowBuildSummary.ok === false
+    && record(record(slowBuildSummary).thresholds).buildTaskRunWarningSeconds === 120
+    && slowBuildItems.length === 1
+    && slowBuildItems[0]?.serviceId === "hwlab-agent-worker"
+    && slowBuildItems[0]?.severity === "critical",
+  "v0.2 status must warn on slow build TaskRuns like issue #659",
+  slowBuildSummary,
+);
+
 const prBody = [
   "## 背景",
   "",
@@ -295,6 +320,7 @@ console.log(JSON.stringify({
     "v0.2 status alignment reports stale-success without coupling CI to dirty workspace state",
     "v0.2 PipelineRun service matrix excludes hwlab-cli",
     "v0.2 false-green guard checks build TaskRuns, runtime artifact source commits, and reuse provenance",
+    "v0.2 status warns on slow build TaskRuns",
     "rollout brief includes natural-language changelog before automatic diff summary",
     "semantic changelog extracts Chinese summary sections",
     "rollout brief includes lazy-build reused/rebuild metrics and service durations",
diff --git a/scripts/src/hwlab-g14.ts b/scripts/src/hwlab-g14.ts
index a8891777..f90c3044 100644
--- a/scripts/src/hwlab-g14.ts
+++ b/scripts/src/hwlab-g14.ts
@@ -58,6 +58,8 @@ const DEFAULT_MAX_CYCLES = 0;
 const DEFAULT_TIMEOUT_SECONDS = 1800;
 const G14_BRIEF_INDEX_ISSUE = 7;
 const BEIJING_OFFSET_MS = 8 * 60 * 60 * 1000;
+const V02_BUILD_TASKRUN_WARNING_SECONDS = 120;
+const V02_BUILD_TASKRUN_CRITICAL_SECONDS = 180;
 
 interface G14MonitorOptions {
   intervalSeconds: number;
@@ -840,6 +842,7 @@ function taskRunsCompactFromText(text: string, commandOk: boolean, pipelineRun:
       stderr: stderr.trim().slice(0, 2000),
       counts: { succeeded: 0, failed: 0, running: 0, unknown: 0 },
       items: [],
+      performance: v02TaskRunPerformanceSummary([]),
     };
   }
   const items = text
@@ -863,16 +866,71 @@ function taskRunsCompactFromText(text: string, commandOk: boolean, pipelineRun:
     running: items.filter((item) => item.status === "Unknown").length,
     unknown: items.filter((item) => item.status !== "True" && item.status !== "False" && item.status !== "Unknown").length,
   };
+  const performance = v02TaskRunPerformanceSummary(items);
+  const performanceWarning = performance.ok === false ? `; ${String(performance.summary ?? "")}` : "";
   return {
     ok: true,
     pipelineRun,
     counts,
     items,
-    summary: `taskruns succeeded=${counts.succeeded} failed=${counts.failed} running=${counts.running} unknown=${counts.unknown}`,
+    performance,
+    summary: `taskruns succeeded=${counts.succeeded} failed=${counts.failed} running=${counts.running} unknown=${counts.unknown}${performanceWarning}`,
     disclosure: items.length > 0 ? "complete taskrun condition summary" : "no taskruns observed yet",
   };
 }
 
+function v02TaskRunPipelineTaskName(name: string): string | null {
+  const buildIndex = name.lastIndexOf("-build-");
+  if (buildIndex >= 0) return name.slice(buildIndex + 1);
+  const knownSuffixes = [
+    "prepare-source",
+    "plan-artifacts",
+    "publish-artifact-catalog",
+    "gitops-render",
+    "gitops-promote",
+    "runtime-ready",
+    "collect-artifacts",
+  ];
+  return knownSuffixes.find((suffix) => name.endsWith(`-${suffix}`)) ?? null;
+}
+
+export function v02TaskRunPerformanceSummary(taskRuns: unknown[]): Record<string, unknown> {
+  const slowTaskRuns: Record<string, unknown>[] = [];
+  for (const itemRaw of taskRuns) {
+    const item = record(itemRaw);
+    const name = stringOrNull(item.name) ?? "";
+    const pipelineTask = stringOrNull(item.pipelineTask) ?? v02TaskRunPipelineTaskName(name);
+    const durationSeconds = typeof item.durationSeconds === "number" ? item.durationSeconds : null;
+    if (!pipelineTask?.startsWith("build-") || durationSeconds === null || durationSeconds <= V02_BUILD_TASKRUN_WARNING_SECONDS) continue;
+    const serviceId = pipelineTask.slice("build-".length) || null;
+    slowTaskRuns.push({
+      name,
+      pipelineTask,
+      serviceId,
+      status: stringOrNull(item.status),
+      reason: stringOrNull(item.reason),
+      durationSeconds,
+      budgetSeconds: V02_BUILD_TASKRUN_WARNING_SECONDS,
+      severity: durationSeconds >= V02_BUILD_TASKRUN_CRITICAL_SECONDS ? "critical" : "warning",
+      message: `${pipelineTask} took ${durationSeconds}s, above v0.2 build TaskRun warning budget ${V02_BUILD_TASKRUN_WARNING_SECONDS}s`,
+    });
+  }
+  slowTaskRuns.sort((left, right) => Number(right.durationSeconds ?? 0) - Number(left.durationSeconds ?? 0));
+  const worst = slowTaskRuns[0];
+  return {
+    ok: slowTaskRuns.length === 0,
+    warningCount: slowTaskRuns.length,
+    thresholds: {
+      buildTaskRunWarningSeconds: V02_BUILD_TASKRUN_WARNING_SECONDS,
+      buildTaskRunCriticalSeconds: V02_BUILD_TASKRUN_CRITICAL_SECONDS,
+    },
+    slowTaskRuns,
+    summary: worst
+      ? `slow build taskruns=${slowTaskRuns.length}; worst=${String(worst.pipelineTask)} ${String(worst.durationSeconds)}s budget=${V02_BUILD_TASKRUN_WARNING_SECONDS}s`
+      : `no build taskrun over ${V02_BUILD_TASKRUN_WARNING_SECONDS}s`,
+  };
+}
+
 function v02WebAssetsFromText(
   text: string,
   commandOk: boolean,