From a37d2b1374b2f00a56079162649a1e063038eee2 Mon Sep 17 00:00:00 2001 From: Codex Date: Sun, 31 May 2026 09:36:50 +0000 Subject: [PATCH] fix: improve hwlab v02 cli observability --- docs/reference/cli.md | 4 +- docs/reference/observability.md | 2 +- scripts/src/help.ts | 6 +- scripts/src/hwlab-g14.ts | 160 ++++++++++++++++++++++++++++++++ scripts/src/jobs.ts | 119 +++++++++++++++++++++++- 5 files changed, 283 insertions(+), 8 deletions(-) diff --git a/docs/reference/cli.md b/docs/reference/cli.md index d13395aa..74719d36 100644 --- a/docs/reference/cli.md +++ b/docs/reference/cli.md @@ -43,7 +43,7 @@ CI/CD、GitOps、rollout、artifact 发布、PR 合并后的 DEV/PROD 滚动、P - `artifact-registry plan|render|status|health|install|deploy-backend-core|deploy-service` 管理 D601 host-managed CNCF Distribution registry 的声明、安装、只读检查和 pull-only artifact CD。该 registry 固定为 D601 loopback `127.0.0.1:5000`,由 systemd + Docker Compose 管理,位于 native k3s 故障域外;`deploy-service` 只拉取 CI 已发布的 commit-pinned 镜像、retag/recreate 或导入 native k3s,并做 live commit 验证,不构建 runtime source。`deploy-backend-core` 是 deprecated 兼容名,标准 backend-core prod CD 入口是 `deploy apply --env prod --service backend-core`。长期规则见 `docs/reference/artifact-registry.md`。 - `commander contract|plan --dry-run|smoke --dry-run|approval request --dry-run|prompt-lint --kind gpt55-pr` 是 host Codex 指挥官直管微服务 skeleton 入口。当前命令返回 `phase=source-contract`、service/API/state/bridge/prompt/trace/#20/#46/ClaudeQQ 审批边界、.state/commander/ 状态模型、dev 无 daemon smoke contract、dry-run 计划和 GPT-5.5 PR prompt 边界辅助 lint,不接 live bridge、不注入 prompt、不发送 ClaudeQQ。`approval request --dry-run` 会生成 200 字以内中文纯文本 ClaudeQQ 审批草案、`notification-path-unavailable` blocker 和授权后唯一可用的 `bun scripts/cli.ts microservice proxy claudeqq /api/push/text --method POST --body-json '' --raw` 命令;不得提示使用本机 ClaudeQQ skill、powershell 或本地 server。`prompt-lint` 支持 `--prompt-file` 与 `--stdin`,输出 `ok`、`missingClauses`、`riskLevel`、`suggestedPatchSnippet` 且不回显完整 prompt;它是 commander 辅助检查,不是业务 PR 门禁,也不改变 `codex submit` 默认行为。`plan`、`smoke` 与 `approval request` 必须带 `--dry-run`;缺少时返回 `error=dry-run-required`。长期规则见 `docs/reference/host-codex-commander.md`。 - `hwlab g14 monitor-prs [--once] [--dry-run] [--interval-seconds N] [--max-cycles N] [--timeout-seconds N]` 是当前 HWLAB G14 PR -> CI/CD -> DEV rollout 的一行式入口。普通调用创建 `.state/jobs/` 异步 job 并立刻返回 `job.id`、`statusCommand` 和 stdout/stderr 路径;后台 worker 每轮通过 UniDesk `gh pr list/preflight/merge` 监控 `pikasTech/HWLAB` base=`G14` 的 open PR,ready 时合并,然后通过 UniDesk `ssh G14:k3s` 观察 `hwlab-g14-ci-poll-`、Argo `hwlab-g14-dev` 和 DEV `/health/live`,直到 DEV `Synced/Healthy` 且 Deployment/StatefulSet ready;历史 `Completed` smoke/debug pod 不作为 rollout blocker。每次成功 DEV rollout 后,worker 会定位或创建 #7“指挥简报索引”中的北京日期每日简报 issue,并追加 CI/CD 耗时、CI/CD 关键指标、语义化上线 changelog、自动 diff 摘要、PipelineRun、GitOps revision 和 DEV 验证摘要;关键指标来自 G14 Tekton TaskRun results,固定包含 `lazy build reused: x/y`、reused services、rebuild services 和每个 service 的独立耗时/状态/backend,用于观察 lazy build 机制效果。语义化 changelog 优先从 PR body 的 `## 修改`/`## 变更`/`## Changelog` 等段落提取,diff 摘要只作为文件和统计证据保留,不替代 changelog。也可用 `hwlab g14 record-rollout --pr --source-commit ` 手动补记,手动补记同样会按 PipelineRun 采集 TaskRun 指标。状态指针按用途分离:长期监控只写 `.state/hwlab-g14/latest-monitor-job.json`,`--once` 写 `latest-once-job.json`,`--dry-run` 写 `latest-dry-run-job.json`,`--once --dry-run` 写 `latest-once-dry-run-job.json`,避免一次性收口覆盖持续监控入口。`--once --dry-run` 只做单轮监控和 merge plan,不写 GitHub、不等待 rollout。该命令禁止使用原生 `gh` 或手拼 GitHub 请求;如果 UniDesk `gh` 子命令字段或行为不够,必须先改进 `scripts/src/gh.ts` 后再使用。 -- `hwlab g14 control-plane status|apply --lane v02 [--dry-run|--confirm]` 是 HWLAB `v0.2` 加法 lane 的受控 Tekton/Argo 控制面维护入口,只面向 G14 `/root/hwlab-v02`、branch `v0.2`、namespace `hwlab-ci` 和 Argo application `hwlab-g14-v02`;`status` 只读汇总 pipeline、RBAC/ServiceAccount、Argo、当前 commit PipelineRun、最近 PipelineRun 摘要、活跃 PipelineRun 和遗留 v02 CronJob 清理状态,默认只读取必要字段,禁止把完整 PipelineRun spec、Tekton 内联脚本或历史大对象展开到默认输出;`apply` 先在 G14 workspace 快进并执行 render check,再经 `G14:k3s` server-side apply `tekton-v02/rbac.yaml`、`pipeline.yaml`、`argocd/project.yaml` 和 `argocd/application-v02.yaml`,confirmed apply 会删除遗留 v02 CronJob,但不会应用 runtime-v02 workload、Secret 或数据迁移。 +- `hwlab g14 control-plane status|apply --lane v02 [--dry-run|--confirm]` 是 HWLAB `v0.2` 加法 lane 的受控 Tekton/Argo 控制面维护入口,只面向 G14 `/root/hwlab-v02`、branch `v0.2`、namespace `hwlab-ci` 和 Argo application `hwlab-g14-v02`;`status` 只读汇总 pipeline、RBAC/ServiceAccount、Argo、当前 commit PipelineRun、当前 PipelineRun 的 TaskRun 条件摘要、最近 PipelineRun 摘要、活跃 PipelineRun、遗留 v02 CronJob 清理状态,以及 19666/19667 的 Cloud Web 静态资源和 API live 探针。`webAssets` 必须直接给出 `readonly-rpc` 删除、sidebar/workspace/event panel 关键 CSS、`/health/live` 和 API revision;Cloud Web 静态资源变更时允许 `apiRevision` 与 source commit 不同,但不得把这种差异误判成 19666 未发布。默认只读取必要字段,禁止把完整 PipelineRun spec、Tekton 内联脚本、历史大对象或整份 CSS/HTML 展开到默认输出;`apply` 先在 G14 workspace 快进并执行 render check,再经 `G14:k3s` server-side apply `tekton-v02/rbac.yaml`、`pipeline.yaml`、`argocd/project.yaml` 和 `argocd/application-v02.yaml`,confirmed apply 会删除遗留 v02 CronJob,但不会应用 runtime-v02 workload、Secret 或数据迁移。 - `hwlab g14 control-plane trigger-current --lane v02 [--dry-run|--confirm]` 是 v02 标准手动触发入口:解析当前 `origin/v0.2` full SHA,创建 commit-pinned `hwlab-v02-ci-poll-` PipelineRun;读 Git 走 `git-mirror-http.devops-infra.svc.cluster.local`,GitOps promotion 写 `git-mirror-write.devops-infra.svc.cluster.local`;confirmed trigger 在删除/创建 PipelineRun 前会先按当前 source commit render 并 server-side apply v02 Tekton RBAC、Pipeline 与 Argo Application,避免 CI/CD 脚本或 runtime-ready 逻辑已合并但集群仍执行旧 Pipeline 定义;同名 PipelineRun 成功或运行中时拒绝重复触发,失败或不存在时才删除旧对象并重新创建。 创建 PipelineRun 前会读取 `devops-infra` mirror refs,若 `localV02` 未等于当前 source commit,则自动执行一次受控 manual `git-mirror sync` Job 并复核 ref,复核失败时停止触发,避免 Tekton `prepare-source` 已知失败;services 参数只包含 v02 runtime service matrix,`hwlab-cli` 是固定 repo 短连接源码工具,不进入 PipelineRun service build。 `--dry-run` 只报告是否会 pre-sync,不创建 Job;confirmed trigger 默认创建 `.state/jobs/` 异步 job 并立刻返回 `job.id`、`statusCommand`、stdout/stderr 路径,避免 git mirror pre-sync 或 PipelineRun 创建期间长时间阻塞;`--wait` 路径也必须向 stderr 输出 `hwlab.v02.trigger.progress` JSON 事件,覆盖 `control-plane-refresh`、`git-mirror-pre-sync`、`delete-existing-pipelinerun` 和 `create-pipelinerun`,避免异步 job 长时间只有启动命令而无法判断卡点;默认 JSON 必须对 `manifest_b64`、长脚本和远端 stdout/stderr 做有界摘要,保留长度与 hash,最终 trigger 结果只返回阶段摘要和关键 tail,完整内容通过 job stdout/stderr 文件渐进披露;只有现场同步调试才显式加 `--wait`;旧 `rerun-current` 只作为输入别名保留。 @@ -88,7 +88,7 @@ CI/CD、GitOps、rollout、artifact 发布、PR 合并后的 DEV/PROD 滚动、P - `codex interrupt|cancel ` 通过 Code Queue 私有代理请求中断;running/judging 任务会请求 D601 当前 agent run 停止,queued/retry_wait 任务的取消也必须保持与 WebUI 相同代理路径,返回有界 task 摘要和后续查询命令。任何需要接触 active run 的动作仍属于 D601 执行面。 - Code Queue 多队列 lane 由 `codex` 命令命名空间管理:`queues [--full|--all] [--limit N] [--page N|--offset N]` 列表、`queue create ` 创建、`queue merge --into ` 合并、`move --queue ` 迁移;这些队列管理入口默认由主 server `code-queue-mgr` 直管 PostgreSQL,仍通过稳定 `code-queue` 用户服务代理路径访问。`codex queues` 默认只返回 active/nonempty/unread/runnable queue 摘要、activity、commanderConcurrency、全局 counts 和 execution diagnostics;`--full` 或 `--all` 只切换为完整队列行视图的一页,仍受 `--limit`/`--page`/`--offset` 分页约束,不再默认携带 deprecated full array。summary 和 full 的稳定机读路径都是 `.data.queues.items[]`,全局元数据固定在 `.data.queues.commanderConcurrency`、`.data.queues.activity`、`.data.queues.counts`、`.data.queues.executionDiagnostics`、`.data.queues.activeTaskIds` 和 `.data.queues.queuedTaskIds`;需要完整 upstream 时使用输出中的 raw command。`commanderConcurrency.activeRunnerCount` / `activity.effectiveActiveTaskCount` 是指挥官并发判断的有效活跃数,`schedulerLocalActiveQueueCount`/`activeQueueIds` 只描述本地 scheduler active-run slots,不能覆盖数据库 running 计数或 heartbeat-fresh runner 计数。旧 full 顶层数组语义已作为 deprecated 兼容信息记录,不再作为 `.data.queues` 主形态。同一个 queue 内部串行执行,不同 queue 之间并行执行。迁移只允许尚未被 scheduler claim 的 `queued`/`retry_wait` 任务,必须满足 `startedAt=null`、`currentAttempt=0` 且没有 active thread/turn;已进入 `running`/`judging` 或已有 claim 标记的任务返回 409,不得被 move/merge 回写成 queued。合并会移动可迁移任务归属并自动删除源 queue 记录,只保留合并后的目标 queue;若 source 或 target queue 存在 active/claimed 任务,合并整体返回 409。合并后的目标 queue 按任务原 `queueEnteredAt`/`createdAt` 时间顺序串行,成功迁移 queued/retry_wait 任务后由 D601 scheduler 轮询推进。 - 所有 `codex` 查询和管理命令必须走与 WebUI 相同的 backend-core 私有代理路径 `/api/microservices/code-queue/proxy/...`;CLI 不得为了提交、移动、中断、取消或队列管理直接调用 D601 内部 Service、数据库、pod curl 或 k3sctl scheduler 子服务。若该路径失败,应先修复 CLI/backend/provider tunnel 链路,而不是绕过控制面。 -- `job list [--limit N] [--include-command]` 与 `job status [--tail-bytes N]` 查询 `.state/jobs/` 文件系统状态,是异步命令的可观测入口。`job list` 默认只返回最新 50 条摘要;`job status` 默认只返回 stdout/stderr 末尾 12000 字节,并带 `tailPolicy` 与完整日志路径。 +- `job list [--limit N] [--include-command]` 与 `job status [--tail-bytes N]` 查询 `.state/jobs/` 文件系统状态,是异步命令的可观测入口。`job list` 默认只返回最新 50 条摘要,并为已知异步工作流返回轻量 `progress.summary` 与后续查询命令;`job status` 默认返回结构化 `progress`、stdout/stderr 末尾 12000 字节、`tailPolicy` 与完整日志路径。已知工作流应从有界日志尾部抽取阶段、关键对象名和下一步命令,避免为了判断当前阶段而手工打开完整 stdout/stderr。 - `debug health`、`debug dispatch` 与 `debug task` 走真实内部 core、WebSocket、数据库、provider、系统指标、Docker 状态和 Host SSH 维护桥流程,只用于开发调试,不写入 `TEST.md` 的正式验收步骤。 - `e2e run [--only pattern[,pattern...]] [--skip pattern[,pattern...]]` 使用 publicHost 派生的公开 production frontend/dev frontend/provider ingress URL,并通过 Docker 内网验证 core API、PostgreSQL、provider self-connection、系统指标曲线、Docker 状态快照、provider.upgrade 预检和 Playwright 前端页面,是交付前的自动化 E2E 门禁;CLI 默认输出 check 状态摘要,完整诊断写入 `resultPath`,日常迭代应优先用 `--only` / `--skip` 跑最小必要集合。 diff --git a/docs/reference/observability.md b/docs/reference/observability.md index ed439723..bbe9ea65 100644 --- a/docs/reference/observability.md +++ b/docs/reference/observability.md @@ -4,7 +4,7 @@ UniDesk 的可观测性优先级高于静默成功。CLI、服务日志、Docker ## CLI Logs -异步 job 的 stdout 和 stderr 位于 `.state/jobs/`。`job list` 默认只返回最新 50 条摘要;`job status` 会返回有限尾部,避免输出爆炸,同时保留完整日志文件路径便于继续排查。实现必须只读取日志尾部字节,不得先把完整 job 日志读入 CLI 内存。 +异步 job 的 stdout 和 stderr 位于 `.state/jobs/`。`job list` 默认只返回最新 50 条摘要,并为已知异步工作流返回轻量 `progress.summary`;`job status` 会返回结构化 `progress` 与有限尾部,避免输出爆炸,同时保留完整日志文件路径便于继续排查。实现必须只读取日志尾部字节,不得先把完整 job 日志读入 CLI 内存;长时命令的阶段、关键对象名和下一步查询命令应优先沉淀到 `progress`,不能要求调用者先阅读完整日志才能知道是否卡在提交、构建、发布或观测阶段。 ## Service Logs diff --git a/scripts/src/help.ts b/scripts/src/help.ts index e6785d46..ba6e5d5c 100644 --- a/scripts/src/help.ts +++ b/scripts/src/help.ts @@ -80,8 +80,8 @@ export function rootHelp(): unknown { { command: "codex steer-confirm --steer-id [--raw]", description: "Read-only lookup for a steerId in task trace so deliveryUnconfirmed can be resolved without resending the corrective prompt." }, { command: "codex interrupt|cancel ", description: "Request interrupt for a running Code Queue task, or cancel a queued/retry_wait task, through the same private proxy." }, { command: "codex (queues [--full|--all] | queue create | queue merge --into | move --queue )", description: "List low-noise queue summaries by default, including effective activity counts that distinguish scheduler-local queues, DB running tasks, and heartbeat-fresh runners; full queue rows require --full/--all." }, - { command: "job list [--limit N] [--include-command]", description: "List async jobs from .state/jobs with a bounded default page." }, - { command: "job status [--tail-bytes N]", description: "Show job state with bounded stdout/stderr tails." }, + { command: "job list [--limit N] [--include-command]", description: "List async jobs from .state/jobs with a bounded default page and progress summaries." }, + { command: "job status [--tail-bytes N]", description: "Show job state with a structured progress summary and bounded stdout/stderr tails." }, { command: "debug health", description: "Probe internal core, nodes, system/Docker status, frontend, provider ingress, and public boundary." }, { command: "debug dispatch [providerId] [docker.ps|provider.upgrade|host.ssh|microservice.http|echo] [--wait-ms N]", description: "Submit a real internal-core dispatch request for CLI debugging." }, { command: "debug task ", description: "Read a dispatched task record from internal core for CLI debugging." }, @@ -481,7 +481,7 @@ function jobHelp(): unknown { "bun scripts/cli.ts job list [--limit N] [--include-command]", "bun scripts/cli.ts job status [--tail-bytes N]", ], - description: "Inspect fire-and-forget job state from .state/jobs without streaming unbounded logs.", + description: "Inspect fire-and-forget job state from .state/jobs with structured progress summaries and bounded log tails.", }; } diff --git a/scripts/src/hwlab-g14.ts b/scripts/src/hwlab-g14.ts index 63577d01..07d6004f 100644 --- a/scripts/src/hwlab-g14.ts +++ b/scripts/src/hwlab-g14.ts @@ -43,6 +43,8 @@ const V02_SERVICE_IDS = [ "hwlab-edge-proxy", "hwlab-agent-skills", ]; +const V02_CLOUD_WEB_URL = "http://74.48.78.17:19666"; +const V02_CLOUD_API_URL = "http://74.48.78.17:19667"; export function v02PipelineServiceIds(): string[] { return [...V02_SERVICE_IDS]; @@ -712,11 +714,152 @@ function v02ControlPlaneStatusBundle(sourceCommit: string | null | undefined): C `section obsoleteCronJobs kubectl get cronjob -n ${shellQuote(CI_NAMESPACE)} ${shellQuote(V02_POLLER)} ${shellQuote(V02_RECONCILER)} --ignore-not-found -o name`, `section argo kubectl get application -n ${shellQuote(ARGO_NAMESPACE)} ${shellQuote(V02_APP)} -o 'jsonpath={.spec.source.targetRevision}{"\\n"}{.spec.source.path}{"\\n"}{.status.sync.revision}{"\\n"}{.status.sync.status}{"\\n"}{.status.health.status}{"\\n"}'`, `if [ -n "$pipeline_run" ]; then section pipelineRun kubectl get pipelinerun -n ${shellQuote(CI_NAMESPACE)} "$pipeline_run" -o 'jsonpath={.status.conditions[0].status}{"\\n"}{.status.conditions[0].reason}{"\\n"}{.status.conditions[0].message}{"\\n"}'; else section pipelineRun sh -c 'true'; fi`, + `if [ -n "$pipeline_run" ]; then section taskRuns kubectl get taskrun -n ${shellQuote(CI_NAMESPACE)} -l "tekton.dev/pipelineRun=$pipeline_run" -o 'jsonpath={range .items[*]}{.metadata.name}{"\\t"}{.status.conditions[0].status}{"\\t"}{.status.conditions[0].reason}{"\\t"}{.status.startTime}{"\\t"}{.status.completionTime}{"\\n"}{end}'; else section taskRuns sh -c 'true'; fi`, `section recentPipelineRuns kubectl get pipelinerun -n ${shellQuote(CI_NAMESPACE)} -l hwlab.pikastech.local/gitops-target=v02 -o ${shellQuote(pipelineRunRowsJsonPath())}`, + `section webAssets sh -c ${shellQuote(v02WebAssetsProbeScript())}`, ].join("\n"); return g14K3s(["script", "--", script], 60_000); } +function v02WebAssetsProbeScript(): string { + return [ + "set +e", + `base=${shellQuote(V02_CLOUD_WEB_URL)}`, + `api=${shellQuote(V02_CLOUD_API_URL)}`, + "fetch_url() {", + " if command -v curl >/dev/null 2>&1; then", + " curl -fsS --connect-timeout 2 --max-time 5 \"$1\"", + " elif command -v wget >/dev/null 2>&1; then", + " wget -q -T 5 -O - \"$1\"", + " else", + " return 127", + " fi", + "}", + "printf 'baseUrl\\t%s\\n' \"$base\"", + "printf 'apiUrl\\t%s\\n' \"$api\"", + "html=$(fetch_url \"$base/\" 2>/dev/null)", + "html_code=$?", + "printf 'htmlOk\\t%s\\n' \"$html_code\"", + "printf 'readonlyNote\\t%s\\n' \"$(printf '%s' \"$html\" | grep -Eiq 'readonly-rpc|复核入口'; printf '%s' \"$?\")\"", + "css=$(fetch_url \"$base/styles.css\" 2>/dev/null)", + "css_code=$?", + "printf 'cssOk\\t%s\\n' \"$css_code\"", + "printf 'sidebarFitCss\\t%s\\n' \"$(printf '%s' \"$css\" | grep -Eq 'grid-template-rows:[[:space:]]*auto[[:space:]]+auto[[:space:]]+auto[[:space:]]+auto[[:space:]]+minmax\\(0,[[:space:]]*1fr\\)'; printf '%s' \"$?\")\"", + "printf 'workspaceFitCss\\t%s\\n' \"$(printf '%s' \"$css\" | grep -Eq 'grid-template-rows:[[:space:]]*auto[[:space:]]+minmax\\(0,[[:space:]]*1fr\\)'; printf '%s' \"$?\")\"", + "printf 'eventPanelFitCss\\t%s\\n' \"$(printf '%s' \"$css\" | grep -Eq 'grid-template-rows:[[:space:]]*auto[[:space:]]+minmax\\(132px,[[:space:]]*1fr\\)'; printf '%s' \"$?\")\"", + "health=$(fetch_url \"$api/health/live\" 2>/dev/null)", + "health_code=$?", + "printf 'apiHealthOk\\t%s\\n' \"$health_code\"", + "printf 'apiRevision\\t%s\\n' \"$(printf '%s' \"$health\" | sed -n 's/.*\"revision\"[[:space:]]*:[[:space:]]*\"\\([0-9A-Za-z._-]*\\)\".*/\\1/p' | head -1)\"", + ].join("\n"); +} + +function taskRunsCompactFromText(text: string, commandOk: boolean, pipelineRun: string | null, exitCode: number | null, stderr: string): Record { + if (!commandOk) { + return { + ok: false, + pipelineRun, + exitCode, + stderr: stderr.trim().slice(0, 2000), + counts: { succeeded: 0, failed: 0, running: 0, unknown: 0 }, + items: [], + }; + } + const items = text + .split(/\r?\n/u) + .map((line) => line.trim()) + .filter(Boolean) + .map((line) => { + const [name = "", status = "", reason = "", startTime = "", completionTime = ""] = line.split("\t"); + return { + name, + status: status || null, + reason: reason || null, + startTime: startTime || null, + completionTime: completionTime || null, + durationSeconds: secondsBetween(startTime, completionTime), + }; + }); + const counts = { + succeeded: items.filter((item) => item.status === "True").length, + failed: items.filter((item) => item.status === "False").length, + running: items.filter((item) => item.status === "Unknown").length, + unknown: items.filter((item) => item.status !== "True" && item.status !== "False" && item.status !== "Unknown").length, + }; + return { + ok: true, + pipelineRun, + counts, + items, + summary: `taskruns succeeded=${counts.succeeded} failed=${counts.failed} running=${counts.running} unknown=${counts.unknown}`, + disclosure: items.length > 0 ? "complete taskrun condition summary" : "no taskruns observed yet", + }; +} + +function v02WebAssetsFromText(text: string, commandOk: boolean, sourceCommit: string | null, argoSyncRevision: string | null, exitCode: number | null, stderr: string): Record { + const fields: Record = {}; + for (const line of text.split(/\r?\n/u)) { + const [key = "", ...rest] = line.split("\t"); + if (key.length > 0) fields[key] = rest.join("\t"); + } + const htmlOk = fields.htmlOk === "0"; + const cssOk = fields.cssOk === "0"; + const apiHealthOk = fields.apiHealthOk === "0"; + const readonlyNoteAbsent = fields.readonlyNote === "1"; + const sidebarFitCss = fields.sidebarFitCss === "0"; + const workspaceFitCss = fields.workspaceFitCss === "0"; + const eventPanelFitCss = fields.eventPanelFitCss === "0"; + const apiRevision = fields.apiRevision || null; + const webChecksPass = htmlOk && cssOk && readonlyNoteAbsent && sidebarFitCss && workspaceFitCss && eventPanelFitCss && apiHealthOk; + const failedChecks = Object.entries({ + htmlOk, + cssOk, + readonlyNoteAbsent, + sidebarFitCss, + workspaceFitCss, + eventPanelFitCss, + apiHealthOk, + }).filter(([, ok]) => !ok).map(([name]) => name); + return { + ok: commandOk && webChecksPass, + summary: commandOk && webChecksPass ? "19666/19667 probes passed" : `19666/19667 probe issues: ${failedChecks.join(", ") || "command failed"}`, + baseUrl: fields.baseUrl || V02_CLOUD_WEB_URL, + apiUrl: fields.apiUrl || V02_CLOUD_API_URL, + sourceCommit, + argoSyncRevision: argoSyncRevision || null, + checks: { + htmlOk, + cssOk, + readonlyNoteAbsent, + sidebarFitCss, + workspaceFitCss, + eventPanelFitCss, + apiHealthOk, + }, + probeExitCodes: { + html: numericField(fields.htmlOk), + css: numericField(fields.cssOk), + readonlyNoteGrep: numericField(fields.readonlyNote), + sidebarFitCssGrep: numericField(fields.sidebarFitCss), + workspaceFitCssGrep: numericField(fields.workspaceFitCss), + eventPanelFitCssGrep: numericField(fields.eventPanelFitCss), + apiHealth: numericField(fields.apiHealthOk), + }, + apiRevision, + note: apiRevision && sourceCommit && apiRevision !== sourceCommit + ? "cloud-api image revision can differ when a change only republishes Cloud Web static assets; use webAssets.checks for 19666 frontend asset readiness." + : null, + exitCode, + stderr: commandOk ? "" : stderr.trim().slice(0, 2000), + }; +} + +function numericField(value: string | undefined): number | null { + if (value === undefined || value.trim().length === 0) return null; + const parsed = Number(value); + return Number.isFinite(parsed) ? parsed : null; +} + function listV02PipelineRunsCompactFromText(text: string, commandOk: boolean, command: string[] | string, exitCode: number | null, stderr: string, limit = 8, nowMs = Date.now()): Record { if (!commandOk) { return { @@ -1184,6 +1327,8 @@ function v02ControlPlaneStatus(sourceCommitInput?: string | null): Record { } export function jobWithTail(job: JobRecord, maxBytes = 12000): JobRecord & { + progress: JobProgressSummary; tailPolicy: { requestedTailBytes: number; stdoutBytes: number; @@ -155,8 +169,12 @@ export function jobWithTail(job: JobRecord, maxBytes = 12000): JobRecord & { } { const stdoutBytes = existsSync(job.stdoutFile) ? statSync(job.stdoutFile).size : 0; const stderrBytes = existsSync(job.stderrFile) ? statSync(job.stderrFile).size : 0; + const progressTailBytes = Math.max(maxBytes, 96_000); + const stdoutProgressTail = tailFile(job.stdoutFile, progressTailBytes); + const stderrProgressTail = tailFile(job.stderrFile, progressTailBytes); return { ...job, + progress: summarizeJobProgress(job, progressTailBytes, { stdoutTail: stdoutProgressTail, stderrTail: stderrProgressTail }), tailPolicy: { requestedTailBytes: maxBytes, stdoutBytes, @@ -165,11 +183,107 @@ export function jobWithTail(job: JobRecord, maxBytes = 12000): JobRecord & { stderrTruncated: stderrBytes > maxBytes, fullLogPaths: { stdoutFile: job.stdoutFile, stderrFile: job.stderrFile }, }, - stdoutTail: tailFile(job.stdoutFile, maxBytes), - stderrTail: tailFile(job.stderrFile, maxBytes), + stdoutTail: tailTextByBytes(stdoutProgressTail, maxBytes), + stderrTail: tailTextByBytes(stderrProgressTail, maxBytes), }; } +function summarizeJobProgress(job: JobRecord, maxBytes = 96_000, tails?: { stdoutTail: string; stderrTail: string }): JobProgressSummary { + const knownWorkflow = job.name === "hwlab_g14_v02_trigger_current"; + if (!knownWorkflow && tails === undefined) return genericJobProgress(job); + const progressTailBytes = Math.max(4096, Math.floor(maxBytes)); + const stderrTail = tails?.stderrTail ?? tailFile(job.stderrFile, progressTailBytes); + const stdoutTail = tails?.stdoutTail ?? tailFile(job.stdoutFile, progressTailBytes); + const events = parseJsonLineEvents(stderrTail, "hwlab.v02.trigger.progress"); + const lastEvent = events.at(-1) ?? {}; + const stage = stringField(lastEvent.stage); + const stageStatus = stringField(lastEvent.status); + const sourceCommit = stringField(lastEvent.sourceCommit) ?? firstMatch(stdoutTail, /"sourceCommit"\s*:\s*"([0-9a-f]{40})"/iu); + const pipelineRun = stringField(lastEvent.pipelineRun) ?? firstMatch(stdoutTail, /"pipelineRun"\s*:\s*"([^"]+)"/u); + const pipelineCreated = /pipelinerun\.tekton\.dev\/[^ \n]+ created/u.test(stdoutTail) + ? true + : stage === "create-pipelinerun" && stageStatus === "failed" + ? false + : null; + const lastEventAt = stringField(lastEvent.at); + const kind = events.length > 0 || knownWorkflow ? "hwlab-v02-trigger" : "generic"; + const nextCommand = pipelineRun + ? `bun scripts/cli.ts hwlab g14 control-plane status --lane v02` + : job.status === "running" + ? `bun scripts/cli.ts job status ${job.id} --tail-bytes 12000` + : null; + const summary = kind === "hwlab-v02-trigger" + ? [ + job.status, + stage ? `${stage}${stageStatus ? `:${stageStatus}` : ""}` : "stage:unknown", + sourceCommit ? `source=${sourceCommit.slice(0, 12)}` : null, + pipelineRun ? `pipelineRun=${pipelineRun}` : null, + pipelineCreated === true ? "created" : pipelineCreated === false ? "create-failed" : null, + ].filter(Boolean).join(" ") + : `${job.status}${job.exitCode === null ? "" : ` exit=${job.exitCode}`}`; + return { + kind, + stage, + stageStatus, + sourceCommit, + pipelineRun, + pipelineCreated, + lastEventAt, + eventsObserved: events.length, + summary, + nextCommand, + }; +} + +function genericJobProgress(job: JobRecord): JobProgressSummary { + return { + kind: "generic", + stage: null, + stageStatus: null, + sourceCommit: null, + pipelineRun: null, + pipelineCreated: null, + lastEventAt: null, + eventsObserved: 0, + summary: `${job.status}${job.exitCode === null ? "" : ` exit=${job.exitCode}`}`, + nextCommand: job.status === "running" ? `bun scripts/cli.ts job status ${job.id} --tail-bytes 12000` : null, + }; +} + +function tailTextByBytes(text: string, maxBytes: number): string { + const safeMaxBytes = Math.max(0, Math.floor(maxBytes)); + if (safeMaxBytes === 0) return ""; + const buffer = Buffer.from(text, "utf8"); + if (buffer.length <= safeMaxBytes) return text; + return buffer.subarray(buffer.length - safeMaxBytes).toString("utf8"); +} + +function parseJsonLineEvents(text: string, eventName: string): Record[] { + const events: Record[] = []; + for (const line of text.split(/\r?\n/u)) { + const trimmed = line.trim(); + if (!trimmed.startsWith("{")) continue; + try { + const parsed = JSON.parse(trimmed) as unknown; + if (typeof parsed === "object" && parsed !== null && !Array.isArray(parsed) && (parsed as Record).event === eventName) { + events.push(parsed as Record); + } + } catch { + // Ignore non-JSON stderr lines; the raw tail remains available in stderrTail. + } + } + return events; +} + +function stringField(value: unknown): string | null { + return typeof value === "string" && value.trim().length > 0 ? value.trim() : null; +} + +function firstMatch(text: string, pattern: RegExp): string | null { + const match = pattern.exec(text); + return typeof match?.[1] === "string" && match[1].length > 0 ? match[1] : null; +} + export interface JobListOptions { limit?: number; includeCommand?: boolean; @@ -182,6 +296,7 @@ export function listJobsSummary(options: JobListOptions = {}): unknown { id: job.id, name: job.name, status: job.status, + progress: summarizeJobProgress(job, 32_000), runner: job.runner, runnerPid: job.runnerPid ?? null, runnerContainer: job.runnerContainer ?? null,