fix: improve hwlab v02 cli observability
This commit is contained in:
@@ -43,7 +43,7 @@ CI/CD、GitOps、rollout、artifact 发布、PR 合并后的 DEV/PROD 滚动、P
|
||||
- `artifact-registry plan|render|status|health|install|deploy-backend-core|deploy-service` 管理 D601 host-managed CNCF Distribution registry 的声明、安装、只读检查和 pull-only artifact CD。该 registry 固定为 D601 loopback `127.0.0.1:5000`,由 systemd + Docker Compose 管理,位于 native k3s 故障域外;`deploy-service` 只拉取 CI 已发布的 commit-pinned 镜像、retag/recreate 或导入 native k3s,并做 live commit 验证,不构建 runtime source。`deploy-backend-core` 是 deprecated 兼容名,标准 backend-core prod CD 入口是 `deploy apply --env prod --service backend-core`。长期规则见 `docs/reference/artifact-registry.md`。
|
||||
- `commander contract|plan --dry-run|smoke --dry-run|approval request --dry-run|prompt-lint --kind gpt55-pr` 是 host Codex 指挥官直管微服务 skeleton 入口。当前命令返回 `phase=source-contract`、service/API/state/bridge/prompt/trace/#20/#46/ClaudeQQ 审批边界、.state/commander/ 状态模型、dev 无 daemon smoke contract、dry-run 计划和 GPT-5.5 PR prompt 边界辅助 lint,不接 live bridge、不注入 prompt、不发送 ClaudeQQ。`approval request --dry-run` 会生成 200 字以内中文纯文本 ClaudeQQ 审批草案、`notification-path-unavailable` blocker 和授权后唯一可用的 `bun scripts/cli.ts microservice proxy claudeqq /api/push/text --method POST --body-json '<payload>' --raw` 命令;不得提示使用本机 ClaudeQQ skill、powershell 或本地 server。`prompt-lint` 支持 `--prompt-file` 与 `--stdin`,输出 `ok`、`missingClauses`、`riskLevel`、`suggestedPatchSnippet` 且不回显完整 prompt;它是 commander 辅助检查,不是业务 PR 门禁,也不改变 `codex submit` 默认行为。`plan`、`smoke` 与 `approval request` 必须带 `--dry-run`;缺少时返回 `error=dry-run-required`。长期规则见 `docs/reference/host-codex-commander.md`。
|
||||
- `hwlab g14 monitor-prs [--once] [--dry-run] [--interval-seconds N] [--max-cycles N] [--timeout-seconds N]` 是当前 HWLAB G14 PR -> CI/CD -> DEV rollout 的一行式入口。普通调用创建 `.state/jobs/` 异步 job 并立刻返回 `job.id`、`statusCommand` 和 stdout/stderr 路径;后台 worker 每轮通过 UniDesk `gh pr list/preflight/merge` 监控 `pikasTech/HWLAB` base=`G14` 的 open PR,ready 时合并,然后通过 UniDesk `ssh G14:k3s` 观察 `hwlab-g14-ci-poll-<short>`、Argo `hwlab-g14-dev` 和 DEV `/health/live`,直到 DEV `Synced/Healthy` 且 Deployment/StatefulSet ready;历史 `Completed` smoke/debug pod 不作为 rollout blocker。每次成功 DEV rollout 后,worker 会定位或创建 #7“指挥简报索引”中的北京日期每日简报 issue,并追加 CI/CD 耗时、CI/CD 关键指标、语义化上线 changelog、自动 diff 摘要、PipelineRun、GitOps revision 和 DEV 验证摘要;关键指标来自 G14 Tekton TaskRun results,固定包含 `lazy build reused: x/y`、reused services、rebuild services 和每个 service 的独立耗时/状态/backend,用于观察 lazy build 机制效果。语义化 changelog 优先从 PR body 的 `## 修改`/`## 变更`/`## Changelog` 等段落提取,diff 摘要只作为文件和统计证据保留,不替代 changelog。也可用 `hwlab g14 record-rollout --pr <number> --source-commit <sha>` 手动补记,手动补记同样会按 PipelineRun 采集 TaskRun 指标。状态指针按用途分离:长期监控只写 `.state/hwlab-g14/latest-monitor-job.json`,`--once` 写 `latest-once-job.json`,`--dry-run` 写 `latest-dry-run-job.json`,`--once --dry-run` 写 `latest-once-dry-run-job.json`,避免一次性收口覆盖持续监控入口。`--once --dry-run` 只做单轮监控和 merge plan,不写 GitHub、不等待 rollout。该命令禁止使用原生 `gh` 或手拼 GitHub 请求;如果 UniDesk `gh` 子命令字段或行为不够,必须先改进 `scripts/src/gh.ts` 后再使用。
|
||||
- `hwlab g14 control-plane status|apply --lane v02 [--dry-run|--confirm]` 是 HWLAB `v0.2` 加法 lane 的受控 Tekton/Argo 控制面维护入口,只面向 G14 `/root/hwlab-v02`、branch `v0.2`、namespace `hwlab-ci` 和 Argo application `hwlab-g14-v02`;`status` 只读汇总 pipeline、RBAC/ServiceAccount、Argo、当前 commit PipelineRun、最近 PipelineRun 摘要、活跃 PipelineRun 和遗留 v02 CronJob 清理状态,默认只读取必要字段,禁止把完整 PipelineRun spec、Tekton 内联脚本或历史大对象展开到默认输出;`apply` 先在 G14 workspace 快进并执行 render check,再经 `G14:k3s` server-side apply `tekton-v02/rbac.yaml`、`pipeline.yaml`、`argocd/project.yaml` 和 `argocd/application-v02.yaml`,confirmed apply 会删除遗留 v02 CronJob,但不会应用 runtime-v02 workload、Secret 或数据迁移。
|
||||
- `hwlab g14 control-plane status|apply --lane v02 [--dry-run|--confirm]` 是 HWLAB `v0.2` 加法 lane 的受控 Tekton/Argo 控制面维护入口,只面向 G14 `/root/hwlab-v02`、branch `v0.2`、namespace `hwlab-ci` 和 Argo application `hwlab-g14-v02`;`status` 只读汇总 pipeline、RBAC/ServiceAccount、Argo、当前 commit PipelineRun、当前 PipelineRun 的 TaskRun 条件摘要、最近 PipelineRun 摘要、活跃 PipelineRun、遗留 v02 CronJob 清理状态,以及 19666/19667 的 Cloud Web 静态资源和 API live 探针。`webAssets` 必须直接给出 `readonly-rpc` 删除、sidebar/workspace/event panel 关键 CSS、`/health/live` 和 API revision;Cloud Web 静态资源变更时允许 `apiRevision` 与 source commit 不同,但不得把这种差异误判成 19666 未发布。默认只读取必要字段,禁止把完整 PipelineRun spec、Tekton 内联脚本、历史大对象或整份 CSS/HTML 展开到默认输出;`apply` 先在 G14 workspace 快进并执行 render check,再经 `G14:k3s` server-side apply `tekton-v02/rbac.yaml`、`pipeline.yaml`、`argocd/project.yaml` 和 `argocd/application-v02.yaml`,confirmed apply 会删除遗留 v02 CronJob,但不会应用 runtime-v02 workload、Secret 或数据迁移。
|
||||
- `hwlab g14 control-plane trigger-current --lane v02 [--dry-run|--confirm]` 是 v02 标准手动触发入口:解析当前 `origin/v0.2` full SHA,创建 commit-pinned `hwlab-v02-ci-poll-<short12>` PipelineRun;读 Git 走 `git-mirror-http.devops-infra.svc.cluster.local`,GitOps promotion 写 `git-mirror-write.devops-infra.svc.cluster.local`;confirmed trigger 在删除/创建 PipelineRun 前会先按当前 source commit render 并 server-side apply v02 Tekton RBAC、Pipeline 与 Argo Application,避免 CI/CD 脚本或 runtime-ready 逻辑已合并但集群仍执行旧 Pipeline 定义;同名 PipelineRun 成功或运行中时拒绝重复触发,失败或不存在时才删除旧对象并重新创建。
|
||||
创建 PipelineRun 前会读取 `devops-infra` mirror refs,若 `localV02` 未等于当前 source commit,则自动执行一次受控 manual `git-mirror sync` Job 并复核 ref,复核失败时停止触发,避免 Tekton `prepare-source` 已知失败;services 参数只包含 v02 runtime service matrix,`hwlab-cli` 是固定 repo 短连接源码工具,不进入 PipelineRun service build。
|
||||
`--dry-run` 只报告是否会 pre-sync,不创建 Job;confirmed trigger 默认创建 `.state/jobs/` 异步 job 并立刻返回 `job.id`、`statusCommand`、stdout/stderr 路径,避免 git mirror pre-sync 或 PipelineRun 创建期间长时间阻塞;`--wait` 路径也必须向 stderr 输出 `hwlab.v02.trigger.progress` JSON 事件,覆盖 `control-plane-refresh`、`git-mirror-pre-sync`、`delete-existing-pipelinerun` 和 `create-pipelinerun`,避免异步 job 长时间只有启动命令而无法判断卡点;默认 JSON 必须对 `manifest_b64`、长脚本和远端 stdout/stderr 做有界摘要,保留长度与 hash,最终 trigger 结果只返回阶段摘要和关键 tail,完整内容通过 job stdout/stderr 文件渐进披露;只有现场同步调试才显式加 `--wait`;旧 `rerun-current` 只作为输入别名保留。
|
||||
@@ -88,7 +88,7 @@ CI/CD、GitOps、rollout、artifact 发布、PR 合并后的 DEV/PROD 滚动、P
|
||||
- `codex interrupt|cancel <taskId>` 通过 Code Queue 私有代理请求中断;running/judging 任务会请求 D601 当前 agent run 停止,queued/retry_wait 任务的取消也必须保持与 WebUI 相同代理路径,返回有界 task 摘要和后续查询命令。任何需要接触 active run 的动作仍属于 D601 执行面。
|
||||
- Code Queue 多队列 lane 由 `codex` 命令命名空间管理:`queues [--full|--all] [--limit N] [--page N|--offset N]` 列表、`queue create <queueId>` 创建、`queue merge <sourceQueueId> --into <targetQueueId>` 合并、`move <taskId> --queue <queueId>` 迁移;这些队列管理入口默认由主 server `code-queue-mgr` 直管 PostgreSQL,仍通过稳定 `code-queue` 用户服务代理路径访问。`codex queues` 默认只返回 active/nonempty/unread/runnable queue 摘要、activity、commanderConcurrency、全局 counts 和 execution diagnostics;`--full` 或 `--all` 只切换为完整队列行视图的一页,仍受 `--limit`/`--page`/`--offset` 分页约束,不再默认携带 deprecated full array。summary 和 full 的稳定机读路径都是 `.data.queues.items[]`,全局元数据固定在 `.data.queues.commanderConcurrency`、`.data.queues.activity`、`.data.queues.counts`、`.data.queues.executionDiagnostics`、`.data.queues.activeTaskIds` 和 `.data.queues.queuedTaskIds`;需要完整 upstream 时使用输出中的 raw command。`commanderConcurrency.activeRunnerCount` / `activity.effectiveActiveTaskCount` 是指挥官并发判断的有效活跃数,`schedulerLocalActiveQueueCount`/`activeQueueIds` 只描述本地 scheduler active-run slots,不能覆盖数据库 running 计数或 heartbeat-fresh runner 计数。旧 full 顶层数组语义已作为 deprecated 兼容信息记录,不再作为 `.data.queues` 主形态。同一个 queue 内部串行执行,不同 queue 之间并行执行。迁移只允许尚未被 scheduler claim 的 `queued`/`retry_wait` 任务,必须满足 `startedAt=null`、`currentAttempt=0` 且没有 active thread/turn;已进入 `running`/`judging` 或已有 claim 标记的任务返回 409,不得被 move/merge 回写成 queued。合并会移动可迁移任务归属并自动删除源 queue 记录,只保留合并后的目标 queue;若 source 或 target queue 存在 active/claimed 任务,合并整体返回 409。合并后的目标 queue 按任务原 `queueEnteredAt`/`createdAt` 时间顺序串行,成功迁移 queued/retry_wait 任务后由 D601 scheduler 轮询推进。
|
||||
- 所有 `codex` 查询和管理命令必须走与 WebUI 相同的 backend-core 私有代理路径 `/api/microservices/code-queue/proxy/...`;CLI 不得为了提交、移动、中断、取消或队列管理直接调用 D601 内部 Service、数据库、pod curl 或 k3sctl scheduler 子服务。若该路径失败,应先修复 CLI/backend/provider tunnel 链路,而不是绕过控制面。
|
||||
- `job list [--limit N] [--include-command]` 与 `job status <jobId|latest> [--tail-bytes N]` 查询 `.state/jobs/` 文件系统状态,是异步命令的可观测入口。`job list` 默认只返回最新 50 条摘要;`job status` 默认只返回 stdout/stderr 末尾 12000 字节,并带 `tailPolicy` 与完整日志路径。
|
||||
- `job list [--limit N] [--include-command]` 与 `job status <jobId|latest> [--tail-bytes N]` 查询 `.state/jobs/` 文件系统状态,是异步命令的可观测入口。`job list` 默认只返回最新 50 条摘要,并为已知异步工作流返回轻量 `progress.summary` 与后续查询命令;`job status` 默认返回结构化 `progress`、stdout/stderr 末尾 12000 字节、`tailPolicy` 与完整日志路径。已知工作流应从有界日志尾部抽取阶段、关键对象名和下一步命令,避免为了判断当前阶段而手工打开完整 stdout/stderr。
|
||||
- `debug health`、`debug dispatch` 与 `debug task` 走真实内部 core、WebSocket、数据库、provider、系统指标、Docker 状态和 Host SSH 维护桥流程,只用于开发调试,不写入 `TEST.md` 的正式验收步骤。
|
||||
- `e2e run [--only pattern[,pattern...]] [--skip pattern[,pattern...]]` 使用 publicHost 派生的公开 production frontend/dev frontend/provider ingress URL,并通过 Docker 内网验证 core API、PostgreSQL、provider self-connection、系统指标曲线、Docker 状态快照、provider.upgrade 预检和 Playwright 前端页面,是交付前的自动化 E2E 门禁;CLI 默认输出 check 状态摘要,完整诊断写入 `resultPath`,日常迭代应优先用 `--only` / `--skip` 跑最小必要集合。
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@ UniDesk 的可观测性优先级高于静默成功。CLI、服务日志、Docker
|
||||
|
||||
## CLI Logs
|
||||
|
||||
异步 job 的 stdout 和 stderr 位于 `.state/jobs/`。`job list` 默认只返回最新 50 条摘要;`job status` 会返回有限尾部,避免输出爆炸,同时保留完整日志文件路径便于继续排查。实现必须只读取日志尾部字节,不得先把完整 job 日志读入 CLI 内存。
|
||||
异步 job 的 stdout 和 stderr 位于 `.state/jobs/`。`job list` 默认只返回最新 50 条摘要,并为已知异步工作流返回轻量 `progress.summary`;`job status` 会返回结构化 `progress` 与有限尾部,避免输出爆炸,同时保留完整日志文件路径便于继续排查。实现必须只读取日志尾部字节,不得先把完整 job 日志读入 CLI 内存;长时命令的阶段、关键对象名和下一步查询命令应优先沉淀到 `progress`,不能要求调用者先阅读完整日志才能知道是否卡在提交、构建、发布或观测阶段。
|
||||
|
||||
## Service Logs
|
||||
|
||||
|
||||
+3
-3
@@ -80,8 +80,8 @@ export function rootHelp(): unknown {
|
||||
{ command: "codex steer-confirm <taskId> --steer-id <id> [--raw]", description: "Read-only lookup for a steerId in task trace so deliveryUnconfirmed can be resolved without resending the corrective prompt." },
|
||||
{ command: "codex interrupt|cancel <taskId>", description: "Request interrupt for a running Code Queue task, or cancel a queued/retry_wait task, through the same private proxy." },
|
||||
{ command: "codex (queues [--full|--all] | queue create <queueId> | queue merge <sourceQueueId> --into <targetQueueId> | move <taskId> --queue <queueId>)", description: "List low-noise queue summaries by default, including effective activity counts that distinguish scheduler-local queues, DB running tasks, and heartbeat-fresh runners; full queue rows require --full/--all." },
|
||||
{ command: "job list [--limit N] [--include-command]", description: "List async jobs from .state/jobs with a bounded default page." },
|
||||
{ command: "job status <jobId|latest> [--tail-bytes N]", description: "Show job state with bounded stdout/stderr tails." },
|
||||
{ command: "job list [--limit N] [--include-command]", description: "List async jobs from .state/jobs with a bounded default page and progress summaries." },
|
||||
{ command: "job status <jobId|latest> [--tail-bytes N]", description: "Show job state with a structured progress summary and bounded stdout/stderr tails." },
|
||||
{ command: "debug health", description: "Probe internal core, nodes, system/Docker status, frontend, provider ingress, and public boundary." },
|
||||
{ command: "debug dispatch [providerId] [docker.ps|provider.upgrade|host.ssh|microservice.http|echo] [--wait-ms N]", description: "Submit a real internal-core dispatch request for CLI debugging." },
|
||||
{ command: "debug task <taskId|latest>", description: "Read a dispatched task record from internal core for CLI debugging." },
|
||||
@@ -481,7 +481,7 @@ function jobHelp(): unknown {
|
||||
"bun scripts/cli.ts job list [--limit N] [--include-command]",
|
||||
"bun scripts/cli.ts job status <jobId|latest> [--tail-bytes N]",
|
||||
],
|
||||
description: "Inspect fire-and-forget job state from .state/jobs without streaming unbounded logs.",
|
||||
description: "Inspect fire-and-forget job state from .state/jobs with structured progress summaries and bounded log tails.",
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -43,6 +43,8 @@ const V02_SERVICE_IDS = [
|
||||
"hwlab-edge-proxy",
|
||||
"hwlab-agent-skills",
|
||||
];
|
||||
const V02_CLOUD_WEB_URL = "http://74.48.78.17:19666";
|
||||
const V02_CLOUD_API_URL = "http://74.48.78.17:19667";
|
||||
|
||||
export function v02PipelineServiceIds(): string[] {
|
||||
return [...V02_SERVICE_IDS];
|
||||
@@ -712,11 +714,152 @@ function v02ControlPlaneStatusBundle(sourceCommit: string | null | undefined): C
|
||||
`section obsoleteCronJobs kubectl get cronjob -n ${shellQuote(CI_NAMESPACE)} ${shellQuote(V02_POLLER)} ${shellQuote(V02_RECONCILER)} --ignore-not-found -o name`,
|
||||
`section argo kubectl get application -n ${shellQuote(ARGO_NAMESPACE)} ${shellQuote(V02_APP)} -o 'jsonpath={.spec.source.targetRevision}{"\\n"}{.spec.source.path}{"\\n"}{.status.sync.revision}{"\\n"}{.status.sync.status}{"\\n"}{.status.health.status}{"\\n"}'`,
|
||||
`if [ -n "$pipeline_run" ]; then section pipelineRun kubectl get pipelinerun -n ${shellQuote(CI_NAMESPACE)} "$pipeline_run" -o 'jsonpath={.status.conditions[0].status}{"\\n"}{.status.conditions[0].reason}{"\\n"}{.status.conditions[0].message}{"\\n"}'; else section pipelineRun sh -c 'true'; fi`,
|
||||
`if [ -n "$pipeline_run" ]; then section taskRuns kubectl get taskrun -n ${shellQuote(CI_NAMESPACE)} -l "tekton.dev/pipelineRun=$pipeline_run" -o 'jsonpath={range .items[*]}{.metadata.name}{"\\t"}{.status.conditions[0].status}{"\\t"}{.status.conditions[0].reason}{"\\t"}{.status.startTime}{"\\t"}{.status.completionTime}{"\\n"}{end}'; else section taskRuns sh -c 'true'; fi`,
|
||||
`section recentPipelineRuns kubectl get pipelinerun -n ${shellQuote(CI_NAMESPACE)} -l hwlab.pikastech.local/gitops-target=v02 -o ${shellQuote(pipelineRunRowsJsonPath())}`,
|
||||
`section webAssets sh -c ${shellQuote(v02WebAssetsProbeScript())}`,
|
||||
].join("\n");
|
||||
return g14K3s(["script", "--", script], 60_000);
|
||||
}
|
||||
|
||||
function v02WebAssetsProbeScript(): string {
|
||||
return [
|
||||
"set +e",
|
||||
`base=${shellQuote(V02_CLOUD_WEB_URL)}`,
|
||||
`api=${shellQuote(V02_CLOUD_API_URL)}`,
|
||||
"fetch_url() {",
|
||||
" if command -v curl >/dev/null 2>&1; then",
|
||||
" curl -fsS --connect-timeout 2 --max-time 5 \"$1\"",
|
||||
" elif command -v wget >/dev/null 2>&1; then",
|
||||
" wget -q -T 5 -O - \"$1\"",
|
||||
" else",
|
||||
" return 127",
|
||||
" fi",
|
||||
"}",
|
||||
"printf 'baseUrl\\t%s\\n' \"$base\"",
|
||||
"printf 'apiUrl\\t%s\\n' \"$api\"",
|
||||
"html=$(fetch_url \"$base/\" 2>/dev/null)",
|
||||
"html_code=$?",
|
||||
"printf 'htmlOk\\t%s\\n' \"$html_code\"",
|
||||
"printf 'readonlyNote\\t%s\\n' \"$(printf '%s' \"$html\" | grep -Eiq 'readonly-rpc|复核入口'; printf '%s' \"$?\")\"",
|
||||
"css=$(fetch_url \"$base/styles.css\" 2>/dev/null)",
|
||||
"css_code=$?",
|
||||
"printf 'cssOk\\t%s\\n' \"$css_code\"",
|
||||
"printf 'sidebarFitCss\\t%s\\n' \"$(printf '%s' \"$css\" | grep -Eq 'grid-template-rows:[[:space:]]*auto[[:space:]]+auto[[:space:]]+auto[[:space:]]+auto[[:space:]]+minmax\\(0,[[:space:]]*1fr\\)'; printf '%s' \"$?\")\"",
|
||||
"printf 'workspaceFitCss\\t%s\\n' \"$(printf '%s' \"$css\" | grep -Eq 'grid-template-rows:[[:space:]]*auto[[:space:]]+minmax\\(0,[[:space:]]*1fr\\)'; printf '%s' \"$?\")\"",
|
||||
"printf 'eventPanelFitCss\\t%s\\n' \"$(printf '%s' \"$css\" | grep -Eq 'grid-template-rows:[[:space:]]*auto[[:space:]]+minmax\\(132px,[[:space:]]*1fr\\)'; printf '%s' \"$?\")\"",
|
||||
"health=$(fetch_url \"$api/health/live\" 2>/dev/null)",
|
||||
"health_code=$?",
|
||||
"printf 'apiHealthOk\\t%s\\n' \"$health_code\"",
|
||||
"printf 'apiRevision\\t%s\\n' \"$(printf '%s' \"$health\" | sed -n 's/.*\"revision\"[[:space:]]*:[[:space:]]*\"\\([0-9A-Za-z._-]*\\)\".*/\\1/p' | head -1)\"",
|
||||
].join("\n");
|
||||
}
|
||||
|
||||
function taskRunsCompactFromText(text: string, commandOk: boolean, pipelineRun: string | null, exitCode: number | null, stderr: string): Record<string, unknown> {
|
||||
if (!commandOk) {
|
||||
return {
|
||||
ok: false,
|
||||
pipelineRun,
|
||||
exitCode,
|
||||
stderr: stderr.trim().slice(0, 2000),
|
||||
counts: { succeeded: 0, failed: 0, running: 0, unknown: 0 },
|
||||
items: [],
|
||||
};
|
||||
}
|
||||
const items = text
|
||||
.split(/\r?\n/u)
|
||||
.map((line) => line.trim())
|
||||
.filter(Boolean)
|
||||
.map((line) => {
|
||||
const [name = "", status = "", reason = "", startTime = "", completionTime = ""] = line.split("\t");
|
||||
return {
|
||||
name,
|
||||
status: status || null,
|
||||
reason: reason || null,
|
||||
startTime: startTime || null,
|
||||
completionTime: completionTime || null,
|
||||
durationSeconds: secondsBetween(startTime, completionTime),
|
||||
};
|
||||
});
|
||||
const counts = {
|
||||
succeeded: items.filter((item) => item.status === "True").length,
|
||||
failed: items.filter((item) => item.status === "False").length,
|
||||
running: items.filter((item) => item.status === "Unknown").length,
|
||||
unknown: items.filter((item) => item.status !== "True" && item.status !== "False" && item.status !== "Unknown").length,
|
||||
};
|
||||
return {
|
||||
ok: true,
|
||||
pipelineRun,
|
||||
counts,
|
||||
items,
|
||||
summary: `taskruns succeeded=${counts.succeeded} failed=${counts.failed} running=${counts.running} unknown=${counts.unknown}`,
|
||||
disclosure: items.length > 0 ? "complete taskrun condition summary" : "no taskruns observed yet",
|
||||
};
|
||||
}
|
||||
|
||||
function v02WebAssetsFromText(text: string, commandOk: boolean, sourceCommit: string | null, argoSyncRevision: string | null, exitCode: number | null, stderr: string): Record<string, unknown> {
|
||||
const fields: Record<string, string> = {};
|
||||
for (const line of text.split(/\r?\n/u)) {
|
||||
const [key = "", ...rest] = line.split("\t");
|
||||
if (key.length > 0) fields[key] = rest.join("\t");
|
||||
}
|
||||
const htmlOk = fields.htmlOk === "0";
|
||||
const cssOk = fields.cssOk === "0";
|
||||
const apiHealthOk = fields.apiHealthOk === "0";
|
||||
const readonlyNoteAbsent = fields.readonlyNote === "1";
|
||||
const sidebarFitCss = fields.sidebarFitCss === "0";
|
||||
const workspaceFitCss = fields.workspaceFitCss === "0";
|
||||
const eventPanelFitCss = fields.eventPanelFitCss === "0";
|
||||
const apiRevision = fields.apiRevision || null;
|
||||
const webChecksPass = htmlOk && cssOk && readonlyNoteAbsent && sidebarFitCss && workspaceFitCss && eventPanelFitCss && apiHealthOk;
|
||||
const failedChecks = Object.entries({
|
||||
htmlOk,
|
||||
cssOk,
|
||||
readonlyNoteAbsent,
|
||||
sidebarFitCss,
|
||||
workspaceFitCss,
|
||||
eventPanelFitCss,
|
||||
apiHealthOk,
|
||||
}).filter(([, ok]) => !ok).map(([name]) => name);
|
||||
return {
|
||||
ok: commandOk && webChecksPass,
|
||||
summary: commandOk && webChecksPass ? "19666/19667 probes passed" : `19666/19667 probe issues: ${failedChecks.join(", ") || "command failed"}`,
|
||||
baseUrl: fields.baseUrl || V02_CLOUD_WEB_URL,
|
||||
apiUrl: fields.apiUrl || V02_CLOUD_API_URL,
|
||||
sourceCommit,
|
||||
argoSyncRevision: argoSyncRevision || null,
|
||||
checks: {
|
||||
htmlOk,
|
||||
cssOk,
|
||||
readonlyNoteAbsent,
|
||||
sidebarFitCss,
|
||||
workspaceFitCss,
|
||||
eventPanelFitCss,
|
||||
apiHealthOk,
|
||||
},
|
||||
probeExitCodes: {
|
||||
html: numericField(fields.htmlOk),
|
||||
css: numericField(fields.cssOk),
|
||||
readonlyNoteGrep: numericField(fields.readonlyNote),
|
||||
sidebarFitCssGrep: numericField(fields.sidebarFitCss),
|
||||
workspaceFitCssGrep: numericField(fields.workspaceFitCss),
|
||||
eventPanelFitCssGrep: numericField(fields.eventPanelFitCss),
|
||||
apiHealth: numericField(fields.apiHealthOk),
|
||||
},
|
||||
apiRevision,
|
||||
note: apiRevision && sourceCommit && apiRevision !== sourceCommit
|
||||
? "cloud-api image revision can differ when a change only republishes Cloud Web static assets; use webAssets.checks for 19666 frontend asset readiness."
|
||||
: null,
|
||||
exitCode,
|
||||
stderr: commandOk ? "" : stderr.trim().slice(0, 2000),
|
||||
};
|
||||
}
|
||||
|
||||
function numericField(value: string | undefined): number | null {
|
||||
if (value === undefined || value.trim().length === 0) return null;
|
||||
const parsed = Number(value);
|
||||
return Number.isFinite(parsed) ? parsed : null;
|
||||
}
|
||||
|
||||
function listV02PipelineRunsCompactFromText(text: string, commandOk: boolean, command: string[] | string, exitCode: number | null, stderr: string, limit = 8, nowMs = Date.now()): Record<string, unknown> {
|
||||
if (!commandOk) {
|
||||
return {
|
||||
@@ -1184,6 +1327,8 @@ function v02ControlPlaneStatus(sourceCommitInput?: string | null): Record<string
|
||||
const obsoleteCronJobs = sections.obsoleteCronJobs;
|
||||
const argo = sections.argo;
|
||||
const pipelineRunSection = sections.pipelineRun;
|
||||
const taskRunsSection = sections.taskRuns;
|
||||
const webAssetsSection = sections.webAssets;
|
||||
const recentPipelineRuns = listV02PipelineRunsCompactFromText(
|
||||
sections.recentPipelineRuns?.stdout ?? "",
|
||||
shellSectionOk(sections.recentPipelineRuns),
|
||||
@@ -1237,6 +1382,21 @@ function v02ControlPlaneStatus(sourceCommitInput?: string | null): Record<string
|
||||
pipelineRunSection?.exitCode ?? null,
|
||||
bundle.stderr,
|
||||
),
|
||||
taskRuns: taskRunsCompactFromText(
|
||||
taskRunsSection?.stdout ?? "",
|
||||
shellSectionOk(taskRunsSection),
|
||||
pipelineRun,
|
||||
taskRunsSection?.exitCode ?? null,
|
||||
bundle.stderr,
|
||||
),
|
||||
webAssets: v02WebAssetsFromText(
|
||||
webAssetsSection?.stdout ?? "",
|
||||
shellSectionOk(webAssetsSection),
|
||||
sourceCommit,
|
||||
syncRevision,
|
||||
webAssetsSection?.exitCode ?? null,
|
||||
bundle.stderr,
|
||||
),
|
||||
activePipelineRuns,
|
||||
recentPipelineRuns,
|
||||
query: {
|
||||
|
||||
+117
-2
@@ -24,6 +24,19 @@ export interface JobRecord {
|
||||
note: string;
|
||||
}
|
||||
|
||||
export interface JobProgressSummary {
|
||||
kind: "hwlab-v02-trigger" | "generic";
|
||||
stage: string | null;
|
||||
stageStatus: string | null;
|
||||
sourceCommit: string | null;
|
||||
pipelineRun: string | null;
|
||||
pipelineCreated: boolean | null;
|
||||
lastEventAt: string | null;
|
||||
eventsObserved: number;
|
||||
summary: string;
|
||||
nextCommand: string | null;
|
||||
}
|
||||
|
||||
export interface StartJobOptions {
|
||||
runner?: "local" | "docker";
|
||||
dockerImage?: string;
|
||||
@@ -142,6 +155,7 @@ export async function runJob(id: string): Promise<JobRecord> {
|
||||
}
|
||||
|
||||
export function jobWithTail(job: JobRecord, maxBytes = 12000): JobRecord & {
|
||||
progress: JobProgressSummary;
|
||||
tailPolicy: {
|
||||
requestedTailBytes: number;
|
||||
stdoutBytes: number;
|
||||
@@ -155,8 +169,12 @@ export function jobWithTail(job: JobRecord, maxBytes = 12000): JobRecord & {
|
||||
} {
|
||||
const stdoutBytes = existsSync(job.stdoutFile) ? statSync(job.stdoutFile).size : 0;
|
||||
const stderrBytes = existsSync(job.stderrFile) ? statSync(job.stderrFile).size : 0;
|
||||
const progressTailBytes = Math.max(maxBytes, 96_000);
|
||||
const stdoutProgressTail = tailFile(job.stdoutFile, progressTailBytes);
|
||||
const stderrProgressTail = tailFile(job.stderrFile, progressTailBytes);
|
||||
return {
|
||||
...job,
|
||||
progress: summarizeJobProgress(job, progressTailBytes, { stdoutTail: stdoutProgressTail, stderrTail: stderrProgressTail }),
|
||||
tailPolicy: {
|
||||
requestedTailBytes: maxBytes,
|
||||
stdoutBytes,
|
||||
@@ -165,11 +183,107 @@ export function jobWithTail(job: JobRecord, maxBytes = 12000): JobRecord & {
|
||||
stderrTruncated: stderrBytes > maxBytes,
|
||||
fullLogPaths: { stdoutFile: job.stdoutFile, stderrFile: job.stderrFile },
|
||||
},
|
||||
stdoutTail: tailFile(job.stdoutFile, maxBytes),
|
||||
stderrTail: tailFile(job.stderrFile, maxBytes),
|
||||
stdoutTail: tailTextByBytes(stdoutProgressTail, maxBytes),
|
||||
stderrTail: tailTextByBytes(stderrProgressTail, maxBytes),
|
||||
};
|
||||
}
|
||||
|
||||
function summarizeJobProgress(job: JobRecord, maxBytes = 96_000, tails?: { stdoutTail: string; stderrTail: string }): JobProgressSummary {
|
||||
const knownWorkflow = job.name === "hwlab_g14_v02_trigger_current";
|
||||
if (!knownWorkflow && tails === undefined) return genericJobProgress(job);
|
||||
const progressTailBytes = Math.max(4096, Math.floor(maxBytes));
|
||||
const stderrTail = tails?.stderrTail ?? tailFile(job.stderrFile, progressTailBytes);
|
||||
const stdoutTail = tails?.stdoutTail ?? tailFile(job.stdoutFile, progressTailBytes);
|
||||
const events = parseJsonLineEvents(stderrTail, "hwlab.v02.trigger.progress");
|
||||
const lastEvent = events.at(-1) ?? {};
|
||||
const stage = stringField(lastEvent.stage);
|
||||
const stageStatus = stringField(lastEvent.status);
|
||||
const sourceCommit = stringField(lastEvent.sourceCommit) ?? firstMatch(stdoutTail, /"sourceCommit"\s*:\s*"([0-9a-f]{40})"/iu);
|
||||
const pipelineRun = stringField(lastEvent.pipelineRun) ?? firstMatch(stdoutTail, /"pipelineRun"\s*:\s*"([^"]+)"/u);
|
||||
const pipelineCreated = /pipelinerun\.tekton\.dev\/[^ \n]+ created/u.test(stdoutTail)
|
||||
? true
|
||||
: stage === "create-pipelinerun" && stageStatus === "failed"
|
||||
? false
|
||||
: null;
|
||||
const lastEventAt = stringField(lastEvent.at);
|
||||
const kind = events.length > 0 || knownWorkflow ? "hwlab-v02-trigger" : "generic";
|
||||
const nextCommand = pipelineRun
|
||||
? `bun scripts/cli.ts hwlab g14 control-plane status --lane v02`
|
||||
: job.status === "running"
|
||||
? `bun scripts/cli.ts job status ${job.id} --tail-bytes 12000`
|
||||
: null;
|
||||
const summary = kind === "hwlab-v02-trigger"
|
||||
? [
|
||||
job.status,
|
||||
stage ? `${stage}${stageStatus ? `:${stageStatus}` : ""}` : "stage:unknown",
|
||||
sourceCommit ? `source=${sourceCommit.slice(0, 12)}` : null,
|
||||
pipelineRun ? `pipelineRun=${pipelineRun}` : null,
|
||||
pipelineCreated === true ? "created" : pipelineCreated === false ? "create-failed" : null,
|
||||
].filter(Boolean).join(" ")
|
||||
: `${job.status}${job.exitCode === null ? "" : ` exit=${job.exitCode}`}`;
|
||||
return {
|
||||
kind,
|
||||
stage,
|
||||
stageStatus,
|
||||
sourceCommit,
|
||||
pipelineRun,
|
||||
pipelineCreated,
|
||||
lastEventAt,
|
||||
eventsObserved: events.length,
|
||||
summary,
|
||||
nextCommand,
|
||||
};
|
||||
}
|
||||
|
||||
function genericJobProgress(job: JobRecord): JobProgressSummary {
|
||||
return {
|
||||
kind: "generic",
|
||||
stage: null,
|
||||
stageStatus: null,
|
||||
sourceCommit: null,
|
||||
pipelineRun: null,
|
||||
pipelineCreated: null,
|
||||
lastEventAt: null,
|
||||
eventsObserved: 0,
|
||||
summary: `${job.status}${job.exitCode === null ? "" : ` exit=${job.exitCode}`}`,
|
||||
nextCommand: job.status === "running" ? `bun scripts/cli.ts job status ${job.id} --tail-bytes 12000` : null,
|
||||
};
|
||||
}
|
||||
|
||||
function tailTextByBytes(text: string, maxBytes: number): string {
|
||||
const safeMaxBytes = Math.max(0, Math.floor(maxBytes));
|
||||
if (safeMaxBytes === 0) return "";
|
||||
const buffer = Buffer.from(text, "utf8");
|
||||
if (buffer.length <= safeMaxBytes) return text;
|
||||
return buffer.subarray(buffer.length - safeMaxBytes).toString("utf8");
|
||||
}
|
||||
|
||||
function parseJsonLineEvents(text: string, eventName: string): Record<string, unknown>[] {
|
||||
const events: Record<string, unknown>[] = [];
|
||||
for (const line of text.split(/\r?\n/u)) {
|
||||
const trimmed = line.trim();
|
||||
if (!trimmed.startsWith("{")) continue;
|
||||
try {
|
||||
const parsed = JSON.parse(trimmed) as unknown;
|
||||
if (typeof parsed === "object" && parsed !== null && !Array.isArray(parsed) && (parsed as Record<string, unknown>).event === eventName) {
|
||||
events.push(parsed as Record<string, unknown>);
|
||||
}
|
||||
} catch {
|
||||
// Ignore non-JSON stderr lines; the raw tail remains available in stderrTail.
|
||||
}
|
||||
}
|
||||
return events;
|
||||
}
|
||||
|
||||
function stringField(value: unknown): string | null {
|
||||
return typeof value === "string" && value.trim().length > 0 ? value.trim() : null;
|
||||
}
|
||||
|
||||
function firstMatch(text: string, pattern: RegExp): string | null {
|
||||
const match = pattern.exec(text);
|
||||
return typeof match?.[1] === "string" && match[1].length > 0 ? match[1] : null;
|
||||
}
|
||||
|
||||
export interface JobListOptions {
|
||||
limit?: number;
|
||||
includeCommand?: boolean;
|
||||
@@ -182,6 +296,7 @@ export function listJobsSummary(options: JobListOptions = {}): unknown {
|
||||
id: job.id,
|
||||
name: job.name,
|
||||
status: job.status,
|
||||
progress: summarizeJobProgress(job, 32_000),
|
||||
runner: job.runner,
|
||||
runnerPid: job.runnerPid ?? null,
|
||||
runnerContainer: job.runnerContainer ?? null,
|
||||
|
||||
Reference in New Issue
Block a user