From 2a4f6d7791445e7dfcd8cd69a0b985e1da4c6f2c Mon Sep 17 00:00:00 2001 From: Codex Date: Tue, 2 Jun 2026 06:40:27 +0000 Subject: [PATCH] fix: improve agentrun status visibility --- docs/reference/agentrun.md | 4 +- docs/reference/cli.md | 4 +- scripts/src/agentrun.ts | 213 +++++++++++++++++++++++++++++++------ 3 files changed, 187 insertions(+), 34 deletions(-) diff --git a/docs/reference/agentrun.md b/docs/reference/agentrun.md index f43cb0dc..8d6c545a 100644 --- a/docs/reference/agentrun.md +++ b/docs/reference/agentrun.md @@ -73,7 +73,7 @@ bun scripts/cli.ts agentrun v01 control-plane refresh --dry-run bun scripts/cli.ts agentrun v01 control-plane refresh --confirm ``` -`status` 只读观察 `G14:/root/agentrun-v01` 当前 commit、对应 PipelineRun、GitOps latest、Argo Application 和 `agentrun-v01` workload 摘要,并报告 Argo revision 是否对齐 `v0.1-gitops` latest。`trigger-current` 会先把固定 source worktree 快进到 `origin/v0.1`,再以当前 commit 创建 commit-pinned PipelineRun;同名 PipelineRun 正在运行或已经成功时必须拒绝重复触发,只允许在失败态或不存在时创建。该命令只提交 CI/CD 工作,不等待完整 PipelineRun 或 rollout 完成,后续用 `status` 轮询。`refresh` 只对 `argocd/agentrun-g14-v01` 执行 hard refresh,用于 GitOps promotion 已完成但 Argo 仍停留旧 revision 时的受控同步入口;它不直接 patch runtime workload。 +`status` 只读观察 `G14:/root/agentrun-v01` 当前 commit、对应 PipelineRun、GitOps latest、Argo Application、`agentrun-v01` workload、manager source commit 和 git mirror 摘要,并报告 Argo revision 是否对齐 `v0.1-gitops` latest。默认输出是 compact commander 视图,只保留 `summary`、阶段耗时、对齐状态和 drill-down 命令;需要远端 stdout/stderr tail 时显式加 `--full`,需要原始 git mirror cache 输出时显式加 `--raw`。`status` 会向 stderr 输出 `agentrun.control-plane.status.progress` 阶段事件,覆盖 `source`、`runtime` 和 `git-mirror`,避免长时间聚合时无可见进展。`trigger-current` 会先把固定 source worktree 快进到 `origin/v0.1`,再以当前 commit 创建 commit-pinned PipelineRun;同名 PipelineRun 正在运行或已经成功时必须拒绝重复触发,只允许在失败态或不存在时创建。该命令只提交 CI/CD 工作,不等待完整 PipelineRun 或 rollout 完成,后续用 `status` 轮询。`refresh` 只对 `argocd/agentrun-g14-v01` 执行 hard refresh,用于 GitOps promotion 已完成但 Argo 仍停留旧 revision 时的受控同步入口;它不直接 patch runtime workload。 ## UniDesk 边界 @@ -111,7 +111,7 @@ HWLAB 负责自身产品和接入层,包括用户鉴权、Cloud Web/CLI 对外 HWLAB 通过 AgentRun 执行 Code Agent turn 时,失败归因必须以 AgentRun backend adapter 的结构化 failure kind 为准。AgentRun 负责把 provider、thread、runner、bundle 和 command lifecycle 的失败分类成稳定语义;HWLAB 负责原样消费并映射到用户可读分类。不得为了让 UI 或 issue 收口看起来更顺,把 AgentRun/provider 错误改写成 device-pod、gateway、Cloud API endpoint 或前端渲染问题。 -Codex thread resume 失败必须是单一路径失败。当 `thread/resume` 遇到旧 app-server rollout 缺失、返回 `no rollout found for thread id` 这类可判定的 stale thread 时,AgentRun 应终止当前 turn 并输出 `thread-resume-failed`,不得启动替代 `thread/start`、替换 session 指针或在同一轮混入第二条 thread 路径。HWLAB 收到该 failure kind 时,应显示为 AgentRun thread resume 失败和当前轮次终止;不要把它解释成硬件执行通道或 Cloud API 不可达,也不要通过清会话、隐藏错误或重开路径迁就。 +Codex thread resume 失败必须由 AgentRun 明确归因和处理。当 `thread/resume` 遇到旧 app-server rollout 缺失、返回 `no rollout found for thread id` 这类可判定的 stale thread 时,AgentRun 应输出 `thread/resume:non-resumable`,启动 replacement `thread/start` 继续当前 turn,并在成功后回写新的 `threadId` 到 sessionRef;不得让用户轮次直接失败,也不得要求 HWLAB 通过清会话、隐藏错误或重开路径迁就。只有不可恢复的 resume 协议错误或 replacement 也失败时,才输出 `thread-resume-failed`。HWLAB 收到该 failure kind 时,应显示为 AgentRun thread resume/replacement 层错误,不要把它解释成硬件执行通道或 Cloud API 不可达。 Codex app-server/provider 返回 tool-call 参数 JSON 错误时,AgentRun 应输出 `provider-invalid-tool-call`。HWLAB adapter/Web 应映射为 provider/tool-call 层错误,并保留 `providerTrace.failureKind` 与简明 failure message,明确这不是 device-pod、gateway 或 Cloud API endpoint 故障。后续修复应进入 AgentRun provider/backend adapter 或上游 provider 请求构造,不要在 HWLAB 设备侧增加兼容路径。 diff --git a/docs/reference/cli.md b/docs/reference/cli.md index 3bcddbd9..483177ae 100644 --- a/docs/reference/cli.md +++ b/docs/reference/cli.md @@ -43,8 +43,8 @@ CI/CD、GitOps、rollout、artifact 发布、PR 合并后的 DEV/PROD 滚动、P - `artifact-registry plan|render|status|health|install|deploy-backend-core|deploy-service` 管理 D601 host-managed CNCF Distribution registry 的声明、安装、只读检查和 pull-only artifact CD。该 registry 固定为 D601 loopback `127.0.0.1:5000`,由 systemd + Docker Compose 管理,位于 native k3s 故障域外;`deploy-service` 只拉取 CI 已发布的 commit-pinned 镜像、retag/recreate 或导入 native k3s,并做 live commit 验证,不构建 runtime source。`deploy-backend-core` 是 deprecated 兼容名,标准 backend-core prod CD 入口是 `deploy apply --env prod --service backend-core`。长期规则见 `docs/reference/artifact-registry.md`。 - `commander contract|plan --dry-run|smoke --dry-run|approval request --dry-run|prompt-lint --kind gpt55-pr` 是 host Codex 指挥官直管微服务 skeleton 入口。当前命令返回 `phase=source-contract`、service/API/state/bridge/prompt/trace/#20/#46/ClaudeQQ 审批边界、.state/commander/ 状态模型、dev 无 daemon smoke contract、dry-run 计划和 GPT-5.5 PR prompt 边界辅助 lint,不接 live bridge、不注入 prompt、不发送 ClaudeQQ。`approval request --dry-run` 会生成 200 字以内中文纯文本 ClaudeQQ 审批草案、`notification-path-unavailable` blocker 和授权后唯一可用的 `bun scripts/cli.ts microservice proxy claudeqq /api/push/text --method POST --body-json '' --raw` 命令;不得提示使用本机 ClaudeQQ skill、powershell 或本地 server。`prompt-lint` 支持 `--prompt-file` 与 `--stdin`,输出 `ok`、`missingClauses`、`riskLevel`、`suggestedPatchSnippet` 且不回显完整 prompt;它是 commander 辅助检查,不是业务 PR 门禁,也不改变 `codex submit` 默认行为。`plan`、`smoke` 与 `approval request` 必须带 `--dry-run`;缺少时返回 `error=dry-run-required`。长期规则见 `docs/reference/host-codex-commander.md`。 - `hwlab g14 monitor-prs [--once] [--dry-run] [--interval-seconds N] [--max-cycles N] [--timeout-seconds N]` 是当前 HWLAB G14 PR -> CI/CD -> DEV rollout 的一行式入口。普通调用创建 `.state/jobs/` 异步 job 并立刻返回 `job.id`、`statusCommand` 和 stdout/stderr 路径;后台 worker 每轮通过 UniDesk `gh pr list/preflight/merge` 监控 `pikasTech/HWLAB` base=`G14` 的 open PR,ready 时合并,然后通过 UniDesk `ssh G14:k3s` 观察 `hwlab-g14-ci-poll-`、Argo `hwlab-g14-dev` 和 DEV `/health/live`,直到 DEV `Synced/Healthy` 且 Deployment/StatefulSet ready;历史 `Completed` smoke/debug pod 不作为 rollout blocker。每次成功 DEV rollout 后,worker 会定位或创建 #7“指挥简报索引”中的北京日期每日简报 issue,并追加 CI/CD 耗时、CI/CD 关键指标、语义化上线 changelog、自动 diff 摘要、PipelineRun、GitOps revision 和 DEV 验证摘要;关键指标来自 G14 Tekton TaskRun results,固定包含 `lazy build reused: x/y`、reused services、rebuild services 和每个 service 的独立耗时/状态/backend,用于观察 lazy build 机制效果。语义化 changelog 优先从 PR body 的 `## 修改`/`## 变更`/`## Changelog` 等段落提取,diff 摘要只作为文件和统计证据保留,不替代 changelog。也可用 `hwlab g14 record-rollout --pr --source-commit ` 手动补记,手动补记同样会按 PipelineRun 采集 TaskRun 指标。状态指针按用途分离:长期监控只写 `.state/hwlab-g14/latest-monitor-job.json`,`--once` 写 `latest-once-job.json`,`--dry-run` 写 `latest-dry-run-job.json`,`--once --dry-run` 写 `latest-once-dry-run-job.json`,避免一次性收口覆盖持续监控入口。`--once --dry-run` 只做单轮监控和 merge plan,不写 GitHub、不等待 rollout。该命令禁止使用原生 `gh` 或手拼 GitHub 请求;如果 UniDesk `gh` 子命令字段或行为不够,必须先改进 `scripts/src/gh.ts` 后再使用。 -- `agentrun v01 control-plane status|trigger-current|refresh [--dry-run|--confirm]` 是 AgentRun `v0.1` 在 G14 k3s 的受控 Tekton/Argo 入口。`status` 只读汇总固定 source worktree commit、对应 commit-pinned PipelineRun、GitOps latest、Argo Application、`agentrun-v01` workload、`planArtifacts.summary`、env image result 和 git mirror 摘要,并报告 Argo revision 是否对齐 `v0.1-gitops` latest;`trigger-current` 先快进 `G14:/root/agentrun-v01` 到 `origin/v0.1`,检查 `devops-infra` mirror 的 `localV01` 是否等于目标 source commit,必要时先执行受控 mirror sync,再创建 `agentrun-v01-ci-` PipelineRun。confirmed trigger 只提交 CI/CD 工作并返回后续 `status` 命令,不等待完整 PipelineRun;同名 PipelineRun 运行中或已成功时拒绝重复触发,只允许失败态重建或首次创建。`refresh` 只对 `argocd/agentrun-g14-v01` 执行 hard refresh,用于 GitOps promotion 已完成但 Argo 仍停留旧 revision 时的受控同步入口;它不直接 patch runtime workload。AgentRun 运行时和 SPEC 事实来源仍在 AgentRun 仓库,UniDesk 只维护受控运维入口。 -- `agentrun v01 git-mirror status|sync|flush [--dry-run|--confirm]` 是 AgentRun `v0.1` 使用 `devops-infra` git mirror/relay 的受控维护入口。`status` 返回 read/write URL、`localV01`、`githubV01`、`localGitops`、`githubGitops`、`pendingFlush`、`githubInSync` 和 exact full-SHA shallow fetch 结果;`sync` 创建 manual Job,把 GitHub `v0.1` 和 `v0.1-gitops` refs 拉入 `/cache/pikasTech/agentrun.git`;`flush` 把本地 `v0.1-gitops` 快进推回 GitHub。confirmed `sync`/`flush` 默认创建 `.state/jobs/` 异步 job 并立刻返回 `job.id`、`statusCommand` 和日志路径;只有现场同步调试才显式加 `--wait`。该入口与 HWLAB v0.2 mirror 共用 `devops-infra` 服务和 cache PVC,但 repo path、refs、status 文件和 CLI 命令彼此独立。 +- `agentrun v01 control-plane status|trigger-current|refresh [--dry-run|--confirm]` 是 AgentRun `v0.1` 在 G14 k3s 的受控 Tekton/Argo 入口。`status` 只读汇总固定 source worktree commit、对应 commit-pinned PipelineRun、GitOps latest、Argo Application、`agentrun-v01` manager source commit、`planArtifacts.summary`、env image result 和 git mirror 摘要,并报告 manager/Argo/GitOps 是否对齐当前 source commit。默认输出是 compact commander 视图:`summary` 给出 source、PipelineRun、Argo、manager image、git mirror 和 `aligned` 结论;`timings` 给出 `sourceMs`、`runtimeMs`、`gitMirrorMs` 和 `totalMs`;远端 stdout/stderr tail 默认省略,失败时仍展开必要 tail,完整 tail 用 `--full`,原始 git mirror cache 用 `--raw`。`status` 聚合 source 后会并行读取 runtime 和 git mirror,并向 stderr 输出 `agentrun.control-plane.status.progress` JSON 事件,覆盖 `source`、`runtime`、`git-mirror` 的 started/succeeded/failed 和 elapsedMs,避免 10s 以上状态聚合期间无可见进展;`trigger-current` 先快进 `G14:/root/agentrun-v01` 到 `origin/v0.1`,检查 `devops-infra` mirror 的 `localV01` 是否等于目标 source commit,必要时先执行受控 mirror sync,再创建 `agentrun-v01-ci-` PipelineRun。confirmed trigger 只提交 CI/CD 工作并返回后续 `status` 命令,不等待完整 PipelineRun;同名 PipelineRun 运行中或已成功时拒绝重复触发,只允许失败态重建或首次创建。`refresh` 只对 `argocd/agentrun-g14-v01` 执行 hard refresh,用于 GitOps promotion 已完成但 Argo 仍停留旧 revision 时的受控同步入口;它不直接 patch runtime workload。AgentRun 运行时和 SPEC 事实来源仍在 AgentRun 仓库,UniDesk 只维护受控运维入口。 +- `agentrun v01 git-mirror status|sync|flush [--dry-run|--confirm]` 是 AgentRun `v0.1` 使用 `devops-infra` git mirror/relay 的受控维护入口。`status` 默认返回 read/write URL、`localV01`、`githubV01`、`localGitops`、`githubGitops`、`pendingFlush`、`githubInSync` 和 exact full-SHA shallow fetch 摘要,不默认展开完整 cache stdout;需要探测 tail 时用 `--full`,需要原始 cache 输出时用 `--raw`。`sync` 创建 manual Job,把 GitHub `v0.1` 和 `v0.1-gitops` refs 拉入 `/cache/pikasTech/agentrun.git`;`flush` 把本地 `v0.1-gitops` 快进推回 GitHub。confirmed `sync`/`flush` 默认创建 `.state/jobs/` 异步 job 并立刻返回 `job.id`、`statusCommand` 和日志路径;只有现场同步调试才显式加 `--wait`。该入口与 HWLAB v0.2 mirror 共用 `devops-infra` 服务和 cache PVC,但 repo path、refs、status 文件和 CLI 命令彼此独立。 - `hwlab g14 control-plane status|apply --lane v02 [--dry-run|--confirm]` 是 HWLAB `v0.2` 加法 lane 的受控 Tekton/Argo 控制面维护入口,source commit 只来自 G14 专用 bare repo `/root/hwlab-v02-cicd.git` 的 `refs/remotes/origin/v0.2`;`/root/hwlab-v02` 只作为人工开发和短连接源码工具 workspace 被观测,dirty/stale 状态必须输出为 isolated warning 而不能阻塞 CI/CD。该入口面向 branch `v0.2`、namespace `hwlab-ci` 和 Argo application `hwlab-g14-v02`;默认 `status` 只读汇总最新 source head 的 pipeline、RBAC/ServiceAccount、Argo、当前 commit PipelineRun、当前 PipelineRun 的 TaskRun 条件摘要、最近 PipelineRun 摘要、活跃 PipelineRun、遗留 v02 CronJob 清理状态、commit alignment,以及 19666/19667 的 Cloud Web 静态资源和 API live 探针。分支被后续提交推进后,要复查已完成 run 时使用 `status --lane v02 --pipeline-run hwlab-v02-ci-poll-`;已知完整 source SHA 但不想依赖最新 head 时使用 `status --lane v02 --source-commit `。定点 `status` 输出 `statusTarget.mode` 和 `targetValidation`,只检查指定 PipelineRun/source commit 的证据;`targetValidation.state=passed` 表示该目标已满足 PipelineRun succeeded、Argo `Synced/Healthy`、19666/19667 探针、Git mirror flushed,并且该 run 的 `planArtifacts.rolloutServices` 运行时 source commit 对齐;`planArtifacts.reusedServices` 作为 runtime/provenance 证据呈现,但不能被强制要求等于目标 source commit。`targetValidation.state=superseded` 表示该目标已成功且 runtime 已被同一分支后续成功 PipelineRun 取代,`falseGreenGuard` 在该状态下应标为 superseded/not-applicable。两种状态都不得因为 `origin/v0.2` 后续推进而把历史 run 判为失败;默认不带定点参数时仍严格判定最新 source head alignment。TaskRun 摘要的 `performance` 字段会把超过 120s 的 build TaskRun 标为慢任务、超过 180s 标为 critical warning,用于暴露 env reuse/git mirror 命中率回归,但不作为阻断门禁;CI/CD 性能验收应同时看 `planArtifacts.summary`、`taskRuns.performance.warningCount` 和 PipelineRun duration,纯 CLI/文档或无 runtime 重建需求的后续提交应稳定表现为 `build=0 reuse=` 且无 build TaskRun warning,首次引入或切换 env image 时允许只构建必要 env image 一次。`webAssets` 必须直接给出 `readonly-rpc` 删除、sidebar/workspace/event panel 关键 CSS、`/app.js` 是否可读取和字节数、`/health/live` 与 API revision;`apiRevision` 是 cloud-api 服务自身 revision,Cloud Web 静态资源变更时允许它与 source commit 不同,不能把这种差异误判成 Cloud Web 未发布。默认只读取必要字段,禁止把完整 PipelineRun spec、Tekton 内联脚本、历史大对象或整份 CSS/HTML/JS 展开到默认输出;`apply` 先自动 fetch `/root/hwlab-v02-cicd.git` 并从 commit-pinned detached worktree 执行 render check,再经 `G14:k3s` server-side apply `tekton-v02/rbac.yaml`、`pipeline.yaml`、`argocd/project.yaml` 和 `argocd/application-v02.yaml`,confirmed apply 会删除遗留 v02 CronJob,但不会应用 runtime-v02 workload、Secret 或数据迁移。 - `hwlab g14 control-plane trigger-current --lane v02 [--dry-run|--confirm]` 是 v02 标准手动触发入口:先自动 fetch `/root/hwlab-v02-cicd.git`,解析当前 `origin/v0.2` full SHA,创建 commit-pinned `hwlab-v02-ci-poll-` PipelineRun;读 Git 走 `git-mirror-http.devops-infra.svc.cluster.local`,GitOps promotion 写 `git-mirror-write.devops-infra.svc.cluster.local`;confirmed trigger 在删除/创建 PipelineRun 前会先按当前 source commit 在 G14 临时 detached worktree 中 render,再 server-side apply v02 Tekton RBAC、Pipeline 与 Argo Application,避免 CI/CD 脚本或 runtime-ready 逻辑已合并但集群仍执行旧 Pipeline 定义;该 render 不要求固定 `/root/hwlab-v02` 工作树 clean,也不得因 `.worktree/` 或其他并行未提交修改阻塞;同名 PipelineRun 成功或运行中时拒绝重复触发,失败或不存在时才删除旧对象并重新创建。 创建 PipelineRun 前会读取 `devops-infra` mirror refs,若 `localV02` 未等于当前 source commit,则自动执行一次受控 manual `git-mirror sync` Job 并复核 ref,复核失败时停止触发,避免 Tekton `prepare-source` 已知失败;services 参数只包含 v02 runtime service matrix,`hwlab-cli` 是固定 repo 短连接源码工具,不进入 PipelineRun service build。 diff --git a/scripts/src/agentrun.ts b/scripts/src/agentrun.ts index 6d1be301..91d69b6e 100644 --- a/scripts/src/agentrun.ts +++ b/scripts/src/agentrun.ts @@ -26,11 +26,13 @@ export function agentRunHelp(): unknown { output: "json", usage: [ "bun scripts/cli.ts agentrun v01 control-plane status", + "bun scripts/cli.ts agentrun v01 control-plane status --full", "bun scripts/cli.ts agentrun v01 control-plane trigger-current --dry-run", "bun scripts/cli.ts agentrun v01 control-plane trigger-current --confirm", "bun scripts/cli.ts agentrun v01 control-plane refresh --dry-run", "bun scripts/cli.ts agentrun v01 control-plane refresh --confirm", "bun scripts/cli.ts agentrun v01 git-mirror status", + "bun scripts/cli.ts agentrun v01 git-mirror status --full", "bun scripts/cli.ts agentrun v01 git-mirror sync --confirm", "bun scripts/cli.ts agentrun v01 git-mirror flush --confirm", ], @@ -42,12 +44,12 @@ export async function runAgentRunCommand(config: UniDeskConfig, args: string[]): const [lane, group, action] = args; if (lane !== "v01") return unsupported(args); if (group === "control-plane") { - if (action === "status") return await status(config); + if (action === "status") return await status(config, parseDisclosureOptions(args.slice(3))); if (action === "trigger-current") return await triggerCurrent(config, parseTriggerOptions(args.slice(3))); if (action === "refresh") return await refresh(config, parseConfirmOptions(args.slice(3))); } if (group === "git-mirror") { - if (action === "status") return await gitMirrorStatus(config); + if (action === "status") return await gitMirrorStatus(config, parseDisclosureOptions(args.slice(3))); if (action === "sync" || action === "flush") { const options = parseGitMirrorOptions(args.slice(3)); if (options.confirm && !options.wait) return startAsyncAgentRunJob(`agentrun_v01_git_mirror_${action}`, ["bun", "scripts/cli.ts", "agentrun", "v01", "git-mirror", action, "--confirm", "--wait", "--timeout-seconds", String(options.timeoutSeconds)], `Run AgentRun v0.1 git mirror ${action} on G14`); @@ -72,6 +74,24 @@ interface GitMirrorOptions extends ConfirmOptions { wait: boolean; } +interface DisclosureOptions { + full: boolean; + raw: boolean; +} + +interface TimedValue { + value: T; + elapsedMs: number; +} + +function parseDisclosureOptions(args: string[]): DisclosureOptions { + for (const arg of args) { + if (arg !== "--full" && arg !== "--raw") throw new Error(`unsupported status option: ${arg}`); + } + const raw = args.includes("--raw"); + return { full: raw || args.includes("--full"), raw }; +} + function parseTriggerOptions(args: string[]): TriggerOptions { return parseConfirmOptions(args); } @@ -92,8 +112,8 @@ function parseGitMirrorOptions(args: string[]): GitMirrorOptions { return { ...base, timeoutSeconds, wait: args.includes("--wait") }; } -async function status(config: UniDeskConfig): Promise> { - const source = await capture(config, g14SourceRoute, ["script", "--", [ +async function status(config: UniDeskConfig, options: DisclosureOptions): Promise> { + const sourceProbe = await timedStatusStage("source", () => capture(config, g14SourceRoute, ["script", "--", [ "cd /root/agentrun-v01", "git fetch origin v0.1 >/dev/null 2>&1 || true", "printf 'sourceCommit='", @@ -103,29 +123,79 @@ async function status(config: UniDeskConfig): Promise> { "printf 'gitopsLatest='", `git ls-remote origin ${gitopsBranch} 2>/dev/null | awk '{print $1}' || true`, "git status --short --branch", - ].join("\n")]); + ].join("\n")])); + const source = sourceProbe.value; const localSourceCommit = matchLine(source.stdout, "sourceCommit="); const originSourceCommit = matchLine(source.stdout, "originV01="); const sourceCommit = isGitSha(originSourceCommit ?? "") ? originSourceCommit : localSourceCommit; const gitopsLatest = matchLine(source.stdout, "gitopsLatest="); const pipelineRun = sourceCommit ? pipelineRunName(sourceCommit) : null; - const k3s = await capture(config, g14K3sRoute, ["script", "--", statusScript(pipelineRun)]); + const [runtimeProbe, mirrorProbe] = await Promise.all([ + timedStatusStage("runtime", () => capture(config, g14K3sRoute, ["script", "--", statusScript(pipelineRun)])), + timedStatusStage("git-mirror", () => readGitMirrorStatus(config)), + ]); + const k3s = runtimeProbe.value; + const mirror = mirrorProbe.value; const argo = parseArgoStatus(k3s.stdout); - const mirror = await gitMirrorStatus(config); const ciSummary = labeledJson(k3s.stdout, "ciSummary"); const pipelineRunCondition = labeledJson(k3s.stdout, "pipelineRunCondition"); + const managerImage = parseManagerImage(k3s.stdout); + const mirrorSummary = mirror.summary; + const runtimeAlignment = { + localHeadMatchesOrigin: Boolean(localSourceCommit && originSourceCommit && localSourceCommit === originSourceCommit), + argoRevision: argo.revision, + argoSyncStatus: argo.syncStatus, + argoHealthStatus: argo.healthStatus, + syncedToGitopsLatest: Boolean(gitopsLatest && argo.revision === gitopsLatest), + managerSourceCommit: managerImage.sourceCommit, + managerSourceMatchesExpected: Boolean(sourceCommit && managerImage.sourceCommit === sourceCommit), + }; + const summary = { + sourceCommit, + expectedPipelineRun: pipelineRun, + pipelineRun: { + status: pipelineRunCondition.status ?? null, + reason: pipelineRunCondition.reason ?? null, + completionTime: pipelineRunCondition.completionTime ?? null, + }, + argo, + managerImage, + gitMirror: { + localV01: mirrorSummary.localV01 ?? null, + githubV01: mirrorSummary.githubV01 ?? null, + localGitops: mirrorSummary.localGitops ?? null, + githubGitops: mirrorSummary.githubGitops ?? null, + pendingFlush: mirrorSummary.pendingFlush ?? null, + sourceInSync: mirrorSummary.sourceInSync ?? null, + gitopsInSync: mirrorSummary.gitopsInSync ?? null, + githubInSync: mirrorSummary.githubInSync ?? null, + }, + aligned: pipelineRunCondition.status === "True" && + runtimeAlignment.localHeadMatchesOrigin === true && + runtimeAlignment.syncedToGitopsLatest === true && + runtimeAlignment.managerSourceMatchesExpected === true && + mirrorSummary.githubInSync === true && + mirrorSummary.pendingFlush === false, + }; return { ok: source.exitCode === 0 && k3s.exitCode === 0 && mirror.ok === true, command: "agentrun v01 control-plane status", lane: "v0.1", + summary, sourceCommit, sourceCommitSource: sourceCommit === originSourceCommit ? "origin/v0.1" : "local-head", localSourceCommit, originSourceCommit, gitopsLatest, expectedPipelineRun: pipelineRun, - source: compactCapture(source), - runtime: compactCapture(k3s), + timings: { + sourceMs: sourceProbe.elapsedMs, + runtimeMs: runtimeProbe.elapsedMs, + gitMirrorMs: mirrorProbe.elapsedMs, + totalMs: sourceProbe.elapsedMs + Math.max(runtimeProbe.elapsedMs, mirrorProbe.elapsedMs), + }, + source: compactCapture(source, { full: options.full || options.raw, stdoutTailChars: 3000, stderrTailChars: 2000 }), + runtime: compactCapture(k3s, { full: options.full || options.raw, stdoutTailChars: 8000, stderrTailChars: 4000 }), pipelineRunCondition, ciSummary, gitMirror: { @@ -133,13 +203,18 @@ async function status(config: UniDeskConfig): Promise> { readUrl: mirror.readUrl, writeUrl: mirror.writeUrl, summary: mirror.summary, + probe: compactCapture(mirror.result, { full: options.full || options.raw, stdoutTailChars: 6000, stderrTailChars: 3000 }), + ...(options.raw ? { raw: mirror.raw } : {}), }, - runtimeAlignment: { - localHeadMatchesOrigin: Boolean(localSourceCommit && originSourceCommit && localSourceCommit === originSourceCommit), - argoRevision: argo.revision, - argoSyncStatus: argo.syncStatus, - argoHealthStatus: argo.healthStatus, - syncedToGitopsLatest: Boolean(gitopsLatest && argo.revision === gitopsLatest), + runtimeAlignment, + disclosure: { + defaultView: "compact-low-noise", + full: options.full, + raw: options.raw, + stdoutTailOmitted: !(options.full || options.raw), + rawGitMirrorOmitted: !options.raw, + expandWith: "bun scripts/cli.ts agentrun v01 control-plane status --full", + rawWith: "bun scripts/cli.ts agentrun v01 control-plane status --raw", }, next: { triggerCurrent: "bun scripts/cli.ts agentrun v01 control-plane trigger-current --confirm", @@ -198,8 +273,8 @@ async function triggerCurrent(config: UniDeskConfig, options: TriggerOptions): P writeUrl: gitMirrorWriteUrl, }, }; - const mirrorBefore = await gitMirrorStatus(config); - const mirrorRequirement = gitMirrorSyncRequirement(sourceCommit, String(mirrorBefore.raw ?? "")); + const mirrorBefore = await readGitMirrorStatus(config); + const mirrorRequirement = gitMirrorSyncRequirement(sourceCommit, mirrorBefore.raw); if (options.dryRun || !options.confirm) { return { ok: true, @@ -221,8 +296,8 @@ async function triggerCurrent(config: UniDeskConfig, options: TriggerOptions): P }; if (mirrorRequirement.required) { const synced = await runGitMirrorJob(config, "sync", { confirm: true, dryRun: false, timeoutSeconds: 300, wait: true }); - const after = await gitMirrorStatus(config); - const afterRequirement = gitMirrorSyncRequirement(sourceCommit, String(after.raw ?? "")); + const after = await readGitMirrorStatus(config); + const afterRequirement = gitMirrorSyncRequirement(sourceCommit, after.raw); gitMirrorPreSync = { ...gitMirrorPreSync, sync: synced, after: after.summary, ok: synced.ok === true && afterRequirement.required === false }; if (synced.ok !== true || afterRequirement.required !== false) { return { @@ -432,7 +507,35 @@ function triggerScript(sourceCommit: string, pipelineRun: string): string { ].join("\n"); } -async function gitMirrorStatus(config: UniDeskConfig): Promise> { +async function gitMirrorStatus(config: UniDeskConfig, options: DisclosureOptions = { full: false, raw: false }): Promise> { + const observation = await readGitMirrorStatus(config); + const summary = observation.summary; + return { + ok: observation.ok, + command: "agentrun v01 git-mirror status", + namespace: gitMirrorNamespace, + readUrl: gitMirrorReadUrl, + writeUrl: gitMirrorWriteUrl, + summary, + ...(options.raw ? { raw: observation.raw } : {}), + probe: compactCapture(observation.result, { full: options.full || options.raw, stdoutTailChars: 6000, stderrTailChars: 3000 }), + disclosure: { + defaultView: "compact-low-noise", + full: options.full, + raw: options.raw, + rawOmitted: !options.raw, + probeTailOmitted: !(options.full || options.raw), + expandWith: "bun scripts/cli.ts agentrun v01 git-mirror status --full", + rawWith: "bun scripts/cli.ts agentrun v01 git-mirror status --raw", + }, + next: { + sync: "bun scripts/cli.ts agentrun v01 git-mirror sync --confirm", + flush: summary.pendingFlush === true ? "bun scripts/cli.ts agentrun v01 git-mirror flush --confirm" : null, + }, + }; +} + +async function readGitMirrorStatus(config: UniDeskConfig): Promise & { result: SshCaptureResult; raw: string; summary: Record }> { const script = [ "set +e", "printf 'resources\\n'", @@ -447,17 +550,12 @@ async function gitMirrorStatus(config: UniDeskConfig): Promise { - return { +function compactCapture(result: SshCaptureResult, options: { full?: boolean; stdoutTailChars?: number; stderrTailChars?: number } = {}): Record { + const stdoutTailChars = options.stdoutTailChars ?? 8000; + const stderrTailChars = options.stderrTailChars ?? 4000; + const full = options.full ?? true; + const payload: Record = { exitCode: result.exitCode, - stdoutTail: tail(result.stdout, 8000), - stderrTail: tail(result.stderr, 4000), + stdoutBytes: Buffer.byteLength(result.stdout, "utf8"), + stderrBytes: Buffer.byteLength(result.stderr, "utf8"), + stdoutTailOmitted: !full && result.exitCode === 0, + stderrTailOmitted: !full && result.exitCode === 0, }; + if (full || result.exitCode !== 0) { + payload.stdoutTail = tail(result.stdout, stdoutTailChars); + payload.stderrTail = tail(result.stderr, stderrTailChars); + payload.stdoutTruncated = result.stdout.length > stdoutTailChars; + payload.stderrTruncated = result.stderr.length > stderrTailChars; + } + return payload; +} + +async function timedStatusStage(stage: string, action: () => Promise): Promise> { + const startedAtMs = Date.now(); + progressEvent("agentrun.control-plane.status.progress", { stage, status: "started" }); + try { + const value = await action(); + const exitCode = isCaptureResult(value) ? value.exitCode : typeof record(value).ok === "boolean" && record(value).ok !== true ? 1 : 0; + progressEvent("agentrun.control-plane.status.progress", { + stage, + status: exitCode === 0 ? "succeeded" : "failed", + exitCode, + elapsedMs: Date.now() - startedAtMs, + }); + return { value, elapsedMs: Date.now() - startedAtMs }; + } catch (error) { + progressEvent("agentrun.control-plane.status.progress", { + stage, + status: "failed", + elapsedMs: Date.now() - startedAtMs, + error: error instanceof Error ? error.message : String(error), + }); + throw error; + } +} + +function progressEvent(event: string, payload: Record): void { + process.stderr.write(`${JSON.stringify({ event, at: new Date().toISOString(), ...payload })}\n`); +} + +function isCaptureResult(value: unknown): value is SshCaptureResult { + return typeof value === "object" && value !== null && "exitCode" in value && typeof (value as { exitCode?: unknown }).exitCode === "number"; } function pipelineRunName(sourceCommit: string): string { @@ -847,6 +989,17 @@ function parseArgoStatus(text: string): { revision: string | null; syncStatus: s }; } +function parseManagerImage(text: string): { image: string | null; sourceCommit: string | null } { + const lines = text.split(/\r?\n/u); + const index = lines.findIndex((line) => line.trim() === "managerImage"); + if (index < 0) return { image: null, sourceCommit: null }; + const [image, sourceCommit] = (lines[index + 1] ?? "").split("\t"); + return { + image: image?.trim() || null, + sourceCommit: sourceCommit?.trim() || null, + }; +} + function isGitSha(value: string): boolean { return /^[0-9a-f]{40}$/u.test(value); }