feat: add AgentRun cancel lifecycle policy (#859)

Co-authored-by: Codex <codex@noreply.local>
This commit is contained in:
Lyon
2026-06-25 08:44:30 +08:00
committed by GitHub
parent 94d985d628
commit 790e5df281
10 changed files with 278 additions and 13 deletions
+3 -1
View File
@@ -72,7 +72,7 @@ AgentRun queue 生命周期不是一个单独的 `queue lifecycle` 命令,而
1. 默认总览用 `get tasks --queue commander --limit 20`,只看 task state、queue/lane、run/cmd/rjob/session ref、age 和 attention。
2. 单任务用 `describe task/<taskId>`,读取 `latestAttempt.runId``commandId``runnerJobId``sessionId/sessionPath` 和少量 `Next:`
3. Run 级状态用 `events run/<runId>``result run/<runId> --command <commandId>`,判断 terminalClassification、failureKind、provider interruption、timeoutBudget 和 recoveryActions。
4. Command 级状态用 `describe command/<commandId> --run <runId>``result command/<commandId> --run <runId>`,确认 command state、ack、terminal status 和结果摘要;确认为单个 active command 卡住时,用 `cancel command/<commandId> --run <runId> --reason <text>` 清理该 command,保留同一个 session 后再用 `send session/<sessionId>` 续跑。
4. Command 级状态用 `describe command/<commandId> --run <runId>``result command/<commandId> --run <runId>`,确认 command state、ack、terminal status 和结果摘要;确认为单个 active command 卡住时,`cancel command/<commandId> --run <runId> --reason <text> --dry-run` 核对 `CancelLifecycle` 的 authority、cascade、runner abort 和 fencing,再去掉 `--dry-run` 清理该 command,保留同一个 session 后再用 `send session/<sessionId>` 续跑。
5. Runner job 只读状态用 `describe runnerjob/<runnerJobId> --run <runId>`,确认 env image reuse、jobName、namespace、phase、exitCode、retention 和 `valuesPrinted=false`。不要为了这些字段手动调用 `trans G14:k3s kubectl ...`
6. Runtime runner Job/Pod retention 或 operator 明确要求强杀 runner 时,不属于单个 task/session 资源原语;使用 `bun scripts/cli.ts agentrun control-plane cleanup-runners --node <node> --lane <lane> [--force-active] --dry-run|--confirm`。普通 cleanup 只删 inactive selected runner`--force-active` 会中断 active run/command/session,必须先 dry-run 确认 selection,并且仍应优先于裸 `kubectl delete pod/job`
7. Session trace/output 只在 `describe task` 或 result 里有实际 `sessionId` 时使用 `logs|ack|send|cancel session/<sessionId>``sessionRef=null` 时不要猜 session 命令。用户级 follow-up 一律使用 `send session/<sessionId>`,不要回到旧 `turn/steer``sessions ...` 兼容路径。
@@ -80,6 +80,8 @@ AgentRun queue 生命周期不是一个单独的 `queue lifecycle` 命令,而
默认视图必须低噪声且不是 JSON envelope`-o json|yaml` 才输出稳定机器结构,`--raw` 才保留直连 AgentRun REST envelope;命令返回里的下一步应优先是 `bun scripts/cli.ts agentrun ...` 资源原语,不得把人工 k8s 查询作为日常下一步。
AgentRun cancel 策略由 `config/agentrun.yaml` 的 lane 级 `deployment.runner.cancelLifecycle` 管理;操作 D601、G14 或其他非默认 lane 时必须带 `--node/--lane --dry-run` 先确认 YAML policy,不要依赖全局默认或手动 k8s 强杀来替代资源原语。
## HWLAB Code Agent 入口整合
HWLAB Code Agent / CaseRun follow-up 的日常派单也归入 AgentRun 资源原语:新任务用 `create task --aipod Artificer` 或包含 HWLAB gitbundle 的 `apply -f -`;运行中纠偏用 `send session/<sessionId> --aipod Artificer`。需要验证 HWLAB Web/Cloud API 原入口时,仍按 `$hwlab-code-agent` 使用 G14 `/root/hwlab-v02``hwlab-cli client agent ...` 拉取同一 trace/result/inspect;不要回到旧 `codex submit/resume/steer`
+30
View File
@@ -174,6 +174,21 @@ controlPlane:
ageBasedCleanup:
enabled: false
maxAgeHours: 48
cancelLifecycle:
deliveryMode: manager-epoch
gracefulAbortMs: 15000
killEscalationMs: 30000
staleHeartbeatFencingMs: 900000
lateWriteFencing:
enabled: true
eventStages:
- accepted
- persisted
- delivered
- aborting
- terminalized
- fenced
- late-write-rejected
localPostgres:
enabled: true
serviceName: agentrun-v01-postgres
@@ -361,6 +376,21 @@ controlPlane:
ageBasedCleanup:
enabled: false
maxAgeHours: 48
cancelLifecycle:
deliveryMode: manager-epoch
gracefulAbortMs: 15000
killEscalationMs: 30000
staleHeartbeatFencingMs: 900000
lateWriteFencing:
enabled: true
eventStages:
- accepted
- persisted
- delivered
- aborting
- terminalized
- fenced
- late-write-rejected
localPostgres:
enabled: false
gitMirror:
+1
View File
@@ -118,6 +118,7 @@ PipelineRun 失败或长时间未完成时,先按定点 `control-plane status
- `codex deploy <commitId>` 是旧 Code Queue 兼容部署入口,已禁用以防止维护通道直连 D601 部署 Code Queue;当前 dev 自动化只做 `ci run-dev-e2e` smoke,不提供 Code Queue CD,详细规则见 `docs/reference/codex-deploy.md`
- `agentrun get|describe|events|logs|result|ack|cancel|dispatch|create|apply|send` 是当前指挥官新任务和 AgentRun session 控制入口。UniDesk CLI 是 render-only client:客户端保留 k8s 风格命令解析、human 表格、生命周期摘要、下一步命令、分页、`-o json|yaml` 稳定客户端 schema 和错误展示;AgentRun 服务端只提供稳定 RESTful API、鉴权和业务事实,不承载 UniDesk CLI 渲染。日常查看用 `get tasks --queue commander``describe task/<taskId>``events run/<runId>``logs session/<sessionId>``result run/<runId> --command <commandId>`;日常写入用 `create task --aipod Artificer --prompt-stdin``apply -f -``dispatch task/<taskId>``send session/<sessionId>``ack/cancel task|session/<id>`。用户级 CLI 取消 `turn``steer` 路径;`send session/<sessionId>` 是唯一 session follow-up 写入口,AgentRun 服务端按 durable session/run/command 状态自动决定内部 `steer` 或新 `turn`,dry-run 必须真实返回这个 decision 且不写状态。兼容 group `queue|runs|commands|runner|sessions|aipod-specs` 也走同一 direct HTTP transport`--raw` 只披露直连 AgentRun REST envelope。
- `agentrun` 资源原语的默认 transport 是直连 AgentRun REST API,配置来源是 UniDesk 自有 YAML `config/agentrun.yaml`。不带 `--node`/`--lane` 时按 YAML 的默认 manager `baseUrl` 访问;显式 `--node <node> --lane <lane>` 时按同一 YAML 选中 runtime lane,经 `lane-k8s-service-proxy` 进入 manager `internalBaseUrl`,并用 manager pod env 中声明的 API key metadata 发起请求;输出只披露 node/lane/namespace/baseUrl/auth env metadata 和 `valuesPrinted=false`,不得打印 key value。该模式用于 D601 `agentrun-v02` 等非默认 lane 的资源原语操作与证据采集,尤其是 `get/describe/events/logs/result`,不替代 `agentrun control-plane ...` 发布或运维控制。鉴权可以复用 `HWLAB_API_KEY` 的环境变量/固定文件发现风格,但不得依赖 HWLAB runtime、HWLAB backend-core、HWLAB frontend 代理或 SSH official CLI;多一层转发会增加故障面,不能作为正式路径。`agentrun control-plane ...``git-mirror ...` 仍属于 G14 source/runtime 运维控制路径,可以继续使用 UniDesk SSH capture bridge;这些控制面路径不得反向成为 queue/session 资源原语的默认 transport。
- `agentrun cancel ... --dry-run` 必须显示 `CancelLifecycle` 摘要:transport/authority、YAML lane、cascade scope、runner abort 窗口、cancel epoch 与 late-write fencing。取消策略来自 `config/agentrun.yaml``controlPlane.lanes.<lane>.deployment.runner.cancelLifecycle`;字段缺失或 lane 选择错误应暴露为配置错误,不得在 CLI、manifest 或服务里补隐式默认。操作非默认 lane 时先加 `--node <node> --lane <lane> --dry-run` 核对 policy,再移除 `--dry-run` 发起真实取消。
- `agentrun control-plane expose --dry-run|--confirm``config/agentrun.yaml` 维护 AgentRun 公网 HTTPS 入口,模式与 Sub2API 暴露一致:G14 AgentRun runtime 通过 frpc 出到 master `127.0.0.1:<remotePort>`master Caddy 提供 `https://agentrun.74-48-78-17.nip.io/`。该命令只补 master `frps` allow port 和 Caddy vhostG14 frpc Deployment/ConfigMap 必须由 AgentRun `deploy/deploy.json` + GitOps render 管理,不能在 UniDesk 侧手写 Kubernetes manifest。
- `codex submit/enqueue``codex steer``codex resume``codex queue create``codex queue merge``codex move`、旧 Web 提交表单、旧队列管理和旧 workdir 管理是冻结的 legacy Code Queue 写入口。CLI 必须返回 `ok=false``frozen=true``degradedReason=legacy-code-queue-frozen` 和 AgentRun 替代命令;服务端旧 API 写入口必须返回 410。新任务、session follow-up、events/logs/result、ack 和 cancel 走 AgentRun 资源原语,其中 session follow-up 只用 `agentrun send session/<sessionId>`
- 旧 Code Queue 只保留历史归档、只读排障和残留任务停止。`codex task/tasks/output/read/unread/queues` 继续通过 backend-core 私有代理读取旧 PostgreSQL 历史;`codex interrupt|cancel <taskId>` 只用于停止旧运行面残留任务。旧 `steer-confirm` 只作为历史 trace confirmation 查询,不是新任务控制入口。
@@ -38,6 +38,7 @@ AgentRun核心负责把 HWLAB Agent 任务转化为可持久查询、可调度
- Postgres durable store 中 runs、commands、events、runner jobs、sessions、backends、leases、Queue 引用和 migration ledger 的事实持久化。
- command 终态、run 终态、failureKind、result envelope、event 分页和日志/trace 脱敏边界。
- runner job identity、attempt、logPath、pod identity、stale lease recovery 和 runner replacement 的核心执行语义。
- task、run、command、session 取消请求落到 AgentRun 核心后的 command/run 级取消状态机、cancel epoch、runner abort、terminal `canceled` 和迟到写回 fencing。
### 2.3 范围外
@@ -57,6 +58,9 @@ AgentRun核心负责把 HWLAB Agent 任务转化为可持久查询、可调度
| command | run 内的一次 turn、steer、interrupt 或 cancel 指令,具备独立状态和终态。 |
| event | 单 run 内 append-only、按 `seq` 单调递增的执行事实记录。 |
| terminal status | command 或 run 的权威终态,不由 partial output、stdout、transport close 或 idle timeout 推断。 |
| cancel request | 由 Queue、Session、run 或 command 控制入口提交的取消意图,必须持久化 requestId、targetRef、reason、requestedBy 和作用范围。 |
| cancel epoch | AgentRun 用于隔离取消前后写入的单调 fencing tokenrunner、terminal report 和 late write 必须携带或接受该 epoch 校验。 |
| canceled terminal | 取消成功后的权威终态,区别于 completed、failed、timeout 和 transport close。 |
| failureKind | AgentRun 对 schema、tenant policy、Secret、runner、backend、provider、infra 和 cancel 等失败的结构化分类。 |
| durable facts | Postgres 中可重启后查询的 run、command、event、runner、job、session、backend、lease 和 migration 事实。 |
@@ -84,6 +88,7 @@ AgentRun核心负责把 HWLAB Agent 任务转化为可持久查询、可调度
| PJ2026-01020105 | 控制面恢复 | 本规格 6.6 | manager boot/background reconciler、runner job observation 和 active command 收敛 | Durable事实、Kubernetes Job/Pod | HWLAB 接入、发布Lane、运维监控 |
| PJ2026-01020106 | 终态Outbox | 本规格 6.7 | runner terminal fact 的幂等提交、重试和可恢复 artifact | Runner执行、Backend Profile | 控制面恢复、HWLAB 投影 |
| PJ2026-01020107 | 清理安全 | 本规格 6.8 | cleanup、Job TTL、runner 上限与 active runner 保护 | 控制面恢复、发布Lane | 平台运维、运行面 GC |
| PJ2026-01020108 | 取消生命周期 | 本规格 6.9 | cancel request、cascade scope、epoch fencing、runner abort 和 canceled terminal | Queue会话、Manager API、Runner执行 | 客户端、运维监控、HWLAB接入 |
### 5.1 控制面恢复目标架构图
@@ -161,6 +166,28 @@ sequenceDiagram
| Job completed before terminal commit | Job/Pod phase、outbox/artifact、logs retention | reconciler 恢复 terminal fact 或写不可恢复 blocker | 把 Job completed 单独当 command completed |
| cleanup/TTL race | DB terminal state、runnerJob observation、TTL config | terminal durable 后再清理可恢复证据 | 只按 Pod 列表或年龄清理 active runner |
### 5.5 cancel lifecycle 关键时序图
```mermaid
sequenceDiagram
participant C as Queue / Session / CLI
participant M as agentrun-mgr authority
participant DB as durable ledger
participant R as runner
participant B as backend/tool/process
C->>M: cancel targetRef + reason
M->>DB: persist cancelRequest + next cancel epoch
M->>R: deliver cancel epoch for active command/run
R->>B: abort stream/tool/process
R->>DB: terminal report canceled with epoch
M->>DB: seal command/run canceled
R-->>DB: late write with old epoch
DB-->>R: reject fenced write
```
cancel lifecycle 必须把“接受取消请求”和“运行器已经中止”分成可观测阶段。用户或上游只能把 `canceled terminal` 作为取消完成事实;`cancel requested`、HTTP 连接关闭、runner pod 消失、timeout/watchdog 或缺少新输出都不能单独代表取消完成。
## 6. 原子需求
### 6.1 AR-CORE-REQ-001 Durable Resource 模型
@@ -244,3 +271,15 @@ terminal 上报必须幂等。同一 `runId + commandId + attemptId + runnerId`
AgentRun cleanup、Job TTL 和 runner 上限治理必须以 DB active facts 与 Kubernetes observation 双确认作为判断基础。默认 cleanup 不得杀 active runner;确需强制终止 active runner 时必须使用显式 force 语义,并写入原因、操作者、对象和 observed facts。
Job TTL 和日志/termination metadata 保留窗口不得早于 terminal facts durable commit 所需的最低恢复窗口。runner 上限治理应优先清理 idle、terminal、expired 或不可恢复对象;不得只根据 Pod 列表、Job age、进程内计数或旧 lease 单字段清理 runner。
### 6.9 AR-CORE-REQ-009 Cancel lifecycle 与 fencing
| 编号 | 短名 | 主责模块 | 关联模块 |
| --- | --- | --- | --- |
| AR-CORE-REQ-009 | 取消生命周期 | PJ2026-01020108 取消生命周期 | [队列会话](PJ2026-010203-queue-session.md)、[YAML运维](PJ2026-010603-yaml-first-ops.md)、[运维监控](PJ2026-010605-observability-monitoring.md) |
AgentRun核心应提供 task/run/command/session 控制入口落到核心执行面后的统一取消生命周期。Manager 接受取消请求时必须持久化 cancel request、targetRef、cascade scope、reason、requestedBy、requestId 和 cancel epoch;重复取消同一目标应幂等返回既有或更高 epoch 的取消事实。
取消必须通过 runner 可执行的 abort 信号兑现。runner 收到 cancel epoch 后应中止 provider stream、tool call、后台任务和子进程,并用 grace kill 与强制 kill 或等价机制完成资源释放;中止结果必须以 command/run terminal `canceled`、failureKind 或 cancellation classification 写回 durable store。partial output、transport close、idle timeout、missing terminal watchdog 或 runner pod 消失不得冒充 `completed``canceled`
Manager 和 durable store 必须用 cancel epoch 对迟到写回做 fencing。取消请求后的旧 epoch event、terminal report、result envelope 或 runner heartbeat 不得覆盖 sealed canceled terminal;被拒绝的迟到写回应产生可查询的 low-noise event、diagnostic 或 span attribute,使 CLI 能判断 cancel 在 accepted、delivered、aborting、terminalized、fenced 或 late-write-rejected 哪个阶段。
@@ -37,6 +37,7 @@
- Session API/CLI 的输出、trace、命令流、debug/audit 详情、read 状态和会话控制。
- Queue task 到 Core run/command/runner job/session 的引用关系和 sessionPath 输出。
- `sessions send`、session continuation、unread/default/all 状态和 terminal projection。
- Queue task、Session、run 和 command 取消入口的用户语义、级联边界、同 session 续跑和 canceled terminal projection。
- 自动 scheduler 的 deferred 边界、future pending scan、capacity selection、runner assignment 和 stale lease recovery 方向。
### 2.3 范围外
@@ -58,6 +59,7 @@
| sessionPath | Queue task 返回的 Session API 路径,用于读取输出和 trace。 |
| read cursor | 按 readerId 记录的已读水位,用于默认列表只显示 running 或 unread session/task。 |
| commander | Queue 侧的聚合视图,用于展示队列状态、最新 attempt 和下一步操作摘要。 |
| cancel scope | 用户发起取消时选择的资源边界,包括 task、session、run 或 command。 |
| Scheduler | 后续自动调度器,负责 pending scan 和 runner assignment`v0.1` 第一阶段不作为发布前置。 |
## 4. 系统边界和接口
@@ -81,6 +83,7 @@
| PJ2026-01020302 | Session控制 | 本规格 6.2 | output、trace、send、read、cancel、default/unread/all 视图 | Core events、Runtime session | CLI、客户端、用户 |
| PJ2026-01020303 | 分层边界 | 本规格 6.3 | Queue、Session、Core 和 Scheduler 的职责分离 | Queue任务、Session控制 | 发布流水、客户端 |
| PJ2026-01020304 | Scheduler边界 | 本规格 6.4 | 自动调度 deferred、future scan/assignment/recovery | Queue pending facts、capacity | Runner job、AgentRun Core |
| PJ2026-01020305 | 取消控制 | 本规格 6.5 | task/session/run/command 取消语义、级联关系和 canceled projection | AgentRun核心取消生命周期 | CLI、客户端、commander |
## 6. 原子需求
@@ -123,3 +126,15 @@ Queue 不维护 OA、notification、GitHub action 或外部协作 sink,不从
队列会话应保留自动 scheduler 的规格边界,但 `v0.1` 第一阶段不得因为 scheduler 未实现而阻塞最小真实闭环。
在 scheduler 启用前,CLI/manual dispatch 必须能启动真实 runnermanager durable facts、runner claim、backend turn、events 和 terminal status 必须真实可用。未来 scheduler 只能通过 manager API 和 Kubernetes runner Job 改变调度状态,不得直接写 Postgres 或直接执行 backend。
### 6.5 AR-QUEUE-REQ-005 取消控制语义
| 编号 | 短名 | 主责模块 | 关联模块 |
| --- | --- | --- | --- |
| AR-QUEUE-REQ-005 | 取消控制 | PJ2026-01020305 取消控制 | [AgentRun核心](PJ2026-010201-agentrun-core.md)、[客户端](PJ2026-0104-client.md)、[运维监控](PJ2026-010605-observability-monitoring.md) |
队列会话应把用户可见取消语义表达为明确的资源 scope。`cancel command/<commandId>` 只取消当前 command/runner job,并保留同一个 session 可继续 `send session/<sessionId>``cancel run/<runId>` 取消该 run 下 active command 和 runner job,并由 Core terminalize run`cancel session/<sessionId>` 取消 session 当前 active work 和 session-scoped background work`cancel task/<taskId>` 取消当前 task attempt/run,并按 Queue retry/resume 语义更新 task projection。
Queue 和 Session 只能提交取消意图、展示取消阶段和投影 canceled terminal,不得从 HTTP 成功、timeout、stdout 停止、runner job 删除或 Web 轮询无新增内容推断取消完成。取消完成事实必须来自 [AgentRun核心](PJ2026-010201-agentrun-core.md) 的 durable command/run terminal、cancel epoch fencing 和 result/event 证据。
CLI、commander 和客户端默认输出应展示 cancel scope、targetRef、cascade scope、request id 或等价关联字段、当前阶段和下一步查询命令;完整审计字段通过 machine output 或详细视图获取。取消后的 follow-up 必须显式引用仍可续跑的 `sessionId`,不得回退到旧 Code Queue `resume/steer` 或通过复制历史 prompt 伪造会话连续性。
@@ -39,6 +39,7 @@ YAML运维负责 HWLAB/UniDesk 自有平台配置的真相源、解析、渲染
- target、lane、node、service、namespace、endpoint、publicExposure 和运行目标解析。
- Secret sourceRef、targetKey、providerCredential、manual binding source 和敏感输出约束。
- Sub2API、Codex pool、AgentRun control-plane、session policy 和平台基础设施配置的受控 CLI 读取、解释、计划和下发。
- AgentRun lane 的 runner retention、idle timeout、egress proxy 和 cancel lifecycle policy 等运行策略配置读取、解释和下发。
- FRP、Caddy、public URL、public health、Kubernetes Secret 和平台资源渲染所需的配置投递边界。
- 可复用 ops primitive,包括 YAML path 捕获、字段解析、fingerprint、摘要输出、Secret 引用和命令输出约束。
@@ -61,6 +62,7 @@ YAML运维负责 HWLAB/UniDesk 自有平台配置的真相源、解析、渲染
| sourceRef | YAML 中指向密钥来源的声明,输出时只能显示来源标识和摘要,不显示密钥值。 |
| targetKey | YAML 中声明运行面 Secret 或配置对象接收某项密钥的 key 名。 |
| providerCredential | AgentRun lane 中声明 provider profile 与运行面 Secret 绑定关系的配置项。 |
| cancel lifecycle policy | AgentRun lane 中声明取消投递、runner abort、kill escalation、stale fencing 和事件阶段输出的配置块;具体数值以 YAML 为准。 |
| publicExposure | YAML 中描述 FRP、Caddy、domain、TLS、public URL 和 health 目标的公开入口声明。 |
| ops primitive | 平台运维 CLI 共享的底层能力,例如字段解析、fingerprint、Secret 引用、摘要输出和 YAML path 捕获。 |
| 配置解释输出 | CLI 将 YAML 解析后的默认值、来源和目标以非敏感摘要展示给操作人员的输出。 |
@@ -143,9 +145,11 @@ YAML运维应从 publicExposure 和 target 声明渲染 FRP、Caddy、public URL
| --- | --- | --- | --- |
| OPS-YAML-REQ-005 | 执行策略 | PJ2026-01060305 执行策略 | [Agent编排](PJ2026-0102-agent-orchestration.md)、[用户管理](PJ2026-0105-user-management.md) |
YAML运维应为 AgentRun control-plane default、client sessionPolicy、lane secret providerCredential、workspaceexecution policy 提供配置读取与解释,使 AgentRun 运维入口不依赖代码内固定 profile、namespace 或执行策略。
YAML运维应为 AgentRun control-plane default、client sessionPolicy、lane secret providerCredential、workspaceexecution policy 和 cancel lifecycle policy 提供配置读取与解释,使 AgentRun 运维入口不依赖代码内固定 profile、namespace、取消超时或执行策略。
本需求只约束执行策略如何作为平台配置进入运行面。Agent run、command、session 状态机、任务恢复和 provider 业务语义由 Agent编排负责,用户身份和 API key 约束由用户管理负责
cancel lifecycle policy 至少应能声明取消信号投递方式、runner graceful abort、kill escalation、stale heartbeat fencing window、late write fencing 和默认事件阶段输出开关。CLI 只校验字段结构、类型、必填项和可渲染性;具体窗口、超时和开关值由 YAML 承载,不在代码或 SPEC 中写成第二真相
本需求只约束执行策略如何作为平台配置进入运行面。Agent run、command、session 状态机、任务恢复、取消语义和 provider 业务语义由 Agent编排负责,用户身份和 API key 约束由用户管理负责。
### 6.6 OPS-YAML-REQ-006 公共 ops primitive
@@ -41,7 +41,7 @@
- OpenTelemetry Collector、trace backend、span 语义、trace context 传播和 trace 查询入口。
- Web/API/AgentRun/HWPOD/Harness/用户管理等服务的运行面健康、资源状态、公开入口健康和用户可感知性能观测。
- 发布后 runtime readiness、resource usage、error rate、queue depth、target availability 和 alert 状态摘要。
- AgentRun rolling recovery 的 active runner、stale lease、terminal report retry、reconciler backlog、projection lag 和 Job TTL cleanup 观测。
- AgentRun rolling recovery 与 cancel lifecycle 的 active runner、stale lease、cancel request、cancel delivery、terminal report retry、reconciler backlog、projection lag、late-write fencing 和 Job TTL cleanup 观测。
- 监控和 trace 数据的受控查询、低噪声摘要、失败归因和敏感输出约束。
### 2.3 范围外
@@ -251,11 +251,11 @@ Workbench 性能监控只记录低基数指标、阶段耗时、状态分类和
| --- | --- | --- | --- |
| OPS-MON-REQ-007 | Rolling恢复 | PJ2026-01060507 Rolling恢复观测 | [AgentRun核心](PJ2026-010201-agentrun-core.md)、[HWLAB接入](PJ2026-010205-hwlab-dispatch.md)、[Workbench唯一投影](PJ2026-0104010803-workbench-unique-projection.md)、[AgentRun发布Lane](PJ2026-01060105-agentrun-v01-release-lane.md) |
运维监控应为 AgentRun/HWLAB rolling recovery 提供可查询的低基数 metrics、trace span 和受控 CLI 摘要。最小观测对象包括 active runner 数、runner job observation phase、stale lease 数、terminal report retry/outbox backlog、manager reconciler backlog、reconciler last success/error、projection lag、projection blocker 数、Job TTL cleanup 数和不可恢复 blocker 数。
运维监控应为 AgentRun/HWLAB rolling recovery 和 cancel lifecycle 提供可查询的低基数 metrics、trace span 和受控 CLI 摘要。最小观测对象包括 active runner 数、runner job observation phase、stale lease 数、cancel request accepted 数、cancel delivered 数、cancel terminalized 数、late-write-fenced 数、terminal report retry/outbox backlog、manager reconciler backlog、reconciler last success/error、projection lag、projection blocker 数、Job TTL cleanup 数和不可恢复 blocker 数。
rolling recovery 相关 span 应能用 OTel trace id/request id 关联 `sessionId``traceId``runId``commandId``runnerJobId``jobName` 的 redacted attribute。高基数业务 ID 只能作为 trace/span attribute 或 CLI 单次查询参数,不进入 Prometheus labelprompt、assistant 正文、tool 参数、stdout/stderr、Secret、完整 token、完整 DSN 和 provider payload 不得进入默认 metrics、span 或 issue closeout。
发布/rollout 前后的受控 CLI 摘要应能回答:仍在运行的 runner 数、已恢复控制权数量、terminal report retry 数、projection lag 最大值、不可恢复 blocker 数和最近一次 reconciler 错误。该摘要只用于定位和发布判定,不替代 P6 原入口 rolling/chaos 验收,也不能把可观测性 green 当作业务任务完成。
发布/rollout 前后的受控 CLI 摘要应能回答:仍在运行的 runner 数、已恢复控制权数量、cancel 当前阶段分布、terminal report retry 数、projection lag 最大值、不可恢复 blocker 数和最近一次 reconciler 错误。该摘要只用于定位和发布判定,不替代 P6 原入口 rolling/chaos 验收,也不能把可观测性 green 当作业务任务完成。
## 7. 过程控制
+45
View File
@@ -1,3 +1,5 @@
// SPEC: PJ2026-01060305 AgentRun execution policy + PJ2026-01020108 cancel lifecycle draft-2026-06-25-p0.
// Parses AgentRun YAML lane policy, including cancel lifecycle values owned by config/agentrun.yaml.
import { rootPath } from "./config";
import {
asRecord,
@@ -113,6 +115,7 @@ export interface AgentRunLaneSpec {
readonly egressProxyUrl: string | null;
readonly noProxyExtra: readonly string[];
readonly retention: AgentRunRunnerRetentionSpec;
readonly cancelLifecycle: AgentRunCancelLifecycleSpec;
};
readonly localPostgres: {
readonly enabled: boolean;
@@ -195,6 +198,19 @@ export interface AgentRunRunnerRetentionSpec {
};
}
export type AgentRunCancelLifecycleStage = "accepted" | "persisted" | "delivered" | "aborting" | "terminalized" | "fenced" | "late-write-rejected";
export interface AgentRunCancelLifecycleSpec {
readonly deliveryMode: "manager-epoch";
readonly gracefulAbortMs: number;
readonly killEscalationMs: number;
readonly staleHeartbeatFencingMs: number;
readonly lateWriteFencing: {
readonly enabled: boolean;
};
readonly eventStages: readonly AgentRunCancelLifecycleStage[];
}
export interface AgentRunLaneTarget {
readonly configPath: string;
readonly spec: AgentRunLaneSpec;
@@ -312,6 +328,7 @@ export function agentRunLaneSummary(spec: AgentRunLaneSpec): Record<string, unkn
egressProxyUrl: spec.deployment.runner.egressProxyUrl,
noProxyExtra: spec.deployment.runner.noProxyExtra,
retention: spec.deployment.runner.retention,
cancelLifecycle: spec.deployment.runner.cancelLifecycle,
},
localPostgres: spec.deployment.localPostgres,
},
@@ -551,11 +568,39 @@ function parseDeployment(input: Record<string, unknown>, path: string): AgentRun
egressProxyUrl: optionalStringField(runner, "egressProxyUrl", `${path}.runner`) ?? null,
noProxyExtra: optionalStringArrayField(runner, "noProxyExtra", `${path}.runner`),
retention: parseRunnerRetention(recordField(runner, "retention", `${path}.runner`), `${path}.runner.retention`),
cancelLifecycle: parseCancelLifecycle(recordField(runner, "cancelLifecycle", `${path}.runner`), `${path}.runner.cancelLifecycle`),
},
localPostgres: parseLocalPostgres(localPostgres, `${path}.localPostgres`),
};
}
function parseCancelLifecycle(input: Record<string, unknown>, path: string): AgentRunCancelLifecycleSpec {
const lateWriteFencing = recordField(input, "lateWriteFencing", path);
return {
deliveryMode: enumField(input, "deliveryMode", path, ["manager-epoch"]),
gracefulAbortMs: positiveIntegerField(input, "gracefulAbortMs", path),
killEscalationMs: positiveIntegerField(input, "killEscalationMs", path),
staleHeartbeatFencingMs: positiveIntegerField(input, "staleHeartbeatFencingMs", path),
lateWriteFencing: {
enabled: booleanField(lateWriteFencing, "enabled", `${path}.lateWriteFencing`),
},
eventStages: parseCancelLifecycleStages(input.eventStages, `${path}.eventStages`),
};
}
function parseCancelLifecycleStages(input: unknown, path: string): readonly AgentRunCancelLifecycleStage[] {
const values: readonly AgentRunCancelLifecycleStage[] = ["accepted", "persisted", "delivered", "aborting", "terminalized", "fenced", "late-write-rejected"];
if (!Array.isArray(input)) throw new Error(`${path} must be an array`);
if (input.length === 0) throw new Error(`${path} must declare at least one stage`);
const result = input.map((value, index) => {
if (typeof value !== "string" || !values.includes(value as AgentRunCancelLifecycleStage)) throw new Error(`${path}[${index}] must be one of ${values.join(", ")}`);
return value as AgentRunCancelLifecycleStage;
});
const duplicates = result.filter((value, index) => result.indexOf(value) !== index);
if (duplicates.length > 0) throw new Error(`${path} must not contain duplicate stages: ${[...new Set(duplicates)].join(", ")}`);
return result;
}
function parseRunnerRetention(input: Record<string, unknown>, path: string): AgentRunRunnerRetentionSpec {
const selectors = recordField(input, "selectors", path);
const ageBasedCleanup = recordField(input, "ageBasedCleanup", path);
+8
View File
@@ -1,3 +1,5 @@
// SPEC: PJ2026-01060305 AgentRun execution policy + PJ2026-01020108 cancel lifecycle draft-2026-06-25-p0.
// Renders AgentRun YAML lane policy into runtime manager environment.
import { createHash } from "node:crypto";
import type { AgentRunLaneSpec } from "./agentrun-lanes";
@@ -449,6 +451,12 @@ function managerEnv(spec: AgentRunLaneSpec, sourceCommit: string, imageRef: stri
{ name: "AGENTRUN_RUNNER_RETENTION_JOB_NAME_PREFIXES", value: spec.deployment.runner.retention.selectors.jobNamePrefixes.join(",") },
{ name: "AGENTRUN_RUNNER_RETENTION_AGE_BASED_CLEANUP_ENABLED", value: String(spec.deployment.runner.retention.ageBasedCleanup.enabled) },
...(spec.deployment.runner.retention.ageBasedCleanup.maxAgeHours === null ? [] : [{ name: "AGENTRUN_RUNNER_RETENTION_AGE_BASED_MAX_AGE_HOURS", value: String(spec.deployment.runner.retention.ageBasedCleanup.maxAgeHours) }]),
{ name: "AGENTRUN_CANCEL_DELIVERY_MODE", value: spec.deployment.runner.cancelLifecycle.deliveryMode },
{ name: "AGENTRUN_CANCEL_GRACEFUL_ABORT_MS", value: String(spec.deployment.runner.cancelLifecycle.gracefulAbortMs) },
{ name: "AGENTRUN_CANCEL_KILL_ESCALATION_MS", value: String(spec.deployment.runner.cancelLifecycle.killEscalationMs) },
{ name: "AGENTRUN_CANCEL_STALE_HEARTBEAT_FENCING_MS", value: String(spec.deployment.runner.cancelLifecycle.staleHeartbeatFencingMs) },
{ name: "AGENTRUN_CANCEL_LATE_WRITE_FENCING_ENABLED", value: String(spec.deployment.runner.cancelLifecycle.lateWriteFencing.enabled) },
{ name: "AGENTRUN_CANCEL_EVENT_STAGES", value: spec.deployment.runner.cancelLifecycle.eventStages.join(",") },
...(spec.deployment.runner.egressProxyUrl === null ? [] : [{ name: "AGENTRUN_RUNNER_EGRESS_PROXY_URL", value: spec.deployment.runner.egressProxyUrl }]),
...(spec.deployment.runner.noProxyExtra.length === 0 ? [] : [{ name: "AGENTRUN_RUNNER_NO_PROXY_EXTRA", value: spec.deployment.runner.noProxyExtra.join(",") }]),
{ name: "AGENTRUN_API_KEY", valueFrom: { secretKeyRef: spec.deployment.manager.apiKeySecretRef } },
+128 -7
View File
@@ -1,3 +1,5 @@
// SPEC: PJ2026-01020108 cancel lifecycle + PJ2026-01020305 cancel control + PJ2026-01060305 AgentRun execution policy draft-2026-06-25-p0.
// Exposes AgentRun cancel lifecycle policy and dry-run visibility in the UniDesk CLI.
import { chmodSync, copyFileSync, existsSync, readFileSync, statSync, writeFileSync } from "node:fs";
import { join } from "node:path";
import { spawnSync } from "node:child_process";
@@ -13,6 +15,7 @@ import {
agentRunPipelineRunName,
agentRunProviderCredentialRefs,
resolveAgentRunLaneTarget,
type AgentRunCancelLifecycleSpec,
type AgentRunLaneSpec,
} from "./agentrun-lanes";
import {
@@ -620,7 +623,7 @@ async function resourceCancel(config: UniDeskConfig | null, command: string, act
if (options.reason !== null) cancelArgs.push("--reason", options.reason);
if (ref.kind === "command") cancelArgs.push("--run-id", options.runId ?? requiredContext("command cancel", "--run <runId>"));
if (options.dryRun) {
const result = agentRunResourceCancelDryRunPlan(ref, options, rerunWithoutDryRun(command));
const result = agentRunResourceCancelDryRunPlan(config, ref, options, rerunWithoutDryRun(command));
return renderMutationSummary(command, result, options, `Planned cancel ${ref.kind}/${shortId(ref.name)}`, [rerunWithoutDryRun(command)]);
}
const result = ref.kind === "task"
@@ -636,21 +639,108 @@ async function resourceCancel(config: UniDeskConfig | null, command: string, act
return renderMutationSummary(command, result, options, `${options.dryRun ? "Planned cancel" : "Cancel requested"} ${ref.kind}/${shortId(ref.name)}`, options.dryRun ? [rerunWithoutDryRun(command)] : undefined);
}
function agentRunResourceCancelDryRunPlan(ref: AgentRunResourceRef, options: AgentRunResourceOptions, confirmCommand: string): Record<string, unknown> {
function agentRunResourceCancelDryRunPlan(config: UniDeskConfig | null, ref: AgentRunResourceRef, options: AgentRunResourceOptions, confirmCommand: string): Record<string, unknown> {
const body: Record<string, unknown> = {};
if (options.reason !== null) body.reason = options.reason;
if (ref.kind === "task") return agentRunDryRunPlan("task-cancel", `/api/v1/queue/tasks/${encodeURIComponent(ref.name)}/cancel`, body, confirmCommand);
if (ref.kind === "session") return agentRunDryRunPlan("session-cancel", `/api/v1/sessions/${encodeURIComponent(ref.name)}/control`, { action: "cancel", ...body }, confirmCommand);
if (ref.kind === "run") return agentRunDryRunPlan("run-cancel", `/api/v1/runs/${encodeURIComponent(ref.name)}/cancel`, body, confirmCommand);
const cancelLifecycle = agentRunCancelLifecycleDryRunDisclosure(config, ref, options);
if (ref.kind === "task") return agentRunDryRunPlan("task-cancel", `/api/v1/queue/tasks/${encodeURIComponent(ref.name)}/cancel`, body, confirmCommand, "POST", { cancelLifecycle });
if (ref.kind === "session") return agentRunDryRunPlan("session-cancel", `/api/v1/sessions/${encodeURIComponent(ref.name)}/control`, { action: "cancel", ...body }, confirmCommand, "POST", { cancelLifecycle });
if (ref.kind === "run") return agentRunDryRunPlan("run-cancel", `/api/v1/runs/${encodeURIComponent(ref.name)}/cancel`, body, confirmCommand, "POST", { cancelLifecycle });
if (ref.kind === "command") {
const runId = options.runId ?? requiredContext("command cancel", "--run <runId>");
return agentRunDryRunPlan("command-cancel", `/api/v1/commands/${encodeURIComponent(ref.name)}/cancel`, body, confirmCommand, "POST", {
commandRef: { runId, commandId: ref.name, valuesPrinted: false },
cancelLifecycle,
});
}
throw new Error("cancel supports task/<taskId>, session/<sessionId>, run/<runId>, or command/<commandId>");
}
function agentRunCancelLifecycleDryRunDisclosure(config: UniDeskConfig | null, ref: AgentRunResourceRef, options: AgentRunResourceOptions): Record<string, unknown> {
const target = resolveAgentRunCancelPolicyTarget(config, options);
const policy = target?.spec.deployment.runner.cancelLifecycle ?? null;
return {
specRefs: ["PJ2026-01020108", "PJ2026-01020305", "PJ2026-01060305"],
authority: agentRunCancelAuthorityDisclosure(target),
targetRef: {
kind: ref.kind,
name: ref.name,
runId: ref.kind === "command" ? options.runId : options.runId ?? null,
valuesPrinted: false,
},
cascadeScope: agentRunCancelCascadeScope(ref.kind),
terminalAuthority: "AgentRun Core canceled terminal/result event",
expectedStages: policy?.eventStages ?? [],
runnerAbort: policy === null ? null : agentRunCancelRunnerAbortDisclosure(policy),
fencing: agentRunCancelFencingDisclosure(policy),
verification: {
describe: `bun scripts/cli.ts agentrun describe ${ref.kind}/${ref.name}`,
events: ref.kind === "run" || options.runId !== null ? `bun scripts/cli.ts agentrun events run/${ref.kind === "run" ? ref.name : options.runId} --after-seq 0` : null,
logs: ref.kind === "session" ? `bun scripts/cli.ts agentrun logs session/${ref.name} --tail 100` : null,
result: ref.kind === "command" ? `bun scripts/cli.ts agentrun result command/${ref.name} --run ${options.runId ?? "<runId>"}` : null,
valuesPrinted: false,
},
valuesPrinted: false,
};
}
function resolveAgentRunCancelPolicyTarget(config: UniDeskConfig | null, options: AgentRunResourceOptions): { configPath: string; spec: AgentRunLaneSpec; source: "selected-lane" | "default-lane" } | null {
if (activeAgentRunRestTarget !== null) return { configPath: activeAgentRunRestTarget.configPath, spec: activeAgentRunRestTarget.spec, source: "selected-lane" };
if (config === null) return null;
const { configPath, spec } = resolveAgentRunLaneTarget({ node: options.node, lane: options.lane });
return { configPath, spec, source: options.node !== null || options.lane !== null ? "selected-lane" : "default-lane" };
}
function agentRunCancelAuthorityDisclosure(target: { configPath: string; spec: AgentRunLaneSpec; source: "selected-lane" | "default-lane" } | null): Record<string, unknown> {
const laneTarget = activeAgentRunRestTarget !== null;
return {
transport: laneTarget ? "lane-k8s-service-proxy" : "direct-http",
policySource: target?.source ?? "unavailable",
node: target?.spec.nodeId ?? null,
lane: target?.spec.lane ?? null,
namespace: target?.spec.runtime.namespace ?? null,
managerDeployment: target?.spec.runtime.managerDeployment ?? null,
baseUrl: laneTarget ? target?.spec.runtime.internalBaseUrl ?? null : agentRunDirectManagerBaseUrl(),
laneConfigPath: target?.configPath ?? null,
valuesPrinted: false,
};
}
function agentRunDirectManagerBaseUrl(): string | null {
try {
return readAgentRunClientConfig().manager.baseUrl;
} catch {
return null;
}
}
function agentRunCancelRunnerAbortDisclosure(policy: AgentRunCancelLifecycleSpec): Record<string, unknown> {
return {
deliveryMode: policy.deliveryMode,
gracefulAbortMs: policy.gracefulAbortMs,
killEscalationMs: policy.killEscalationMs,
valuesPrinted: false,
};
}
function agentRunCancelFencingDisclosure(policy: AgentRunCancelLifecycleSpec | null): Record<string, unknown> {
if (policy === null) return { cancelEpoch: true, policySource: "unavailable", valuesPrinted: false };
return {
cancelEpoch: true,
staleHeartbeatFencingMs: policy.staleHeartbeatFencingMs,
lateWriteFencing: policy.lateWriteFencing.enabled,
valuesPrinted: false,
};
}
function agentRunCancelCascadeScope(kind: AgentRunResourceKind): string {
if (kind === "task") return "current task attempt -> run -> active command -> runner job";
if (kind === "session") return "session active work -> active run/command -> session-scoped background work";
if (kind === "run") return "run active commands -> runner jobs -> run terminal";
if (kind === "command") return "single command -> current runner job; session remains reusable";
return "unsupported cancel target";
}
async function resourceDispatch(config: UniDeskConfig | null, command: string, action: string | undefined, args: string[], options: AgentRunResourceOptions): Promise<RenderedCliResult> {
const ref = parseResourceRef(action, args, "task");
if (ref.kind !== "task") throw new Error("dispatch supports task/<taskId>");
@@ -775,16 +865,47 @@ function renderMutationSummary(command: string, raw: Record<string, unknown>, op
if (id !== null) lines.push(`Name: ${id}`);
const decision = stringOrNull(data.decision);
const internalCommandType = stringOrNull(data.internalCommandType);
if (data.dryRun !== undefined) lines.push(`DryRun: ${String(data.dryRun)}`);
if (data.mutation !== undefined) lines.push(`Mutation: ${String(data.mutation)}`);
const dryRun = data.dryRun !== undefined ? data.dryRun : raw.dryRun;
const mutation = data.mutation !== undefined ? data.mutation : raw.mutation;
if (dryRun !== undefined) lines.push(`DryRun: ${String(dryRun)}`);
if (mutation !== undefined) lines.push(`Mutation: ${String(mutation)}`);
if (decision !== null) lines.push(`Decision: ${decision}`);
if (internalCommandType !== null) lines.push(`InternalCommandType: ${internalCommandType}`);
lines.push(...renderCancelLifecycleMutationLines(record(data.cancelLifecycle ?? raw.cancelLifecycle)));
const next = record(raw.next ?? data.next);
const nextLines = (overrideNextLines ?? Object.values(next).map(String)).filter((line) => line.length > 0).slice(0, 5);
if (nextLines.length > 0) lines.push("", "Next:", ...nextLines.map((line) => ` ${line}`));
return renderedCliResult(raw.ok !== false, command, lines.join("\n"));
}
function renderCancelLifecycleMutationLines(lifecycle: Record<string, unknown>): string[] {
if (Object.keys(lifecycle).length === 0) return [];
const authority = record(lifecycle.authority);
const runnerAbort = record(lifecycle.runnerAbort);
const fencing = record(lifecycle.fencing);
const expectedStages = Array.isArray(lifecycle.expectedStages) ? lifecycle.expectedStages.map(String).filter((value) => value.length > 0) : [];
const node = stringOrNull(authority.node);
const lane = stringOrNull(authority.lane);
const target = node !== null && lane !== null ? `${node}/${lane}` : "-";
const lines = ["", "CancelLifecycle:"];
lines.push(` Authority: ${displayValue(authority.transport)} policy=${displayValue(authority.policySource)} lane=${target}`);
const namespace = stringOrNull(authority.namespace);
const deployment = stringOrNull(authority.managerDeployment);
if (namespace !== null || deployment !== null) lines.push(` Runtime: ns=${displayValue(namespace)} manager=${displayValue(deployment)}`);
const cascadeScope = stringOrNull(lifecycle.cascadeScope);
if (cascadeScope !== null) lines.push(` Cascade: ${cascadeScope}`);
if (Object.keys(runnerAbort).length > 0) {
lines.push(` RunnerAbort: mode=${displayValue(runnerAbort.deliveryMode)} gracefulMs=${displayValue(runnerAbort.gracefulAbortMs)} killMs=${displayValue(runnerAbort.killEscalationMs)}`);
}
if (Object.keys(fencing).length > 0) {
lines.push(` Fencing: cancelEpoch=${displayValue(fencing.cancelEpoch)} staleHeartbeatMs=${displayValue(fencing.staleHeartbeatFencingMs)} lateWrite=${displayValue(fencing.lateWriteFencing)}`);
}
if (expectedStages.length > 0) lines.push(` Stages: ${expectedStages.join(", ")}`);
const terminalAuthority = stringOrNull(lifecycle.terminalAuthority);
if (terminalAuthority !== null) lines.push(` Terminal: ${terminalAuthority}`);
return lines;
}
function rerunWithoutDryRun(command: string): string {
return `bun scripts/cli.ts ${command.replace(/\s+--dry-run\b/gu, "").trim()}`;
}