diff --git a/.agents/skills/unidesk-cicd/SKILL.md b/.agents/skills/unidesk-cicd/SKILL.md index a26a8b6d..5829e2f1 100644 --- a/.agents/skills/unidesk-cicd/SKILL.md +++ b/.agents/skills/unidesk-cicd/SKILL.md @@ -130,7 +130,7 @@ PipelineRun `gitops-promote` 如果报 git mirror 控制面漂移、refs 不一 D601 `v0.3` 固定 worktree 的 fetch remote 是 node-local git mirror。GitHub PR 合并后,如果 `D601:/home/ubuntu/workspace/hwlab-v03` 中 `git fetch origin v0.3` 仍看不到最新 merge commit,先执行 `hwlab nodes git-mirror sync --node D601 --lane v03 --confirm --wait`,再在固定 worktree `git fetch origin v0.3 && git pull --ff-only origin v0.3`。`trigger-current --lane v03` 会为 PipelineRun 做 mirror pre-sync,但不替代固定 worktree 的 fetch hygiene。promotion 后若 node-local `git-mirror status` 显示 `pendingFlush=true`,执行 node-local flush 并等到 `pendingFlush=false`、`githubInSync=true`。 -D601/node-scoped mirror status 的 `githubGitops` 来自本地 mirror cache 的 `refs/mirror-stage/...`。如果 `hwlab nodes git-mirror flush --node D601 --lane v03 --confirm --wait` 的日志已经显示 `v0.3-gitops -> v0.3-gitops` 推送成功,但随后因 GitHub SSH `kex_exchange_identification` 或 fetch 确认失败导致命令 exit 44、status 仍显示 `pendingFlush=true`,不要连续盲目 flush;先执行 `hwlab nodes git-mirror sync --node D601 --lane v03 --confirm --wait` 刷新 mirror-stage,再用 status 确认 `localGitops=githubGitops`、`pendingFlush=false`、`githubInSync=true`。 +D601/node-scoped mirror status 的 `githubGitops` 来自本地 mirror cache 的 `refs/mirror-stage/...`;`status` 输出应通过 `refSources.githubFieldsAreMirrorStageCache=true` 显示这一点。`hwlab nodes git-mirror flush --node D601 --lane v03 --confirm --wait` 如果已经显示 `v0.3-gitops -> v0.3-gitops` 推送成功,但随后因 GitHub SSH `kex_exchange_identification` 或 fetch 确认失败导致命令非零退出,会标记 `partialSuccess=push-succeeded-fetch-failed` 并把下一步指向 `hwlab nodes git-mirror sync --node D601 --lane v03 --confirm --wait`。不要连续盲目 flush;先刷新 mirror-stage,再用 status 确认 `localGitops=githubGitops`、`pendingFlush=false`、`githubInSync=true`。 --- diff --git a/docs/reference/cli.md b/docs/reference/cli.md index f6d92b3c..9f541e73 100644 --- a/docs/reference/cli.md +++ b/docs/reference/cli.md @@ -28,6 +28,8 @@ G14/D601 v03 的 bootstrap admin password 是 HWLAB runtime Secret 生命周期 `hwlab nodes control-plane infra plan|status|apply --node D601 --lane v03` 是 D601 HWLAB v03 节点本地 CI/CD 与 git-mirror 前置控制面的 YAML 驱动入口,配置真相源是 `config/hwlab-node-control-plane.yaml`。`plan` 只读展示 YAML target 和将渲染的 control-plane 对象;`status` 只读观察 D601 Tekton、CI namespace、git-mirror、Argo、node-local registry 和 tools image readiness;`apply --dry-run` 只输出 manifest 摘要;`apply --confirm` 只收敛 D601 control-plane bootstrap 对象,不触发 HWLAB runtime rollout,不创建 PK01 DB,也不修改 Caddy/FRP。tools image 的 node-local registry 地址只能作为输出 artifact;输入 base image 必须由 YAML 声明为公开 registry 来源,缺少 output image 时应在 `status.next.blockers` 中体现,而不是把现有 node-local image 当成输入基础镜像。 +`hwlab nodes git-mirror status|sync|flush --node --lane ` 是 node-scoped runtime lane 的 Git mirror 维护入口。`status` 的 `githubSource` / `githubGitops` 来自本地 mirror cache 的 `refs/mirror-stage/...`,不是实时 GitHub API;输出中的 `refSources.githubFieldsAreMirrorStageCache=true` 和 `refSources.cacheRefresh` 给出这一来源和刷新命令。`flush --confirm --wait` 如果已经把 GitOps ref push 到 GitHub,但 post-push fetch/recheck 因 transient SSH 失败而无法刷新 mirror-stage,会返回 `partialSuccess=push-succeeded-fetch-failed`、`degradedReason=node-runtime-git-mirror-flush-post-push-fetch-failed`,并把下一步指向 `sync --confirm --wait` 后再查 `status`;不要把这种 partial success 解读为需要连续盲目 flush。 + `hwlab nodes control-plane infra tools-image status|build|logs --node D601 --lane v03` 是 D601 tools image 的受控入口。Dockerfile 必须由 `config/hwlab-node-control-plane.yaml` 的 `tekton.toolsImage.dockerfileInline` 声明,输入镜像必须列在 `publicBaseImages`,构建参数和网络模式也来自 YAML;confirmed build 只在 D601 后台异步构建并推送到 node-local registry,返回 status/logs 轮询命令。`hwlab nodes control-plane infra argo status|apply|logs --node D601 --lane v03` 是 D601 Argo CD 的声明式安装入口。Argo 版本、官方 manifest URL、镜像 rewrite/preload、field manager、imagePullPolicy、CRD 列表、期望 Deployment/StatefulSet 以及生成的 AppProject/Application 都必须来自同一个 YAML;`argo apply --confirm` 只执行可重复 server-side apply 和后台轮询,不把原生 `kubectl apply`、手工 Argo CLI 或临时 manifest 作为正式安装路径。 `hwlab nodes control-plane runtime-image status|preload|build --node G14 --lane v03` 是 G14 v0.3 runtime lane base image 的受控入口。输入公开来源来自 `config/hwlab-node-lanes.yaml` 的 `baseImageSource`,输出目标来自同一 lane 的 `baseImage`;`status` 只读检查 node-local registry tag 与 source/target image presence,`preload --confirm` 按 YAML 下载配置执行 source pull/tag/push,`build` 当前只是 preload 别名。这个入口用于 `trigger-current` 前置检查和 base image 缺失恢复;后续 service build 失败应按失败 TaskRun 单独分流。 @@ -163,7 +165,7 @@ PW 长时操作采用 Fire-and-Forget 模式:CLI 创建 `.state/jobs/{jobId}.json`,后台进程执行真实命令,并将 stdout、stderr 分别写入 `.state/jobs/{jobId}.stdout.log` 与 `.state/jobs/{jobId}.stderr.log`。调用者通过 `bun scripts/cli.ts job status ` 查询进度和尾部输出。 -异步 job 的返回值只表示控制动作已经排入后台执行,不表示目标运行面对象已经创建或收敛。所有带 `statusCommand` 的返回都必须先用 `job status ` 查看 `progress.stage`、`progress.stageStatus`、关键对象名和 `nextCommand`;只有 progress 已进入对应创建/完成阶段后,才进入更重的运行面 status。对于 `hwlab g14 control-plane trigger-current --lane v02 --confirm`,`progress.pipelineRun` 在 refresh 或 mirror pre-sync 阶段可能只是预期 PipelineRun 名称;在 `progress.stage=create-pipelinerun` 且 `progress.pipelineCreated=true` 前,`control-plane status --pipeline-run ` 返回 not found 只能说明 PipelineRun 尚未创建,不能当作 CI/CD 失败。对于 `git-mirror sync|flush --confirm`,先看 job progress 和 timing 摘要,再用对应 `git-mirror status` 确认 `pendingFlush`、local/github refs 和 `githubInSync`。 +异步 job 的返回值只表示控制动作已经排入后台执行,不表示目标运行面对象已经创建或收敛。所有带 `statusCommand` 的返回都必须先用 `job status ` 查看 `progress.stage`、`progress.stageStatus`、关键对象名和 `nextCommand`;只有 progress 已进入对应创建/完成阶段后,才进入更重的运行面 status。对于 `hwlab g14 control-plane trigger-current --lane v02 --confirm`,`progress.pipelineRun` 在 refresh 或 mirror pre-sync 阶段可能只是预期 PipelineRun 名称;在 `progress.stage=create-pipelinerun` 且 `progress.pipelineCreated=true` 前,`control-plane status --pipeline-run ` 返回 not found 只能说明 PipelineRun 尚未创建,不能当作 CI/CD 失败。对于 `git-mirror sync|flush --confirm`,先看 job progress 和 timing 摘要,再用对应 `git-mirror status` 确认 `pendingFlush`、local/github refs 和 `githubInSync`;node-scoped `flush` 的 progress 若出现 `partialSuccess=push-succeeded-fetch-failed`,`nextCommand` 会直接指向同 node/lane 的 `sync --confirm --wait` 来刷新 mirror-stage cache。 定点状态查询优先使用后台 job 输出的稳定对象名:PipelineRun 用 `hwlab g14 control-plane status --lane v02 --pipeline-run `,已知 source commit 用 `--source-commit `。不要在 source branch 可能继续推进时用默认最新 head status 判定历史 run 成败;默认最新 head 口径只适合判断当前 lane 是否整体最新。`control-plane status` 会汇总 source、mirror、Tekton、Argo、runtime workload 和公网探针,可能比普通只读命令慢;高频轮询应先用 `job status` 或更窄的 status,完整 status 留给阶段收口和异常定位。 diff --git a/docs/reference/g14.md b/docs/reference/g14.md index f8604dd7..36927799 100644 --- a/docs/reference/g14.md +++ b/docs/reference/g14.md @@ -81,7 +81,7 @@ The `devops-infra` git mirror/relay remains manual and CLI-controlled, not CronJ After a `v0.2` PipelineRun completes, treat runtime rollout and remote GitOps persistence as two separate checks. `hwlab g14 control-plane status --lane v02` is the runtime check: it must show the expected source commit, PipelineRun completed, Argo `Synced/Healthy`, public 19666/19667 probes passing, and Cloud Web asset probes such as `/app.js` readable. `hwlab g14 git-mirror status` is the persistence check: `cache.summary.pendingFlush` must be false and `cache.summary.githubInSync` true before declaring GitOps fully flushed back to GitHub. The PR monitor performs this flush automatically for its own merged PRs and records the result in the PR comment. Manual operators should run `bun scripts/cli.ts hwlab g14 git-mirror flush --confirm` and poll the returned job with `bun scripts/cli.ts job status --tail-bytes 12000` only when they used lower-level manual trigger/status paths or when the monitor reports a flush failure; do not replace this with raw `kubectl`, native `git push`, or a long SSH wait. -For D601/node-scoped runtime lanes, `hwlab nodes git-mirror status --node --lane ` reports GitHub refs from the mirror cache's `refs/mirror-stage/...`, not from a live GitHub API request. A `flush --wait` run can push the GitOps ref successfully and still exit non-zero if the follow-up fetch/verification hits a transient GitHub SSH error such as `kex_exchange_identification`. When the flush log already shows the ref update was pushed but status still reports `pendingFlush=true`, do not keep repeating blind flush attempts; first run the corresponding controlled `hwlab nodes git-mirror sync --node --lane --confirm --wait` to refresh `refs/mirror-stage/...`, then recheck status for `localGitops=githubGitops`, `pendingFlush=false`, and `githubInSync=true`. +For D601/node-scoped runtime lanes, `hwlab nodes git-mirror status --node --lane ` reports GitHub refs from the mirror cache's `refs/mirror-stage/...`, not from a live GitHub API request. The status output must keep that source visible through `refSources.githubFieldsAreMirrorStageCache=true`. A `flush --wait` run can push the GitOps ref successfully and still exit non-zero if the follow-up fetch/verification hits a transient GitHub SSH error such as `kex_exchange_identification`; this condition is reported as `partialSuccess=push-succeeded-fetch-failed` with the next step set to the corresponding controlled `hwlab nodes git-mirror sync --node --lane --confirm --wait`. Do not keep repeating blind flush attempts; refresh `refs/mirror-stage/...` with `sync --wait`, then recheck status for `localGitops=githubGitops`, `pendingFlush=false`, and `githubInSync=true`. If `gitops-promote` fails because the git mirror control plane drifted, refs are inconsistent, or publish/flush did not complete, recover through the controlled mirror path: `hwlab g14 git-mirror apply --confirm` to reinstall the current hook/ConfigMap, `hwlab g14 git-mirror sync --confirm --wait` to realign source and GitOps refs, then a targeted `control-plane cleanup-runs --pipeline-run --confirm` before retriggering the same lane. The old branch/path allowlist gate has been removed; do not restore it, patch the hook inside the pod, delete PipelineRuns with raw kubectl, or bypass `git-mirror flush`. Closeout still requires the target PipelineRun status, Argo health, public probes, and `git-mirror status` with `pendingFlush=false`. diff --git a/scripts/src/hwlab-node-control-plane.ts b/scripts/src/hwlab-node-control-plane.ts index 84fc2e50..a35690d2 100644 --- a/scripts/src/hwlab-node-control-plane.ts +++ b/scripts/src/hwlab-node-control-plane.ts @@ -1085,6 +1085,13 @@ console.log(JSON.stringify({ githubSource: first.githubSource || null, localGitops: first.localGitops || null, githubGitops: first.githubGitops || null, + refSources: { + localSource: 'refs/heads/' + (first.sourceBranch || ''), + githubSource: 'refs/mirror-stage/heads/' + (first.sourceBranch || ''), + localGitops: 'refs/heads/' + (first.gitopsBranch || ''), + githubGitops: 'refs/mirror-stage/heads/' + (first.gitopsBranch || ''), + githubFieldsAreMirrorStageCache: true + }, pendingFlush, flushNeeded: pendingFlush, githubInSync: Object.values(items).every((item) => item.sourceInSync === true && item.gitopsInSync === true), @@ -1203,18 +1210,49 @@ function gitMirrorFlushShell(node: ControlPlaneNodeSpec, target: ControlPlaneTar "test -d \"$repo/objects\"", "git --git-dir=\"$repo\" remote set-url origin \"$remote\" || git --git-dir=\"$repo\" remote add origin \"$remote\"", "local_gitops=$(git --git-dir=\"$repo\" rev-parse --verify \"refs/heads/${gitops_branch}^{commit}\" 2>/dev/null || true)", + "push_status=skipped", + "push_exit=0", + "fetch_status=skipped", + "fetch_exit=0", "if [ -n \"$local_gitops\" ]; then", + " set +e", " git --git-dir=\"$repo\" -c remote.origin.mirror=false push origin \"refs/heads/${gitops_branch}:refs/heads/${gitops_branch}\"", - " git --git-dir=\"$repo\" fetch origin \"+refs/heads/${gitops_branch}:refs/mirror-stage/heads/${gitops_branch}\"", + " push_exit=$?", + " set -e", + " if [ \"$push_exit\" = \"0\" ]; then", + " push_status=succeeded", + " set +e", + " git --git-dir=\"$repo\" fetch origin \"+refs/heads/${gitops_branch}:refs/mirror-stage/heads/${gitops_branch}\"", + " fetch_exit=$?", + " set -e", + " if [ \"$fetch_exit\" = \"0\" ]; then fetch_status=succeeded; else fetch_status=failed; fi", + " else", + " push_status=failed", + " fi", "fi", "github_gitops=$(git --git-dir=\"$repo\" rev-parse --verify \"refs/mirror-stage/heads/${gitops_branch}^{commit}\" 2>/dev/null || true)", "pending=false; if [ -n \"$local_gitops\" ] && { [ -z \"$github_gitops\" ] || [ \"$local_gitops\" != \"$github_gitops\" ]; }; then pending=true; fi", - "export repository gitops_branch started_at local_gitops github_gitops pending", + "status=succeeded", + "partial_success=", + "degraded_reason=", + "exit_code=0", + "if [ \"$push_status\" = \"failed\" ]; then", + " status=failed", + " degraded_reason=git-mirror-push-failed", + " exit_code=$push_exit", + "elif [ \"$push_status\" = \"succeeded\" ] && [ \"$fetch_status\" = \"failed\" ]; then", + " status=partial-success", + " partial_success=push-succeeded-fetch-failed", + " degraded_reason=git-mirror-post-push-fetch-failed", + " exit_code=44", + "fi", + "export repository gitops_branch started_at local_gitops github_gitops pending push_status push_exit fetch_status fetch_exit status partial_success degraded_reason", "node <<'NODE' | tee /cache/HWLAB.last-flush.json", - "const payload = { event: 'git-mirror-flush', repo: process.env.repository, status: 'succeeded', startedAt: process.env.started_at, flushedAt: new Date().toISOString(), gitopsBranch: process.env.gitops_branch, localGitops: process.env.local_gitops || null, githubGitops: process.env.github_gitops || null, pendingFlush: process.env.pending === 'true' };", + "const payload = { event: 'git-mirror-flush', repo: process.env.repository, status: process.env.status || 'failed', partialSuccess: process.env.partial_success || null, degradedReason: process.env.degraded_reason || null, startedAt: process.env.started_at, flushedAt: new Date().toISOString(), gitopsBranch: process.env.gitops_branch, localGitops: process.env.local_gitops || null, githubGitops: process.env.github_gitops || null, pendingFlush: process.env.pending === 'true', stages: { push: process.env.push_status || null, pushExitCode: Number.parseInt(process.env.push_exit || '0', 10), postPushFetch: process.env.fetch_status || null, postPushFetchExitCode: Number.parseInt(process.env.fetch_exit || '0', 10) } };", "console.log(JSON.stringify(payload));", "NODE", "cat /cache/HWLAB.last-flush.json", + "if [ \"$exit_code\" != \"0\" ]; then exit \"$exit_code\"; fi", "", ].join("\n"); } diff --git a/scripts/src/hwlab-node.ts b/scripts/src/hwlab-node.ts index 75aee89c..ca485b63 100644 --- a/scripts/src/hwlab-node.ts +++ b/scripts/src/hwlab-node.ts @@ -495,6 +495,63 @@ function compactRuntimeCommand(result: CommandResult): Record { }; } +function parseLastJsonLineObject(text: string): Record { + for (const line of text.split(/\r?\n/u).reverse()) { + const trimmed = line.trim(); + if (!trimmed.startsWith("{") || !trimmed.endsWith("}")) continue; + try { + return record(JSON.parse(trimmed) as unknown); + } catch { + // Keep scanning: command tails can mix kubectl status, git output, and JSON payloads. + } + } + return {}; +} + +function escapeRegExp(value: string): string { + return value.replace(/[.*+?^${}()|[\]\\]/gu, "\\$&"); +} + +function nodeRuntimeGitMirrorRefSources(scoped: ReturnType, mirror: NodeRuntimeGitMirrorTargetSpec): Record { + return { + localSource: `refs/heads/${mirror.sourceBranch}`, + githubSource: `refs/mirror-stage/heads/${mirror.sourceBranch}`, + localGitops: `refs/heads/${mirror.gitopsBranch}`, + githubGitops: `refs/mirror-stage/heads/${mirror.gitopsBranch}`, + githubFieldsAreMirrorStageCache: true, + cacheRefresh: `bun scripts/cli.ts hwlab nodes git-mirror sync --node ${scoped.node} --lane ${scoped.lane} --confirm --wait`, + }; +} + +function nodeRuntimeGitMirrorFlushPartialSuccess(scoped: ReturnType, mirror: NodeRuntimeGitMirrorTargetSpec, result: CommandResult): Record | null { + if (scoped.action !== "flush") return null; + const text = `${result.stdout}\n${result.stderr}`; + const payload = parseLastJsonLineObject(text); + const partialSuccess = typeof payload.partialSuccess === "string" && payload.partialSuccess.length > 0 + ? payload.partialSuccess + : null; + const payloadStatus = typeof payload.status === "string" && payload.status.length > 0 ? payload.status : null; + const branch = escapeRegExp(mirror.gitopsBranch); + const pushSucceededInGitOutput = new RegExp(`\\b${branch}\\s*->\\s*${branch}\\b`, "u").test(text); + const postPushFetchFailed = /kex_exchange_identification|Connection closed by remote host|Could not read from remote repository|fatal:.*fetch|fetch-pack|early EOF/iu.test(text); + if (partialSuccess !== "push-succeeded-fetch-failed" && !(result.exitCode !== 0 && pushSucceededInGitOutput && postPushFetchFailed)) { + return null; + } + return { + status: payloadStatus ?? "partial-success", + partialSuccess: "push-succeeded-fetch-failed", + degradedReason: "node-runtime-git-mirror-flush-post-push-fetch-failed", + retryable: true, + message: "GitOps push appears to have succeeded, but the post-push fetch/recheck failed. Refresh mirror-stage cache before deciding another flush is needed.", + payload: Object.keys(payload).length > 0 ? payload : null, + refSources: nodeRuntimeGitMirrorRefSources(scoped, mirror), + next: { + sync: `bun scripts/cli.ts hwlab nodes git-mirror sync --node ${scoped.node} --lane ${scoped.lane} --confirm --wait`, + status: `bun scripts/cli.ts hwlab nodes git-mirror status --node ${scoped.node} --lane ${scoped.lane}`, + }, + }; +} + function sshTcpPoolDiagnosticsFromCommand(spec: HwlabRuntimeLaneSpec, result: CommandResult): Record | null { if (isCommandSuccess(result)) return null; const failureKind = classifySshTcpPoolFailure(`${result.stderr}\n${result.stdout}`); @@ -1347,6 +1404,7 @@ function nodeRuntimeGitMirrorStatus(scoped: ReturnType