From 1e0140fdcd24c27c028cc91ef0ed2244d6904833 Mon Sep 17 00:00:00 2001 From: Codex Date: Sun, 17 May 2026 01:22:33 +0000 Subject: [PATCH 01/14] fix code queue enqueue view preservation --- scripts/src/e2e.ts | 57 +++++++++++- src/components/frontend/src/code-queue.tsx | 100 ++++++++++++++++++++- 2 files changed, 152 insertions(+), 5 deletions(-) diff --git a/scripts/src/e2e.ts b/scripts/src/e2e.ts index 90a10354..628bb160 100644 --- a/scripts/src/e2e.ts +++ b/scripts/src/e2e.ts @@ -564,12 +564,14 @@ function isCodeQueueTaskEnqueueRequest(url: string, method: string): boolean { async function runCodeQueueEnqueueAwaitSmoke(page: Page): Promise { const marker = `e2e-await-enqueue-${Date.now()}-${Math.random().toString(16).slice(2, 8)}`; + const submitQueueId = "e2e-submit-smoke"; const prompt = [ `Code Queue await enqueue smoke ${marker}`, "", "This task is created by the frontend E2E smoke test to verify that the enqueue submit path awaits the backend response before unlocking the form.", ].join("\n"); let delayedPostCount = 0; + const taskOverviewRequests: string[] = []; const routePattern = "**/api/microservices/code-queue/proxy/api/tasks**"; const routeHandler = async (route: any, request: any): Promise => { if (!isCodeQueueTaskEnqueueRequest(request.url(), request.method())) { @@ -580,11 +582,39 @@ async function runCodeQueueEnqueueAwaitSmoke(page: Page): Promise { await new Promise((resolve) => setTimeout(resolve, 900)); await route.continue(); }; + const onRequest = (request: any): void => { + try { + const parsed = new URL(request.url()); + if (request.method() === "GET" && parsed.pathname === "/api/microservices/code-queue/proxy/api/tasks/overview") { + taskOverviewRequests.push(parsed.search); + } + } catch { + // ignore non-URL requests + } + }; + page.on("request", onRequest); await page.route(routePattern, routeHandler); try { await page.getByTestId("code-queue-filter-select").selectOption("__all__").catch(() => undefined); - await page.getByTestId("code-queue-id-select").selectOption("default").catch(() => undefined); + page.once("dialog", (dialog) => { void dialog.accept(submitQueueId); }); + await page.getByTestId("codex-create-queue-button").click(); + await page.waitForFunction((queueId) => { + const select = document.querySelector('[data-testid="code-queue-id-select"]') as HTMLSelectElement | null; + const filter = document.querySelector('[data-testid="code-queue-filter-select"]') as HTMLSelectElement | null; + return select?.value === queueId && filter?.value === queueId && Array.from(select?.options || []).some((option) => option.value === queueId); + }, submitQueueId, { timeout: 10000 }); + await page.getByTestId("code-queue-filter-select").selectOption("__all__").catch(() => undefined); + await page.getByTestId("code-queue-id-select").selectOption(submitQueueId); + taskOverviewRequests.length = 0; + const beforeSubmitView = await page.evaluate(() => { + const filter = document.querySelector('[data-testid="code-queue-filter-select"]') as HTMLSelectElement | null; + const submitQueue = document.querySelector('[data-testid="code-queue-id-select"]') as HTMLSelectElement | null; + return { + filterValue: filter?.value || "", + submitQueueValue: submitQueue?.value || "", + }; + }); await page.getByTestId("codex-max-attempts-input").fill("1"); await page.getByTestId("codex-repeat-count-input").fill("1"); await page.locator('[data-testid="code-queue-task-form"] textarea').fill(prompt); @@ -626,6 +656,10 @@ async function runCodeQueueEnqueueAwaitSmoke(page: Page): Promise { const textarea = document.querySelector('[data-testid="code-queue-task-form"] textarea') as HTMLTextAreaElement | null; const notice = document.querySelector('[data-testid="codex-create-success"]') as HTMLElement | null; const card = document.querySelector(`[data-testid="codex-task-${CSS.escape(String(id))}"]`) as HTMLElement | null; + const filter = document.querySelector('[data-testid="code-queue-filter-select"]') as HTMLSelectElement | null; + const submitQueue = document.querySelector('[data-testid="code-queue-id-select"]') as HTMLSelectElement | null; + const sessionPanel = document.querySelector('.codex-output-panel') as HTMLElement | null; + const cards = Array.from(document.querySelectorAll('[data-testid^="codex-task-codex_"]')).map((node) => (node as HTMLElement).innerText); return { formBusy: form?.getAttribute("aria-busy") === "true", waitMissing: wait === null, @@ -634,6 +668,10 @@ async function runCodeQueueEnqueueAwaitSmoke(page: Page): Promise { textareaEmpty: (textarea?.value || "") === "", noticeText: notice?.textContent || "", cardVisible: Boolean(card && card.offsetParent !== null), + filterValue: filter?.value || "", + submitQueueValue: submitQueue?.value || "", + selectedTraceHasTask: Boolean(sessionPanel && (sessionPanel.textContent || "").includes(String(id))), + allVisibleCardsRespectFilter: filter?.value === "__all__" || cards.every((text) => text.includes(`queue=${filter?.value || ""}`)), }; }, taskId); const storedTask = await page.evaluate(async (id) => { @@ -663,11 +701,15 @@ async function runCodeQueueEnqueueAwaitSmoke(page: Page): Promise { } })(); await page.getByTestId("codex-max-attempts-input").fill("99").catch(() => undefined); + const afterSubmitOverviewRequests = taskOverviewRequests.slice(); return { checked: true, marker, + submitQueueId, delayedPostCount, requestBody, + afterSubmitOverviewRequests, + postSubmitSubmitQueueOverviewRequestCount: afterSubmitOverviewRequests.filter((search) => search.includes(`queueId=${encodeURIComponent(submitQueueId)}`)).length, responseStatus: response.status(), responseOk: response.ok(), responseBody: { @@ -675,6 +717,7 @@ async function runCodeQueueEnqueueAwaitSmoke(page: Page): Promise { taskIds: Array.isArray(responseBody?.tasks) ? responseBody.tasks.map((task: any) => String(task?.id || "")).filter(Boolean) : [], }, taskId, + beforeSubmitView, duringAwait, afterAwait, storedTask: { @@ -695,6 +738,7 @@ async function runCodeQueueEnqueueAwaitSmoke(page: Page): Promise { }; } finally { await page.unroute(routePattern, routeHandler).catch(() => undefined); + page.off("request", onRequest); } } @@ -2800,6 +2844,7 @@ async function frontendCheck(config: UniDeskConfig, urls: PublicUrls, checks: E2 && codeQueueEnqueueAwaitSmoke.responseOk === true && codeQueueEnqueueAwaitSmoke.responseStatus === 202 && /^codex_\d+_[A-Za-z0-9_-]+$/u.test(String(codeQueueEnqueueAwaitSmoke.taskId || "")) + && codeQueueEnqueueAwaitSmoke.submitQueueId === "e2e-submit-smoke" && codeQueueEnqueueAwaitSmoke.duringAwait?.formBusy === true && codeQueueEnqueueAwaitSmoke.duringAwait?.waitVisible === true && codeQueueEnqueueAwaitSmoke.duringAwait?.buttonDisabled === true @@ -2807,9 +2852,17 @@ async function frontendCheck(config: UniDeskConfig, urls: PublicUrls, checks: E2 && codeQueueEnqueueAwaitSmoke.afterAwait?.formBusy === false && codeQueueEnqueueAwaitSmoke.afterAwait?.waitMissing === true && codeQueueEnqueueAwaitSmoke.afterAwait?.textareaEmpty === true + && codeQueueEnqueueAwaitSmoke.beforeSubmitView?.filterValue === "__all__" + && codeQueueEnqueueAwaitSmoke.afterAwait?.filterValue === codeQueueEnqueueAwaitSmoke.beforeSubmitView?.filterValue + && codeQueueEnqueueAwaitSmoke.afterAwait?.submitQueueValue === codeQueueEnqueueAwaitSmoke.beforeSubmitView?.submitQueueValue + && codeQueueEnqueueAwaitSmoke.beforeSubmitView?.submitQueueValue === codeQueueEnqueueAwaitSmoke.submitQueueId + && Number(codeQueueEnqueueAwaitSmoke.postSubmitSubmitQueueOverviewRequestCount || 0) === 0 + && codeQueueEnqueueAwaitSmoke.afterAwait?.cardVisible === true + && codeQueueEnqueueAwaitSmoke.afterAwait?.selectedTraceHasTask === true + && codeQueueEnqueueAwaitSmoke.afterAwait?.allVisibleCardsRespectFilter === true && codeQueueEnqueueAwaitSmoke.storedTask?.ok === true && codeQueueEnqueueAwaitSmoke.storedTask?.id === codeQueueEnqueueAwaitSmoke.taskId - && codeQueueEnqueueAwaitSmoke.storedTask?.queueId === "default" + && codeQueueEnqueueAwaitSmoke.storedTask?.queueId === codeQueueEnqueueAwaitSmoke.submitQueueId && codeQueueEnqueueAwaitSmoke.storedTask?.promptIncludesMarker === true && (codeQueueEnqueueAwaitSmoke.interrupt?.ok === true || codeQueueEnqueueAwaitSmoke.interrupt?.status === 409), { codeQueueEnqueueAwaitSmoke }); diff --git a/src/components/frontend/src/code-queue.tsx b/src/components/frontend/src/code-queue.tsx index e20dae5b..6476653b 100644 --- a/src/components/frontend/src/code-queue.tsx +++ b/src/components/frontend/src/code-queue.tsx @@ -2215,6 +2215,99 @@ export function CodeQueuePage({ microservices, onRaw, apiBaseUrl = "/api", initi }; } + function taskVisibleInQueueFilter(task: any, queueFilterId: string): boolean { + return isAllQueues(queueFilterId) || taskQueueLabel(task) === queueFilterId; + } + + function taskMatchesCurrentSearch(task: any): boolean { + const query = normalizedSearchQuery.toLowerCase(); + if (query.length === 0) return true; + const haystack = [ + task?.id, + task?.status, + task?.queueId, + task?.providerId, + task?.model, + task?.cwd, + task?.displayPrompt, + task?.basePrompt, + task?.prompt, + task?.finalResponse, + task?.lastError?.message, + ].map((value) => String(value || "").toLowerCase()).join("\n"); + return haystack.includes(query); + } + + function mergeCreatedTasksIntoCurrentView(createdTasks: any[], queuePatch: any): void { + const rows = createdTasks.filter((task) => String(task?.id || "").length > 0); + if (rows.length === 0 && !queuePatch) return; + const firstTask = rows[0] || null; + const firstId = String(firstTask?.id || ""); + const filteredRows = rows.filter((task) => taskVisibleInQueueFilter(task, selectedQueueId)); + const searchableRows = filteredRows.filter(taskMatchesCurrentSearch); + const activeSortId = String(queuePatch?.activeTaskId || activeTaskIds(queuePatch)[0] || firstId || activeTaskId || ""); + for (const task of rows) { + const taskId = String(task?.id || ""); + if (!taskId) continue; + const transcript = Array.isArray(task?.transcript) ? task.transcript : []; + sessionCacheRef.current.set(taskId, { + ...(sessionCacheRef.current.get(taskId) || {}), + task: { + ...task, + _summaryLoaded: true, + _detailLoaded: transcript.length > 0, + _transcriptComplete: false, + _transcriptPreview: false, + }, + maxSeq: transcriptMaxSeq(transcript), + complete: false, + completeUpdatedAt: "", + }); + } + setTasksData((previous: any) => { + if (!previous && (filteredRows.length === 0 || !queuePatch)) return previous; + const previousRows = taskRows(previous); + const mergedRows = applyLocalReadStateToRows(mergeTaskRowsPreferLatest([previousRows, filteredRows], activeSortId)); + return { + ...(previous || {}), + queue: queuePatch || previous?.queue, + tasks: mergedRows, + pagination: previous?.pagination ? { ...taskPagination(previous), returned: mergedRows.length } : previous?.pagination, + }; + }); + if (searchActive) { + setSearchTasksData((previous: any) => { + if (!previous || searchableRows.length === 0) return previous; + const mergedRows = applyLocalReadStateToRows(mergeTaskRowsPreferLatest([taskRows(previous), searchableRows], activeSortId)); + return { + ...previous, + queue: queuePatch || previous.queue, + tasks: mergedRows, + pagination: previous.pagination ? { ...taskPagination(previous), returned: mergedRows.length } : previous.pagination, + }; + }); + } + if (firstTask && taskVisibleInQueueFilter(firstTask, selectedQueueId) && taskMatchesCurrentSearch(firstTask)) { + detailLoadTokenRef.current += 1; + selectedIdRef.current = firstId; + setSelectedId(firstId); + setSelectedTask(sessionCacheRef.current.get(firstId)?.task || firstTask); + setSelectedDetailLoading(false); + setLoadStats({ + phase: "complete", + taskId: firstId, + queueMs: 0, + detailMs: 0, + totalMs: 0, + chunks: 1, + transcriptRows: Array.isArray(firstTask?.transcript) ? firstTask.transcript.length : 0, + partial: true, + completedAt: new Date(), + }); + } + setRefreshedAt(new Date()); + } + function changeSubmitProvider(nextProviderId: string): void { const next = String(nextProviderId || queue?.mainProviderId || "D601").trim() || "D601"; setProviderId(next); @@ -3194,15 +3287,16 @@ export function CodeQueuePage({ microservices, onRaw, apiBaseUrl = "/api", initi const firstId = result?.tasks?.[0]?.id || ""; const ids = Array.isArray(result?.tasks) ? result.tasks.map((task: any) => String(task?.id || "")).filter(Boolean) : []; const msg = `已创建 ${ids.length || submittingItems.length} 个任务${ids.length > 0 ? `:${ids.join(" / ")}` : ""}`; + mergeCreatedTasksIntoCurrentView(Array.isArray(result?.tasks) ? result.tasks : [], result?.queue || null); setNotice(msg); addNotification("success", msg); setPrompt(""); setReferenceTaskId(""); setBatchConfirmed(false); - selectedIdRef.current = firstId; - if (selectedQueueId !== submitQueueId) setTasksData(null); setQueueId(submitQueueId); - await load(firstId, true, submitQueueId); + if (firstId && taskVisibleInQueueFilter(result?.tasks?.[0], selectedQueueId) && taskMatchesCurrentSearch(result?.tasks?.[0])) { + void ensureTraceSummary(String(firstId), false).catch((err) => setError(errorText(err, "加载 Codex Trace Summary 失败"))); + } }, "Codex 任务入队失败"); enqueueInFlightRef.current = false; setSubmitting(false); From fd77f74909d6267a8720308ad56e192dce2a44cf Mon Sep 17 00:00:00 2001 From: Codex Date: Sun, 17 May 2026 01:41:19 +0000 Subject: [PATCH 02/14] Fix Code Queue WebSearch trace coalescing --- .../microservices/code-queue/src/index.ts | 28 +++- .../microservices/code-queue/src/oa-events.ts | 33 ++++- .../code-queue/src/self-tests.ts | 12 ++ .../microservices/code-queue/src/task-view.ts | 128 +++++++++++++++++- 4 files changed, 189 insertions(+), 12 deletions(-) diff --git a/src/components/microservices/code-queue/src/index.ts b/src/components/microservices/code-queue/src/index.ts index b044bae6..297dca54 100644 --- a/src/components/microservices/code-queue/src/index.ts +++ b/src/components/microservices/code-queue/src/index.ts @@ -130,8 +130,10 @@ import { } from "./oa-events"; import { configureSelfTests, runJudgeInfraSelfTest, runQueueOrderingSelfTest, runReferenceInjectionSelfTest, runTracePortSelfTest } from "./self-tests"; import { + codexToolLifecycleStartedBeforeIn, configureTaskView, formatCommandOutput, + isCodexToolLifecycleOutput, lastAssistantMessage, promptLineCount, recordNumberField, @@ -1018,6 +1020,7 @@ function outputStartsTraceStepInHistory(outputs: LiveOutput[], output: LiveOutpu if (output.channel === "user" && output.method === "enqueue") return false; if (isOpenCodeStepBoundaryMethod(output.method)) return false; if (output.channel === "system") return false; + if (codexToolLifecycleStartedBeforeIn(outputs, output)) return false; if (output.channel === "diff" || output.channel === "tool" || output.channel === "error" || output.channel === "assistant" || output.channel === "reasoning") return true; if (output.channel === "user") return true; if (output.channel !== "command") return true; @@ -1088,9 +1091,18 @@ function recordTaskOutputMetrics(task: QueueTask, output: LiveOutput, op: "set" function outputUpdatesExistingTraceStep(output: LiveOutput): boolean { if (output.channel === "assistant" || output.channel === "reasoning" || output.channel === "diff") return true; + if (isCodexToolLifecycleOutput(output) && output.method === "item/completed") return true; return false; } +function traceStepOutputForProjection(task: QueueTask, output: LiveOutput): LiveOutput { + if (!isCodexToolLifecycleOutput(output) || output.method !== "item/completed" || typeof output.itemId !== "string") return output; + const started = taskFullOutput(task) + .filter((item) => item !== output && isCodexToolLifecycleOutput(item) && item.itemId === output.itemId && item.method === "item/started") + .sort((left, right) => Number(left.seq) - Number(right.seq))[0]; + return started === undefined ? output : { ...output, seq: started.seq, at: output.at, itemId: output.itemId, rawSeqs: [started.seq, output.seq] } as LiveOutput; +} + function errorToJson(error: unknown): JsonValue { if (error instanceof Error) return { name: error.name, message: error.message, stack: error.stack ?? null }; return String(error); @@ -2298,8 +2310,9 @@ configureTaskOutput({ onOutputAppended: (task, output, op) => { const archiveOp = op === "append" ? "append" : "set"; const stepChanged = recordTaskOutputMetrics(task, output, archiveOp); - if (stepChanged) publishCodeQueueTraceStep(task, queueIdOf(task), output, taskOutputMaxSeq(task)); - else if (archiveOp === "append" && outputUpdatesExistingTraceStep(output)) publishCodeQueueTraceStep(task, queueIdOf(task), output, taskOutputMaxSeq(task), null, String(output.text || "").length); + const projectionOutput = traceStepOutputForProjection(task, output); + if (stepChanged) publishCodeQueueTraceStep(task, queueIdOf(task), projectionOutput, taskOutputMaxSeq(task)); + else if ((archiveOp === "append" || output.method === "item/completed") && outputUpdatesExistingTraceStep(output)) publishCodeQueueTraceStep(task, queueIdOf(task), projectionOutput, taskOutputMaxSeq(task), null, String(output.text || "").length); if (archiveOp === "append" && !outputCanChangeStepCount(output)) return; publishTaskOaEvent(task, "output", { onlyStepChange: archiveOp === "append", stepChanged }); }, @@ -4162,9 +4175,14 @@ async function backfillOaTraceStats(url: URL): Promise { const attemptBySeq = outputAttemptIndexMap(output); if (includeSteps) { for (const item of output) { - if (!outputStartsTraceStepInHistory(output, item)) continue; - publishCodeQueueTraceStep(task, queueId, item, outputMaxSeq, attemptBySeq.get(item.seq) ?? null); - stepEventCount += 1; + const projectionOutput = traceStepOutputForProjection(task, item); + if (outputStartsTraceStepInHistory(output, item)) { + publishCodeQueueTraceStep(task, queueId, projectionOutput, outputMaxSeq, attemptBySeq.get(item.seq) ?? null); + stepEventCount += 1; + } else if (outputUpdatesExistingTraceStep(item)) { + publishCodeQueueTraceStep(task, queueId, projectionOutput, outputMaxSeq, attemptBySeq.get(projectionOutput.seq) ?? attemptBySeq.get(item.seq) ?? null, String(item.text || "").length); + stepEventCount += 1; + } } } publishCodeQueueTraceStatsSnapshot(task, queueId, "backfill", traceStats.stepCount, outputMaxSeq, traceStats); diff --git a/src/components/microservices/code-queue/src/oa-events.ts b/src/components/microservices/code-queue/src/oa-events.ts index 0a968530..bd515b0e 100644 --- a/src/components/microservices/code-queue/src/oa-events.ts +++ b/src/components/microservices/code-queue/src/oa-events.ts @@ -198,7 +198,7 @@ function normalizeCommandText(text: string): string { function commandKind(command: string): "read" | "edit" | "run" { if (/\b(apply_patch|git apply|cat >|tee .*<<|sed -i|python3? .*write_text|write|patch|edit|delete|create)\b/iu.test(command)) return "edit"; - if (/\b(rg|grep|find|ls|cat|sed -n|tail|head|git status|git diff|ps|read|glob|search|view)\b/iu.test(command)) return "read"; + if (/\b(rg|grep|find|ls|cat|sed -n|tail|head|git status|git diff|ps|read|glob|search|view|webSearch)\b/iu.test(command)) return "read"; return "run"; } @@ -240,6 +240,7 @@ export function outputTraceKind(output: LiveOutput): "read" | "edit" | "run" | " if (output.channel === "assistant" || output.channel === "user" || output.channel === "reasoning") return "message"; if (output.channel === "tool") { const record = openCodeToolRecord(output); + if (record === null) return commandKind(normalizeCommandText(output.text)); const part = record?.part && typeof record.part === "object" && !Array.isArray(record.part) ? record.part as Record : null; const state = part?.state && typeof part.state === "object" && !Array.isArray(part.state) ? part.state as Record : null; const input = state?.input && typeof state.input === "object" && !Array.isArray(state.input) ? state.input as Record : null; @@ -388,7 +389,7 @@ export function publishCodeQueueTraceStep(task: QueueTask, queueId: string, outp title: outputTitle(output, kind), status: task.status, summaryLines: outputSummaryLines(output), - rawSeqs: [output.seq], + rawSeqs: outputRawSeqs(output), }, }); } @@ -519,6 +520,14 @@ function numberList(value: unknown, fallback: number): number[] { return values.length > 0 ? values : [fallback]; } +function outputRawSeqs(output: LiveOutput): number[] { + const rawSeqs = (output as LiveOutput & { rawSeqs?: unknown }).rawSeqs; + const values = Array.isArray(rawSeqs) + ? rawSeqs.map((item) => Number(item)).filter((item) => Number.isFinite(item)).map((item) => Math.floor(item)) + : []; + return values.length > 0 ? Array.from(new Set(values)) : [output.seq]; +} + function commandLifecycleStatus(payload: JsonRecord, title: string, summaryLines: string[]): string { const source = [title, ...summaryLines].join("\n"); const status = /\bstatus=([A-Za-z0-9_-]+)/u.exec(source)?.[1]; @@ -552,6 +561,24 @@ function traceStepFromEvent(event: unknown): OaTraceStepSummary | null { }; } +function traceStepLifecycleRank(step: OaTraceStepSummary): number { + const source = [step.title, step.status, ...step.summaryLines].join("\n"); + if (/\bitem\/completed\b|status=completed\b|\bcompleted\b/iu.test(source)) return 2; + if (/\bitem\/started\b|status=inProgress\b|\binProgress\b/iu.test(source)) return 1; + return 0; +} + +function mergeOaTraceStepSummary(existing: OaTraceStepSummary | undefined, incoming: OaTraceStepSummary): OaTraceStepSummary { + if (existing === undefined) return incoming; + const selected = traceStepLifecycleRank(incoming) >= traceStepLifecycleRank(existing) ? incoming : existing; + return { + ...existing, + ...selected, + eventSequence: Math.max(existing.eventSequence, incoming.eventSequence), + rawSeqs: Array.from(new Set([...existing.rawSeqs, ...incoming.rawSeqs])), + }; +} + function eventNextAfterSeq(body: Record, events: unknown[], fallback: number): number { const bodyNext = Number(body.nextAfterSeq); const eventNext = events.reduce((max, event) => { @@ -609,7 +636,7 @@ export async function readOaTraceStepsForTask(taskId: string, attemptIndex: numb const events = Array.isArray(body.events) ? body.events : []; for (const event of events) { const step = traceStepFromEvent(event); - if (step !== null) bySeq.set(step.seq, { ...(bySeq.get(step.seq) ?? {}), ...step }); + if (step !== null) bySeq.set(step.seq, mergeOaTraceStepSummary(bySeq.get(step.seq), step)); } const nextAfterSeq = eventNextAfterSeq(body, events, afterSeq); if (events.length < traceStepReadPageLimit || nextAfterSeq <= afterSeq) break; diff --git a/src/components/microservices/code-queue/src/self-tests.ts b/src/components/microservices/code-queue/src/self-tests.ts index 15fd566c..76c9e44e 100644 --- a/src/components/microservices/code-queue/src/self-tests.ts +++ b/src/components/microservices/code-queue/src/self-tests.ts @@ -346,6 +346,17 @@ function runTracePortSelfTest(): JsonValue { assertReferenceTest(!transcript.some((line) => line.status === "opencode/step-start" || line.status === "opencode/step-finish"), "opencode step boundaries should stay out of trace"); assertReferenceTest(!transcript.some((line) => String(line.bodyPreview || "").includes("hidden reasoning")), "reasoning-only opencode assistant text should not duplicate reasoning"); + const codexWebSearchTask = testTask("codex_5004_web_search", "codex web search prompt", "", [], "2026-05-12T00:01:00.000Z"); + codexWebSearchTask.output = [ + { seq: 30, at: "2026-05-12T00:01:00.000Z", channel: "tool", method: "item/started", itemId: "ws_trace", text: "item/started: webSearch\n" }, + { seq: 31, at: "2026-05-12T00:01:01.000Z", channel: "tool", method: "item/completed", itemId: "ws_trace", text: "item/completed: webSearch status=completed\n" }, + ]; + const webSearchTranscript = buildTaskTranscript(codexWebSearchTask, 20, 0); + const webSearchLines = webSearchTranscript.filter((line) => line.rawSeqs.includes(30) || line.rawSeqs.includes(31)); + assertReferenceTest(webSearchLines.length === 1, "codex WebSearch start/completed lifecycle should coalesce into one trace line"); + assertReferenceTest(webSearchLines[0]?.kind === "explored", "codex WebSearch should count as an explored/read trace line"); + assertReferenceTest([30, 31].every((seq) => webSearchLines[0]?.rawSeqs.includes(seq)), "codex WebSearch trace line should preserve lifecycle raw seqs"); + const codexTask = testTask("codex_5002_interleaved_command", "codex command prompt", "", [], "2026-05-12T00:02:00.000Z"); codexTask.output = [ { seq: 10, at: "2026-05-12T00:02:00.000Z", channel: "command", method: "item/started", itemId: "call_long", text: "item/started: /bin/bash -lc \"python3 - <<'PY'\\nprint('hello')\\nPY\" status=inProgress\n" }, @@ -404,6 +415,7 @@ function runTracePortSelfTest(): JsonValue { { name: "reasoning_duplicate_filtered", ok: true }, { name: "interleaved_command_output_single_trace_line", ok: true, rawSeqs: longCommand?.rawSeqs ?? [] }, { name: "interleaved_command_summary_has_command", ok: true, summaryLines: longCommand ? transcriptLineSummaryLines(longCommand) : [] }, + { name: "codex_web_search_lifecycle_coalesced", ok: true, rawSeqs: webSearchLines[0]?.rawSeqs ?? [] }, { name: "message_fragments_coalesced", ok: true, assistantRawSeqs: assistantMessages[0]?.rawSeqs ?? [], reasoningRawSeqs: reasoningMessages[0]?.rawSeqs ?? [] }, { name: "duration_preserved", ok: true, durationMs: explored?.durationMs ?? null }, { name: "remote_opencode_exec_includes_binary", ok: true }, diff --git a/src/components/microservices/code-queue/src/task-view.ts b/src/components/microservices/code-queue/src/task-view.ts index 2d552688..acb6ed50 100644 --- a/src/components/microservices/code-queue/src/task-view.ts +++ b/src/components/microservices/code-queue/src/task-view.ts @@ -50,6 +50,18 @@ export interface TaskViewContext { taskQueueEnteredAt: (task: QueueTask) => string; } +function isCodexToolLifecycleOutput(output: Pick): boolean { + if (output.channel !== "tool" || typeof output.itemId !== "string" || output.itemId.length === 0) return false; + const method = String(output.method || ""); + if (method !== "item/started" && method !== "item/completed") return false; + return /\b(?:webSearch|mcpToolCall|dynamicToolCall)\b/u.test(String(output.text || "")); +} + +function codexToolLifecycleStartedBeforeIn(outputs: Pick[], output: Pick): boolean { + if (!isCodexToolLifecycleOutput(output)) return false; + return outputs.some((item) => item !== output && isCodexToolLifecycleOutput(item) && item.itemId === output.itemId && item.method === "item/started"); +} + const judgeFailRetryLimit = 3; const transcriptCache = new Map(); const codexSessionPathCache = new Map(); @@ -783,6 +795,7 @@ function overlayTraceMessagesFromRawTranscript(oaLines: TranscriptLine[], rawLin function commandKind(command: string): TranscriptKind { + if (/\bwebSearch\b/u.test(command)) return "explored"; if (/\b(apply_patch|git apply|cat >|tee .*<<|sed -i|python3? .*write_text)\b/u.test(command)) return "edited"; if (/\b(rg|grep|find|ls|cat|sed -n|tail|head|git status|git diff|ps)\b/u.test(command)) return "explored"; return "ran"; @@ -1102,7 +1115,9 @@ function buildTaskTranscript(task: QueueTask, limit = 180, rawOutputWindow = 0, ? codexSessionFileChangesByCallId(task) : new Map(); type ActiveMessage = { seq: number; at: string; title: string; status?: string; body: string; rawSeqs: number[] }; + type ActiveCodexTool = { seq: number; at: string; text: string; status?: string; rawSeqs: number[]; itemId?: string }; let activeMessage: ActiveMessage | null = null; + const activeCodexToolsByItemId = new Map(); const flushMessage = (): void => { if (activeMessage === null) return; @@ -1123,6 +1138,21 @@ function buildTaskTranscript(task: QueueTask, limit = 180, rawOutputWindow = 0, activeMessage = { seq: item.seq, at: item.at, title, status, body, rawSeqs: [item.seq] }; }; + const parseCodexToolLifecycle = (item: LiveOutput): { status: string | undefined; text: string } => { + const status = /\bstatus=([A-Za-z0-9_-]+)/u.exec(item.text)?.[1]; + return { status, text: String(item.text || "").trimEnd() }; + }; + + const codexToolLifecycleLine = (tool: ActiveCodexTool): TranscriptLine => { + const kind = commandKind(tool.text); + return transcriptLine(kind, tool.at, tool.seq, shortCommandTitle(tool.text), tool.rawSeqs, "", tool.text, tool.status, fullText); + }; + + const flushCodexTool = (tool: ActiveCodexTool): void => { + entries.push(codexToolLifecycleLine(tool)); + if (tool.itemId !== undefined && activeCodexToolsByItemId.get(tool.itemId) === tool) activeCodexToolsByItemId.delete(tool.itemId); + }; + for (const item of outputItems) { if (initialPrompt !== null && item.channel === "user" && item.method === "enqueue") continue; if (item.channel === "user" && item.method === "turn/steer" && promptHistorySeqs.has(item.seq)) continue; @@ -1191,7 +1221,37 @@ function buildTaskTranscript(task: QueueTask, limit = 180, rawOutputWindow = 0, if (item.channel !== "assistant" && item.channel !== "reasoning") flushMessage(); flushCommand(); - if (item.channel === "diff") { + if (isCodexToolLifecycleOutput(item)) { + const parsed = parseCodexToolLifecycle(item); + const itemId = item.itemId || ""; + const existing = activeCodexToolsByItemId.get(itemId); + if (item.method === "item/started") { + if (existing !== undefined) flushCodexTool(existing); + activeCodexToolsByItemId.set(itemId, { + seq: item.seq, + at: item.at, + text: parsed.text, + status: parsed.status ?? item.method, + rawSeqs: [item.seq], + itemId, + }); + } else if (existing !== undefined) { + existing.at = item.at; + existing.status = parsed.status ?? existing.status; + existing.text = parsed.text.length > 0 ? parsed.text : existing.text; + pushUniqueRawSeq(existing.rawSeqs, item.seq); + flushCodexTool(existing); + } else { + entries.push(codexToolLifecycleLine({ + seq: item.seq, + at: item.at, + text: parsed.text, + status: parsed.status ?? item.method, + rawSeqs: [item.seq], + itemId, + })); + } + } else if (item.channel === "diff") { const text = fileChangeTextWithInlinePatch(item, fileChangeInputs); entries.push(transcriptLine("edited", item.at, item.seq, "Edited files", [item.seq], text, "", item.method, fullText)); } else if (item.channel === "error") { @@ -1221,6 +1281,9 @@ function buildTaskTranscript(task: QueueTask, limit = 180, rawOutputWindow = 0, for (const command of Array.from(activeCommandsByItemId.values()).sort((left, right) => left.seq - right.seq)) { flushCommand(command); } + for (const tool of Array.from(activeCodexToolsByItemId.values()).sort((left, right) => left.seq - right.seq)) { + flushCodexTool(tool); + } return boundedTranscript(coalesceTranscriptMessageFragments(entries), limit); } @@ -1863,12 +1926,67 @@ function oaTraceStepToTranscriptLine(step: OaTraceStepSummary): TranscriptLine { }; } +function isCodexToolLifecycleTranscriptLine(line: TranscriptLine): boolean { + const text = `${line.commandPreview ?? ""}\n${line.bodyPreview ?? ""}\n${line.title}`; + const status = String(line.status || ""); + return (line.kind === "explored" || line.kind === "ran") + && (status === "item/started" || status === "item/completed" || /^item\/(?:started|completed):/u.test(text)) + && /\b(?:webSearch|mcpToolCall|dynamicToolCall)\b/u.test(text); +} + +function mergeCodexToolLifecycleGroup(group: TranscriptLine[]): TranscriptLine { + if (group.length <= 1) return group[0]; + const first = group[0]; + const last = group.at(-1) || first; + const rawSeqs: number[] = []; + for (const line of group) { + for (const seq of Array.isArray(line.rawSeqs) ? line.rawSeqs : [line.seq]) pushUniqueRawSeq(rawSeqs, Number(seq)); + } + const command = String(last.commandPreview || first.commandPreview || last.bodyPreview || first.bodyPreview || last.title || first.title || ""); + return { + ...first, + seq: Number.isFinite(Number(last.seq)) ? Number(last.seq) : Number(first.seq), + at: last.at || first.at, + kind: commandKind(command), + title: shortCommandTitle(command) || String(last.title || first.title || "WebSearch"), + status: last.status || first.status, + commandPreview: command || undefined, + commandOmittedLines: Number(first.commandOmittedLines || 0) + Number(last.commandOmittedLines || 0) || undefined, + bodyPreview: last.bodyPreview || first.bodyPreview, + bodyOmittedLines: Number(first.bodyOmittedLines || 0) + Number(last.bodyOmittedLines || 0) || undefined, + rawSeqs, + }; +} + +function coalesceCodexToolLifecycleTranscriptLines(lines: TranscriptLine[]): TranscriptLine[] { + const rows = sortTranscript([...lines]); + const merged: TranscriptLine[] = []; + let group: TranscriptLine[] = []; + const flush = () => { + if (group.length > 0) merged.push(mergeCodexToolLifecycleGroup(group)); + group = []; + }; + for (const line of rows) { + if (isCodexToolLifecycleTranscriptLine(line)) { + const text = String(line.commandPreview || line.bodyPreview || ""); + if ((line.status === "item/started" || /^item\/started:/u.test(text)) && group.length > 0) flush(); + group.push(line); + if (line.status === "item/completed" || /^item\/completed:/u.test(text)) flush(); + continue; + } + flush(); + merged.push(line); + } + flush(); + return merged; +} + async function oaTraceTranscriptForTask(task: QueueTask, attemptIndex: number | null): Promise { const taskId = task.id; const steps = await readOaTraceStepsForTask(taskId, attemptIndex); - const oaLines = coalesceTranscriptMessageFragments(steps.map(oaTraceStepToTranscriptLine).filter(traceLineVisibleInTraceView)); - const rawLines = fullTranscript(task).filter(traceLineVisibleInTraceView); - return overlayTraceMessagesFromRawTranscript(oaLines, rawLines); + const oaLines = coalesceCodexToolLifecycleTranscriptLines(coalesceTranscriptMessageFragments(steps.map(oaTraceStepToTranscriptLine).filter(traceLineVisibleInTraceView))); + const rawLines = coalesceCodexToolLifecycleTranscriptLines(fullTranscript(task).filter(traceLineVisibleInTraceView)); + return coalesceCodexToolLifecycleTranscriptLines(overlayTraceMessagesFromRawTranscript(oaLines, rawLines)); } function mergeTraceWindowLines(left: TranscriptLine[], right: TranscriptLine[]): TranscriptLine[] { @@ -2384,6 +2502,8 @@ export { buildCompactTaskTranscript, buildTaskTranscript, cachedPreviewTranscript, + codexToolLifecycleStartedBeforeIn, + isCodexToolLifecycleOutput, formatCommandOutput, fullTranscript, lastAssistantMessage, From 857b4bc2989889956ae90ea7c1891c001b7594be Mon Sep 17 00:00:00 2001 From: Codex Date: Sun, 17 May 2026 02:31:51 +0000 Subject: [PATCH 03/14] fix code queue read marker persistence --- .../microservices/code-queue/src/index.ts | 48 +++++++++++++++---- 1 file changed, 39 insertions(+), 9 deletions(-) diff --git a/src/components/microservices/code-queue/src/index.ts b/src/components/microservices/code-queue/src/index.ts index 297dca54..fe758aa6 100644 --- a/src/components/microservices/code-queue/src/index.ts +++ b/src/components/microservices/code-queue/src/index.ts @@ -1186,7 +1186,7 @@ function updateNextSeqFromTasks(): void { } async function upsertTaskToDatabase(client: SqlExecutor, task: QueueTask): Promise { - await client` + const rows = await client>` INSERT INTO unidesk_code_queue_tasks ( id, queue_id, @@ -1269,15 +1269,32 @@ async function upsertTaskToDatabase(client: SqlExecutor, task: QueueTask): Promi updated_at = EXCLUDED.updated_at, started_at = EXCLUDED.started_at, finished_at = EXCLUDED.finished_at, - read_at = EXCLUDED.read_at, + read_at = CASE + WHEN EXCLUDED.status IN ('queued', 'running', 'judging', 'retry_wait') THEN NULL + WHEN unidesk_code_queue_tasks.read_at IS NOT NULL AND EXCLUDED.read_at IS NOT NULL THEN GREATEST(unidesk_code_queue_tasks.read_at, EXCLUDED.read_at) + WHEN unidesk_code_queue_tasks.read_at IS NOT NULL THEN unidesk_code_queue_tasks.read_at + ELSE EXCLUDED.read_at + END, last_error = EXCLUDED.last_error, last_judge = EXCLUDED.last_judge, output_count = EXCLUDED.output_count, event_count = EXCLUDED.event_count, attempt_count = EXCLUDED.attempt_count, last_output_seq = EXCLUDED.last_output_seq, - task_json = EXCLUDED.task_json + task_json = jsonb_set( + EXCLUDED.task_json, + '{readAt}', + CASE + WHEN EXCLUDED.status IN ('queued', 'running', 'judging', 'retry_wait') THEN 'null'::jsonb + WHEN unidesk_code_queue_tasks.read_at IS NOT NULL AND EXCLUDED.read_at IS NOT NULL THEN to_jsonb(GREATEST(unidesk_code_queue_tasks.read_at, EXCLUDED.read_at)) + WHEN unidesk_code_queue_tasks.read_at IS NOT NULL THEN to_jsonb(unidesk_code_queue_tasks.read_at) + ELSE COALESCE(to_jsonb(EXCLUDED.read_at), 'null'::jsonb) + END, + true + ) + RETURNING read_at `; + task.readAt = timestampToIso(rows[0]?.read_at ?? null); } async function upsertQueueToDatabase(client: SqlExecutor, queue: QueueRecord): Promise { @@ -1325,6 +1342,8 @@ async function upsertWorkdirsToDatabase(records: WorkdirRecord[]): Promise interface DatabaseTaskRow { id: string; updated_at: Date | string; + status: TaskStatus; + read_at: Date | string | null; task_json: unknown; } @@ -1343,7 +1362,13 @@ function normalizeDatabaseTaskRows(rows: DatabaseTaskRow[], source: string): Que const tasks: QueueTask[] = []; for (const row of rows) { try { - tasks.push(normalizeTask(row.task_json as QueueTask)); + const taskJson = row.task_json; + if (typeof taskJson !== "object" || taskJson === null || Array.isArray(taskJson)) throw new Error("task_json is not an object"); + tasks.push(normalizeTask({ + ...(taskJson as Record), + status: row.status, + readAt: timestampToIso(row.read_at), + } as unknown as QueueTask)); } catch (error) { logger("warn", "database_task_row_ignored", { source, id: String(row.id), error: errorToJson(error) }); } @@ -1353,11 +1378,13 @@ function normalizeDatabaseTaskRows(rows: DatabaseTaskRow[], source: string): Que async function loadPrunedDatabaseTaskRows(where: "all" | "hot"): Promise { return await sql` - SELECT id, updated_at, task_json + SELECT id, updated_at, status, read_at, task_json FROM ( SELECT id, updated_at, + status, + read_at, jsonb_set( jsonb_set( task_json, @@ -1407,8 +1434,7 @@ async function loadPrunedDatabaseTaskRows(where: "all" | "hot"): Promise id.trim()).filter(Boolean))); if (ids.length === 0) return []; const rows = await sql` - SELECT id, updated_at, task_json + SELECT id, updated_at, status, read_at, task_json FROM ( SELECT id, updated_at, + status, + read_at, task_json - 'output' - 'events' - 'attempts' - 'promptHistory' AS task_json FROM unidesk_code_queue_tasks WHERE id IN ${sql(ids)} @@ -1464,11 +1492,13 @@ async function loadTasksFromDatabaseByIds(taskIds: string[]): Promise { const rows = await sql` - SELECT id, updated_at, task_json + SELECT id, updated_at, status, read_at, task_json FROM ( SELECT id, updated_at, + status, + read_at, jsonb_set( jsonb_set( task_json, From 1cafe6da6a8e2dd117556b9e9b1802527091fded Mon Sep 17 00:00:00 2001 From: Codex Date: Sun, 17 May 2026 03:22:55 +0000 Subject: [PATCH 04/14] Add D601 Tekton CI --- AGENTS.md | 2 + TEST.md | 4 + docs/reference/ci.md | 84 +++ docs/reference/deploy.md | 6 + docs/reference/repo-tree.md | 3 + scripts/ci-code-queue-read-perf.ts | 165 +++++ scripts/cli.ts | 7 + scripts/src/ci.ts | 353 +++++++++++ scripts/tsconfig.json | 2 +- .../k3sctl-adapter/k3s/ci/tekton-install.yaml | 14 + .../k3s/ci/unidesk-ci.pipeline.yaml | 590 ++++++++++++++++++ .../k3s/ci/unidesk-ci.triggers.yaml | 80 +++ 12 files changed, 1309 insertions(+), 1 deletion(-) create mode 100644 docs/reference/ci.md create mode 100644 scripts/ci-code-queue-read-perf.ts create mode 100644 scripts/src/ci.ts create mode 100644 src/components/microservices/k3sctl-adapter/k3s/ci/tekton-install.yaml create mode 100644 src/components/microservices/k3sctl-adapter/k3s/ci/unidesk-ci.pipeline.yaml create mode 100644 src/components/microservices/k3sctl-adapter/k3s/ci/unidesk-ci.triggers.yaml diff --git a/AGENTS.md b/AGENTS.md index 5376d29e..92d519d4 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -34,6 +34,7 @@ UniDesk 是一个以主 server 为统一入口的分布式工作平台;本文 - `bun scripts/cli.ts ssh [ssh-like args...]`:通过 provider-gateway 的 Host SSH / WSL SSH 维护桥打开近似原生 ssh 的交互会话或远端命令,并在远端 PATH 注入 `apply_patch`、`glob` 与 `skill-discover`;`apply-patch`、`py`、`skills`、结构化 `find`、`glob` 和 `argv` 子命令用于避免远端补丁、Python stdin、skill 发现与常用只读命令的嵌套转义问题,使用规则见 `docs/reference/cli.md` 和 `docs/reference/provider-gateway.md`。 - `bun scripts/cli.ts microservice list/status/health/proxy`:管理和验证挂载在主 server、计算节点 Docker 或 k3s 控制面上的用户服务,OA Event Flow/Todo Note/Baidu Netdisk on main-server、k3s Control/Code Queue/MDTODO/FindJob/Pipeline/MET Nonlinear on D601 的规则见 `docs/reference/microservices.md`。 - `bun scripts/cli.ts deploy check/plan/apply [--file deploy.json] [--service ]`:按根目录 `deploy.json` 的服务 repo 和 commit 期望状态校验或更新用户服务,目标侧自行 fetch、构建、部署和 live commit 验证;规则见 `docs/reference/deploy.md`。 +- `bun scripts/cli.ts ci install/status/run/logs`:在 D601 原生 k3s 上安装和运行 Tekton CI,只做每 commit 检查和 Code Queue 只读性能门禁,不部署 CD;规则见 `docs/reference/ci.md`。 - `bun scripts/cli.ts codex deploy `:Code Queue 兼容部署入口,会生成临时 desired manifest 并调用 `deploy apply --service code-queue` 的同一条 target-side build 与 live commit 验证路径;规则见 `docs/reference/codex-deploy.md`。 - `bun scripts/cli.ts codex task `:按 Code Queue 任务 ID 查询初始 prompt、最后 assistant message、工具调用摘要、attempt/judge/error 和耗时,便于新任务引用历史 session。 - `bun scripts/cli.ts codex judge --attempt [--dry-run]`:按指定 task/attempt 用与队列 worker 相同的上下文构建和 MiniMax judge 调用路径单步复现完成判定;`--dry-run` 只输出 prompt/payload 诊断。 @@ -65,5 +66,6 @@ UniDesk 是一个以主 server 为统一入口的分布式工作平台;本文 - `docs/reference/pipeline-oa-event-flow.md`:Pipeline/OA 事件流、审核/无审核流转、单步调试、甘特图渲染和最终去残留规则。 - `docs/reference/pipeline-model-proxy.md`:Pipeline v2 model proxy 链路架构、D601 宿主 proxy 服务部署、harness token 注入规则和 smoke test 验证流程。 - `docs/reference/deploy.md`:`deploy.json` desired-state、target-side build、一次性构建 proxy、直管/代管服务部署 executor 和 live commit 验证规则。 +- `docs/reference/ci.md`:D601 k3s Tekton CI、只读主数据库性能门禁和 CLI 入口规则。 - `docs/reference/codex-deploy.md`:D601 Code Queue `codex deploy ` 异步部署管线、路径约定和验证入口。 - `reference`:兼容旧路径的符号链接,指向 `docs/reference/`。 diff --git a/TEST.md b/TEST.md index 16368630..55b7be7c 100644 --- a/TEST.md +++ b/TEST.md @@ -107,6 +107,10 @@ 随后登录公网 frontend `http://74.48.78.17:18081/`,进入 `用户服务 / Code Queue`,确认页面显示默认模型 `gpt-5.5`、默认执行 Provider `D601`、默认工作目录 `/workspace`、模型下拉菜单包含 `gpt-5.4-mini`/`gpt-5.4`/`gpt-5.5`、入队份数、队列指标、任务 ID、复制任务 ID、引用按钮、任务耗时、引用任务 ID、清空输入、创建成功提示、任务提交表单、Trace 输出、attempt 表、MiniMax/fallback judge 状态、追加 prompt、打断和重试控件;通过页面提交一个小任务,确认任务进入 queued/running/succeeded 或可解释的 failed 状态,并且输出区能看到运行中的 Codex 消息。批量验收时设置 `入队份数=5` 或用 `---` 分隔 5 段 prompt,一次性入队 5 条任务,确认 5 条任务按顺序运行并全部进入 succeeded 或可解释的非成功终态,不能只运行第一条后停止;其中任一任务被 judge 判定 `fail` 时只能把当前任务标为 failed,后续 queued 任务仍必须继续推进。测试异常中断时可以提交长任务后点击 `打断`,确认任务变为 canceled 或被 judge 标记为非成功终态;自动重试只应在服务端/传输异常、任务正常结束但 execution record 显示未完成、或 judge 判定 retry 时发生;retry 必须复用已有 Codex thread 并 append 继续执行 prompt,只有当前任务 complete 后才推进队列中的下一个任务。MiniMax judge 必须能处理 Markdown fence/夹杂文本等 JSON 去噪;若去噪后仍失败,必须把解析错误和上一轮去噪前原始回答反馈给 MiniMax 修复后重试,日志中应出现 `judge_json_parse_retry`,且 repair 成功时仍以 `source=minimax` 返回。Codex provider key 只能通过 `OPENAI_API_KEY`、`CRS_OAI_KEY` 这类运行时环境透传,MiniMax API key 只能通过 D601 env-file 运行时环境传入,禁止写入 `config.json`、Dockerfile、源码或测试文档。 +## T23A D601 k3s CI Gate + +阅读 `AGENTS.md` 和 `docs/reference/ci.md`,运行 `bun scripts/cli.ts ci install`,确认 Tekton Pipelines `v1.12.0`、Tekton Triggers `v0.34.0` 和 `unidesk-ci` Pipeline/Task/EventListener 已部署到 D601 原生 k3s;随后运行 `bun scripts/cli.ts ci run --revision <已push的commitId> --wait-ms 1200000`,确认 PipelineRun 只执行 clone/check/performance,不调用 `deploy apply` 或 `codex deploy`,并确认临时 `code-queue-ci-read` 使用主 PostgreSQL 只读查询 Code Queue 首屏、TraceView summary、TraceView steps 和 step detail 的性能指标。若失败,使用 `bun scripts/cli.ts ci logs ` 查看 TaskRun 和 Pod 日志;交付说明必须记录性能预算是否通过。 + ## T24 MET Nonlinear D601 GPU User Service 阅读 `AGENTS.md`(本项目 `AGENTS.md` 同时承担 `SKILL.md` 对 `scripts/cli.ts` 的解释职责),然后用 cli 手动测试以下内容:确认 D601 `~/met_nonlinear` 中存在 `docker-compose.unidesk.yml`、`docker/unidesk/Dockerfile.ml`、`unidesk/server/src/index.ts` 和 `docs/reference/unidesk_microservice.md`;运行 `bun scripts/cli.ts microservice list`,确认 `met-nonlinear` 显示为 `providerId=D601`、`public=false`、`frontendOnly=true`、`127.0.0.1:3288` 后端映射和 `met-nonlinear-ts` 容器摘要;运行 `bun scripts/cli.ts microservice health met-nonlinear`、`bun scripts/cli.ts microservice proxy met-nonlinear /api/queue`、`bun scripts/cli.ts microservice proxy met-nonlinear '/api/projects?root=projects&limit=500'`、`bun scripts/cli.ts microservice proxy met-nonlinear '/api/projects?root=ex_projects&limit=500'`、`bun scripts/cli.ts microservice proxy met-nonlinear '/api/projects/config?path=projects/' --raw` 和 `bun scripts/cli.ts microservice proxy met-nonlinear /api/images`,确认链路通过 backend-core、D601 provider-gateway 和 D601 本机 TS 后端,项目详情包含 `config`、`progress`、`data`、`model`、`metrics` 字段;最后登录公网 frontend `http://74.48.78.17:18081/`,进入 `用户服务 / MET Nonlinear`,确认项目库按 `projects/` 和 `ex_projects/` 文件树层级展示且文件夹 Project 数与后端返回数量一致,点击项目行能看到结构化 `config.json`、`data/` 训练状态、模型参数量和指标;通过 UI 选择已有 source Project,设置训练轮数和最大并发,使用 `Fork Project` 创建新的 `projects/unidesk_forks/` Project,确认新 Project 被自动勾选但不会直接训练,再点击 `加入待启动队列` 和 `启动队列`;完整验收可用 UI 输入 `Fork 数量=10`、`训练轮数=200`、`最大并发=3`,但这个规模只能由输入框配置,不能作为硬编码按钮。确认最多按 UI 设置的并发数运行、目标 GPU 是 2080Ti、显存余量低于 20% 时自动限制并发、任务最终进入已完成或失败诊断标签且训练容器自动销毁。页面必须以 React 控件显示项目库、待启动/排队/训练中、已完成、失败诊断、GPU/镜像、训练进度、ETA、`epoch/h` 训练速度和历史记录;项目库、当前队列、已完成和失败列表中的项目必须可点击打开详情;默认没有裸 JSON,只有点击 `查看原始JSON` 才显示原始数据;前端不得再提供 `创建10个10轮任务` 这类硬编码测试按钮。 diff --git a/docs/reference/ci.md b/docs/reference/ci.md new file mode 100644 index 00000000..1bc94110 --- /dev/null +++ b/docs/reference/ci.md @@ -0,0 +1,84 @@ +# UniDesk CI On D601 k3s + +UniDesk CI is hosted on the D601 native k3s cluster with Tekton Pipelines and Tekton Triggers. It is CI only. CD remains the existing `deploy.json` / `deploy apply` / `codex deploy ` path, and no Tekton task may roll out production services. + +## Components + +- Tekton Pipelines: `v1.12.0`. +- Tekton Triggers: `v0.34.0`. +- UniDesk CI namespace: `unidesk-ci`. +- Manifests: `src/components/microservices/k3sctl-adapter/k3s/ci/`. +- CLI entry: `bun scripts/cli.ts ci install|status|run|logs`. + +The CLI reaches D601 through the existing `k3sctl-adapter` Host SSH maintenance bridge and then runs native `KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl ...`. It does not require backend-core to be running and does not expose a new public port. + +## Pipeline Scope + +Each commit CI run performs: + +- `git clone` and checkout of the requested repository revision. +- `bun install --frozen-lockfile` at the repo root and `src/`, because `bun scripts/cli.ts check` compiles all `src/components` and needs the component workspace lockfile for frontend React dependencies. +- `bun scripts/cli.ts check`. +- Temporary `code-queue-ci-read` Deployment and ClusterIP Service in `unidesk-ci`. +- Code Queue read performance checks against the production PostgreSQL through `d601-tcp-egress-gateway`. + +`ci install` also prewarms the D601 k3s containerd runtime with the Tekton entrypoint/workingdir helper images, `oven/bun:1-debian`, `alpine/git:2.45.2` and `unidesk-code-queue:d601`. Missing images are pulled through the node-local provider-gateway WS egress proxy and then imported into native k3s containerd with digests preserved, so PipelineRun pods do not hang on external registry pulls. + +Git clone and dependency downloads inside the repo check task use `d601-provider-egress-proxy.unidesk.svc.cluster.local:18789`; the NO_PROXY list keeps the in-cluster read service, D601 TCP egress gateway and any in-cluster CI Git mirror on the cluster network. + +Steps that call the Kubernetes API directly clear inherited proxy variables so service-account HTTPS calls to `kubernetes.default.svc` do not accidentally use the Code Queue image's Docker Compose proxy defaults. +The rollout poll reads the Deployment main resource rather than the `/status` subresource, keeping CI RBAC limited to the same app/service resources it creates and deletes. +The performance probe scans recent Code Queue tasks until it finds one with trace steps, so a newly selected task without persisted step detail does not make the whole gate fail before measuring the trace endpoints. + +The temporary Code Queue service uses: + +- `CODE_QUEUE_SERVICE_ROLE=read`. +- `CODE_QUEUE_SCHEDULER_ENABLED=false`. +- `CODE_QUEUE_STARTUP_OA_BACKFILL_ENABLED=false`. +- `CODE_QUEUE_NOTIFY_CLAUDEQQ_ENABLED=false`. +- D601 k3s `d601-provider-egress-proxy` for external/OA Event Flow fetches, with `d601-tcp-egress-gateway` and the CI read service in `NO_PROXY`. +- EmptyDir state/log mounts. + +This means the CI service can read existing tasks, Trace summaries, Trace steps and Trace step details from the main database, but it must not schedule, mutate, notify, backfill or become deployment truth. + +## Performance Gate + +The initial budgets live in `unidesk-ci/unidesk-ci-budgets`: + +- Code Queue first overview payload through the temporary read service, used as the service-side first-paint proxy: `2000ms`. +- `GET /api/tasks/{id}/trace-summary`: `700ms`. +- `GET /api/tasks/{id}/trace-steps`: `900ms`. +- `GET /api/tasks/{id}/trace-step`: `700ms`. +- `GET /api/tasks/overview` p95 over 10 samples: `900ms`. + +These are absolute budgets. Historical relative baselines can be added later by writing metrics to a dedicated CI table or object store; they should not be mixed into production task tables. + +## Commands + +Install or refresh CI: + +```bash +bun scripts/cli.ts ci install +``` + +Check status: + +```bash +bun scripts/cli.ts ci status +``` + +Run CI manually for a commit: + +```bash +bun scripts/cli.ts ci run --revision +``` + +Inspect a run: + +```bash +bun scripts/cli.ts ci logs +``` + +## Trigger Boundary + +`unidesk-ci.triggers.yaml` installs the EventListener, TriggerBinding and TriggerTemplate, but the EventListener remains a normal in-cluster Service. Do not expose it through NodePort, LoadBalancer or an unrestricted public ingress. If GitHub or another Git remote needs webhook delivery, add a UniDesk-controlled frontend/backend route with secret verification and then proxy to the EventListener; keep frontend and provider ingress as the only unrestricted public entry points. diff --git a/docs/reference/deploy.md b/docs/reference/deploy.md index a1d24695..5e8134cb 100644 --- a/docs/reference/deploy.md +++ b/docs/reference/deploy.md @@ -73,6 +73,12 @@ The reconciler selects the executor from `config.json`: Existing service-specific commands such as Code Queue deploy should converge onto this reconciler path instead of keeping a parallel implementation. +## CI Separation + +Continuous integration is intentionally separate from this deploy reconciler. D601 k3s hosts Tekton CI resources described in `docs/reference/ci.md`, but those PipelineRuns only clone, check and run read-only performance gates. They must not call `deploy apply`, `codex deploy`, `kubectl rollout restart` for production services, or mutate `deploy.json`. + +The Code Queue performance gate may create a temporary `code-queue-ci-read` service and read the main PostgreSQL through the existing `d601-tcp-egress-gateway`. Because it runs with `CODE_QUEUE_SERVICE_ROLE=read`, scheduler/backfill/notification disabled and EmptyDir state, it is not deployment truth and does not need a temporary database for the current read-only checks. + ## Version Stamping And Verification Every successful deployment must stamp the source version in the runtime: diff --git a/docs/reference/repo-tree.md b/docs/reference/repo-tree.md index 13a877a5..dc79570b 100644 --- a/docs/reference/repo-tree.md +++ b/docs/reference/repo-tree.md @@ -18,6 +18,7 @@ - command.ts (Bounded command execution helpers) - output.ts (JSON output helpers) - e2e.ts (Public frontend/provider ingress, internal core/database, and Playwright frontend E2E checks) + - ci.ts (D601 k3s Tekton CI install/status/manual-run/logs helpers; CI only, no CD) - logs/ (Generated service logs; ignored by git) - .state/ (Generated job state and compose env; ignored by git) - docs/ @@ -32,6 +33,7 @@ - provider-gateway.md (Provider connection and host SSH maintenance bridge) - observability.md (Logs and status visibility) - e2e.md (Delivery gate, Playwright frontend E2E, and database persistence checks) + - ci.md (D601 k3s Tekton CI, read-only production database performance gate, and trigger boundary) - src/ (TypeScript component monorepo) - package.json (Component workspace metadata) - bun.lock (Component dependency lockfile) @@ -88,4 +90,5 @@ - code-queue/ (Codex/OpenCode queue backend; k3s-managed when exposed through UniDesk) - oa-event-flow/ (Unified OA event ledger, tag stream, and Trace/STEP stats center) - k3sctl-adapter/ (D601 k3s control-plane adapter and managed service manifests) + - k3s/ci/ (Tekton CI install marker, Pipeline/Task, and in-cluster Trigger manifests) - example-service/ diff --git a/scripts/ci-code-queue-read-perf.ts b/scripts/ci-code-queue-read-perf.ts new file mode 100644 index 00000000..18720c08 --- /dev/null +++ b/scripts/ci-code-queue-read-perf.ts @@ -0,0 +1,165 @@ +interface TimingSample { + label: string; + method: string; + url: string; + ok: boolean; + status: number; + durationMs: number; + bytes: number; + error: string | null; +} + +export {}; + +function envNumber(name: string, fallback: number): number { + const raw = process.env[name]; + if (raw === undefined || raw.length === 0) return fallback; + const value = Number(raw); + if (!Number.isFinite(value) || value <= 0) throw new Error(`${name} must be a positive number`); + return Math.floor(value); +} + +function baseUrl(): string { + return (process.env.CI_CODE_QUEUE_URL ?? "http://code-queue-ci-read.unidesk-ci.svc.cluster.local:4222").replace(/\/+$/u, ""); +} + +async function fetchSample(label: string, url: string, timeoutMs = 30_000): Promise { + const started = performance.now(); + try { + const response = await fetch(url, { signal: AbortSignal.timeout(timeoutMs) }); + const text = await response.text(); + return { + label, + method: "GET", + url, + ok: response.ok, + status: response.status, + durationMs: Math.round((performance.now() - started) * 10) / 10, + bytes: text.length, + error: null, + }; + } catch (error) { + return { + label, + method: "GET", + url, + ok: false, + status: 0, + durationMs: Math.round((performance.now() - started) * 10) / 10, + bytes: 0, + error: error instanceof Error ? error.message : String(error), + }; + } +} + +function percentile(values: number[], percentileValue: number): number { + if (values.length === 0) return 0; + const sorted = values.slice().sort((left, right) => left - right); + if (percentileValue <= 0) return sorted[0] ?? 0; + if (percentileValue >= 100) return sorted[sorted.length - 1] ?? 0; + const index = Math.min(sorted.length - 1, Math.max(0, Math.ceil((percentileValue / 100) * sorted.length) - 1)); + return sorted[index] ?? 0; +} + +async function candidateTaskIds(url: string): Promise { + const response = await fetch(`${url}/api/tasks/overview?limit=24&transcriptLimit=0&compact=1&selected=1&includeActive=0&stats=0&skipTrace=1`, { + signal: AbortSignal.timeout(30_000), + }); + const body = await response.json() as { selected?: { task?: { id?: string } }; tasks?: Array<{ id?: string }> }; + const ids = [ + body.selected?.task?.id, + ...(body.tasks ?? []).map((task) => task.id), + ].filter((id): id is string => typeof id === "string" && id.length > 0); + return [...new Set(ids)]; +} + +async function traceSeq(url: string, taskId: string): Promise { + const response = await fetch(`${url}/api/tasks/${encodeURIComponent(taskId)}/trace-steps?tail=1&limit=8`, { + signal: AbortSignal.timeout(30_000), + }); + const body = await response.json() as { steps?: Array<{ seq?: number }> }; + const seq = body.steps?.find((step) => Number.isFinite(Number(step.seq)))?.seq; + if (!Number.isFinite(Number(seq))) return null; + return Number(seq); +} + +async function traceTarget(url: string): Promise<{ taskId: string; seq: number; skippedTaskIds: string[] }> { + const ids = await candidateTaskIds(url); + if (ids.length === 0) throw new Error("Code Queue CI perf could not find a task id in the production PostgreSQL task table"); + const skippedTaskIds: string[] = []; + for (const taskId of ids) { + const seq = await traceSeq(url, taskId); + if (seq !== null) return { taskId, seq, skippedTaskIds }; + skippedTaskIds.push(taskId); + } + throw new Error(`Code Queue CI perf could not find a task with trace steps among ${ids.length} candidates: ${skippedTaskIds.join(",")}`); +} + +async function measureFirstPaint(url: string): Promise> { + const sample = await fetchSample("code-queue-read-first-paint-proxy", `${url}/api/tasks/overview?limit=12&transcriptLimit=1&compact=1&selected=0&includeActive=0&stats=0&skipTrace=1`, 60_000); + return { + ok: sample.ok, + url: sample.url, + firstPaintMs: sample.durationMs, + apiTimings: [sample], + consoleErrors: [], + note: "Code Queue service is API-only in k3s; this measures the first overview payload used by the frontend Code Queue page.", + }; +} + +async function main(): Promise { + const url = baseUrl(); + const budgets = { + firstPaintMs: envNumber("FIRST_PAINT_BUDGET_MS", 2000), + traceSummaryMs: envNumber("TRACE_SUMMARY_BUDGET_MS", 700), + traceStepsMs: envNumber("TRACE_STEPS_BUDGET_MS", 900), + traceStepDetailMs: envNumber("TRACE_STEP_DETAIL_BUDGET_MS", 700), + overviewP95Ms: envNumber("OVERVIEW_P95_BUDGET_MS", 900), + }; + const health = await fetchSample("health", `${url}/health`); + if (!health.ok) throw new Error(`Code Queue CI read health failed: ${JSON.stringify(health)}`); + const target = await traceTarget(url); + const { taskId, seq } = target; + const firstPaint = await measureFirstPaint(url); + const traceSummary = await fetchSample("trace-summary", `${url}/api/tasks/${encodeURIComponent(taskId)}/trace-summary`); + const traceSteps = await fetchSample("trace-steps", `${url}/api/tasks/${encodeURIComponent(taskId)}/trace-steps?tail=1&limit=20`); + const traceStepDetail = await fetchSample("trace-step-detail", `${url}/api/tasks/${encodeURIComponent(taskId)}/trace-step?seq=${encodeURIComponent(String(seq))}`); + const overviewSamples: TimingSample[] = []; + for (let index = 0; index < 10; index += 1) { + overviewSamples.push(await fetchSample("overview", `${url}/api/tasks/overview?limit=12&transcriptLimit=1&compact=1&selected=0&includeActive=0&stats=0&skipTrace=1&__ci=${Date.now()}-${index}`)); + } + const overviewSuccessful = overviewSamples.filter((sample) => sample.ok).map((sample) => sample.durationMs); + const overviewP95Ms = Math.round(percentile(overviewSuccessful, 95) * 10) / 10; + const firstPaintMs = Number((firstPaint as { firstPaintMs?: number }).firstPaintMs ?? 0); + const checks = [ + { name: "first-paint", ok: firstPaintMs <= budgets.firstPaintMs, valueMs: firstPaintMs, budgetMs: budgets.firstPaintMs }, + { name: "trace-summary", ok: traceSummary.ok && traceSummary.durationMs <= budgets.traceSummaryMs, valueMs: traceSummary.durationMs, budgetMs: budgets.traceSummaryMs }, + { name: "trace-steps", ok: traceSteps.ok && traceSteps.durationMs <= budgets.traceStepsMs, valueMs: traceSteps.durationMs, budgetMs: budgets.traceStepsMs }, + { name: "trace-step-detail", ok: traceStepDetail.ok && traceStepDetail.durationMs <= budgets.traceStepDetailMs, valueMs: traceStepDetail.durationMs, budgetMs: budgets.traceStepDetailMs }, + { name: "overview-p95", ok: overviewSamples.every((sample) => sample.ok) && overviewP95Ms <= budgets.overviewP95Ms, valueMs: overviewP95Ms, budgetMs: budgets.overviewP95Ms }, + ]; + const result = { + ok: checks.every((check) => check.ok), + measuredAt: new Date().toISOString(), + url, + taskId, + seq, + skippedTaskIds: target.skippedTaskIds, + budgets, + checks, + health, + firstPaint, + traceSummary, + traceSteps, + traceStepDetail, + overview: { + p50Ms: Math.round(percentile(overviewSuccessful, 50) * 10) / 10, + p95Ms: overviewP95Ms, + samples: overviewSamples, + }, + }; + console.log(JSON.stringify(result, null, 2)); + if (!result.ok) process.exitCode = 1; +} + +await main(); diff --git a/scripts/cli.ts b/scripts/cli.ts index 664ba8af..49998d62 100644 --- a/scripts/cli.ts +++ b/scripts/cli.ts @@ -13,6 +13,7 @@ import { runCodeQueueDeployCompatCommand, runDeployCommand } from "./src/deploy" import { runProviderCommand } from "./src/provider-attach"; import { runScheduleCommand } from "./src/schedules"; import { parseNetworkPerfOptions, runNetworkPerf } from "./src/network-perf"; +import { runCiCommand } from "./src/ci"; const remoteOptions = extractRemoteCliOptions(process.argv.slice(2)); const args = remoteOptions.args; @@ -58,6 +59,7 @@ function help(): unknown { { command: "debug dispatch [providerId] [docker.ps|provider.upgrade|host.ssh|microservice.http|echo] [--wait-ms N]", description: "Submit a real internal-core dispatch request for CLI debugging." }, { command: "debug task ", description: "Read a dispatched task record from internal core for CLI debugging." }, { command: "network perf [--service code-queue --path /api/tasks/overview?limit=30 --count N --concurrency N --label before|after]", description: "Benchmark frontend -> backend-core -> provider/adapter user-service networking and report latency/proxy-mode distributions." }, + { command: "ci install|status|run|logs", description: "Manage D601 k3s Tekton CI only; does not deploy CD. CI reads the production PostgreSQL through a temporary read-only Code Queue service." }, { command: "e2e run [--only pattern[,pattern...]] [--skip pattern[,pattern...]]", description: "Run selected public/internal/Playwright E2E checks; use --only for focused iteration and rerun without filters for final regression." }, ], }; @@ -247,6 +249,11 @@ async function main(): Promise { return; } + if (top === "ci") { + emitJson(commandName, runCiCommand(config, args.slice(1))); + return; + } + if (top === "e2e" && sub === "run") { const result = await runE2E(config, parseE2ERunOptions(args.slice(2))); const ok = (result as { ok?: unknown }).ok === true; diff --git a/scripts/src/ci.ts b/scripts/src/ci.ts new file mode 100644 index 00000000..81c50e48 --- /dev/null +++ b/scripts/src/ci.ts @@ -0,0 +1,353 @@ +import { spawnSync } from "node:child_process"; +import { existsSync, readFileSync } from "node:fs"; +import { runCommand } from "./command"; +import { type UniDeskConfig, repoRoot, rootPath } from "./config"; +import { startJob } from "./jobs"; + +const k3sctlContainerName = "k3sctl-adapter"; +const k3sctlSshKey = "/run/host-ssh/id_ed25519"; +const d601SshTarget = "ubuntu@host.docker.internal"; +const d601Kubeconfig = "/etc/rancher/k3s/k3s.yaml"; +const tektonPipelineVersion = "v1.12.0"; +const tektonTriggersVersion = "v0.34.0"; +const tektonPipelineReleaseUrl = `https://infra.tekton.dev/tekton-releases/pipeline/previous/${tektonPipelineVersion}/release.yaml`; +const tektonTriggersReleaseUrl = `https://infra.tekton.dev/tekton-releases/triggers/previous/${tektonTriggersVersion}/release.yaml`; +const tektonTriggersInterceptorsUrl = `https://infra.tekton.dev/tekton-releases/triggers/previous/${tektonTriggersVersion}/interceptors.yaml`; +const providerGatewayWsEgressProxyUrl = "http://127.0.0.1:18789"; +const ciRuntimeImages = [ + "rancher/mirrored-pause:3.6", + "rancher/mirrored-library-busybox:1.36.1", + "cgr.dev/chainguard/busybox@sha256:19f02276bf8dbdd62f069b922f10c65262cc34b710eea26ff928129a736be791", + "ghcr.io/tektoncd/pipeline/entrypoint-bff0a22da108bc2f16c818c97641a296:v1.12.0", + "ghcr.io/tektoncd/pipeline/workingdirinit-0c558922ec6a1b739e550e349f2d5fc1:v1.12.0", + "ghcr.io/tektoncd/pipeline/nop-8eac7c133edad5df719dc37b36b62482:v1.12.0", + "ghcr.io/tektoncd/pipeline/events-a9042f7efb0cbade2a868a1ee5ddd52c:v1.12.0", + "ghcr.io/tektoncd/triggers/eventlistenersink-7ad1faa98cddbcb0c24990303b220bb8:v0.34.0", + "oven/bun:1-debian", + "alpine/git:2.45.2", + "unidesk-code-queue:d601", +]; + +interface CiOptions { + repoUrl: string; + revision: string; + waitMs: number; +} + +function stringOption(args: string[], name: string): string | null { + const index = args.indexOf(name); + if (index === -1) return null; + const value = args[index + 1]; + if (value === undefined || value.startsWith("--")) throw new Error(`${name} requires a value`); + return value; +} + +function numberOption(args: string[], name: string, fallback: number): number { + const raw = stringOption(args, name); + if (raw === null) return fallback; + const value = Number(raw); + if (!Number.isInteger(value) || value < 0) throw new Error(`${name} must be a non-negative integer`); + return value; +} + +function requireRevision(value: string | null): string { + if (value === null || value.length === 0) throw new Error("ci run requires --revision "); + if (!/^[A-Za-z0-9._/@:-]{1,160}$/u.test(value)) throw new Error("ci --revision contains unsupported characters"); + return value; +} + +function shellQuote(value: string): string { + return `'${value.replace(/'/gu, "'\\''")}'`; +} + +function dockerExecK3sctl(args: string[]) { + return runCommand(["docker", "exec", k3sctlContainerName, ...args], repoRoot); +} + +function dockerExecK3sctlWithInput(args: string[], input: string) { + const command = ["docker", "exec", "-i", k3sctlContainerName, ...args]; + const result = spawnSync(command[0], command.slice(1), { + cwd: repoRoot, + encoding: "utf8", + input, + maxBuffer: 1024 * 1024 * 8, + }); + return { + command, + cwd: repoRoot, + exitCode: result.status, + stdout: result.stdout ?? "", + stderr: result.stderr ?? result.error?.message ?? "", + }; +} + +function remoteKubectlCommand(script: string): string[] { + return [ + "sh", + "-lc", + [ + "ssh", + "-i", + shellQuote(k3sctlSshKey), + "-o", + "StrictHostKeyChecking=no", + "-o", + "UserKnownHostsFile=/tmp/unidesk-ci-known-hosts", + "-o", + "ConnectTimeout=10", + shellQuote(d601SshTarget), + shellQuote(`KUBECONFIG=${d601Kubeconfig} bash -lc ${shellQuote(script)}`), + ].join(" "), + ]; +} + +function runRemoteKubectl(script: string) { + const result = dockerExecK3sctl(remoteKubectlCommand(script)); + if (result.exitCode !== 0) { + throw new Error(`D601 kubectl command failed: ${result.stderr || result.stdout}`); + } + return result; +} + +function remoteApplyManifest(path: string): void { + const absolute = rootPath(path); + if (!existsSync(absolute)) throw new Error(`manifest not found: ${path}`); + const result = dockerExecK3sctlWithInput([ + "sh", + "-lc", + [ + "ssh", + "-i", + shellQuote(k3sctlSshKey), + "-o", + "StrictHostKeyChecking=no", + "-o", + "UserKnownHostsFile=/tmp/unidesk-ci-known-hosts", + "-o", + "ConnectTimeout=10", + shellQuote(d601SshTarget), + shellQuote(`KUBECONFIG=${d601Kubeconfig} kubectl apply -f -`), + ].join(" "), + ], readFileSync(absolute, "utf8")); + if (result.exitCode !== 0) { + throw new Error(`kubectl apply failed for ${path}: ${result.stderr || result.stdout}`); + } +} + +function prewarmCiRuntimeImages(): void { + const images = ciRuntimeImages.map(shellQuote).join(" "); + runRemoteKubectl([ + "set -euo pipefail", + "export DOCKER_CONFIG=/tmp/unidesk-ci-docker-config", + "mkdir -p \"$DOCKER_CONFIG\"", + "printf '{}\\n' > \"$DOCKER_CONFIG/config.json\"", + `images=(${images})`, + "for image in \"${images[@]}\"; do", + " if ! docker image inspect \"$image\" >/dev/null 2>&1; then", + " echo ci_runtime_image_pull=$image", + ` HTTP_PROXY=${shellQuote(providerGatewayWsEgressProxyUrl)} HTTPS_PROXY=${shellQuote(providerGatewayWsEgressProxyUrl)} ALL_PROXY=${shellQuote(providerGatewayWsEgressProxyUrl)} NO_PROXY=localhost,127.0.0.1,::1,host.docker.internal docker pull --platform linux/amd64 "$image"`, + " else", + " echo ci_runtime_image_cached=$image", + " fi", + "done", + "pause_entrypoint=$(docker image inspect rancher/mirrored-pause:3.6 --format '{{json .Config.Entrypoint}}' 2>/dev/null || true)", + "if ! printf '%s' \"$pause_entrypoint\" | grep -q '\"/pause\"'; then echo native_k3s_pause_image_invalid_entrypoint=$pause_entrypoint >&2; exit 1; fi", + "rm -f /tmp/unidesk-ci-runtime-images.tar", + "docker save \"${images[@]}\" -o /tmp/unidesk-ci-runtime-images.tar", + "/mnt/c/Windows/System32/wsl.exe -u root -- ctr --address /run/k3s/containerd/containerd.sock -n k8s.io images import --digests --all-platforms /tmp/unidesk-ci-runtime-images.tar >/tmp/unidesk-ci-runtime-images-import.log", + "/mnt/c/Windows/System32/wsl.exe -u root -- ctr --address /run/k3s/containerd/containerd.sock -n k8s.io images ls | grep -F 'docker.io/rancher/mirrored-pause:3.6' >/dev/null", + "/mnt/c/Windows/System32/wsl.exe -u root -- ctr --address /run/k3s/containerd/containerd.sock -n k8s.io images ls | grep -F 'docker.io/oven/bun:1-debian' >/dev/null", + "/mnt/c/Windows/System32/wsl.exe -u root -- ctr --address /run/k3s/containerd/containerd.sock -n k8s.io images ls | grep -F 'docker.io/alpine/git:2.45.2' >/dev/null", + "/mnt/c/Windows/System32/wsl.exe -u root -- ctr --address /run/k3s/containerd/containerd.sock -n k8s.io images ls | grep -F 'docker.io/library/unidesk-code-queue:d601' >/dev/null", + ].join("\n")); +} + +function status(): Record { + const summary = runRemoteKubectl([ + "set -euo pipefail", + "printf 'tekton_pipelines='", + "kubectl get deploy -n tekton-pipelines -o name 2>/dev/null | tr '\\n' ' ' || true", + "printf '\\ntekton_triggers='", + "kubectl get deploy -n tekton-pipelines-resolvers -o name 2>/dev/null | tr '\\n' ' ' || true", + "printf '\\nunidesk_ci='", + "kubectl get pipeline,task,pipelinerun,eventlistener,svc -n unidesk-ci -o name 2>/dev/null | tr '\\n' ' ' || true", + "printf '\\n'", + ].join("\n")); + return { + ok: true, + providerId: "D601", + orchestrator: "native-k3s", + tekton: { + pipelineVersion: tektonPipelineVersion, + triggersVersion: tektonTriggersVersion, + }, + summary: summary.stdout.trim(), + }; +} + +function install(): Record { + if (!existsSync(rootPath("src/components/microservices/k3sctl-adapter/k3s/ci/unidesk-ci.pipeline.yaml"))) { + throw new Error("CI manifests are missing"); + } + prewarmCiRuntimeImages(); + runRemoteKubectl([ + "set -euo pipefail", + `kubectl apply -f ${shellQuote(tektonPipelineReleaseUrl)}`, + "kubectl wait --for=condition=Available deployment --all -n tekton-pipelines --timeout=900s", + `kubectl apply -f ${shellQuote(tektonTriggersReleaseUrl)}`, + `kubectl apply -f ${shellQuote(tektonTriggersInterceptorsUrl)}`, + "kubectl wait --for=condition=Available deployment --all -n tekton-pipelines --timeout=900s", + "kubectl wait --for=condition=Available deployment --all -n tekton-pipelines-resolvers --timeout=900s", + ].join("\n")); + remoteApplyManifest("src/components/microservices/k3sctl-adapter/k3s/ci/tekton-install.yaml"); + remoteApplyManifest("src/components/microservices/k3sctl-adapter/k3s/ci/unidesk-ci.pipeline.yaml"); + remoteApplyManifest("src/components/microservices/k3sctl-adapter/k3s/ci/unidesk-ci.triggers.yaml"); + return status(); +} + +function pipelineRunManifest(options: CiOptions): string { + const safeSuffix = new Date().toISOString().replace(/[-:.TZ]/g, "").slice(0, 14).toLowerCase(); + return `apiVersion: tekton.dev/v1 +kind: PipelineRun +metadata: + generateName: unidesk-ci-${safeSuffix}- + namespace: unidesk-ci + labels: + app.kubernetes.io/name: unidesk-ci + app.kubernetes.io/part-of: unidesk + unidesk.ai/revision: ${JSON.stringify(options.revision)} +spec: + pipelineRef: + name: unidesk-ci + taskRunTemplate: + serviceAccountName: unidesk-ci-runner + params: + - name: repo-url + value: ${JSON.stringify(options.repoUrl)} + - name: revision + value: ${JSON.stringify(options.revision)} + workspaces: + - name: shared-workspace + persistentVolumeClaim: + claimName: unidesk-ci-cache +`; +} + +function remoteCreatePipelineRun(manifest: string): string { + const result = dockerExecK3sctlWithInput([ + "sh", + "-lc", + [ + "ssh", + "-i", + shellQuote(k3sctlSshKey), + "-o", + "StrictHostKeyChecking=no", + "-o", + "UserKnownHostsFile=/tmp/unidesk-ci-known-hosts", + shellQuote(d601SshTarget), + shellQuote(`KUBECONFIG=${d601Kubeconfig} kubectl create -f - -o jsonpath='{.metadata.name}'`), + ].join(" "), + ], manifest); + if (result.exitCode !== 0) throw new Error(result.stderr || result.stdout); + return result.stdout.trim(); +} + +function run(options: CiOptions): Record { + const name = remoteCreatePipelineRun(pipelineRunManifest(options)); + const wait = options.waitMs > 0 ? dockerExecK3sctl(remoteKubectlCommand([ + "set -euo pipefail", + `deadline=$((SECONDS + ${Math.ceil(options.waitMs / 1000)}))`, + "while [ \"$SECONDS\" -lt \"$deadline\" ]; do", + ` condition="$(kubectl get pipelinerun/${shellQuote(name)} -n unidesk-ci -o jsonpath='{range .status.conditions[?(@.type==\"Succeeded\")]}{.status}{\"\\t\"}{.reason}{\"\\t\"}{.message}{end}' 2>/dev/null || true)"`, + " case \"$condition\" in", + " True*)", + " echo \"$condition\"", + ` kubectl get pipelinerun/${shellQuote(name)} -n unidesk-ci -o json`, + " exit 0", + " ;;", + " False*)", + " echo \"$condition\"", + ` kubectl get pipelinerun/${shellQuote(name)} -n unidesk-ci -o json`, + " exit 1", + " ;;", + " esac", + " sleep 2", + "done", + `echo "Timed out waiting for pipelinerun/${name}" >&2`, + `kubectl get pipelinerun/${shellQuote(name)} -n unidesk-ci -o json`, + "exit 124", + ].join("\n"))) : null; + return { + ok: wait === null || wait.exitCode === 0, + pipelineRun: name, + namespace: "unidesk-ci", + repoUrl: options.repoUrl, + revision: options.revision, + wait: wait === null ? null : { + stdoutTail: wait.stdout.slice(-6000), + stderrTail: wait.stderr.slice(-6000), + }, + next: [ + `bun scripts/cli.ts ci logs ${name}`, + "bun scripts/cli.ts ci status", + ], + }; +} + +function logs(name: string): Record { + if (name.length === 0) throw new Error("ci logs requires PipelineRun name"); + const result = runRemoteKubectl([ + "set -euo pipefail", + `kubectl get pipelinerun/${shellQuote(name)} -n unidesk-ci -o wide`, + `kubectl get taskrun -n unidesk-ci -l tekton.dev/pipelineRun=${shellQuote(name)} -o wide`, + `for pod in $(kubectl get pods -n unidesk-ci -l tekton.dev/pipelineRun=${shellQuote(name)} -o name); do echo "===== $pod"; kubectl logs -n unidesk-ci "$pod" --all-containers=true --tail=160; done`, + ].join("\n")); + return { + ok: true, + pipelineRun: name, + output: result.stdout, + stderr: result.stderr, + }; +} + +function help(): Record { + return { + command: "ci install|status|run|logs", + description: "Manage the D601 k3s Tekton CI gate. This intentionally does not deploy CD.", + examples: [ + "bun scripts/cli.ts ci install", + "bun scripts/cli.ts ci run --revision ", + "bun scripts/cli.ts ci logs ", + ], + tekton: { + pipelineVersion: tektonPipelineVersion, + triggersVersion: tektonTriggersVersion, + sources: { + pipeline: tektonPipelineReleaseUrl, + triggers: tektonTriggersReleaseUrl, + interceptors: tektonTriggersInterceptorsUrl, + }, + }, + }; +} + +export function runCiCommand(_config: UniDeskConfig, args: string[]): Record { + const [action = "status", nameArg] = args; + if (action === "help" || action === "--help" || action === "-h") return help(); + if (action === "install") return install(); + if (action === "status") return status(); + if (action === "run") { + const repoUrl = stringOption(args, "--repo") ?? stringOption(args, "--repo-url") ?? "https://github.com/pikasTech/unidesk"; + const revision = requireRevision(stringOption(args, "--revision") ?? stringOption(args, "--commit")); + const waitMs = numberOption(args, "--wait-ms", 0); + return run({ repoUrl, revision, waitMs }); + } + if (action === "logs") return logs(nameArg ?? ""); + throw new Error("ci command must be one of: install, status, run, logs"); +} + +export function startCiInstallJob(): Record { + const job = startJob("ci_install", ["bun", "scripts/cli.ts", "ci", "install"], "Install/refresh Tekton CI on D601 k3s"); + return { ok: true, job }; +} diff --git a/scripts/tsconfig.json b/scripts/tsconfig.json index 37402e91..0689298a 100644 --- a/scripts/tsconfig.json +++ b/scripts/tsconfig.json @@ -9,5 +9,5 @@ "noFallthroughCasesInSwitch": true, "skipLibCheck": true }, - "include": ["cli.ts", "src/**/*.ts"] + "include": ["cli.ts", "src/**/*.ts", "../scripts/*.ts"] } diff --git a/src/components/microservices/k3sctl-adapter/k3s/ci/tekton-install.yaml b/src/components/microservices/k3sctl-adapter/k3s/ci/tekton-install.yaml new file mode 100644 index 00000000..e48f4801 --- /dev/null +++ b/src/components/microservices/k3sctl-adapter/k3s/ci/tekton-install.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: unidesk-tekton-install + namespace: unidesk + labels: + app.kubernetes.io/name: unidesk-ci + app.kubernetes.io/part-of: unidesk +data: + pipelineVersion: "v1.12.0" + triggersVersion: "v0.34.0" + pipelineReleaseUrl: "https://infra.tekton.dev/tekton-releases/pipeline/previous/v1.12.0/release.yaml" + triggersReleaseUrl: "https://infra.tekton.dev/tekton-releases/triggers/previous/v0.34.0/release.yaml" + triggersInterceptorsReleaseUrl: "https://infra.tekton.dev/tekton-releases/triggers/previous/v0.34.0/interceptors.yaml" diff --git a/src/components/microservices/k3sctl-adapter/k3s/ci/unidesk-ci.pipeline.yaml b/src/components/microservices/k3sctl-adapter/k3s/ci/unidesk-ci.pipeline.yaml new file mode 100644 index 00000000..d885efc7 --- /dev/null +++ b/src/components/microservices/k3sctl-adapter/k3s/ci/unidesk-ci.pipeline.yaml @@ -0,0 +1,590 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: unidesk-ci + labels: + app.kubernetes.io/part-of: unidesk + unidesk.ai/purpose: ci +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: unidesk-ci-runner + namespace: unidesk-ci + labels: + app.kubernetes.io/name: unidesk-ci + app.kubernetes.io/part-of: unidesk +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: unidesk-ci-runner + namespace: unidesk-ci +rules: + - apiGroups: [""] + resources: ["pods", "pods/log", "services"] + verbs: ["get", "list", "watch", "create", "delete", "patch"] + - apiGroups: ["apps"] + resources: ["deployments"] + verbs: ["get", "list", "watch", "create", "delete", "patch"] + - apiGroups: ["tekton.dev"] + resources: ["pipelineruns", "taskruns"] + verbs: ["get", "list", "watch", "create", "delete", "patch"] + - apiGroups: ["triggers.tekton.dev"] + resources: ["eventlisteners", "triggers", "triggerbindings", "triggertemplates", "interceptors"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: unidesk-ci-runner + namespace: unidesk-ci +subjects: + - kind: ServiceAccount + name: unidesk-ci-runner + namespace: unidesk-ci +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: unidesk-ci-runner +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: unidesk-ci-trigger-reader + labels: + app.kubernetes.io/name: unidesk-ci + app.kubernetes.io/part-of: unidesk +rules: + - apiGroups: ["triggers.tekton.dev"] + resources: ["clusterinterceptors", "clustertriggerbindings"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: unidesk-ci-trigger-reader + labels: + app.kubernetes.io/name: unidesk-ci + app.kubernetes.io/part-of: unidesk +subjects: + - kind: ServiceAccount + name: unidesk-ci-runner + namespace: unidesk-ci +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: unidesk-ci-trigger-reader +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: unidesk-ci-cross-namespace + namespace: unidesk +rules: + - apiGroups: [""] + resources: ["services"] + verbs: ["get", "list"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: unidesk-ci-cross-namespace + namespace: unidesk +subjects: + - kind: ServiceAccount + name: unidesk-ci-runner + namespace: unidesk-ci +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: unidesk-ci-cross-namespace +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: unidesk-ci-cache + namespace: unidesk-ci + labels: + app.kubernetes.io/name: unidesk-ci + app.kubernetes.io/part-of: unidesk +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 20Gi +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: unidesk-ci-budgets + namespace: unidesk-ci + labels: + app.kubernetes.io/name: unidesk-ci + app.kubernetes.io/part-of: unidesk +data: + firstPaintMs: "2000" + traceSummaryMs: "700" + traceStepsMs: "900" + traceStepDetailMs: "700" + overviewP95Ms: "900" +--- +apiVersion: tekton.dev/v1 +kind: Task +metadata: + name: unidesk-repo-check + namespace: unidesk-ci + labels: + app.kubernetes.io/name: unidesk-ci + app.kubernetes.io/component: repo-check +spec: + params: + - name: repo-url + type: string + - name: revision + type: string + - name: image + type: string + default: unidesk-code-queue:d601 + workspaces: + - name: source + volumes: + - name: docker-sock + hostPath: + path: /var/run/docker.sock + type: Socket + steps: + - name: clone + image: alpine/git:2.45.2 + env: + - name: HTTP_PROXY + value: "http://d601-provider-egress-proxy.unidesk.svc.cluster.local:18789" + - name: HTTPS_PROXY + value: "http://d601-provider-egress-proxy.unidesk.svc.cluster.local:18789" + - name: ALL_PROXY + value: "http://d601-provider-egress-proxy.unidesk.svc.cluster.local:18789" + - name: NO_PROXY + value: "localhost,127.0.0.1,::1,ci-git-mirror,ci-git-mirror.unidesk-ci,ci-git-mirror.unidesk-ci.svc,ci-git-mirror.unidesk-ci.svc.cluster.local,d601-provider-egress-proxy,d601-provider-egress-proxy.unidesk,d601-provider-egress-proxy.unidesk.svc,d601-provider-egress-proxy.unidesk.svc.cluster.local,d601-tcp-egress-gateway,d601-tcp-egress-gateway.unidesk,d601-tcp-egress-gateway.unidesk.svc,d601-tcp-egress-gateway.unidesk.svc.cluster.local,code-queue-ci-read,code-queue-ci-read.unidesk-ci,code-queue-ci-read.unidesk-ci.svc,code-queue-ci-read.unidesk-ci.svc.cluster.local" + - name: http_proxy + value: "http://d601-provider-egress-proxy.unidesk.svc.cluster.local:18789" + - name: https_proxy + value: "http://d601-provider-egress-proxy.unidesk.svc.cluster.local:18789" + - name: all_proxy + value: "http://d601-provider-egress-proxy.unidesk.svc.cluster.local:18789" + - name: no_proxy + value: "localhost,127.0.0.1,::1,ci-git-mirror,ci-git-mirror.unidesk-ci,ci-git-mirror.unidesk-ci.svc,ci-git-mirror.unidesk-ci.svc.cluster.local,d601-provider-egress-proxy,d601-provider-egress-proxy.unidesk,d601-provider-egress-proxy.unidesk.svc,d601-provider-egress-proxy.unidesk.svc.cluster.local,d601-tcp-egress-gateway,d601-tcp-egress-gateway.unidesk,d601-tcp-egress-gateway.unidesk.svc,d601-tcp-egress-gateway.unidesk.svc.cluster.local,code-queue-ci-read,code-queue-ci-read.unidesk-ci,code-queue-ci-read.unidesk-ci.svc,code-queue-ci-read.unidesk-ci.svc.cluster.local" + script: | + #!/bin/sh + set -eu + rm -rf "$(workspaces.source.path)/repo" + git clone --filter=blob:none "$(params.repo-url)" "$(workspaces.source.path)/repo" + cd "$(workspaces.source.path)/repo" + git fetch --depth=1 origin "$(params.revision)" + git checkout --detach FETCH_HEAD + git rev-parse HEAD | tee "$(workspaces.source.path)/commit.txt" + - name: install-and-check + image: "$(params.image)" + env: + - name: DOCKER_HOST + value: unix:///var/run/docker.sock + - name: BUN_INSTALL_CACHE_DIR + value: "$(workspaces.source.path)/cache/bun" + - name: HTTP_PROXY + value: "http://d601-provider-egress-proxy.unidesk.svc.cluster.local:18789" + - name: HTTPS_PROXY + value: "http://d601-provider-egress-proxy.unidesk.svc.cluster.local:18789" + - name: ALL_PROXY + value: "http://d601-provider-egress-proxy.unidesk.svc.cluster.local:18789" + - name: NO_PROXY + value: "localhost,127.0.0.1,::1,d601-provider-egress-proxy,d601-provider-egress-proxy.unidesk,d601-provider-egress-proxy.unidesk.svc,d601-provider-egress-proxy.unidesk.svc.cluster.local,d601-tcp-egress-gateway,d601-tcp-egress-gateway.unidesk,d601-tcp-egress-gateway.unidesk.svc,d601-tcp-egress-gateway.unidesk.svc.cluster.local,code-queue-ci-read,code-queue-ci-read.unidesk-ci,code-queue-ci-read.unidesk-ci.svc,code-queue-ci-read.unidesk-ci.svc.cluster.local" + - name: http_proxy + value: "http://d601-provider-egress-proxy.unidesk.svc.cluster.local:18789" + - name: https_proxy + value: "http://d601-provider-egress-proxy.unidesk.svc.cluster.local:18789" + - name: all_proxy + value: "http://d601-provider-egress-proxy.unidesk.svc.cluster.local:18789" + - name: no_proxy + value: "localhost,127.0.0.1,::1,d601-provider-egress-proxy,d601-provider-egress-proxy.unidesk,d601-provider-egress-proxy.unidesk.svc,d601-provider-egress-proxy.unidesk.svc.cluster.local,d601-tcp-egress-gateway,d601-tcp-egress-gateway.unidesk,d601-tcp-egress-gateway.unidesk.svc,d601-tcp-egress-gateway.unidesk.svc.cluster.local,code-queue-ci-read,code-queue-ci-read.unidesk-ci,code-queue-ci-read.unidesk-ci.svc,code-queue-ci-read.unidesk-ci.svc.cluster.local" + volumeMounts: + - name: docker-sock + mountPath: /var/run/docker.sock + script: | + #!/usr/bin/env bash + set -euo pipefail + cd "$(workspaces.source.path)/repo" + command -v bun + command -v git + command -v docker + docker compose version >/dev/null + bun install --frozen-lockfile + (cd src && bun install --frozen-lockfile) + bun scripts/cli.ts check +--- +apiVersion: tekton.dev/v1 +kind: Task +metadata: + name: unidesk-code-queue-read-perf + namespace: unidesk-ci + labels: + app.kubernetes.io/name: unidesk-ci + app.kubernetes.io/component: code-queue-performance +spec: + params: + - name: revision + type: string + - name: app-image + type: string + default: unidesk-code-queue:d601 + workspaces: + - name: source + steps: + - name: start-read-service + image: "$(params.app-image)" + env: + - name: HTTP_PROXY + value: "" + - name: HTTPS_PROXY + value: "" + - name: ALL_PROXY + value: "" + - name: NO_PROXY + value: "localhost,127.0.0.1,::1,kubernetes,kubernetes.default,kubernetes.default.svc,kubernetes.default.svc.cluster.local" + - name: http_proxy + value: "" + - name: https_proxy + value: "" + - name: all_proxy + value: "" + - name: no_proxy + value: "localhost,127.0.0.1,::1,kubernetes,kubernetes.default,kubernetes.default.svc,kubernetes.default.svc.cluster.local" + script: | + #!/bin/bash + set -euo pipefail + kube_api="https://${KUBERNETES_SERVICE_HOST}:${KUBERNETES_SERVICE_PORT_HTTPS}" + kube_token="$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" + kube_ca="/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + kube_namespace="$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace)" + kube() { + local method="$1" + shift + curl -fsS --cacert "$kube_ca" -H "Authorization: Bearer $kube_token" -X "$method" "$@" + } + cat >/tmp/code-queue-ci-read-deployment.yaml </dev/null + kube PATCH \ + -H "Content-Type: application/apply-patch+yaml" \ + --data-binary @/tmp/code-queue-ci-read-01 \ + "$kube_api/api/v1/namespaces/$kube_namespace/services/code-queue-ci-read?fieldManager=unidesk-ci&force=true" >/dev/null + deadline=$((SECONDS + 180)) + while [ "$SECONDS" -lt "$deadline" ]; do + status="$(kube GET "$kube_api/apis/apps/v1/namespaces/$kube_namespace/deployments/code-queue-ci-read")" + replicas="$(printf '%s' "$status" | jq -r '.spec.replicas // 1')" + available="$(printf '%s' "$status" | jq -r '.status.availableReplicas // 0')" + updated="$(printf '%s' "$status" | jq -r '.status.updatedReplicas // 0')" + observed="$(printf '%s' "$status" | jq -r '.status.observedGeneration // 0')" + generation="$(printf '%s' "$status" | jq -r '.metadata.generation // 0')" + if [ "$available" -ge "$replicas" ] && [ "$updated" -ge "$replicas" ] && [ "$observed" -ge "$generation" ]; then + echo "code_queue_ci_read_rollout=available replicas=$available generation=$generation" + exit 0 + fi + sleep 2 + done + echo "code_queue_ci_read_rollout=timeout" >&2 + kube GET "$kube_api/apis/apps/v1/namespaces/$kube_namespace/deployments/code-queue-ci-read" >&2 + exit 1 + - name: measure + image: "$(params.app-image)" + workingDir: "$(workspaces.source.path)/repo" + env: + - name: CI_CODE_QUEUE_URL + value: "http://code-queue-ci-read.unidesk-ci.svc.cluster.local:4222" + - name: HTTP_PROXY + value: "http://d601-provider-egress-proxy.unidesk.svc.cluster.local:18789" + - name: HTTPS_PROXY + value: "http://d601-provider-egress-proxy.unidesk.svc.cluster.local:18789" + - name: ALL_PROXY + value: "http://d601-provider-egress-proxy.unidesk.svc.cluster.local:18789" + - name: NO_PROXY + value: "localhost,127.0.0.1,::1,code-queue-ci-read,code-queue-ci-read.unidesk-ci,code-queue-ci-read.unidesk-ci.svc,code-queue-ci-read.unidesk-ci.svc.cluster.local,d601-tcp-egress-gateway,d601-tcp-egress-gateway.unidesk,d601-tcp-egress-gateway.unidesk.svc,d601-tcp-egress-gateway.unidesk.svc.cluster.local,d601-provider-egress-proxy,d601-provider-egress-proxy.unidesk,d601-provider-egress-proxy.unidesk.svc,d601-provider-egress-proxy.unidesk.svc.cluster.local" + - name: http_proxy + value: "http://d601-provider-egress-proxy.unidesk.svc.cluster.local:18789" + - name: https_proxy + value: "http://d601-provider-egress-proxy.unidesk.svc.cluster.local:18789" + - name: all_proxy + value: "http://d601-provider-egress-proxy.unidesk.svc.cluster.local:18789" + - name: no_proxy + value: "localhost,127.0.0.1,::1,code-queue-ci-read,code-queue-ci-read.unidesk-ci,code-queue-ci-read.unidesk-ci.svc,code-queue-ci-read.unidesk-ci.svc.cluster.local,d601-tcp-egress-gateway,d601-tcp-egress-gateway.unidesk,d601-tcp-egress-gateway.unidesk.svc,d601-tcp-egress-gateway.unidesk.svc.cluster.local,d601-provider-egress-proxy,d601-provider-egress-proxy.unidesk,d601-provider-egress-proxy.unidesk.svc,d601-provider-egress-proxy.unidesk.svc.cluster.local" + - name: FIRST_PAINT_BUDGET_MS + valueFrom: + configMapKeyRef: + name: unidesk-ci-budgets + key: firstPaintMs + - name: TRACE_SUMMARY_BUDGET_MS + valueFrom: + configMapKeyRef: + name: unidesk-ci-budgets + key: traceSummaryMs + - name: TRACE_STEPS_BUDGET_MS + valueFrom: + configMapKeyRef: + name: unidesk-ci-budgets + key: traceStepsMs + - name: TRACE_STEP_DETAIL_BUDGET_MS + valueFrom: + configMapKeyRef: + name: unidesk-ci-budgets + key: traceStepDetailMs + - name: OVERVIEW_P95_BUDGET_MS + valueFrom: + configMapKeyRef: + name: unidesk-ci-budgets + key: overviewP95Ms + script: | + #!/usr/bin/env bash + set -euo pipefail + bun scripts/ci-code-queue-read-perf.ts + - name: cleanup + image: "$(params.app-image)" + env: + - name: HTTP_PROXY + value: "" + - name: HTTPS_PROXY + value: "" + - name: ALL_PROXY + value: "" + - name: NO_PROXY + value: "localhost,127.0.0.1,::1,kubernetes,kubernetes.default,kubernetes.default.svc,kubernetes.default.svc.cluster.local" + - name: http_proxy + value: "" + - name: https_proxy + value: "" + - name: all_proxy + value: "" + - name: no_proxy + value: "localhost,127.0.0.1,::1,kubernetes,kubernetes.default,kubernetes.default.svc,kubernetes.default.svc.cluster.local" + script: | + #!/bin/bash + set -euo pipefail + kube_api="https://${KUBERNETES_SERVICE_HOST}:${KUBERNETES_SERVICE_PORT_HTTPS}" + kube_token="$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" + kube_ca="/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + kube_namespace="$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace)" + delete_resource() { + local path="$1" + local code + code="$(curl -sS -o /tmp/unidesk-ci-delete-response -w "%{http_code}" --cacert "$kube_ca" -H "Authorization: Bearer $kube_token" -X DELETE "$kube_api/$path")" + if [ "$code" = "200" ] || [ "$code" = "202" ] || [ "$code" = "404" ]; then + return 0 + fi + cat /tmp/unidesk-ci-delete-response >&2 + return 1 + } + delete_resource "apis/apps/v1/namespaces/$kube_namespace/deployments/code-queue-ci-read" + delete_resource "api/v1/namespaces/$kube_namespace/services/code-queue-ci-read" +--- +apiVersion: tekton.dev/v1 +kind: Pipeline +metadata: + name: unidesk-ci + namespace: unidesk-ci + labels: + app.kubernetes.io/name: unidesk-ci + app.kubernetes.io/part-of: unidesk +spec: + params: + - name: repo-url + type: string + default: https://github.com/pikasTech/unidesk + - name: revision + type: string + - name: check-image + type: string + default: unidesk-code-queue:d601 + - name: code-queue-image + type: string + default: unidesk-code-queue:d601 + workspaces: + - name: shared-workspace + tasks: + - name: repo-check + taskRef: + name: unidesk-repo-check + params: + - name: repo-url + value: "$(params.repo-url)" + - name: revision + value: "$(params.revision)" + - name: image + value: "$(params.check-image)" + workspaces: + - name: source + workspace: shared-workspace + - name: code-queue-read-perf + runAfter: + - repo-check + taskRef: + name: unidesk-code-queue-read-perf + params: + - name: revision + value: "$(params.revision)" + - name: app-image + value: "$(params.code-queue-image)" + workspaces: + - name: source + workspace: shared-workspace diff --git a/src/components/microservices/k3sctl-adapter/k3s/ci/unidesk-ci.triggers.yaml b/src/components/microservices/k3sctl-adapter/k3s/ci/unidesk-ci.triggers.yaml new file mode 100644 index 00000000..769573e4 --- /dev/null +++ b/src/components/microservices/k3sctl-adapter/k3s/ci/unidesk-ci.triggers.yaml @@ -0,0 +1,80 @@ +apiVersion: triggers.tekton.dev/v1beta1 +kind: EventListener +metadata: + name: unidesk-ci + namespace: unidesk-ci + labels: + app.kubernetes.io/name: unidesk-ci + app.kubernetes.io/component: triggers + app.kubernetes.io/part-of: unidesk +spec: + serviceAccountName: unidesk-ci-runner + triggers: + - name: github-push + interceptors: + - ref: + name: cel + params: + - name: filter + value: >- + body.ref.startsWith('refs/heads/') && + body.after.matches('^[0-9a-f]{40}$') + bindings: + - ref: unidesk-ci-github-push + template: + ref: unidesk-ci +--- +apiVersion: triggers.tekton.dev/v1beta1 +kind: TriggerBinding +metadata: + name: unidesk-ci-github-push + namespace: unidesk-ci + labels: + app.kubernetes.io/name: unidesk-ci + app.kubernetes.io/component: triggers + app.kubernetes.io/part-of: unidesk +spec: + params: + - name: repo-url + value: $(body.repository.clone_url) + - name: revision + value: $(body.after) +--- +apiVersion: triggers.tekton.dev/v1beta1 +kind: TriggerTemplate +metadata: + name: unidesk-ci + namespace: unidesk-ci + labels: + app.kubernetes.io/name: unidesk-ci + app.kubernetes.io/component: triggers + app.kubernetes.io/part-of: unidesk +spec: + params: + - name: repo-url + default: https://github.com/pikasTech/unidesk + - name: revision + resourcetemplates: + - apiVersion: tekton.dev/v1 + kind: PipelineRun + metadata: + generateName: unidesk-ci- + namespace: unidesk-ci + labels: + app.kubernetes.io/name: unidesk-ci + app.kubernetes.io/part-of: unidesk + unidesk.ai/revision: $(tt.params.revision) + spec: + pipelineRef: + name: unidesk-ci + taskRunTemplate: + serviceAccountName: unidesk-ci-runner + params: + - name: repo-url + value: $(tt.params.repo-url) + - name: revision + value: $(tt.params.revision) + workspaces: + - name: shared-workspace + persistentVolumeClaim: + claimName: unidesk-ci-cache From d74439ecba7849e1c7305d57a65007458d8b0671 Mon Sep 17 00:00:00 2001 From: Codex Date: Sun, 17 May 2026 06:17:17 +0000 Subject: [PATCH 05/14] feat: add decision center service --- AGENTS.md | 7 +- TEST.md | 4 + config.json | 56 ++ deploy.json | 5 + docs/reference/cli.md | 5 +- docs/reference/deploy.md | 2 + docs/reference/microservices.md | 13 + docs/reference/repo-tree.md | 2 + scripts/cli.ts | 9 + scripts/src/check.ts | 2 + scripts/src/decision-center.ts | 208 +++++++ scripts/src/e2e.ts | 124 ++++- scripts/src/remote.ts | 16 +- src/components/frontend/public/style.css | 117 +++- src/components/frontend/src/app.tsx | 3 + .../frontend/src/decision-center.tsx | 258 +++++++++ src/components/frontend/src/navigation.ts | 1 + .../microservices/decision-center/Dockerfile | 11 + .../decision-center/package.json | 12 + .../decision-center/src/index.ts | 510 ++++++++++++++++++ .../decision-center/tsconfig.json | 18 + .../k3sctl-adapter/docker-compose.d601.yml | 3 +- .../k3s/decision-center.k3s.json | 37 ++ .../k3s/decision-center.k8s.yaml | 102 ++++ .../microservices/k3sctl-adapter/src/index.ts | 2 +- src/tsconfig.base.json | 4 +- 26 files changed, 1517 insertions(+), 14 deletions(-) create mode 100644 scripts/src/decision-center.ts create mode 100644 src/components/frontend/src/decision-center.tsx create mode 100644 src/components/microservices/decision-center/Dockerfile create mode 100644 src/components/microservices/decision-center/package.json create mode 100644 src/components/microservices/decision-center/src/index.ts create mode 100644 src/components/microservices/decision-center/tsconfig.json create mode 100644 src/components/microservices/k3sctl-adapter/k3s/decision-center.k3s.json create mode 100644 src/components/microservices/k3sctl-adapter/k3s/decision-center.k8s.yaml diff --git a/AGENTS.md b/AGENTS.md index 92d519d4..17c64a48 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -33,6 +33,7 @@ UniDesk 是一个以主 server 为统一入口的分布式工作平台;本文 - `bun scripts/cli.ts provider attach [--master-server URL] [--up] [--force]`:在新增计算节点上生成两项配置的 provider-gateway 挂载包;默认只需要主 server URL(默认 `http://74.48.78.17/`)和唯一 Provider ID,生成的 Compose 固定 Docker socket、`pid: "host"`、`restart: always`、只读 `/workspace`、SSH 维护私钥挂载和 loopback egress proxy 端口,规则见 `docs/reference/provider-gateway.md`。 - `bun scripts/cli.ts ssh [ssh-like args...]`:通过 provider-gateway 的 Host SSH / WSL SSH 维护桥打开近似原生 ssh 的交互会话或远端命令,并在远端 PATH 注入 `apply_patch`、`glob` 与 `skill-discover`;`apply-patch`、`py`、`skills`、结构化 `find`、`glob` 和 `argv` 子命令用于避免远端补丁、Python stdin、skill 发现与常用只读命令的嵌套转义问题,使用规则见 `docs/reference/cli.md` 和 `docs/reference/provider-gateway.md`。 - `bun scripts/cli.ts microservice list/status/health/proxy`:管理和验证挂载在主 server、计算节点 Docker 或 k3s 控制面上的用户服务,OA Event Flow/Todo Note/Baidu Netdisk on main-server、k3s Control/Code Queue/MDTODO/FindJob/Pipeline/MET Nonlinear on D601 的规则见 `docs/reference/microservices.md`。 +- `bun scripts/cli.ts decision upload/list/show/health`:通过 backend-core 用户服务代理上传会议记录/决议 Markdown、列出记录和查看详情;Decision Center 运行在 D601 k3s,规则见 `docs/reference/microservices.md`。 - `bun scripts/cli.ts deploy check/plan/apply [--file deploy.json] [--service ]`:按根目录 `deploy.json` 的服务 repo 和 commit 期望状态校验或更新用户服务,目标侧自行 fetch、构建、部署和 live commit 验证;规则见 `docs/reference/deploy.md`。 - `bun scripts/cli.ts ci install/status/run/logs`:在 D601 原生 k3s 上安装和运行 Tekton CI,只做每 commit 检查和 Code Queue 只读性能门禁,不部署 CD;规则见 `docs/reference/ci.md`。 - `bun scripts/cli.ts codex deploy `:Code Queue 兼容部署入口,会生成临时 desired manifest 并调用 `deploy apply --service code-queue` 的同一条 target-side build 与 live commit 验证路径;规则见 `docs/reference/codex-deploy.md`。 @@ -46,8 +47,8 @@ UniDesk 是一个以主 server 为统一入口的分布式工作平台;本文 ## Runtime - `bun`:TypeScript 运行时固定使用 Bun,组件入口和 CLI 都直接运行 `.ts` 文件,约束见 `docs/reference/config.md`。 -- `docker-compose.yml`:主 server 统一编排 core、frontend、database、本机 provider gateway、Todo Note 后端、Baidu Netdisk 后端和 OA Event Flow 后端;Code Queue 和 MDTODO 由 D601 k3s/k8s 控制面代管,并经 `k3sctl-adapter` 的 Kubernetes API service proxy 单一路径接入,服务拓扑见 `docs/reference/deployment.md`。 -- `src/components/frontend`:前端源码固定使用 TypeScript + React,`app.tsx` 只做 shell/router,左侧主模块与顶部子标签统一编译为模块前缀路由:`/ops//`、`/nodes//`、`/tasks//`、`/config//`,只有用户服务使用 `/app//` 深链接,运行总览包含通用性能面板,资源监控含曲线和进程资源排序表,Todo Note、FindJob、Pipeline、MET Nonlinear、Baidu Netdisk、Code Queue、MDTODO、OA Event Flow、k3s Control 等业务页必须拆到独立 TSX 模块,界面规则见 `docs/reference/frontend.md`。 +- `docker-compose.yml`:主 server 统一编排 core、frontend、database、本机 provider gateway、Todo Note 后端、Baidu Netdisk 后端和 OA Event Flow 后端;Code Queue、MDTODO 和 Decision Center 由 D601 k3s/k8s 控制面代管,并经 `k3sctl-adapter` 的 Kubernetes API service proxy 单一路径接入,服务拓扑见 `docs/reference/deployment.md`。 +- `src/components/frontend`:前端源码固定使用 TypeScript + React,`app.tsx` 只做 shell/router,左侧主模块与顶部子标签统一编译为模块前缀路由:`/ops//`、`/nodes//`、`/tasks//`、`/config//`,只有用户服务使用 `/app//` 深链接,运行总览包含通用性能面板,资源监控含曲线和进程资源排序表,Todo Note、FindJob、Pipeline、MET Nonlinear、Baidu Netdisk、Code Queue、MDTODO、Decision Center、OA Event Flow、k3s Control 等业务页必须拆到独立 TSX 模块,界面规则见 `docs/reference/frontend.md`。 - `backend-core / frontend performance`:backend-core 暴露 `/api/performance`,frontend 暴露同源 `/api/frontend-performance` 并在 `/ops/performance/` 汇总组件请求、失败请求、内部操作和慢操作,规则见 `docs/reference/observability.md`。backend-core 源码已拆分为 15 个模块,结构见 `docs/reference/repo-tree.md`。 - `Unified OA event flow`:`oa-event-flow` 是独立主 server 用户服务,提供事件表、按 tag 订阅和 Trace/STEP 统计中心,Code Queue 与 Pipeline 都必须接入统一事件流;共享契约见 `docs/reference/oa-event-flow.md`,Pipeline 专有控制流规则见 `docs/reference/pipeline-oa-event-flow.md`。 - `src/components/provider-gateway`:当前主 server `74.48.78.17` 也作为 provider gateway 接入 UniDesk,外部节点通过 `ws://74.48.78.17:18082/ws/provider` 接入,必须以 `restart: always` 部署 always-enabled 远程升级、sleep-and-validate 回滚保护和 Host SSH / WSL SSH 透传并完成自测,部署与 Playwright 公网前端验证方法见 `docs/reference/provider-gateway.md`。 @@ -59,7 +60,7 @@ UniDesk 是一个以主 server 为统一入口的分布式工作平台;本文 - `docs/reference/arch.md`:UniDesk 分布式工作平台的长期架构约束。 - `docs/reference/repo-tree.md`:仓库结构目标与组件边界。 - `docs/reference/observability.md`:服务日志、任务活性、通用性能指标 API 和性能面板的可观测性规则。 -- `docs/reference/microservices.md`:用户服务(兼容命名 `microservice`)的配置、代理、安全边界、unidesk-direct/k3sctl-managed 部署模式、Todo Note/Baidu Netdisk on main-server、k3s Control/Code Queue/MDTODO/FindJob/Pipeline/MET Nonlinear on D601 和验证规则。 +- `docs/reference/microservices.md`:用户服务(兼容命名 `microservice`)的配置、代理、安全边界、unidesk-direct/k3sctl-managed 部署模式、Todo Note/Baidu Netdisk on main-server、k3s Control/Code Queue/MDTODO/Decision Center/FindJob/Pipeline/MET Nonlinear on D601 和验证规则。 - `docs/reference/windows-passthrough.md`:WSL provider 通过 SSH 透传调用 Windows cmd/PowerShell、Keil、COM 串口和 Windows 侧 skill 的长期规则。 - `docs/reference/constar-d601.md`:D601 上 ConStart/constar 固件工作区的 UniDesk SSH 入口、WSL skill wrapper、Keil 编译下载和串口/JSON-RPC 验证简要引导。 - `docs/reference/oa-event-flow.md`:统一 OA 事件流微服务、事件表、tag 订阅、Trace/STEP 统计中心和前端可见性规则。 diff --git a/TEST.md b/TEST.md index 55b7be7c..3e0d091d 100644 --- a/TEST.md +++ b/TEST.md @@ -111,6 +111,10 @@ 阅读 `AGENTS.md` 和 `docs/reference/ci.md`,运行 `bun scripts/cli.ts ci install`,确认 Tekton Pipelines `v1.12.0`、Tekton Triggers `v0.34.0` 和 `unidesk-ci` Pipeline/Task/EventListener 已部署到 D601 原生 k3s;随后运行 `bun scripts/cli.ts ci run --revision <已push的commitId> --wait-ms 1200000`,确认 PipelineRun 只执行 clone/check/performance,不调用 `deploy apply` 或 `codex deploy`,并确认临时 `code-queue-ci-read` 使用主 PostgreSQL 只读查询 Code Queue 首屏、TraceView summary、TraceView steps 和 step detail 的性能指标。若失败,使用 `bun scripts/cli.ts ci logs ` 查看 TaskRun 和 Pod 日志;交付说明必须记录性能预算是否通过。 +## T23B D601 Decision Center User Service + +阅读 `AGENTS.md` 和 `docs/reference/microservices.md`,运行 `bun scripts/cli.ts microservice list`,确认 `decision-center` 显示为 `providerId=D601`、`public=false`、`frontendOnly=true`、仓库 URL `https://github.com/pikasTech/unidesk`、k3s/k8s `k3s://unidesk/decision-center:4277` 逻辑服务映射、`deployment.mode=k3sctl-managed`、`runtime.orchestrator=k3sctl` 且无业务直连容器摘要;使用 `bun scripts/cli.ts deploy apply --service decision-center` 按 `deploy.json` 期望状态部署,确认 job 在 D601 target-side build、导入原生 k3s/containerd、apply `src/components/microservices/k3sctl-adapter/k3s/decision-center.k8s.yaml`、stamp deployment commit、rollout 并通过 UniDesk microservice proxy 验证 live commit。运行 `bun scripts/cli.ts microservice health decision-center`,确认 `service=decision-center`、`storage=postgres`、`schemaReady=true`;准备一份临时 Markdown 会议记录,运行 `bun scripts/cli.ts decision upload --title --type meeting --level G1 --status active --evidence <url>`,再运行 `bun scripts/cli.ts decision list` 和 `bun scripts/cli.ts decision show <id>`,确认 CLI 只通过 backend-core 用户服务代理访问,返回结构化 JSON 且能看到刚上传的记录。最后登录公网 frontend `http://74.48.78.17:18081/`,进入 `用户服务 / Decision Center`,确认页面显示 G0/G1 目标、P0/P1 Blocker、停放事项、最近会议/决议、筛选和全部记录表,刚上传的会议记录可见;页面不得提供聊天/LLM 会话窗口,默认不得裸 JSON,完整 JSON 只能通过 `查看原始JSON` 打开。 + ## T24 MET Nonlinear D601 GPU User Service 阅读 `AGENTS.md`(本项目 `AGENTS.md` 同时承担 `SKILL.md` 对 `scripts/cli.ts` 的解释职责),然后用 cli 手动测试以下内容:确认 D601 `~/met_nonlinear` 中存在 `docker-compose.unidesk.yml`、`docker/unidesk/Dockerfile.ml`、`unidesk/server/src/index.ts` 和 `docs/reference/unidesk_microservice.md`;运行 `bun scripts/cli.ts microservice list`,确认 `met-nonlinear` 显示为 `providerId=D601`、`public=false`、`frontendOnly=true`、`127.0.0.1:3288` 后端映射和 `met-nonlinear-ts` 容器摘要;运行 `bun scripts/cli.ts microservice health met-nonlinear`、`bun scripts/cli.ts microservice proxy met-nonlinear /api/queue`、`bun scripts/cli.ts microservice proxy met-nonlinear '/api/projects?root=projects&limit=500'`、`bun scripts/cli.ts microservice proxy met-nonlinear '/api/projects?root=ex_projects&limit=500'`、`bun scripts/cli.ts microservice proxy met-nonlinear '/api/projects/config?path=projects/<name>' --raw` 和 `bun scripts/cli.ts microservice proxy met-nonlinear /api/images`,确认链路通过 backend-core、D601 provider-gateway 和 D601 本机 TS 后端,项目详情包含 `config`、`progress`、`data`、`model`、`metrics` 字段;最后登录公网 frontend `http://74.48.78.17:18081/`,进入 `用户服务 / MET Nonlinear`,确认项目库按 `projects/` 和 `ex_projects/` 文件树层级展示且文件夹 Project 数与后端返回数量一致,点击项目行能看到结构化 `config.json`、`data/` 训练状态、模型参数量和指标;通过 UI 选择已有 source Project,设置训练轮数和最大并发,使用 `Fork Project` 创建新的 `projects/unidesk_forks/` Project,确认新 Project 被自动勾选但不会直接训练,再点击 `加入待启动队列` 和 `启动队列`;完整验收可用 UI 输入 `Fork 数量=10`、`训练轮数=200`、`最大并发=3`,但这个规模只能由输入框配置,不能作为硬编码按钮。确认最多按 UI 设置的并发数运行、目标 GPU 是 2080Ti、显存余量低于 20% 时自动限制并发、任务最终进入已完成或失败诊断标签且训练容器自动销毁。页面必须以 React 控件显示项目库、待启动/排队/训练中、已完成、失败诊断、GPU/镜像、训练进度、ETA、`epoch/h` 训练速度和历史记录;项目库、当前队列、已完成和失败列表中的项目必须可点击打开详情;默认没有裸 JSON,只有点击 `查看原始JSON` 才显示原始数据;前端不得再提供 `创建10个10轮任务` 这类硬编码测试按钮。 diff --git a/config.json b/config.json index a7e0925e..f3ed2b81 100644 --- a/config.json +++ b/config.json @@ -699,6 +699,62 @@ ], "activeNodeId": "D601" } + }, + { + "id": "decision-center", + "name": "Decision Center", + "providerId": "D601", + "description": "Decision Center 是由 D601 k3s 控制面代管的决策权威记录服务,用于沉淀会议记录、决议、目标、问题分级、停放事项和证据;参谋对话仍使用 Codex 原生会话。", + "repository": { + "url": "https://github.com/pikasTech/unidesk", + "commitId": "eb2660d3b777e01291506c418352ab6cfa4eca35", + "dockerfile": "src/components/microservices/decision-center/Dockerfile", + "composeFile": "src/components/microservices/k3sctl-adapter/k3s/decision-center.k3s.json", + "composeService": "decision-center", + "containerName": "k3s:decision-center" + }, + "backend": { + "nodeBaseUrl": "k3s://decision-center", + "nodeBindHost": "k3s://unidesk/decision-center", + "nodePort": 4277, + "proxyMode": "k3sctl-adapter-http", + "frontendOnly": true, + "public": false, + "allowedMethods": [ + "GET", + "HEAD", + "POST", + "PUT", + "DELETE" + ], + "allowedPathPrefixes": [ + "/health", + "/live", + "/logs", + "/api/" + ], + "healthPath": "/health", + "timeoutMs": 30000 + }, + "development": { + "providerId": "D601", + "sshPassthrough": true, + "worktreePath": "/home/ubuntu/cq-deploy/src/components/microservices/decision-center" + }, + "frontend": { + "route": "/apps/decision-center", + "integrated": true + }, + "deployment": { + "mode": "k3sctl-managed", + "adapterServiceId": "k3sctl-adapter", + "k3sServiceId": "decision-center", + "namespace": "unidesk", + "expectedNodeIds": [ + "D601" + ], + "activeNodeId": "D601" + } } ], "paths": { diff --git a/deploy.json b/deploy.json index 66e8a14f..b0a505c3 100644 --- a/deploy.json +++ b/deploy.json @@ -55,6 +55,11 @@ "id": "mdtodo", "repo": "https://github.com/pikasTech/unidesk", "commitId": "75fb6757b2504ba86d61f2587fb34a9c9ed4019a" + }, + { + "id": "decision-center", + "repo": "https://github.com/pikasTech/unidesk", + "commitId": "eb2660d3b777e01291506c418352ab6cfa4eca35" } ] } diff --git a/docs/reference/cli.md b/docs/reference/cli.md index 94d1d5fd..2195a373 100644 --- a/docs/reference/cli.md +++ b/docs/reference/cli.md @@ -19,6 +19,7 @@ UniDesk 的统一 CLI 入口是根目录 `scripts/cli.ts`,运行方式固定 - `ssh <providerId> py [script-args...] < script.py` 把本地 stdin 落到远端临时 `.py` 文件后再以 `python3 -u` 执行并自动清理,避免再手写 `'python3 -'`、heredoc 或多层引号;`script-args` 会按 argv 安全透传给远端脚本。 - `ssh <providerId> skills [--scope all|wsl|windows] [--limit N]` 发现目标节点上的 WSL/Linux skill 根目录;当 provider 是 WSL 时同一次调用还会扫描 Windows 用户目录下的 `.agents/skills` 与 `.codex/skills`。 - `microservice list/status/health/proxy` 通过 backend-core 内网 API 管理挂载在计算节点 Docker 中的用户服务(底层命令名仍为 microservice);`health` 和 `proxy` 会走真实 backend-core -> provider-gateway -> 节点本机后端链路,`proxy` 对超大 body 默认输出有界预览,规则见 `docs/reference/microservices.md`。 +- `decision upload/list/show/health` 通过 backend-core 用户服务代理访问 D601 k3s Decision Center,用于上传会议记录/决议 Markdown、列出权威记录、查看详情和健康检查;它不得直连 D601 Service、NodePort 或 provider-gateway 业务 HTTP。 - `deploy check/plan/apply` 从根目录 `deploy.json` 读取服务 repo 与 commit 期望状态,join `config.json` 和现有 manifest 后使用 target-side build 单一路径校验或更新直管服务与 k3s 代管服务;规则见 `docs/reference/deploy.md`。 - `codex deploy <commitId>` 是 Code Queue 兼容部署入口,会生成临时 desired manifest 并调用 `deploy apply --service code-queue` 的同一条 target-side build、k3s import、rollout 和 live commit 验证路径;详细规则见 `docs/reference/codex-deploy.md`。 - `codex task <taskId>` 通过 Code Queue 私有代理按任务 ID 查询结构化执行摘要;默认只返回有界 prompt/response 预览、执行 Provider、工作目录、最后 assistant message、最近工具调用摘要、attempt、judge、错误、耗时和 trace 翻页提示,适合在新队列任务中引用历史 session 且避免噪声爆炸。 @@ -34,7 +35,7 @@ UniDesk 的统一 CLI 入口是根目录 `scripts/cli.ts`,运行方式固定 长时操作采用 Fire-and-Forget 模式:CLI 创建 `.state/jobs/{jobId}.json`,后台进程执行真实命令,并将 stdout、stderr 分别写入 `.state/jobs/{jobId}.stdout.log` 与 `.state/jobs/{jobId}.stderr.log`。调用者通过 `bun scripts/cli.ts job status <jobId>` 查询进度和尾部输出。 -`server rebuild` 与 `server start`、`server stop` 一样必须通过返回的 job id 确认结果;不要把连续 `server rebuild` 命令理解成“前一个重建已完成”,因为两个命令只是在快速创建异步 job。重建 frontend 的标准流程是运行 `bun scripts/cli.ts server rebuild frontend`,随后轮询 `bun scripts/cli.ts job status <jobId>` 到 `succeeded`,再用 `server status` 或 `e2e run` 验证公网 frontend;重建 Todo Note 后端使用 `bun scripts/cli.ts server rebuild todo-note`,随后用 `microservice health todo-note` 和 `microservice proxy todo-note /api/instances` 验证;重建 Project Manager 后端使用 `bun scripts/cli.ts server rebuild project-manager`,随后用 `microservice health project-manager` 和 `microservice proxy project-manager /api/projects` 验证;重建 Baidu Netdisk 后端使用 `bun scripts/cli.ts server rebuild baidu-netdisk`,随后用 `microservice health baidu-netdisk` 和 `microservice proxy baidu-netdisk /api/transfers` 验证;重建 OA Event Flow 后端使用 `bun scripts/cli.ts server rebuild oa-event-flow`,随后用 `microservice health oa-event-flow` 和 `microservice proxy oa-event-flow /api/diagnostics` 验证。Code Queue 后端由 D601 k3s/k8s 控制面代管,必须使用 `bun scripts/cli.ts deploy apply --service code-queue` 或兼容入口 `bun scripts/cli.ts codex deploy <commitId>` 部署已 push 的 remote commit;部署 job 自身必须通过真实 `/health` 和 k3s Deployment annotation 证明不是旧服务在充数,之后再用 `microservice health code-queue` 和 `microservice proxy code-queue /api/tasks/overview` 做人工复核。不得把 `docker rm` 手工兜底当成正式交付步骤。 +`server rebuild` 与 `server start`、`server stop` 一样必须通过返回的 job id 确认结果;不要把连续 `server rebuild` 命令理解成“前一个重建已完成”,因为两个命令只是在快速创建异步 job。重建 frontend 的标准流程是运行 `bun scripts/cli.ts server rebuild frontend`,随后轮询 `bun scripts/cli.ts job status <jobId>` 到 `succeeded`,再用 `server status` 或 `e2e run` 验证公网 frontend;重建 Todo Note 后端使用 `bun scripts/cli.ts server rebuild todo-note`,随后用 `microservice health todo-note` 和 `microservice proxy todo-note /api/instances` 验证;重建 Project Manager 后端使用 `bun scripts/cli.ts server rebuild project-manager`,随后用 `microservice health project-manager` 和 `microservice proxy project-manager /api/projects` 验证;重建 Baidu Netdisk 后端使用 `bun scripts/cli.ts server rebuild baidu-netdisk`,随后用 `microservice health baidu-netdisk` 和 `microservice proxy baidu-netdisk /api/transfers` 验证;重建 OA Event Flow 后端使用 `bun scripts/cli.ts server rebuild oa-event-flow`,随后用 `microservice health oa-event-flow` 和 `microservice proxy oa-event-flow /api/diagnostics` 验证。Code Queue 和 Decision Center 后端由 D601 k3s/k8s 控制面代管,必须使用 `bun scripts/cli.ts deploy apply --service code-queue`、`bun scripts/cli.ts deploy apply --service decision-center` 或 Code Queue 兼容入口 `bun scripts/cli.ts codex deploy <commitId>` 部署已 push 的 remote commit;部署 job 自身必须通过真实 `/health` 和 k3s Deployment annotation 证明不是旧服务在充数,之后再用 `microservice health <service>` 和对应私有代理 API 做人工复核。不得把 `docker rm` 手工兜底当成正式交付步骤。 新部署入口优先使用 `deploy apply`。旧的 `server rebuild` 和 `codex deploy` 只保留为兼容入口,后续实现应收敛到同一个 reconciler:从 remote commit 导出源码,在目标节点一次性代理构建镜像,部署后用 live commit 校验证明不是旧服务。 @@ -112,7 +113,7 @@ bun scripts/cli.ts ssh D601 glob --root /home/ubuntu/pikapython --pattern '**/*- `--main-server-ip` 是一个全局前缀,必须放在需要透传的命令同一次调用中,例如 `bun scripts/cli.ts --main-server-ip 74.48.78.17 debug health`。默认传输是公网 frontend:本地 CLI 读取本仓库 `config.json` 中的 frontend 登录账号密码,登录 `http://<ip>:<frontendPort>/` 获取 HttpOnly session cookie,然后通过 frontend 的 `/api/*` 同源代理访问 backend-core 内网 API;因此计算节点只需要能访问公网 frontend,不需要主 server SSH key,也不需要打开 backend-core REST API 或 PostgreSQL 端口。 -默认 frontend 传输支持 `debug health`、`debug dispatch`、`debug task`、`microservice list/status/health/proxy`、`codex task <taskId>`、`codex output <taskId>`、`codex judge <taskId> --attempt N` 和 `ssh <PROVIDER_ID> <remote-command>`。其中 `ssh` 的 remote frontend 传输使用 `host.ssh` dispatch 执行有界远端命令,适合 `ssh D601 hostname` 和 `ssh D601 skills` 这类自测;交互式登录 shell 仍应在主 server 本机 CLI 使用,或显式切换到旧 SSH 传输后在主 server 上执行。frontend 远程透传不会流式转发本地 stdin,因此 `ssh py < script.py`、`ssh apply-patch < patch.diff` 这类 stdin-backed helper 必须在主 server 本机运行,或显式切换到 `--main-server-transport ssh`。若确实需要旧行为,可使用 `--main-server-key <key>` 或 `--main-server-transport ssh`,这时 CLI 会通过 SSH 登录主 server 的 `--main-server-root` 目录执行同一个 `bun scripts/cli.ts <command>`。 +默认 frontend 传输支持 `debug health`、`debug dispatch`、`debug task`、`microservice list/status/health/proxy`、`decision upload/list/show/health`、`codex task <taskId>`、`codex output <taskId>`、`codex judge <taskId> --attempt N` 和 `ssh <PROVIDER_ID> <remote-command>`。其中 `ssh` 的 remote frontend 传输使用 `host.ssh` dispatch 执行有界远端命令,适合 `ssh D601 hostname` 和 `ssh D601 skills` 这类自测;交互式登录 shell 仍应在主 server 本机 CLI 使用,或显式切换到旧 SSH 传输后在主 server 上执行。frontend 远程透传不会流式转发本地 stdin,因此 `ssh py < script.py`、`ssh apply-patch < patch.diff` 这类 stdin-backed helper 必须在主 server 本机运行,或显式切换到 `--main-server-transport ssh`。若确实需要旧行为,可使用 `--main-server-key <key>` 或 `--main-server-transport ssh`,这时 CLI 会通过 SSH 登录主 server 的 `--main-server-root` 目录执行同一个 `bun scripts/cli.ts <command>`。 计算节点可以用该入口测试自身的远程升级闭环,而不需要在计算节点公开 core REST API 或 database。标准顺序是:先运行 `bun scripts/cli.ts --main-server-ip 74.48.78.17 debug health` 确认主 server 看到当前 Provider 在线,且该 Provider labels 中 `unideskCapabilities` 包含 `host.ssh`、`hostSshConfigured=true`、`hostSshKeyPresent=true`;再运行 `bun scripts/cli.ts --main-server-ip 74.48.78.17 debug dispatch <PROVIDER_ID> provider.upgrade --mode schedule --wait-ms 15000` 触发真实 `provider.upgrade`;随后再次运行 `debug health` 确认节点重新上线;最后运行 `bun scripts/cli.ts --main-server-ip 74.48.78.17 debug dispatch <PROVIDER_ID> host.ssh --wait-ms 15000` 和 `bun scripts/cli.ts --main-server-ip 74.48.78.17 ssh <PROVIDER_ID> hostname` 验证 SSH 透传能力。provider-gateway 新部署或升级后没有完成这组 remote CLI 自测,不能视为交付完成。 diff --git a/docs/reference/deploy.md b/docs/reference/deploy.md index 5e8134cb..adfc37da 100644 --- a/docs/reference/deploy.md +++ b/docs/reference/deploy.md @@ -73,6 +73,8 @@ The reconciler selects the executor from `config.json`: Existing service-specific commands such as Code Queue deploy should converge onto this reconciler path instead of keeping a parallel implementation. +Decision Center is a standard `k3sctl-managed` service in this model. `deploy apply --service decision-center` must build `src/components/microservices/decision-center/Dockerfile` on D601, import `unidesk-decision-center:d601` into native k3s containerd, apply `src/components/microservices/k3sctl-adapter/k3s/decision-center.k8s.yaml`, stamp the Deployment, and verify health through `/api/microservices/decision-center/health`. It must not add a main-server Compose service, NodePort, hostPort, or provider-gateway direct HTTP backend for Decision Center. + ## CI Separation Continuous integration is intentionally separate from this deploy reconciler. D601 k3s hosts Tekton CI resources described in `docs/reference/ci.md`, but those PipelineRuns only clone, check and run read-only performance gates. They must not call `deploy apply`, `codex deploy`, `kubectl rollout restart` for production services, or mutate `deploy.json`. diff --git a/docs/reference/microservices.md b/docs/reference/microservices.md index 51bf3a6c..138f11fb 100644 --- a/docs/reference/microservices.md +++ b/docs/reference/microservices.md @@ -193,6 +193,18 @@ Baidu Netdisk 在 UniDesk 语境中按纯后端服务管理:不得暴露百度 - 代理/API:只允许 `/health`、`/logs` 和 `/api/` 前缀;允许方法为 `GET`、`HEAD`、`POST` 和 `DELETE`。`POST /api/push/text` 接受 `userId` 或 `groupId` 与 `message`,由 ClaudeQQ 通过同 Pod NapCat HTTP API 发送 QQ 消息;NapCat 不可用时必须快速返回 `status=napcat_offline` 或可解释错误。 - UniDesk 前端:`用户服务 / ClaudeQQ` React 页面负责展示 D601、仓库引用、私有 k3s 后端映射、NapCat 登录二维码、NapCat HTTP/WS 状态、事件缓存、订阅表、订阅创建表单、消息推送表单、主用户私聊账号 `645275593`、最近 QQ 事件和已发送记录;完整原始 JSON 只能通过显式 `查看原始JSON` 打开。浏览器只能通过 UniDesk frontend 同源代理访问 ClaudeQQ,不得直接访问 D601 `3290/3000/3001/6099`,也不得 iframe ClaudeQQ 旧 WebUI。 +### Decision Center k3s-Managed + +当前 Decision Center 作为 `id=decision-center` 的 `k3sctl-managed` 用户服务登记在 `config.json`,用于沉淀 Codex/人工会议后的会议记录、决议、目标、问题分级、停放事项和证据。它只负责权威记录和展示,不承载通用聊天、LLM 会话窗口或自动参谋对话。 + +- Orchestrator:`deployment.mode=k3sctl-managed`,`deployment.adapterServiceId=k3sctl-adapter`,`deployment.k3sServiceId=decision-center`,`backend.proxyMode=k3sctl-adapter-http`,`backend.nodeBaseUrl=k3s://decision-center`;正式链路只能是 `frontend/CLI -> backend-core -> k3sctl-adapter -> Kubernetes API service proxy -> Kubernetes Service decision-center:4277`。 +- 部署引用:后端源码位于 UniDesk 仓库 `src/components/microservices/decision-center`,Dockerfile 为 `src/components/microservices/decision-center/Dockerfile`;k3s manifest 为 `src/components/microservices/k3sctl-adapter/k3s/decision-center.k3s.json`,Kubernetes 运行清单为 `src/components/microservices/k3sctl-adapter/k3s/decision-center.k8s.yaml`,镜像名固定为 `unidesk-decision-center:d601`。主 server `docker-compose.yml` 不得加入该服务,也不得公开 `4277`。 +- 状态权威:Decision Center 必须写入主 PostgreSQL,当前表为 `decision_center_records`;不得使用浏览器 `localStorage`、IndexedDB、容器 writable layer 或本地 JSON 文件作为会议、决议、目标或问题状态权威。D601 Pod 通过集群内 `d601-tcp-egress-gateway.unidesk.svc.cluster.local:15432` 访问主 PostgreSQL。 +- 数据模型:第一版记录类型为 `meeting|decision|goal|blocker|debt|experiment`,等级为 `G0|G1|G2|G3|P0|P1|P2|P3|none`,状态为 `active|blocked|parked|done`,字段包含 `title`、Markdown `summary/body`、`linkedGoalId`、`tags`、`evidenceLinks`、`sourceSession`、`taskId`、`commitId`、`createdAt` 和 `updatedAt`。 +- API:只允许 `/health`、`/live`、`/logs` 和 `/api/` 前缀;允许 `GET`、`HEAD`、`POST`、`PUT` 和 `DELETE`。业务 API 包含 `GET /api/records`、`POST /api/records`、`GET|PUT|DELETE /api/records/:id` 和 `POST /api/meetings/import`,错误必须返回结构化 JSON,便于 CLI 与 frontend 诊断。 +- CLI:`bun scripts/cli.ts decision upload <markdown-file>`、`decision list`、`decision show <id>` 和 `decision health` 只能通过 backend-core 用户服务代理访问 Decision Center,不得直连 D601 Service、NodePort 或 provider-gateway `microservice.http`。 +- UniDesk 前端:`用户服务 / Decision Center` React 页面只展示高密度记录、筛选、当前 G0/G1 目标、P0/P1 blocker、停放事项和最近会议/决议;默认不得展示裸 JSON,完整原始数据只能通过 `查看原始JSON` 打开。 + ### MDTODO k3s-Managed 当前 MDTODO 作为 `id=mdtodo` 的 `k3sctl-managed` 用户服务登记在 `config.json`,用于把 D601 Windows 工作区 `F:\Work\vscode-mdtodo` 从 VS Code 扩展形态拆成 UniDesk 可代理的后端服务: @@ -212,6 +224,7 @@ Baidu Netdisk 在 UniDesk 语境中按纯后端服务管理:不得暴露百度 - `pipeline`:Pipeline v2 控制与观测服务,UniDesk frontend 渲染组件矩阵、React Flow 控制图、epoch 甘特图、运行材料索引和 node 精细控制面板。 - `met-nonlinear`:MET Nonlinear 训练编排服务,UniDesk frontend 渲染 GPU/镜像、训练队列、Project config 预览、训练进度、ETA 和历史记录。 - `claudeqq`:ClaudeQQ 纯后端 QQ 消息网关,UniDesk frontend 渲染 NapCat 连接、事件订阅、消息推送、最近 QQ 事件和发送记录。 +- `decision-center`:Decision Center 决策权威记录服务,D601 k3s 代管,状态写入主 PostgreSQL,UniDesk frontend 渲染记录筛选、目标、blocker、停放事项和会议/决议。 ### D601 Docker/k3s Restart Recovery diff --git a/docs/reference/repo-tree.md b/docs/reference/repo-tree.md index dc79570b..ba5cbe7b 100644 --- a/docs/reference/repo-tree.md +++ b/docs/reference/repo-tree.md @@ -75,6 +75,7 @@ - src/met-nonlinear.tsx (MET Nonlinear D601 training orchestration React page; do not fold back into `app.tsx`) - src/code-queue.tsx (Code Queue user-service React page; do not fold back into `app.tsx`) - src/oa-event-flow.tsx (Unified OA event flow and Trace/STEP stats React page; do not fold back into `app.tsx`) + - src/decision-center.tsx (Decision Center records dashboard; do not fold back into `app.tsx`) - src/k3sctl.tsx (k3s Control Plane React page backed only by `k3sctl-adapter`; do not fold back into `app.tsx`) - public/ (HTML/CSS static assets for the compact industrial console; no handwritten app JS) - provider-gateway/ (Compute node Provider Gateway container) @@ -89,6 +90,7 @@ - microservices/ (UniDesk-owned user services and compatibility examples) - code-queue/ (Codex/OpenCode queue backend; k3s-managed when exposed through UniDesk) - oa-event-flow/ (Unified OA event ledger, tag stream, and Trace/STEP stats center) + - decision-center/ (Decision records backend; k3s-managed on D601 and PostgreSQL-backed) - k3sctl-adapter/ (D601 k3s control-plane adapter and managed service manifests) - k3s/ci/ (Tekton CI install marker, Pipeline/Task, and in-cluster Trigger manifests) - example-service/ diff --git a/scripts/cli.ts b/scripts/cli.ts index 49998d62..badc1d8c 100644 --- a/scripts/cli.ts +++ b/scripts/cli.ts @@ -9,6 +9,7 @@ import { runSsh } from "./src/ssh"; import { extractRemoteCliOptions, runRemoteCli } from "./src/remote"; import { runMicroserviceCommand } from "./src/microservices"; import { runCodeQueueCommand } from "./src/code-queue"; +import { runDecisionCenterCommand } from "./src/decision-center"; import { runCodeQueueDeployCompatCommand, runDeployCommand } from "./src/deploy"; import { runProviderCommand } from "./src/provider-attach"; import { runScheduleCommand } from "./src/schedules"; @@ -45,6 +46,9 @@ function help(): unknown { { command: "microservice status <id>", description: "Show one user service config, repository reference, backend mapping, and runtime status." }, { command: "microservice health <id>", description: "Probe one user service through backend-core -> provider-gateway HTTP proxy." }, { command: "microservice proxy <id> <path> [--method GET|POST|PUT|PATCH|DELETE] [--raw] [--max-body-bytes N]", description: "Access a private user-service backend path through the same frontend-only proxy used by WebUI; large bodies are summarized unless --raw is set." }, + { command: "decision upload <markdown-file> [--title text] [--type meeting|decision] [--level G0|G1|G2|G3|P0|P1|P2|P3|none] [--status active|blocked|parked|done] [--linked-goal-id id] [--evidence url]", description: "Upload a meeting note or decision record through backend-core -> decision-center user-service proxy." }, + { command: "decision list [--type ...] [--status ...] [--level ...] [--linked-goal-id id] [--limit N]", description: "List Decision Center records through the user-service proxy." }, + { command: "decision show <id>", description: "Show one Decision Center record." }, { command: "deploy check|plan|apply [--file deploy.json] [--service id] [--dry-run] [--force]", description: "Reconcile services from a repo+commit manifest using target-side build and live commit verification." }, { command: "schedule list|get|runs|run|delete", description: "Manage backend-core scheduled tasks and run history; schedule run <id> supports --wait-ms N." }, { command: "schedule upsert-pgdata-backup [--time HH:MM] [--remote-base /SERVER_DATA/UNIDESK_PG_DATA]", description: "Create or update the daily PGDATA physical backup task that uploads monthly rotated archives to Baidu Netdisk." }, @@ -184,6 +188,11 @@ async function main(): Promise<void> { return; } + if (top === "decision" || top === "decision-center") { + emitJson(commandName, await runDecisionCenterCommand(config, args.slice(1))); + return; + } + if (top === "deploy") { const result = await runDeployCommand(config, args.slice(1)); const ok = (result as { ok?: unknown }).ok !== false; diff --git a/scripts/src/check.ts b/scripts/src/check.ts index e25ee7d4..57c463e5 100644 --- a/scripts/src/check.ts +++ b/scripts/src/check.ts @@ -39,6 +39,7 @@ function unifiedLogRotationItem(): CheckItem { "src/components/microservices/project-manager/src/index.ts", "src/components/microservices/baidu-netdisk/src/index.ts", "src/components/microservices/oa-event-flow/src/index.ts", + "src/components/microservices/decision-center/src/index.ts", ]; const offenders = serviceFiles.flatMap((path) => { const text = readFileSync(rootPath(path), "utf8"); @@ -70,6 +71,7 @@ export function runChecks(config: UniDeskConfig): { ok: boolean; items: CheckIte fileItem("src/components/microservices/oa-event-flow/src/index.ts"), fileItem("src/components/microservices/k3sctl-adapter/src/index.ts"), fileItem("src/components/microservices/mdtodo/src/index.ts"), + fileItem("src/components/microservices/decision-center/src/index.ts"), fileItem("scripts/src/deploy.ts"), fileItem("scripts/src/e2e.ts"), unifiedLogRotationItem(), diff --git a/scripts/src/decision-center.ts b/scripts/src/decision-center.ts new file mode 100644 index 00000000..74767cdf --- /dev/null +++ b/scripts/src/decision-center.ts @@ -0,0 +1,208 @@ +import { readFileSync } from "node:fs"; +import { resolve } from "node:path"; +import { type UniDeskConfig, repoRoot } from "./config"; +import { coreInternalFetch } from "./microservices"; + +type DecisionRecordType = "meeting" | "decision" | "goal" | "blocker" | "debt" | "experiment"; +type DecisionRecordLevel = "G0" | "G1" | "G2" | "G3" | "P0" | "P1" | "P2" | "P3" | "none"; +type DecisionRecordStatus = "active" | "blocked" | "parked" | "done"; + +const serviceId = "decision-center"; +const typeValues = new Set<DecisionRecordType>(["meeting", "decision", "goal", "blocker", "debt", "experiment"]); +const levelValues = new Set<DecisionRecordLevel>(["G0", "G1", "G2", "G3", "P0", "P1", "P2", "P3", "none"]); +const statusValues = new Set<DecisionRecordStatus>(["active", "blocked", "parked", "done"]); + +function optionValue(args: string[], names: string[]): string | undefined { + for (const name of names) { + const index = args.indexOf(name); + if (index === -1) continue; + const raw = args[index + 1]; + if (raw === undefined || raw.length === 0 || raw.startsWith("--")) throw new Error(`${name} requires a non-empty value`); + return raw; + } + return undefined; +} + +function optionValues(args: string[], names: string[]): string[] { + const values: string[] = []; + for (let index = 0; index < args.length; index += 1) { + if (!names.includes(args[index] ?? "")) continue; + const raw = args[index + 1]; + if (raw === undefined || raw.length === 0 || raw.startsWith("--")) throw new Error(`${args[index]} requires a non-empty value`); + values.push(raw); + index += 1; + } + return values; +} + +function positionalArgs(args: string[]): string[] { + const positions: string[] = []; + for (let index = 0; index < args.length; index += 1) { + const value = args[index] ?? ""; + if (value.startsWith("--")) { + index += 1; + continue; + } + positions.push(value); + } + return positions; +} + +function parseType(raw: string | undefined, fallback: DecisionRecordType): DecisionRecordType { + const value = raw || fallback; + if (!typeValues.has(value as DecisionRecordType)) throw new Error(`--type must be one of: ${Array.from(typeValues).join(", ")}`); + return value as DecisionRecordType; +} + +function parseLevel(raw: string | undefined, fallback: DecisionRecordLevel): DecisionRecordLevel { + const value = raw || fallback; + if (!levelValues.has(value as DecisionRecordLevel)) throw new Error(`--level must be one of: ${Array.from(levelValues).join(", ")}`); + return value as DecisionRecordLevel; +} + +function parseStatus(raw: string | undefined, fallback: DecisionRecordStatus): DecisionRecordStatus { + const value = raw || fallback; + if (!statusValues.has(value as DecisionRecordStatus)) throw new Error(`--status must be one of: ${Array.from(statusValues).join(", ")}`); + return value as DecisionRecordStatus; +} + +function splitList(values: string[]): string[] { + return [...new Set(values.flatMap((value) => value.split(",")).map((value) => value.trim()).filter(Boolean))]; +} + +function readMarkdownFile(path: string): { absolutePath: string; markdown: string } { + const absolutePath = resolve(repoRoot, path); + const markdown = readFileSync(absolutePath, "utf8"); + if (markdown.trim().length === 0) throw new Error(`markdown file is empty: ${absolutePath}`); + if (markdown.length > 1_000_000) throw new Error(`markdown file is too large: ${absolutePath}`); + return { absolutePath, markdown }; +} + +function decisionProxy(path: string, init?: { method?: string; body?: unknown }): unknown { + return coreInternalFetch(`/api/microservices/${encodeURIComponent(serviceId)}/proxy${path}`, init); +} + +async function decisionProxyAsync( + fetcher: (path: string, init?: { method?: string; body?: unknown }) => Promise<unknown>, + path: string, + init?: { method?: string; body?: unknown }, +): Promise<unknown> { + return await fetcher(`/api/microservices/${encodeURIComponent(serviceId)}/proxy${path}`, init); +} + +function unwrapProxyResponse(response: unknown): unknown { + const record = typeof response === "object" && response !== null && !Array.isArray(response) ? response as Record<string, unknown> : {}; + if (record.ok !== true) return response; + const body = record.body; + return { upstream: { ok: record.ok, status: record.status }, body }; +} + +function uploadMeeting(args: string[]): unknown { + const file = positionalArgs(args)[0]; + if (!file) throw new Error("decision upload requires markdown file"); + const { absolutePath, markdown } = readMarkdownFile(file); + const type = parseType(optionValue(args, ["--type"]), "meeting"); + const payload = { + markdown, + title: optionValue(args, ["--title"]), + type, + level: parseLevel(optionValue(args, ["--level"]), "none"), + status: parseStatus(optionValue(args, ["--status"]), "active"), + linkedGoalId: optionValue(args, ["--linked-goal-id", "--linkedGoalId"]), + tags: splitList(optionValues(args, ["--tag", "--tags"])), + evidenceLinks: splitList(optionValues(args, ["--evidence", "--evidence-link", "--evidenceLinks"])), + sourceSession: optionValue(args, ["--source-session", "--sourceSession"]), + taskId: optionValue(args, ["--task-id", "--taskId"]), + commitId: optionValue(args, ["--commit-id", "--commitId"]), + }; + const endpoint = type === "meeting" ? "/api/meetings/import" : "/api/records"; + const body = type === "meeting" ? payload : { ...payload, body: markdown }; + return { file: absolutePath, result: unwrapProxyResponse(decisionProxy(endpoint, { method: "POST", body })) }; +} + +async function uploadMeetingAsync(args: string[], fetcher: (path: string, init?: { method?: string; body?: unknown }) => Promise<unknown>): Promise<unknown> { + const file = positionalArgs(args)[0]; + if (!file) throw new Error("decision upload requires markdown file"); + const { absolutePath, markdown } = readMarkdownFile(file); + const type = parseType(optionValue(args, ["--type"]), "meeting"); + const payload = { + markdown, + title: optionValue(args, ["--title"]), + type, + level: parseLevel(optionValue(args, ["--level"]), "none"), + status: parseStatus(optionValue(args, ["--status"]), "active"), + linkedGoalId: optionValue(args, ["--linked-goal-id", "--linkedGoalId"]), + tags: splitList(optionValues(args, ["--tag", "--tags"])), + evidenceLinks: splitList(optionValues(args, ["--evidence", "--evidence-link", "--evidenceLinks"])), + sourceSession: optionValue(args, ["--source-session", "--sourceSession"]), + taskId: optionValue(args, ["--task-id", "--taskId"]), + commitId: optionValue(args, ["--commit-id", "--commitId"]), + }; + const endpoint = type === "meeting" ? "/api/meetings/import" : "/api/records"; + const body = type === "meeting" ? payload : { ...payload, body: markdown }; + return { file: absolutePath, result: unwrapProxyResponse(await decisionProxyAsync(fetcher, endpoint, { method: "POST", body })) }; +} + +function listRecords(args: string[]): unknown { + const params = new URLSearchParams(); + const type = optionValue(args, ["--type"]); + const status = optionValue(args, ["--status"]); + const level = optionValue(args, ["--level"]); + const linkedGoalId = optionValue(args, ["--linked-goal-id", "--linkedGoalId"]); + const limit = optionValue(args, ["--limit"]); + if (type !== undefined) params.set("type", parseType(type, "meeting")); + if (status !== undefined) params.set("status", parseStatus(status, "active")); + if (level !== undefined) params.set("level", parseLevel(level, "none")); + if (linkedGoalId !== undefined) params.set("linkedGoalId", linkedGoalId); + if (limit !== undefined) params.set("limit", limit); + const query = params.toString(); + return unwrapProxyResponse(decisionProxy(`/api/records${query ? `?${query}` : ""}`)); +} + +async function listRecordsAsync(args: string[], fetcher: (path: string, init?: { method?: string; body?: unknown }) => Promise<unknown>): Promise<unknown> { + const params = new URLSearchParams(); + const type = optionValue(args, ["--type"]); + const status = optionValue(args, ["--status"]); + const level = optionValue(args, ["--level"]); + const linkedGoalId = optionValue(args, ["--linked-goal-id", "--linkedGoalId"]); + const limit = optionValue(args, ["--limit"]); + if (type !== undefined) params.set("type", parseType(type, "meeting")); + if (status !== undefined) params.set("status", parseStatus(status, "active")); + if (level !== undefined) params.set("level", parseLevel(level, "none")); + if (linkedGoalId !== undefined) params.set("linkedGoalId", linkedGoalId); + if (limit !== undefined) params.set("limit", limit); + const query = params.toString(); + return unwrapProxyResponse(await decisionProxyAsync(fetcher, `/api/records${query ? `?${query}` : ""}`)); +} + +function showRecord(id: string | undefined): unknown { + if (!id) throw new Error("decision show requires record id"); + return unwrapProxyResponse(decisionProxy(`/api/records/${encodeURIComponent(id)}`)); +} + +async function showRecordAsync(id: string | undefined, fetcher: (path: string, init?: { method?: string; body?: unknown }) => Promise<unknown>): Promise<unknown> { + if (!id) throw new Error("decision show requires record id"); + return unwrapProxyResponse(await decisionProxyAsync(fetcher, `/api/records/${encodeURIComponent(id)}`)); +} + +export async function runDecisionCenterCommand(_config: UniDeskConfig, args: string[]): Promise<unknown> { + const [action = "list", id] = args; + if (action === "upload") return uploadMeeting(args.slice(1)); + if (action === "list") return listRecords(args.slice(1)); + if (action === "show") return showRecord(id); + if (action === "health") return unwrapProxyResponse(coreInternalFetch(`/api/microservices/${encodeURIComponent(serviceId)}/health`)); + throw new Error("decision command must be one of: upload, list, show, health"); +} + +export async function runDecisionCenterCommandAsync( + _config: UniDeskConfig, + args: string[], + fetcher: (path: string, init?: { method?: string; body?: unknown }) => Promise<unknown>, +): Promise<unknown> { + const [action = "list", id] = args; + if (action === "upload") return uploadMeetingAsync(args.slice(1), fetcher); + if (action === "list") return listRecordsAsync(args.slice(1), fetcher); + if (action === "show") return showRecordAsync(id, fetcher); + if (action === "health") return unwrapProxyResponse(await fetcher(`/api/microservices/${encodeURIComponent(serviceId)}/health`)); + throw new Error("decision command must be one of: upload, list, show, health"); +} diff --git a/scripts/src/e2e.ts b/scripts/src/e2e.ts index 628bb160..2156547e 100644 --- a/scripts/src/e2e.ts +++ b/scripts/src/e2e.ts @@ -39,6 +39,7 @@ const NETWORK_CHECK_NAMES = [ "network:todo-note-public-blocked", "network:code-queue-public-blocked", "network:oa-event-flow-public-blocked", + "network:decision-center-public-blocked", "network:filebrowser-public-blocked", ] as const; @@ -61,6 +62,7 @@ const SERVICE_CHECK_NAMES = [ "microservice:catalog-todo-note", "microservice:catalog-oa-event-flow", "microservice:catalog-code-queue", + "microservice:catalog-decision-center", "microservice:k3sctl-adapter-status", "microservice:k3sctl-control-plane", "microservice:catalog-filebrowser", @@ -92,6 +94,9 @@ const SERVICE_CHECK_NAMES = [ "microservice:code-queue-status", "microservice:code-queue-health", "microservice:code-queue-tasks", + "microservice:decision-center-status", + "microservice:decision-center-health", + "microservice:decision-center-records", "microservice:oa-event-flow-status", "microservice:oa-event-flow-health", "microservice:oa-event-flow-diagnostics", @@ -129,6 +134,7 @@ const FRONTEND_CHECK_NAMES = [ "frontend:todo-note-integrated-visible", "frontend:findjob-integrated-visible", "frontend:oa-event-flow-visible", + "frontend:decision-center-visible", "frontend:code-queue-integrated-visible", "frontend:code-queue-enqueue-await-smoke", "frontend:code-queue-summary-mobile-wrap", @@ -321,6 +327,7 @@ const LAYOUT_OVERFLOW_PAGE_TEST_IDS: Record<string, string> = { "/app/met-nonlinear/": "met-nonlinear-page", "/app/claudeqq/": "claudeqq-page", "/app/oa-event-flow/": "oa-event-flow-page", + "/app/decision-center/": "decision-center-page", "/app/code-queue/": "code-queue-page", }; @@ -1025,6 +1032,7 @@ async function exposureChecks(config: UniDeskConfig, urls: PublicUrls, checks: E const codeQueuePublic = await fetchProbe(`http://${config.network.publicHost}:14222/health`, 2500); const oaEventFlowPublic = await fetchProbe(`http://${config.network.publicHost}:4255/health`, 2500); const oaEventFlowRestriction = dockerUserPortRestriction(4255, allowedSourceCidrs); + const decisionCenterPublic = await fetchProbe(`http://${config.network.publicHost}:4277/health`, 2500); const filebrowserPublic = await fetchProbe(`http://${config.network.publicHost}:4251/health`, 2500); addSelectedCheck(checks, options, "network:only-frontend-provider-ports", !portsText.includes(`:${config.network.core.port}->`) && !portsText.includes(":14222->"), portSummary); addSelectedCheck(checks, options, "network:core-public-blocked", (corePublic as { reachable?: boolean }).reachable === false, corePublic); @@ -1035,6 +1043,7 @@ async function exposureChecks(config: UniDeskConfig, urls: PublicUrls, checks: E addSelectedCheck(checks, options, "network:todo-note-public-blocked", (todoNotePublic as { reachable?: boolean }).reachable === false, todoNotePublic); addSelectedCheck(checks, options, "network:code-queue-public-blocked", (codeQueuePublic as { reachable?: boolean }).reachable === false, codeQueuePublic); addSelectedCheck(checks, options, "network:oa-event-flow-public-blocked", publicProbeBlockedOrRestricted(oaEventFlowPublic, oaEventFlowRestriction), { publicProbe: oaEventFlowPublic, restriction: oaEventFlowRestriction }); + addSelectedCheck(checks, options, "network:decision-center-public-blocked", (decisionCenterPublic as { reachable?: boolean }).reachable === false, decisionCenterPublic); addSelectedCheck(checks, options, "network:filebrowser-public-blocked", (filebrowserPublic as { reachable?: boolean }).reachable === false, filebrowserPublic); } @@ -1077,6 +1086,9 @@ async function serviceChecks(config: UniDeskConfig, urls: PublicUrls, checks: E2 const codeQueueStatus = dockerCoreJson("/api/microservices/code-queue/status"); const codeQueueHealth = dockerCoreJson("/api/microservices/code-queue/health"); const codeQueueTasks = dockerCoreJson("/api/microservices/code-queue/proxy/api/tasks/overview?limit=5&transcriptLimit=1&compact=1&afterSeq=0&preferId="); + const decisionCenterStatus = dockerCoreJson("/api/microservices/decision-center/status"); + const decisionCenterHealth = dockerCoreJson("/api/microservices/decision-center/health"); + const decisionCenterRecords = dockerCoreJson("/api/microservices/decision-center/proxy/api/records?limit=20"); const filebrowserHealth = dockerCoreJson("/api/microservices/filebrowser/health"); const filebrowserWebui = dockerCoreJson("/api/microservices/filebrowser/proxy/"); const filebrowserD601Health = dockerCoreJson("/api/microservices/filebrowser-d601/health"); @@ -1127,6 +1139,7 @@ async function serviceChecks(config: UniDeskConfig, urls: PublicUrls, checks: E2 const todoNote = microserviceList.find((service) => service.id === "todo-note"); const oaEventFlow = microserviceList.find((service) => service.id === "oa-event-flow"); const codeQueue = microserviceList.find((service) => service.id === "code-queue"); + const decisionCenter = microserviceList.find((service) => service.id === "decision-center"); const filebrowser = microserviceList.find((service) => service.id === "filebrowser"); const filebrowserD601 = microserviceList.find((service) => service.id === "filebrowser-d601"); const findjobSummaryBody = (findjobSummary as { body?: { totalJobs?: number; prioritizedJobs?: number } }).body; @@ -1150,6 +1163,8 @@ async function serviceChecks(config: UniDeskConfig, urls: PublicUrls, checks: E2 const oaEventFlowStatsBody = (oaEventFlowStats as { body?: { ok?: boolean; stats?: unknown[]; returned?: number } }).body; const codeQueueHealthBody = (codeQueueHealth as { body?: { ok?: boolean; egressProxy?: { connected?: boolean }; queue?: { defaultModel?: string; judgeConfigured?: boolean; modelReasoningEfforts?: Record<string, string> } } }).body; const codeQueueTasksBody = (codeQueueTasks as { body?: { ok?: boolean; queue?: { defaultModel?: string; modelReasoningEfforts?: Record<string, string> }; tasks?: unknown[] } }).body; + const decisionCenterHealthBody = (decisionCenterHealth as { body?: { ok?: boolean; service?: string; storage?: string; schemaReady?: boolean; recordCount?: number; deploy?: { commit?: string } } }).body; + const decisionCenterRecordsBody = (decisionCenterRecords as { body?: { ok?: boolean; records?: unknown[]; returned?: number } }).body; const k3sctlControlPlaneBody = (k3sctlControlPlane as { body?: { ok?: boolean; clusterId?: string; @@ -1169,6 +1184,7 @@ async function serviceChecks(config: UniDeskConfig, urls: PublicUrls, checks: E2 } }).body; const k3sctlCodeQueueService = k3sctlControlPlaneBody?.services?.find((service) => service.id === "code-queue"); const k3sctlClaudeqqService = k3sctlControlPlaneBody?.services?.find((service) => service.id === "claudeqq"); + const k3sctlDecisionCenterService = k3sctlControlPlaneBody?.services?.find((service) => service.id === "decision-center"); const filebrowserHealthBody = (filebrowserHealth as { body?: { status?: string } }).body; const filebrowserD601HealthBody = (filebrowserD601Health as { body?: { status?: string } }).body; const filebrowserWebuiText = String((filebrowserWebui as { body?: { text?: string } }).body?.text || ""); @@ -1216,6 +1232,15 @@ async function serviceChecks(config: UniDeskConfig, urls: PublicUrls, checks: E2 && codeQueue.runtime?.orchestrator === "k3sctl" && codeQueue.runtime?.container === null, { microservices }); + addSelectedCheck(checks, options, "microservice:catalog-decision-center", + (microservices as { ok?: boolean }).ok === true + && decisionCenter?.providerId === "D601" + && decisionCenter.backend?.public === false + && decisionCenter.backend?.proxyMode === "k3sctl-adapter-http" + && decisionCenter.deployment?.mode === "k3sctl-managed" + && decisionCenter.runtime?.orchestrator === "k3sctl" + && decisionCenter.runtime?.container === null, + { microservices }); addSelectedCheck(checks, options, "microservice:k3sctl-adapter-status", (k3sctlStatus as { ok?: boolean; body?: { microservice?: { id?: string; providerId?: string } } }).ok === true && (k3sctlStatus as { body?: { microservice?: { id?: string; providerId?: string } } }).body?.microservice?.id === "k3sctl-adapter" @@ -1238,6 +1263,11 @@ async function serviceChecks(config: UniDeskConfig, urls: PublicUrls, checks: E2 && k3sctlClaudeqqService?.servingHealthy === true && k3sctlClaudeqqService?.active?.id === "D601" && k3sctlClaudeqqService?.active?.healthy === true + && k3sctlDecisionCenterService?.status === "healthy" + && k3sctlDecisionCenterService?.topologyComplete === true + && k3sctlDecisionCenterService?.servingHealthy === true + && k3sctlDecisionCenterService?.active?.id === "D601" + && k3sctlDecisionCenterService?.active?.healthy === true && (k3sctlCodeQueueService?.presentNodeIds ?? []).includes("D601") && (k3sctlCodeQueueService?.missingNodeIds ?? []).length === 0 && (k3sctlCodeQueueService?.instances ?? []).some((instance) => instance.id === "D601" && instance.healthy === true), @@ -1248,6 +1278,7 @@ async function serviceChecks(config: UniDeskConfig, urls: PublicUrls, checks: E2 kubeApiProxy: k3sctlControlPlaneBody?.kubeApiProxy, service: k3sctlCodeQueueService, claudeqq: k3sctlClaudeqqService, + decisionCenter: k3sctlDecisionCenterService, }); addSelectedCheck(checks, options, "microservice:catalog-filebrowser", (microservices as { ok?: boolean }).ok === true && filebrowser?.providerId === "D518" @@ -1319,6 +1350,9 @@ async function serviceChecks(config: UniDeskConfig, urls: PublicUrls, checks: E2 addSelectedCheck(checks, options, "microservice:code-queue-status", (codeQueueStatus as { ok?: boolean }).ok === true && (codeQueueStatus as { body?: { microservice?: { id?: string; providerId?: string } } }).body?.microservice?.providerId === "D601", codeQueueStatus); addSelectedCheck(checks, options, "microservice:code-queue-health", (codeQueueHealth as { ok?: boolean }).ok === true && codeQueueHealthBody?.ok === true && codeQueueHealthBody.egressProxy?.connected === true && codeQueueHealthBody.queue?.defaultModel === "gpt-5.5" && codeQueueHealthBody.queue?.modelReasoningEfforts?.["gpt-5.5"] === "xhigh", codeQueueHealth); addSelectedCheck(checks, options, "microservice:code-queue-tasks", (codeQueueTasks as { ok?: boolean }).ok === true && codeQueueTasksBody?.ok === true && Array.isArray(codeQueueTasksBody.tasks) && codeQueueTasksBody.queue?.defaultModel === "gpt-5.5" && codeQueueTasksBody.queue?.modelReasoningEfforts?.["gpt-5.5"] === "xhigh", codeQueueTasks); + addSelectedCheck(checks, options, "microservice:decision-center-status", (decisionCenterStatus as { ok?: boolean }).ok === true && (decisionCenterStatus as { body?: { microservice?: { id?: string; providerId?: string } } }).body?.microservice?.providerId === "D601", decisionCenterStatus); + addSelectedCheck(checks, options, "microservice:decision-center-health", (decisionCenterHealth as { ok?: boolean }).ok === true && decisionCenterHealthBody?.ok === true && decisionCenterHealthBody.service === "decision-center" && decisionCenterHealthBody.storage === "postgres" && decisionCenterHealthBody.schemaReady === true, decisionCenterHealth); + addSelectedCheck(checks, options, "microservice:decision-center-records", (decisionCenterRecords as { ok?: boolean }).ok === true && decisionCenterRecordsBody?.ok === true && Array.isArray(decisionCenterRecordsBody.records), decisionCenterRecords); const upgradeDispatch = dockerCoreJson("/api/dispatch", { method: "POST", body: { providerId: config.providerGateway.id, command: "provider.upgrade", payload: { source: "cli-e2e", mode: "plan" } }, @@ -1417,6 +1451,7 @@ async function frontendCheck(config: UniDeskConfig, urls: PublicUrls, checks: E2 const needTodoNote = wants("frontend:todo-note-integrated-visible"); const needFindJob = wants("frontend:findjob-integrated-visible"); const needOaEventFlow = wants("frontend:oa-event-flow-visible"); + const needDecisionCenter = wants("frontend:decision-center-visible"); const needCodeQueue = wantsAny([ "frontend:code-queue-integrated-visible", "frontend:code-queue-enqueue-await-smoke", @@ -1502,6 +1537,10 @@ async function frontendCheck(config: UniDeskConfig, urls: PublicUrls, checks: E2 let findjobText = ""; let oaEventFlowText = ""; let oaEventFlowMetrics: any = { pageVisible: false, eventTableVisible: false, statsVisible: false, tagFilterValue: "", rawButtonCount: 0 }; + let decisionCenterText = ""; + let decisionCenterE2eRecord: any = null; + let decisionCenterDeleteResult: any = null; + let decisionCenterMetrics: any = { pageVisible: false, tableVisible: false, rawButtonCount: 0, rawJsonBlocks: 0, chatInputCount: 0, bodyContainsRecord: false }; let codeQueueText = ""; let codeQueueOutputText = ""; let codeQueueTaskCount = 0; @@ -1717,9 +1756,9 @@ async function frontendCheck(config: UniDeskConfig, urls: PublicUrls, checks: E2 } } - if (needMicroserviceCatalog || needTodoNote || needFindJob || needOaEventFlow || needCodeQueue || needClaudeqq || needRouteDeepLink || needPipeline || needMetNonlinear) { + if (needMicroserviceCatalog || needTodoNote || needFindJob || needOaEventFlow || needDecisionCenter || needCodeQueue || needClaudeqq || needRouteDeepLink || needPipeline || needMetNonlinear) { await page.getByRole("button", { name: /用户服务/ }).click(); - if (needMicroserviceCatalog || needTodoNote || needFindJob || needOaEventFlow || needCodeQueue || needClaudeqq || needRouteDeepLink || needPipeline || needMetNonlinear) { + if (needMicroserviceCatalog || needTodoNote || needFindJob || needOaEventFlow || needDecisionCenter || needCodeQueue || needClaudeqq || needRouteDeepLink || needPipeline || needMetNonlinear) { await page.waitForSelector('[data-testid="microservice-catalog-page"]', { timeout: 10000 }); } if (needMicroserviceCatalog) { @@ -1730,6 +1769,7 @@ async function frontendCheck(config: UniDeskConfig, urls: PublicUrls, checks: E2 await page.waitForSelector('[data-testid="microservice-row-todo-note"]', { timeout: 10000 }); await page.waitForSelector('[data-testid="microservice-row-oa-event-flow"]', { timeout: 10000 }); await page.waitForSelector('[data-testid="microservice-row-code-queue"]', { timeout: 10000 }); + await page.waitForSelector('[data-testid="microservice-row-decision-center"]', { timeout: 10000 }); microserviceCatalogText = await page.locator('[data-testid="microservice-catalog-page"]').innerText({ timeout: 5000 }); } if (needTodoNote) { @@ -1806,6 +1846,65 @@ async function frontendCheck(config: UniDeskConfig, urls: PublicUrls, checks: E2 }; }); } + if (needDecisionCenter) { + const decisionCenterE2eTitle = `E2E Decision Center ${Date.now()}`; + decisionCenterE2eRecord = await page.evaluate(async (title) => { + const response = await fetch("/api/microservices/decision-center/proxy/api/records", { + method: "POST", + credentials: "same-origin", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + type: "meeting", + level: "G1", + status: "active", + title, + body: "E2E seeded meeting record for Decision Center frontend validation.", + tags: ["e2e", "decision-center"], + evidenceLinks: ["https://example.com/unidesk/decision-center-e2e"], + }), + }); + return { ok: response.ok, status: response.status, body: await response.json().catch(() => null) }; + }, decisionCenterE2eTitle); + await page.getByRole("button", { name: /Decision Center/ }).click(); + await page.waitForSelector('[data-testid="decision-center-page"]', { timeout: 10000 }); + await page.waitForSelector('[data-testid="decision-center-filters"]', { timeout: 30000 }); + await page.waitForSelector('[data-testid="decision-center-record-table"]', { timeout: 30000 }); + await page.waitForFunction((title) => { + const text = document.body.innerText; + return text.includes("Decision Center") + && text.includes("G0/G1 目标") + && text.includes("P0/P1 Blocker") + && text.includes("停放事项") + && text.includes("最近会议/决议") + && text.includes("查看原始JSON") + && text.includes(String(title)); + }, decisionCenterE2eTitle, { timeout: 30000 }); + decisionCenterText = await page.locator('[data-testid="decision-center-page"]').innerText({ timeout: 5000 }); + decisionCenterMetrics = await page.evaluate(() => { + const root = document.querySelector('[data-testid="decision-center-page"]') as HTMLElement | null; + const table = document.querySelector('[data-testid="decision-center-record-table"]') as HTMLElement | null; + return { + pageVisible: Boolean(root), + tableVisible: Boolean(table), + rawButtonCount: root?.querySelectorAll('[data-testid^="raw-decision-center"], .ghost-btn').length ?? 0, + rawJsonBlocks: root?.querySelectorAll("pre.raw-json, [data-testid='raw-json']").length ?? 0, + chatInputCount: root?.querySelectorAll("textarea, [contenteditable='true']").length ?? 0, + recordCardCount: root?.querySelectorAll('[data-testid^="decision-record-"]').length ?? 0, + tableRows: table?.querySelectorAll("tbody tr").length ?? 0, + textPreview: root?.innerText.slice(0, 1000) || "", + }; + }); + const decisionCenterRecordId = String(decisionCenterE2eRecord?.body?.record?.id || ""); + if (decisionCenterRecordId) { + decisionCenterDeleteResult = await page.evaluate(async (id) => { + const response = await fetch(`/api/microservices/decision-center/proxy/api/records/${encodeURIComponent(String(id))}`, { + method: "DELETE", + credentials: "same-origin", + }); + return { ok: response.ok, status: response.status, body: await response.json().catch(() => null) }; + }, decisionCenterRecordId); + } + } if (needCodeQueue) { await page.getByLabel("用户服务 子功能").getByRole("button", { name: "Code Queue" }).click(); await page.waitForSelector('[data-testid="code-queue-page"]', { timeout: 10000 }); @@ -2787,6 +2886,7 @@ async function frontendCheck(config: UniDeskConfig, urls: PublicUrls, checks: E2 const microserviceCatalogTextLower = microserviceCatalogText.toLowerCase(); const todoNoteTextLower = todoNoteText.toLowerCase(); const findjobTextLower = findjobText.toLowerCase(); + const decisionCenterTextLower = decisionCenterText.toLowerCase(); const codeQueueTextLower = codeQueueText.toLowerCase(); const claudeqqTextLower = claudeqqText.toLowerCase(); const pipelineTextLower = pipelineText.toLowerCase(); @@ -2821,7 +2921,7 @@ async function frontendCheck(config: UniDeskConfig, urls: PublicUrls, checks: E2 addSelectedCheck(checks, options, "frontend:gateway-duration-subsecond-visible", gatewayHasSubsecondDuration && !gatewayHasRoundedZeroDuration, { gatewayHasSubsecondDuration, gatewayHasRoundedZeroDuration, gatewayTextPreview: gatewayText.slice(0, 900) }); addSelectedCheck(checks, options, "frontend:provider-operation-availability-visible", sshAvailabilityTexts.length >= 1 && upgradeAvailabilityTexts.length >= 1 && sshAvailabilityTexts.every((text) => text.includes("SSH 透传")) && upgradeAvailabilityTexts.every((text) => text.includes("远程更新")) && upgradeAvailabilityTexts.some((text) => text.includes("always-enabled")), { sshAvailabilityTexts, upgradeAvailabilityTexts }); addSelectedCheck(checks, options, "frontend:overview-pgdata-visible", bodyText.includes("PGDATA") && bodyText.includes(config.database.volume), { bodyPreview: bodyText.slice(0, 800) }); - addSelectedCheck(checks, options, "frontend:microservice-catalog-visible", microserviceCatalogTextLower.includes("findjob") && microserviceCatalogTextLower.includes("pipeline") && microserviceCatalogTextLower.includes("todo note") && microserviceCatalogTextLower.includes("met nonlinear") && microserviceCatalogTextLower.includes("claudeqq") && microserviceCatalogTextLower.includes("oa event flow") && microserviceCatalogTextLower.includes("code queue") && microserviceCatalogText.includes("D601") && microserviceCatalogText.includes(config.providerGateway.id) && microserviceCatalogTextLower.includes("private") && microserviceCatalogText.includes("https://gitee.com/Lyon1998/findjob") && microserviceCatalogText.includes("https://github.com/pikasTech/pipeline") && microserviceCatalogText.includes("https://github.com/pikasTech/met_nonlinear") && microserviceCatalogText.includes("https://gitee.com/lyon1998/agent_skills") && microserviceCatalogText.includes("https://gitee.com/Lyon1998/todo_note") && microserviceCatalogText.includes("https://github.com/pikasTech/unidesk"), { microserviceCatalogPreview: microserviceCatalogText.slice(0, 2000) }); + addSelectedCheck(checks, options, "frontend:microservice-catalog-visible", microserviceCatalogTextLower.includes("findjob") && microserviceCatalogTextLower.includes("pipeline") && microserviceCatalogTextLower.includes("todo note") && microserviceCatalogTextLower.includes("met nonlinear") && microserviceCatalogTextLower.includes("claudeqq") && microserviceCatalogTextLower.includes("oa event flow") && microserviceCatalogTextLower.includes("code queue") && microserviceCatalogTextLower.includes("decision center") && microserviceCatalogText.includes("D601") && microserviceCatalogText.includes(config.providerGateway.id) && microserviceCatalogTextLower.includes("private") && microserviceCatalogText.includes("https://gitee.com/Lyon1998/findjob") && microserviceCatalogText.includes("https://github.com/pikasTech/pipeline") && microserviceCatalogText.includes("https://github.com/pikasTech/met_nonlinear") && microserviceCatalogText.includes("https://gitee.com/lyon1998/agent_skills") && microserviceCatalogText.includes("https://gitee.com/Lyon1998/todo_note") && microserviceCatalogText.includes("https://github.com/pikasTech/unidesk"), { microserviceCatalogPreview: microserviceCatalogText.slice(0, 2000) }); addSelectedCheck(checks, options, "frontend:todo-note-integrated-visible", todoNoteTextLower.includes("todo note 工作台") && todoNoteText.includes("CONSTAR") && todoNoteText.includes("大论文") && todoNoteText.includes("UI E2E smoke task") && todoNoteText.includes("撤销") && todoNoteText.includes("重做") && todoNoteText.includes("全部展开") && todoNoteText.includes("仅 UniDesk frontend 代理访问"), { todoNoteTextPreview: todoNoteText.slice(0, 1400) }); addSelectedCheck(checks, options, "frontend:findjob-integrated-visible", findjobTextLower.includes("findjob 工作台".toLowerCase()) && findjobText.includes("岗位总量") && findjobText.includes("D601") && findjobText.includes("近期岗位") && findjobText.includes("仅 UniDesk frontend 代理访问") && /岗位总量\s+\d+/.test(findjobText) && /health\s+ok/i.test(findjobText) && /[1-9]\d*\/[1-9]\d*\s+preview/i.test(findjobText), { findjobTextPreview: findjobText.slice(0, 1200) }); addSelectedCheck(checks, options, "frontend:oa-event-flow-visible", @@ -2837,6 +2937,24 @@ async function frontendCheck(config: UniDeskConfig, urls: PublicUrls, checks: E2 && Number(oaEventFlowMetrics.rawButtonCount || 0) >= 2 && !oaEventFlowText.includes("{\n"), { oaEventFlowMetrics, oaEventFlowTextPreview: oaEventFlowText.slice(0, 1400) }); + addSelectedCheck(checks, options, "frontend:decision-center-visible", + decisionCenterTextLower.includes("decision center") + && decisionCenterText.includes("G0/G1 目标") + && decisionCenterText.includes("P0/P1 Blocker") + && decisionCenterText.includes("停放事项") + && decisionCenterText.includes("最近会议/决议") + && decisionCenterText.includes("全部记录") + && decisionCenterText.includes("PostgreSQL") + && decisionCenterText.includes("查看原始JSON") + && decisionCenterMetrics.pageVisible === true + && decisionCenterMetrics.tableVisible === true + && decisionCenterE2eRecord?.ok === true + && decisionCenterDeleteResult?.ok === true + && Number(decisionCenterMetrics.rawButtonCount || 0) >= 1 + && Number(decisionCenterMetrics.rawJsonBlocks || 0) === 0 + && Number(decisionCenterMetrics.chatInputCount || 0) === 0 + && !decisionCenterText.includes("{\n"), + { decisionCenterMetrics, decisionCenterE2eRecord, decisionCenterDeleteResult, decisionCenterTextPreview: decisionCenterText.slice(0, 1400) }); addSelectedCheck(checks, options, "frontend:code-queue-integrated-visible", codeQueueTextLower.includes("code queue") && codeQueueText.includes("gpt-5.4-mini") && codeQueueText.includes("gpt-5.4") && codeQueueText.includes("gpt-5.5") && codeQueueText.includes("提交任务") && codeQueueText.includes("执行 Provider") && codeQueueText.includes("入队份数") && codeQueueText.includes("追加 prompt") && codeQueueText.includes("打断") && codeQueueTextLower.includes("查看 queue") && codeQueueText.includes("创建 queue") && codeQueueText.includes("合并 queue") && codeQueueOptions.some((text) => text.includes("All queues")) && codeQueueTracePlacement.firstChildIsTrace === true && codeQueueTracePlacement.noPageTopStatus === true && codeQueueTracePlacement.filterInsideTracePanel === true && codeQueueTracePlacement.taskSearchVisible === true && codeQueueTracePlacement.traceStatusVisible === true && codeQueueTracePlacement.markAllReadVisible === true && codeQueueGlobalStatus.activeMicroserviceVisible === true && codeQueueSidebarUpdateMetrics.hasRecentUpdateLabel === true && codeQueueHtmlGuard.rootAttrMissing === true && codeQueueHtmlGuard.sourceAttrMissing === true && codeQueueHtmlGuard.sourceNoBasePrompt === true && codeQueueSubmitQueueControl.tagName === "select" && codeQueueSubmitQueueControl.createButtonVisible === true && codeQueueSubmitQueueControl.mergeButtonVisible === true && codeQueueSubmitQueueControl.mergeSourceInlineMissing === true && codeQueueSubmitQueueControl.mergeDialogMissingBeforeClick === true && (codeQueueSubmitQueueControl.mergeButtonDisabled === true || (codeQueueSubmitQueueControl.mergeDialogVisible === true && codeQueueSubmitQueueControl.mergeDialogSelectVisible === true && Number(codeQueueSubmitQueueControl.mergeDialogSourceOptionCount || 0) > 1 && codeQueueSubmitQueueControl.mergeDialogSelectInsideSubmitForm !== true && codeQueueSubmitQueueControl.mergeDialogUsesCommonComponent === true && codeQueueSubmitQueueControl.mergeDialogDeleteNoteVisible === true)) && codeQueueSubmitQueueControl.oldInputMissing === true && codeQueueSubmitQueueControl.providerValue === "D601" && codeQueueSubmitQueueControl.cwdValue === "/workspace" && Array.isArray(codeQueueSubmitQueueControl.providerOptions) && codeQueueSubmitQueueControl.providerOptions.some((item: any) => item.value === "D601" && String(item.text || "").includes("/workspace")) && codeQueueSubmitQueueControl.maxAttemptsMax === "99" && codeQueueSubmitQueueControl.maxAttemptsValue === "99" && codeQueueSubmitQueueControl.moveQueueVisible === true && codeQueuePromptDefaultEmpty === true && codeQueueSubmitGuard.batchRowVisible === true && codeQueueSubmitGuard.checkboxVisible === true && codeQueueSubmitGuard.disabledBeforeConfirm === true && codeQueueSubmitGuard.enabledAfterConfirm === true && codeQueueSubmitGuard.waitElementMissingBeforeSubmit === true && codeQueueScrollbarMetrics.transcriptThin === true && codeQueueScrollbarMetrics.toolHorizontalHidden === true && (codeQueueSwitchMetrics.optionCount <= 1 || codeQueueSwitchMetrics.switched === true) && codeQueueTextLower.includes("attempts") && codeQueueText.includes("仅 UniDesk frontend 代理访问") && (codeQueueTaskCount === 0 || codeQueueOutputText.includes("Submitted prompt")), { codeQueueTaskCount, codeQueueOptions, codeQueueSwitchMetrics, codeQueueSubmitQueueControl, codeQueueSubmitGuard, codeQueueScrollbarMetrics, codeQueuePromptDefaultEmpty, codeQueueTracePlacement, codeQueueGlobalStatus, codeQueueSidebarUpdateMetrics, codeQueueHtmlGuard, codeQueueOutputPreview: codeQueueOutputText.slice(0, 900), codeQueueTextPreview: codeQueueText.slice(0, 1400) }); addSelectedCheck(checks, options, "frontend:code-queue-enqueue-await-smoke", codeQueueEnqueueAwaitSmoke.checked === true diff --git a/scripts/src/remote.ts b/scripts/src/remote.ts index 55843035..bf3f508e 100644 --- a/scripts/src/remote.ts +++ b/scripts/src/remote.ts @@ -5,6 +5,7 @@ import { summarizeMicroserviceProxyResponse } from "./microservices"; import { parseNetworkPerfOptions, runNetworkPerf } from "./network-perf"; import { isSshSkillDiscoveryArgs, parseSshArgs } from "./ssh"; import { codexJudgeQueryAsync, codexOutputQueryAsync, codexTaskQueryAsync } from "./code-queue"; +import { runDecisionCenterCommandAsync } from "./decision-center"; export interface RemoteCliOptions { host: string | null; @@ -558,7 +559,7 @@ async function runRemoteCliOverFrontend(options: RemoteCliOptions, config: UniDe emitRemoteJson(name, { transport: "frontend", baseUrl: session.baseUrl, - commands: ["debug health", "debug dispatch", "debug task", "ssh <providerId> <command>", "ssh <providerId> skills", "microservice list", "microservice status <id>", "microservice health <id>", "microservice proxy <id> <path>", "codex task <taskId>", "codex judge <taskId> --attempt N", "network perf"], + commands: ["debug health", "debug dispatch", "debug task", "ssh <providerId> <command>", "ssh <providerId> skills", "microservice list", "microservice status <id>", "microservice health <id>", "microservice proxy <id> <path>", "decision upload <markdown-file>", "decision list", "decision show <id>", "codex task <taskId>", "codex judge <taskId> --attempt N", "network perf"], }); return 0; } @@ -578,6 +579,19 @@ async function runRemoteCliOverFrontend(options: RemoteCliOptions, config: UniDe emitRemoteJson(name, await remoteMicroservice(session, args)); return 0; } + if (top === "decision" || top === "decision-center") { + const fetcher = (path: string, init?: { method?: string; body?: unknown }): Promise<FetchJsonResult> => { + const requestInit = init === undefined + ? undefined + : { + method: init.method, + body: init.body === undefined ? undefined : JSON.stringify(init.body), + }; + return frontendJson(session, path, requestInit, 30_000); + }; + emitRemoteJson(name, await runDecisionCenterCommandAsync(config, args.slice(1), fetcher)); + return 0; + } if (top === "codex") { emitRemoteJson(name, await remoteCodeQueue(session, args)); return 0; diff --git a/src/components/frontend/public/style.css b/src/components/frontend/public/style.css index 63564555..706f905f 100644 --- a/src/components/frontend/public/style.css +++ b/src/components/frontend/public/style.css @@ -1422,7 +1422,7 @@ input:focus, select:focus, textarea:focus { border-color: var(--accent-2); } .result-card dd { margin: 0; } .result-grid { grid-template-columns: repeat(auto-fill, minmax(280px, 1fr)); } -.microservice-page, .findjob-page, .pipeline-page, .met-page, .code-queue-page, .baidu-netdisk-page, .filebrowser-page, .oa-event-flow-page { +.microservice-page, .findjob-page, .pipeline-page, .met-page, .code-queue-page, .baidu-netdisk-page, .filebrowser-page, .oa-event-flow-page, .decision-center-page { display: grid; gap: 10px; } @@ -6617,8 +6617,121 @@ input:focus, select:focus, textarea:focus { border-color: var(--accent-2); } overflow-wrap: anywhere; } +.decision-hero { + display: grid; + grid-template-columns: minmax(0, 1fr) minmax(260px, 0.36fr); + gap: 10px; + align-items: stretch; +} +.decision-filter-bar { + display: grid; + grid-template-columns: repeat(4, minmax(160px, 1fr)); + gap: 10px; +} +.decision-filter-bar label { + display: grid; + gap: 5px; + color: var(--muted); + font-size: 11px; + letter-spacing: 0.12em; + text-transform: uppercase; +} +.decision-default-grid { + display: grid; + grid-template-columns: repeat(2, minmax(0, 1fr)); + gap: 10px; + align-items: start; +} +.decision-card-list { + display: grid; + gap: 8px; +} +.decision-record-card { + display: grid; + gap: 8px; + min-width: 0; + padding: 10px; + border: 1px solid var(--line-soft); + background: rgba(0,0,0,0.16); +} +.decision-record-card.compact { + padding: 9px; +} +.decision-record-head { + display: grid; + grid-template-columns: minmax(0, 1fr) auto; + gap: 8px; + align-items: start; +} +.decision-record-head strong { + display: block; + min-width: 0; + margin-top: 4px; + overflow-wrap: anywhere; + font-size: 14px; +} +.decision-record-meta, +.decision-record-foot, +.decision-tags, +.decision-evidence { + display: flex; + flex-wrap: wrap; + gap: 6px; + align-items: center; +} +.decision-record-meta span:first-child { + color: var(--accent); + font-size: 10px; + letter-spacing: 0.16em; + text-transform: uppercase; +} +.decision-summary { + margin: 0; + color: var(--muted); + line-height: 1.45; + overflow-wrap: anywhere; +} +.decision-markdown { + max-height: 280px; + overflow: auto; + padding-right: 4px; +} +.decision-record-foot { + color: var(--muted); + font-size: 11px; +} +.decision-record-foot code { + white-space: normal; + overflow-wrap: anywhere; +} +.decision-tags span { + padding: 2px 6px; + border: 1px solid rgba(78, 183, 168, 0.34); + color: var(--accent-2); + background: rgba(78, 183, 168, 0.08); + font-size: 11px; +} +.decision-evidence a { + color: var(--text); + text-decoration: none; + overflow-wrap: anywhere; +} +.decision-table th, +.decision-table td { + vertical-align: top; +} +.decision-table td strong, +.decision-table td code { + display: block; + min-width: 0; + overflow-wrap: anywhere; +} + @media (max-width: 1100px) { - .mdtodo-layout { + .mdtodo-layout, + .decision-hero, + .decision-filter-bar, + .decision-default-grid { grid-template-columns: 1fr; } } diff --git a/src/components/frontend/src/app.tsx b/src/components/frontend/src/app.tsx index 6c4fe61d..0e80ba88 100644 --- a/src/components/frontend/src/app.tsx +++ b/src/components/frontend/src/app.tsx @@ -4,6 +4,7 @@ import { createRoot } from "react-dom/client"; import { BaiduNetdiskPage } from "./baidu-netdisk"; import { ClaudeQqPage } from "./claudeqq"; import { CodeQueuePage } from "./code-queue"; +import { DecisionCenterPage } from "./decision-center"; import { FileBrowserPage } from "./filebrowser"; import { FindJobPage } from "./findjob"; import { MetNonlinearPage } from "./met-nonlinear"; @@ -1652,6 +1653,7 @@ function MicroserviceCatalogPage({ microservices, onRaw, onNavigate }: AnyRecord service.id === "k3sctl-adapter" ? h("button", { type: "button", className: "ghost-btn", onClick: () => onNavigate("apps", "k3sctl"), "data-testid": "open-k3sctl-button" }, "打开") : null, service.id === "code-queue" ? h("button", { type: "button", className: "ghost-btn", onClick: () => onNavigate("apps", "code-queue"), "data-testid": "open-code-queue-button" }, "打开") : null, service.id === "mdtodo" ? h("button", { type: "button", className: "ghost-btn", onClick: () => onNavigate("apps", "mdtodo"), "data-testid": "open-mdtodo-button" }, "打开") : null, + service.id === "decision-center" ? h("button", { type: "button", className: "ghost-btn", onClick: () => onNavigate("apps", "decision-center"), "data-testid": "open-decision-center-button" }, "打开") : null, service.id === "project-manager" ? h("button", { type: "button", className: "ghost-btn", onClick: () => onNavigate("apps", "project-manager"), "data-testid": "open-project-manager-button" }, "打开") : null, h(RawButton, { title: `用户服务 ${service.id}`, data: service, onOpen: onRaw }), ), @@ -2160,6 +2162,7 @@ function WorkArea({ activeModule, activeTab, data, session, refresh, onRaw, onNa if (activeModule === "apps" && activeTab === "k3sctl") return h(K3sCtlPage, { microservices: data.microservices, onRaw, apiBaseUrl: cfg.apiBaseUrl, onNavigate }); if (activeModule === "apps" && activeTab === "code-queue") return h(CodeQueuePage, { microservices: data.microservices, onRaw, apiBaseUrl: cfg.apiBaseUrl, initialTasksData: initialCodeQueueOverview }); if (activeModule === "apps" && activeTab === "mdtodo") return h(MdtodoPage, { microservices: data.microservices, onRaw, apiBaseUrl: cfg.apiBaseUrl }); + if (activeModule === "apps" && activeTab === "decision-center") return h(DecisionCenterPage, { microservices: data.microservices, onRaw, apiBaseUrl: cfg.apiBaseUrl }); if (activeModule === "apps" && activeTab === "project-manager") return h(ProjectManagerPage, { microservices: data.microservices, onRaw, apiBaseUrl: cfg.apiBaseUrl }); if (activeModule === "config" && activeTab === "topology") return h(TopologyPage, { data }); if (activeModule === "config" && activeTab === "auth") return h(AuthPage, { session }); diff --git a/src/components/frontend/src/decision-center.tsx b/src/components/frontend/src/decision-center.tsx new file mode 100644 index 00000000..53d141a2 --- /dev/null +++ b/src/components/frontend/src/decision-center.tsx @@ -0,0 +1,258 @@ +import React from "react"; +import { fmtClock, fmtDate } from "./time"; +import { LoadingTitle } from "./loading-indicator"; +import { MarkdownBody } from "./markdown"; +import { errorMessage, requestJson } from "./unidesk-error"; +import { UniDeskErrorBanner } from "./unidesk-error-banner"; + +type AnyRecord = Record<string, any>; + +const h = React.createElement; +const { useEffect } = React; +const useState: any = React.useState; + +const recordTypes = ["all", "meeting", "decision", "goal", "blocker", "debt", "experiment"]; +const recordLevels = ["all", "G0", "G1", "G2", "G3", "P0", "P1", "P2", "P3", "none"]; +const recordStatuses = ["all", "active", "blocked", "parked", "done"]; + +function StatusBadge({ status, children }: AnyRecord) { + const normalized = String(status || "unknown").toLowerCase(); + return h("span", { className: `status-badge ${normalized}` }, children || status || "unknown"); +} + +function MetricCard({ label, value, hint, tone }: AnyRecord) { + return h("article", { className: `metric-card ${tone || ""}` }, + h("div", { className: "metric-label" }, label), + h("div", { className: "metric-value" }, value), + h("div", { className: "metric-hint" }, hint), + ); +} + +function Panel({ title, eyebrow, actions, children, className, loading }: AnyRecord) { + return h("section", { className: `panel ${className || ""}` }, + h("div", { className: "panel-head" }, + h("div", null, + eyebrow ? h("p", { className: "panel-eyebrow" }, eyebrow) : null, + h(LoadingTitle, { title, loading }), + ), + actions ? h("div", { className: "panel-actions" }, actions) : null, + ), + h("div", { className: "panel-body" }, children), + ); +} + +function RawButton({ title, data, onOpen, testId }: AnyRecord) { + return h("button", { + type: "button", + className: "ghost-btn", + "data-testid": testId, + onClick: () => onOpen(title, data), + }, "查看原始JSON"); +} + +function EmptyState({ title, text }: AnyRecord) { + return h("div", { className: "empty-state" }, h("strong", null, title), h("span", null, text)); +} + +function microserviceRuntime(service: any): AnyRecord { + return service?.runtime && typeof service.runtime === "object" && !Array.isArray(service.runtime) ? service.runtime : {}; +} + +function microserviceBackend(service: any): AnyRecord { + return service?.backend && typeof service.backend === "object" && !Array.isArray(service.backend) ? service.backend : {}; +} + +function microserviceRepository(service: any): AnyRecord { + return service?.repository && typeof service.repository === "object" && !Array.isArray(service.repository) ? service.repository : {}; +} + +function decisionApi(apiBaseUrl: string, path: string): string { + return `${apiBaseUrl}/microservices/decision-center/proxy${path}`; +} + +function levelTone(level: string): string { + if (level === "G0" || level === "G1") return "online"; + if (level === "P0" || level === "P1") return "failed"; + if (level === "none") return "unknown"; + return "warn"; +} + +function statusTone(status: string): string { + if (status === "done") return "online"; + if (status === "blocked") return "failed"; + if (status === "parked") return "warn"; + return "unknown"; +} + +function fmtRecordTime(value: any): string { + return fmtDate(value) || "--"; +} + +function shortText(value: any, max = 220): string { + const text = String(value || "").replace(/\s+/gu, " ").trim(); + return text.length > max ? `${text.slice(0, max - 1)}...` : text; +} + +function RecordCard({ record, onRaw, compact }: AnyRecord) { + const tags = Array.isArray(record.tags) ? record.tags : []; + const evidence = Array.isArray(record.evidenceLinks) ? record.evidenceLinks : []; + return h("article", { className: `decision-record-card ${compact ? "compact" : ""}`, "data-testid": `decision-record-${String(record.id || "").replace(/[^A-Za-z0-9_-]+/g, "-")}` }, + h("div", { className: "decision-record-head" }, + h("div", null, + h("div", { className: "decision-record-meta" }, + h("span", null, record.type || "--"), + h(StatusBadge, { status: levelTone(record.level) }, record.level || "none"), + h(StatusBadge, { status: statusTone(record.status) }, record.status || "--"), + ), + h("strong", null, record.title || "--"), + ), + h(RawButton, { title: `Decision ${record.id}`, data: record, onOpen: onRaw }), + ), + compact + ? h("p", { className: "decision-summary" }, shortText(record.summary || record.body)) + : h(MarkdownBody, { markdown: record.body || record.summary || "", className: "decision-markdown" }), + h("div", { className: "decision-record-foot" }, + record.linkedGoalId ? h("code", null, `goal:${record.linkedGoalId}`) : null, + record.taskId ? h("code", null, `task:${record.taskId}`) : null, + record.commitId ? h("code", null, record.commitId.slice(0, 12)) : null, + h("span", null, fmtRecordTime(record.updatedAt)), + ), + tags.length > 0 ? h("div", { className: "decision-tags" }, tags.slice(0, 8).map((tag: string) => h("span", { key: tag }, tag))) : null, + evidence.length > 0 ? h("div", { className: "decision-evidence" }, evidence.slice(0, 4).map((link: string) => h("a", { key: link, href: link, target: "_blank", rel: "noreferrer" }, shortText(link, 58)))) : null, + ); +} + +function RecordTable({ records, onRaw }: AnyRecord) { + if (!records.length) return h(EmptyState, { title: "暂无记录", text: "通过 CLI 上传会议记录或决议后会显示在这里。" }); + return h("div", { className: "table-wrap" }, + h("table", { className: "decision-table", "data-testid": "decision-center-record-table" }, + h("thead", null, h("tr", null, + h("th", null, "等级"), + h("th", null, "状态"), + h("th", null, "类型"), + h("th", null, "标题"), + h("th", null, "摘要"), + h("th", null, "证据"), + h("th", null, "更新"), + h("th", null, "操作"), + )), + h("tbody", null, records.map((record: any) => h("tr", { key: record.id }, + h("td", null, h(StatusBadge, { status: levelTone(record.level) }, record.level || "none")), + h("td", null, h(StatusBadge, { status: statusTone(record.status) }, record.status || "--")), + h("td", null, record.type || "--"), + h("td", null, h("strong", null, record.title || "--"), record.linkedGoalId ? h("code", null, record.linkedGoalId) : null), + h("td", null, shortText(record.summary || record.body, 180)), + h("td", null, Array.isArray(record.evidenceLinks) ? record.evidenceLinks.length : 0), + h("td", null, fmtRecordTime(record.updatedAt)), + h("td", null, h(RawButton, { title: `Decision ${record.id}`, data: record, onOpen: onRaw })), + ))), + ), + ); +} + +function selectOptions(values: string[]): any[] { + return values.map((value) => h("option", { key: value, value }, value)); +} + +function filteredQuery(filters: AnyRecord): string { + const params = new URLSearchParams(); + if (filters.type !== "all") params.set("type", filters.type); + if (filters.status !== "all") params.set("status", filters.status); + if (filters.level !== "all") params.set("level", filters.level); + if (filters.linkedGoalId.trim()) params.set("linkedGoalId", filters.linkedGoalId.trim()); + params.set("limit", "240"); + return params.toString(); +} + +export function DecisionCenterPage({ microservices, onRaw, apiBaseUrl = "/api" }: AnyRecord) { + const service = microservices.find((item: any) => item.id === "decision-center") || null; + const [state, setState] = useState({ loading: false, error: "", health: null, records: [], refreshedAt: null }); + const [filters, setFilters] = useState({ type: "all", status: "all", level: "all", linkedGoalId: "" }); + + async function load(): Promise<void> { + if (!service) return; + setState((prev: any) => ({ ...prev, loading: true, error: "" })); + try { + const query = filteredQuery(filters); + const [health, records] = await Promise.all([ + requestJson(`${apiBaseUrl}/microservices/decision-center/health`), + requestJson(decisionApi(apiBaseUrl, `/api/records?${query}`)), + ]); + setState({ loading: false, error: "", health, records: Array.isArray(records.records) ? records.records : [], refreshedAt: new Date() }); + } catch (err) { + setState((prev: any) => ({ ...prev, loading: false, error: errorMessage(err, "Decision Center 加载失败") })); + } + } + + useEffect(() => { + load(); + }, [service?.id, service?.runtime?.providerStatus]); + + useEffect(() => { + const timer = setTimeout(() => void load(), 120); + return () => clearTimeout(timer); + }, [filters.type, filters.status, filters.level, filters.linkedGoalId]); + + if (!service) return h(EmptyState, { title: "Decision Center 未登记", text: "请在 config.json 的 microservices 中登记用户服务 id=decision-center" }); + + const runtime = microserviceRuntime(service); + const repository = microserviceRepository(service); + const backend = microserviceBackend(service); + const records = Array.isArray(state.records) ? state.records : []; + const goals = records.filter((record: any) => record.type === "goal" && ["G0", "G1"].includes(record.level) && record.status !== "done").slice(0, 8); + const blockers = records.filter((record: any) => record.type === "blocker" && ["P0", "P1"].includes(record.level) && record.status !== "done").slice(0, 8); + const parked = records.filter((record: any) => record.status === "parked").slice(0, 8); + const recentMeetings = records.filter((record: any) => record.type === "meeting" || record.type === "decision").slice(0, 12); + + return h("div", { className: "decision-center-page", "data-testid": "decision-center-page" }, + h(Panel, { title: "Decision Center", eyebrow: "Authority Records", loading: state.loading, actions: h("div", { className: "inline-actions" }, + h("button", { type: "button", className: "ghost-btn", onClick: () => void load(), disabled: state.loading }, state.loading ? "刷新中" : "刷新"), + h(RawButton, { title: "Decision Center Health", data: state.health, onOpen: onRaw, testId: "raw-decision-center-health" }), + ) }, + h("div", { className: "decision-hero" }, + h("div", { className: "metric-grid" }, + h(MetricCard, { label: "记录数", value: records.length, hint: `PostgreSQL / ${state.health?.storage || "postgres"}`, tone: "ok" }), + h(MetricCard, { label: "G0/G1 目标", value: goals.length, hint: "active authority goals", tone: "ok" }), + h(MetricCard, { label: "P0/P1 Blocker", value: blockers.length, hint: "requires decision", tone: blockers.length > 0 ? "warn" : "ok" }), + h(MetricCard, { label: "Parked", value: parked.length, hint: "停放事项", tone: parked.length > 0 ? "warn" : "ok" }), + ), + h("div", { className: "microservice-ref-card" }, + h("span", null, "Runtime"), + h("strong", null, runtime.orchestrator || service.deployment?.mode || "k3sctl"), + h("code", null, `${service.providerId} / ${backend.nodeBindHost || "--"}:${backend.nodePort || "--"}`), + h("code", null, repository.commitId || "--"), + ), + ), + h(UniDeskErrorBanner, { error: state.error, title: "Decision Center 请求失败" }), + ), + h(Panel, { title: "筛选", eyebrow: "Type / Status / Level" }, + h("div", { className: "decision-filter-bar", "data-testid": "decision-center-filters" }, + h("label", null, "类型", h("select", { value: filters.type, onChange: (event: any) => setFilters((prev: any) => ({ ...prev, type: event.target.value })) }, selectOptions(recordTypes))), + h("label", null, "状态", h("select", { value: filters.status, onChange: (event: any) => setFilters((prev: any) => ({ ...prev, status: event.target.value })) }, selectOptions(recordStatuses))), + h("label", null, "等级", h("select", { value: filters.level, onChange: (event: any) => setFilters((prev: any) => ({ ...prev, level: event.target.value })) }, selectOptions(recordLevels))), + h("label", null, "Linked Goal", h("input", { value: filters.linkedGoalId, onChange: (event: any) => setFilters((prev: any) => ({ ...prev, linkedGoalId: event.target.value })), placeholder: "goal id" })), + ), + ), + h("div", { className: "decision-default-grid" }, + h(Panel, { title: "G0/G1 目标", eyebrow: `${goals.length} Goals` }, + goals.length === 0 ? h(EmptyState, { title: "暂无当前目标", text: "目标记录使用 type=goal 且 level=G0/G1。" }) : + h("div", { className: "decision-card-list" }, goals.map((record: any) => h(RecordCard, { key: record.id, record, onRaw, compact: true }))), + ), + h(Panel, { title: "P0/P1 Blocker", eyebrow: `${blockers.length} Blockers` }, + blockers.length === 0 ? h(EmptyState, { title: "暂无高优先级阻塞", text: "阻塞记录使用 type=blocker 且 level=P0/P1。" }) : + h("div", { className: "decision-card-list" }, blockers.map((record: any) => h(RecordCard, { key: record.id, record, onRaw, compact: true }))), + ), + h(Panel, { title: "停放事项", eyebrow: `${parked.length} Parked` }, + parked.length === 0 ? h(EmptyState, { title: "暂无停放事项", text: "status=parked 的记录会集中展示。" }) : + h("div", { className: "decision-card-list" }, parked.map((record: any) => h(RecordCard, { key: record.id, record, onRaw, compact: true }))), + ), + h(Panel, { title: "最近会议/决议", eyebrow: `${recentMeetings.length} Recent` }, + recentMeetings.length === 0 ? h(EmptyState, { title: "暂无会议或决议", text: "使用 CLI 上传 Markdown 会议记录后会显示。" }) : + h("div", { className: "decision-card-list" }, recentMeetings.map((record: any) => h(RecordCard, { key: record.id, record, onRaw, compact: true }))), + ), + ), + h(Panel, { title: "全部记录", eyebrow: `${records.length} Records`, actions: state.refreshedAt ? h("span", { className: "muted" }, `刷新 ${fmtClock(state.refreshedAt)}`) : null }, + h(RecordTable, { records, onRaw }), + ), + ); +} diff --git a/src/components/frontend/src/navigation.ts b/src/components/frontend/src/navigation.ts index bbc4bb5d..28110df6 100644 --- a/src/components/frontend/src/navigation.ts +++ b/src/components/frontend/src/navigation.ts @@ -73,6 +73,7 @@ export const MODULES: UniDeskModuleDefinition[] = [ { id: "k3sctl", label: "k3s Control" }, { id: "code-queue", label: "Code Queue" }, { id: "mdtodo", label: "MDTODO" }, + { id: "decision-center", label: "Decision Center" }, { id: "project-manager", label: "Project Manager" }, ] }, { id: "config", label: "系统配置", code: "CFG", tabs: [ diff --git a/src/components/microservices/decision-center/Dockerfile b/src/components/microservices/decision-center/Dockerfile new file mode 100644 index 00000000..c993727c --- /dev/null +++ b/src/components/microservices/decision-center/Dockerfile @@ -0,0 +1,11 @@ +FROM oven/bun:1-alpine + +WORKDIR /app/src/components/microservices/decision-center +COPY src/components/microservices/decision-center/package.json ./package.json +RUN bun install --production +COPY src/components/microservices/decision-center/tsconfig.json ./tsconfig.json +COPY src/components/shared /app/src/components/shared +COPY src/components/microservices/decision-center/src ./src + +EXPOSE 4277 +CMD ["bun", "run", "src/index.ts"] diff --git a/src/components/microservices/decision-center/package.json b/src/components/microservices/decision-center/package.json new file mode 100644 index 00000000..a23cb868 --- /dev/null +++ b/src/components/microservices/decision-center/package.json @@ -0,0 +1,12 @@ +{ + "name": "@unidesk/decision-center", + "private": true, + "type": "module", + "scripts": { + "start": "bun run src/index.ts", + "check": "tsc -p tsconfig.json --noEmit" + }, + "dependencies": { + "postgres": "latest" + } +} diff --git a/src/components/microservices/decision-center/src/index.ts b/src/components/microservices/decision-center/src/index.ts new file mode 100644 index 00000000..b222d5da --- /dev/null +++ b/src/components/microservices/decision-center/src/index.ts @@ -0,0 +1,510 @@ +import { randomUUID } from "node:crypto"; +import postgres from "postgres"; +import { createHourlyJsonlWriter, logRetentionBytesForService } from "../../../shared/src/rotating-jsonl"; + +type JsonValue = string | number | boolean | null | JsonValue[] | { [key: string]: JsonValue }; +type JsonRecord = Record<string, JsonValue>; + +type DecisionRecordType = "meeting" | "decision" | "goal" | "blocker" | "debt" | "experiment"; +type DecisionRecordLevel = "G0" | "G1" | "G2" | "G3" | "P0" | "P1" | "P2" | "P3" | "none"; +type DecisionRecordStatus = "active" | "blocked" | "parked" | "done"; + +interface RuntimeConfig { + host: string; + port: number; + databaseUrl: string; + logFile: string; + databasePoolMax: number; +} + +interface DecisionRecordRow { + id: string; + type: DecisionRecordType; + level: DecisionRecordLevel; + status: DecisionRecordStatus; + title: string; + body: string; + linked_goal_id: string | null; + tags: JsonValue; + evidence_links: JsonValue; + source_session: string; + task_id: string; + commit_id: string; + created_at: Date | string; + updated_at: Date | string; +} + +interface DecisionRecord extends JsonRecord { + id: string; + type: DecisionRecordType; + level: DecisionRecordLevel; + status: DecisionRecordStatus; + title: string; + summary: string; + body: string; + linkedGoalId: string | null; + tags: string[]; + evidenceLinks: string[]; + sourceSession: string; + taskId: string; + commitId: string; + createdAt: string; + updatedAt: string; +} + +class HttpError extends Error { + readonly status: number; + readonly detail: JsonRecord; + + constructor(status: number, message: string, detail: JsonRecord = {}) { + super(message); + this.name = "HttpError"; + this.status = status; + this.detail = detail; + } +} + +const recordTypes = new Set<DecisionRecordType>(["meeting", "decision", "goal", "blocker", "debt", "experiment"]); +const recordLevels = new Set<DecisionRecordLevel>(["G0", "G1", "G2", "G3", "P0", "P1", "P2", "P3", "none"]); +const recordStatuses = new Set<DecisionRecordStatus>(["active", "blocked", "parked", "done"]); +const serviceStartedAt = new Date().toISOString(); +const recentLogs: JsonRecord[] = []; +let schemaReady = false; +let schemaLastError: JsonRecord | null = null; + +function envString(name: string, fallback: string): string { + const value = process.env[name]; + return value === undefined || value.length === 0 ? fallback : value; +} + +function envNumber(name: string, fallback: number): number { + const raw = process.env[name]; + if (raw === undefined || raw.length === 0) return fallback; + const value = Number(raw); + return Number.isFinite(value) && value > 0 ? Math.floor(value) : fallback; +} + +function configFromEnv(): RuntimeConfig { + const databaseUrl = process.env.DATABASE_URL || ""; + if (!databaseUrl) throw new Error("DATABASE_URL is required"); + return { + host: envString("HOST", "0.0.0.0"), + port: envNumber("PORT", 4277), + databaseUrl, + logFile: envString("LOG_FILE", "/var/log/unidesk/decision-center.jsonl"), + databasePoolMax: Math.max(1, Math.min(8, envNumber("DATABASE_POOL_MAX", 2))), + }; +} + +const config = configFromEnv(); +const sql = postgres(config.databaseUrl, { + max: config.databasePoolMax, + idle_timeout: 20, + connect_timeout: 10, + connection: { application_name: "unidesk-decision-center" }, +}); +const logWriter = config.logFile + ? createHourlyJsonlWriter({ + baseLogFile: config.logFile, + service: "decision-center", + maxBytes: logRetentionBytesForService("decision-center"), + }) + : null; +logWriter?.prune(); + +function log(level: "info" | "warn" | "error", event: string, detail: JsonRecord = {}): void { + const record: JsonRecord = { at: new Date().toISOString(), service: "decision-center", level, event, ...detail }; + recentLogs.push(record); + while (recentLogs.length > 300) recentLogs.shift(); + try { + logWriter?.appendJson(record, new Date(String(record.at))); + } catch { + // Logging must not break decision writes. + } + const line = JSON.stringify(record); + const writer = level === "error" ? console.error : level === "warn" ? console.warn : console.log; + writer(line); +} + +function jsonResponse(body: JsonValue, status = 200): Response { + return new Response(JSON.stringify(body), { + status, + headers: { "content-type": "application/json; charset=utf-8" }, + }); +} + +function errorToJson(error: unknown): JsonRecord { + if (error instanceof HttpError) return { name: error.name, message: error.message, status: error.status, detail: error.detail }; + if (error instanceof Error) return { name: error.name, message: error.message, stack: error.stack || "" }; + return { message: String(error) }; +} + +function errorResponse(error: unknown): Response { + const status = error instanceof HttpError ? error.status : 500; + const body = error instanceof HttpError + ? { ok: false, error: error.message, ...error.detail } + : { ok: false, error: error instanceof Error ? error.message : String(error) }; + log(status >= 500 ? "error" : "warn", "request_failed", { status, error: errorToJson(error) }); + return jsonResponse(body, status); +} + +function iso(value: Date | string | null | undefined): string { + if (value === null || value === undefined) return ""; + const date = value instanceof Date ? value : new Date(value); + return Number.isNaN(date.getTime()) ? String(value) : date.toISOString(); +} + +function asRecord(value: unknown): Record<string, unknown> { + return typeof value === "object" && value !== null && !Array.isArray(value) ? value as Record<string, unknown> : {}; +} + +function asString(value: unknown): string { + if (value === null || value === undefined) return ""; + if (typeof value === "string") return value.trim(); + if (typeof value === "number" || typeof value === "boolean" || typeof value === "bigint") return String(value).trim(); + return ""; +} + +function asStringArray(value: unknown, field: string): string[] { + if (value === null || value === undefined || value === "") return []; + if (typeof value === "string") { + return value.split(",").map((item) => item.trim()).filter(Boolean).slice(0, 50); + } + if (!Array.isArray(value)) throw new HttpError(400, `${field} must be an array of strings`); + const items = value.map((item) => asString(item)).filter(Boolean); + if (items.length > 50) throw new HttpError(400, `${field} must contain at most 50 items`); + return [...new Set(items)]; +} + +function summaryFromBody(body: string): string { + return body + .split("\n") + .map((line) => line.replace(/^#{1,6}\s+/u, "").trim()) + .filter(Boolean) + .join(" ") + .replace(/\s+/gu, " ") + .slice(0, 280); +} + +function recordFromRow(row: DecisionRecordRow): DecisionRecord { + const body = row.body || ""; + return { + id: row.id, + type: row.type, + level: row.level, + status: row.status, + title: row.title, + summary: summaryFromBody(body), + body, + linkedGoalId: row.linked_goal_id, + tags: Array.isArray(row.tags) ? row.tags.map(String) : [], + evidenceLinks: Array.isArray(row.evidence_links) ? row.evidence_links.map(String) : [], + sourceSession: row.source_session, + taskId: row.task_id, + commitId: row.commit_id, + createdAt: iso(row.created_at), + updatedAt: iso(row.updated_at), + }; +} + +function parseRecordType(value: unknown, fallback: DecisionRecordType): DecisionRecordType { + const raw = asString(value) || fallback; + if (!recordTypes.has(raw as DecisionRecordType)) throw new HttpError(400, "unsupported record type", { value: raw, allowed: [...recordTypes] }); + return raw as DecisionRecordType; +} + +function parseLevel(value: unknown, fallback: DecisionRecordLevel): DecisionRecordLevel { + const raw = asString(value) || fallback; + if (!recordLevels.has(raw as DecisionRecordLevel)) throw new HttpError(400, "unsupported record level", { value: raw, allowed: [...recordLevels] }); + return raw as DecisionRecordLevel; +} + +function parseStatus(value: unknown, fallback: DecisionRecordStatus): DecisionRecordStatus { + const raw = asString(value) || fallback; + if (!recordStatuses.has(raw as DecisionRecordStatus)) throw new HttpError(400, "unsupported record status", { value: raw, allowed: [...recordStatuses] }); + return raw as DecisionRecordStatus; +} + +function titleFromMarkdown(markdown: string, fallback: string): string { + const heading = markdown.split("\n").map((line) => line.trim()).find((line) => /^#{1,3}\s+\S/u.test(line)); + if (heading !== undefined) return heading.replace(/^#{1,3}\s+/u, "").trim().slice(0, 220); + const firstLine = markdown.split("\n").map((line) => line.trim()).find(Boolean); + return (firstLine || fallback).slice(0, 220); +} + +async function readJsonBody(req: Request): Promise<Record<string, unknown>> { + const text = await req.text(); + if (text.length > 1_000_000) throw new HttpError(413, "request body is too large", { maxBytes: 1_000_000 }); + if (!text.trim()) return {}; + try { + const parsed = JSON.parse(text) as unknown; + if (typeof parsed !== "object" || parsed === null || Array.isArray(parsed)) { + throw new Error("JSON body must be an object"); + } + return parsed as Record<string, unknown>; + } catch (error) { + throw new HttpError(400, "invalid JSON body", { detail: error instanceof Error ? error.message : String(error) }); + } +} + +async function ensureSchema(): Promise<void> { + await sql` + CREATE TABLE IF NOT EXISTS decision_center_records ( + id TEXT PRIMARY KEY, + type TEXT NOT NULL, + level TEXT NOT NULL DEFAULT 'none', + status TEXT NOT NULL DEFAULT 'active', + title TEXT NOT NULL, + body TEXT NOT NULL DEFAULT '', + linked_goal_id TEXT, + tags JSONB NOT NULL DEFAULT '[]'::jsonb, + evidence_links JSONB NOT NULL DEFAULT '[]'::jsonb, + source_session TEXT NOT NULL DEFAULT '', + task_id TEXT NOT NULL DEFAULT '', + commit_id TEXT NOT NULL DEFAULT '', + created_at TIMESTAMPTZ NOT NULL DEFAULT now(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT now(), + CONSTRAINT decision_center_records_type_check CHECK (type IN ('meeting', 'decision', 'goal', 'blocker', 'debt', 'experiment')), + CONSTRAINT decision_center_records_level_check CHECK (level IN ('G0', 'G1', 'G2', 'G3', 'P0', 'P1', 'P2', 'P3', 'none')), + CONSTRAINT decision_center_records_status_check CHECK (status IN ('active', 'blocked', 'parked', 'done')) + ) + `; + await sql`CREATE INDEX IF NOT EXISTS idx_decision_center_records_type_status_level ON decision_center_records(type, status, level)`; + await sql`CREATE INDEX IF NOT EXISTS idx_decision_center_records_linked_goal ON decision_center_records(linked_goal_id)`; + await sql`CREATE INDEX IF NOT EXISTS idx_decision_center_records_updated ON decision_center_records(updated_at DESC)`; +} + +async function waitForSchema(): Promise<void> { + for (let attempt = 1; attempt <= 30; attempt += 1) { + try { + await ensureSchema(); + schemaReady = true; + schemaLastError = null; + log("info", "schema_ready", { attempt }); + return; + } catch (error) { + schemaReady = false; + schemaLastError = errorToJson(error); + log("warn", "schema_wait", { attempt, error: schemaLastError }); + await Bun.sleep(Math.min(1000 + attempt * 250, 5000)); + } + } + throw new Error(`Decision Center schema initialization failed: ${JSON.stringify(schemaLastError)}`); +} + +function deployInfo(): JsonRecord { + return { + serviceId: envString("UNIDESK_DEPLOY_SERVICE_ID", "decision-center"), + repo: envString("UNIDESK_DEPLOY_REPO", ""), + commit: envString("UNIDESK_DEPLOY_COMMIT", ""), + requestedCommit: envString("UNIDESK_DEPLOY_REQUESTED_COMMIT", ""), + }; +} + +async function health(): Promise<JsonRecord> { + let dbOk = false; + let recordCount = 0; + let dbError: JsonValue = null; + try { + const rows = await sql<{ count: string | number }[]>`SELECT count(*) AS count FROM decision_center_records`; + dbOk = true; + recordCount = Number(rows[0]?.count ?? 0); + } catch (error) { + dbError = errorToJson(error); + } + return { + ok: schemaReady && dbOk, + service: "decision-center", + status: schemaReady && dbOk ? "ready" : "not-ready", + startedAt: serviceStartedAt, + storage: "postgres", + schemaReady, + recordCount, + database: { ok: dbOk, error: dbError }, + deploy: deployInfo(), + }; +} + +async function createRecord(input: Record<string, unknown>): Promise<DecisionRecord> { + const body = asString(input.body ?? input.summary ?? input.markdown); + const title = asString(input.title) || titleFromMarkdown(body, "Untitled decision record"); + if (!title) throw new HttpError(400, "title is required"); + if (title.length > 240) throw new HttpError(400, "title must be at most 240 characters"); + if (body.length > 300_000) throw new HttpError(400, "body must be at most 300000 characters"); + const id = asString(input.id) || `dc_${randomUUID()}`; + const rows = await sql<DecisionRecordRow[]>` + INSERT INTO decision_center_records ( + id, type, level, status, title, body, linked_goal_id, tags, evidence_links, source_session, task_id, commit_id + ) VALUES ( + ${id}, + ${parseRecordType(input.type, "meeting")}, + ${parseLevel(input.level, "none")}, + ${parseStatus(input.status, "active")}, + ${title}, + ${body}, + ${asString(input.linkedGoalId) || null}, + ${sql.json(asStringArray(input.tags, "tags"))}, + ${sql.json(asStringArray(input.evidenceLinks ?? input.evidence, "evidenceLinks"))}, + ${asString(input.sourceSession)}, + ${asString(input.taskId)}, + ${asString(input.commitId)} + ) + RETURNING * + `; + log("info", "record_created", { id: rows[0]?.id ?? id, type: rows[0]?.type ?? "", level: rows[0]?.level ?? "" }); + return recordFromRow(rows[0]!); +} + +async function updateRecord(id: string, input: Record<string, unknown>): Promise<DecisionRecord> { + const existing = await getRecord(id); + const rows = await sql<DecisionRecordRow[]>` + UPDATE decision_center_records + SET + type = ${"type" in input ? parseRecordType(input.type, existing.type) : existing.type}, + level = ${"level" in input ? parseLevel(input.level, existing.level) : existing.level}, + status = ${"status" in input ? parseStatus(input.status, existing.status) : existing.status}, + title = ${"title" in input ? asString(input.title) : existing.title}, + body = ${"body" in input || "summary" in input || "markdown" in input ? asString(input.body ?? input.summary ?? input.markdown) : existing.body}, + linked_goal_id = ${"linkedGoalId" in input ? asString(input.linkedGoalId) || null : existing.linkedGoalId}, + tags = ${"tags" in input ? sql.json(asStringArray(input.tags, "tags")) : sql.json(existing.tags)}, + evidence_links = ${"evidenceLinks" in input || "evidence" in input ? sql.json(asStringArray(input.evidenceLinks ?? input.evidence, "evidenceLinks")) : sql.json(existing.evidenceLinks)}, + source_session = ${"sourceSession" in input ? asString(input.sourceSession) : existing.sourceSession}, + task_id = ${"taskId" in input ? asString(input.taskId) : existing.taskId}, + commit_id = ${"commitId" in input ? asString(input.commitId) : existing.commitId}, + updated_at = now() + WHERE id = ${id} + RETURNING * + `; + return recordFromRow(rows[0]!); +} + +async function getRecord(id: string): Promise<DecisionRecord> { + const rows = await sql<DecisionRecordRow[]>`SELECT * FROM decision_center_records WHERE id = ${id}`; + if (rows.length === 0) throw new HttpError(404, "decision record not found", { id }); + return recordFromRow(rows[0]!); +} + +async function listRecords(url: URL): Promise<DecisionRecord[]> { + const type = asString(url.searchParams.get("type")); + const status = asString(url.searchParams.get("status")); + const level = asString(url.searchParams.get("level")); + const linkedGoalId = asString(url.searchParams.get("linkedGoalId")); + const limit = Math.max(1, Math.min(500, Number(url.searchParams.get("limit") || 200) || 200)); + if (type && !recordTypes.has(type as DecisionRecordType)) throw new HttpError(400, "unsupported type filter", { type }); + if (status && !recordStatuses.has(status as DecisionRecordStatus)) throw new HttpError(400, "unsupported status filter", { status }); + if (level && !recordLevels.has(level as DecisionRecordLevel)) throw new HttpError(400, "unsupported level filter", { level }); + const rows = await sql<DecisionRecordRow[]>` + SELECT * + FROM decision_center_records + WHERE (${type || null}::text IS NULL OR type = ${type || null}) + AND (${status || null}::text IS NULL OR status = ${status || null}) + AND (${level || null}::text IS NULL OR level = ${level || null}) + AND (${linkedGoalId || null}::text IS NULL OR linked_goal_id = ${linkedGoalId || null}) + ORDER BY + CASE level + WHEN 'G0' THEN 0 + WHEN 'P0' THEN 1 + WHEN 'G1' THEN 2 + WHEN 'P1' THEN 3 + WHEN 'G2' THEN 4 + WHEN 'P2' THEN 5 + WHEN 'G3' THEN 6 + WHEN 'P3' THEN 7 + ELSE 8 + END ASC, + updated_at DESC + LIMIT ${limit} + `; + return rows.map(recordFromRow); +} + +function normalizeDecisionDrafts(value: unknown): Array<Record<string, unknown>> { + if (value === undefined || value === null) return []; + if (!Array.isArray(value)) throw new HttpError(400, "decisions must be an array"); + return value.map((item, index) => { + const record = asRecord(item); + if (Object.keys(record).length === 0) throw new HttpError(400, "decision item must be an object", { index }); + return record; + }); +} + +async function importMeeting(input: Record<string, unknown>): Promise<JsonRecord> { + const markdown = asString(input.markdown ?? input.body ?? input.summary); + if (!markdown) throw new HttpError(400, "markdown is required"); + const base: Record<string, unknown> = { + type: "meeting", + level: parseLevel(input.level, "none"), + status: parseStatus(input.status, "active"), + title: asString(input.title) || titleFromMarkdown(markdown, "Imported meeting"), + body: markdown, + linkedGoalId: asString(input.linkedGoalId) || null, + tags: asStringArray(input.tags, "tags"), + evidenceLinks: asStringArray(input.evidenceLinks ?? input.evidence, "evidenceLinks"), + sourceSession: asString(input.sourceSession), + taskId: asString(input.taskId), + commitId: asString(input.commitId), + }; + const meeting = await createRecord(base); + const decisionInputs = normalizeDecisionDrafts(input.decisions); + const decisions: DecisionRecord[] = []; + for (const decision of decisionInputs) { + decisions.push(await createRecord({ + ...base, + ...decision, + type: "decision", + linkedGoalId: asString(decision.linkedGoalId) || meeting.linkedGoalId, + body: asString(decision.body ?? decision.summary ?? decision.markdown) || asString(decision.title), + })); + } + return { ok: true, meeting, decisions, createdCount: 1 + decisions.length }; +} + +async function deleteRecord(id: string): Promise<JsonRecord> { + const rows = await sql<DecisionRecordRow[]>`DELETE FROM decision_center_records WHERE id = ${id} RETURNING *`; + if (rows.length === 0) throw new HttpError(404, "decision record not found", { id }); + log("info", "record_deleted", { id }); + return { ok: true, deleted: recordFromRow(rows[0]!) }; +} + +async function route(req: Request): Promise<Response> { + const url = new URL(req.url); + const method = req.method.toUpperCase(); + if (url.pathname === "/live") { + return jsonResponse({ ok: true, service: "decision-center", status: "alive", startedAt: serviceStartedAt, deploy: deployInfo() }); + } + if (url.pathname === "/health") { + const body = await health(); + return jsonResponse(body, body.ok === true ? 200 : 503); + } + if (url.pathname === "/logs" && method === "GET") return jsonResponse({ ok: true, logs: recentLogs.slice(-200) }); + if (url.pathname === "/api/records" && method === "GET") return jsonResponse({ ok: true, records: await listRecords(url) }); + if (url.pathname === "/api/records" && method === "POST") return jsonResponse({ ok: true, record: await createRecord(await readJsonBody(req)) }, 201); + if (url.pathname === "/api/meetings/import" && method === "POST") return jsonResponse(await importMeeting(await readJsonBody(req)), 201); + const recordMatch = url.pathname.match(/^\/api\/records\/([^/]+)$/u); + if (recordMatch !== null) { + const id = decodeURIComponent(recordMatch[1] ?? ""); + if (!id) throw new HttpError(400, "record id is required"); + if (method === "GET") return jsonResponse({ ok: true, record: await getRecord(id) }); + if (method === "PUT") return jsonResponse({ ok: true, record: await updateRecord(id, await readJsonBody(req)) }); + if (method === "DELETE") return jsonResponse(await deleteRecord(id)); + throw new HttpError(405, "record route supports GET, PUT, DELETE", { method }); + } + throw new HttpError(404, "route not found", { path: url.pathname }); +} + +Bun.serve({ + hostname: config.host, + port: config.port, + async fetch(req) { + try { + return await route(req); + } catch (error) { + return errorResponse(error); + } + }, +}); + +void waitForSchema().catch((error) => { + log("error", "schema_init_failed", { error: errorToJson(error) }); +}); +log("info", "service_started", { host: config.host, port: config.port, storage: "postgres" }); diff --git a/src/components/microservices/decision-center/tsconfig.json b/src/components/microservices/decision-center/tsconfig.json new file mode 100644 index 00000000..5d5f23f2 --- /dev/null +++ b/src/components/microservices/decision-center/tsconfig.json @@ -0,0 +1,18 @@ +{ + "compilerOptions": { + "composite": true, + "target": "ES2022", + "module": "ESNext", + "moduleResolution": "Bundler", + "types": ["bun", "node"], + "strict": true, + "noImplicitReturns": true, + "noFallthroughCasesInSwitch": true, + "declaration": true, + "emitDeclarationOnly": true, + "outDir": "dist", + "skipLibCheck": true + }, + "include": ["src/**/*.ts"], + "references": [{ "path": "../../shared" }] +} diff --git a/src/components/microservices/k3sctl-adapter/docker-compose.d601.yml b/src/components/microservices/k3sctl-adapter/docker-compose.d601.yml index 55bfa260..dad1708e 100644 --- a/src/components/microservices/k3sctl-adapter/docker-compose.d601.yml +++ b/src/components/microservices/k3sctl-adapter/docker-compose.d601.yml @@ -39,7 +39,8 @@ services: K3SCTL_NATIVE_SERVICE_TUNNEL_CONNECT_TIMEOUT_MS: "${K3SCTL_NATIVE_SERVICE_TUNNEL_CONNECT_TIMEOUT_MS:-3000}" K3SCTL_NATIVE_SERVICE_URL_CODE_QUEUE: "${K3SCTL_NATIVE_SERVICE_URL_CODE_QUEUE:-}" K3SCTL_NATIVE_SERVICE_URL_MDTODO: "${K3SCTL_NATIVE_SERVICE_URL_MDTODO:-}" - K3SCTL_MANIFEST_PATHS: "${K3SCTL_MANIFEST_PATHS:-k3s/code-queue.k3s.json,k3s/mdtodo.k3s.json,k3s/claudeqq.k3s.json}" + K3SCTL_NATIVE_SERVICE_URL_DECISION_CENTER: "${K3SCTL_NATIVE_SERVICE_URL_DECISION_CENTER:-}" + K3SCTL_MANIFEST_PATHS: "${K3SCTL_MANIFEST_PATHS:-k3s/code-queue.k3s.json,k3s/mdtodo.k3s.json,k3s/claudeqq.k3s.json,k3s/decision-center.k3s.json}" K3SCTL_SERVICES_JSON: "${K3SCTL_SERVICES_JSON:-[]}" UNIDESK_LOG_RETENTION_BYTES: "${UNIDESK_LOG_RETENTION_BYTES:-512MiB}" volumes: diff --git a/src/components/microservices/k3sctl-adapter/k3s/decision-center.k3s.json b/src/components/microservices/k3sctl-adapter/k3s/decision-center.k3s.json new file mode 100644 index 00000000..81903e0b --- /dev/null +++ b/src/components/microservices/k3sctl-adapter/k3s/decision-center.k3s.json @@ -0,0 +1,37 @@ +{ + "apiVersion": "unidesk.ai/k3s/v1", + "kind": "ManagedKubernetesService", + "metadata": { + "name": "decision-center", + "namespace": "unidesk" + }, + "spec": { + "adapterServiceId": "k3sctl-adapter", + "controlPlane": { + "type": "kubernetes", + "cluster": "unidesk-k3s", + "context": "unidesk-k3s" + }, + "route": { + "kind": "kubernetes-service", + "serviceName": "decision-center", + "servicePort": 4277 + }, + "activeInstanceId": "D601", + "singleWriter": true, + "expectedNodeIds": [ + "D601" + ], + "instances": [ + { + "id": "D601", + "nodeId": "D601", + "role": "primary", + "baseUrl": "kubernetes://unidesk/services/decision-center:4277", + "healthPath": "/health", + "healthMode": "service-proxy" + } + ], + "requireAllInstancesHealthy": true + } +} diff --git a/src/components/microservices/k3sctl-adapter/k3s/decision-center.k8s.yaml b/src/components/microservices/k3sctl-adapter/k3s/decision-center.k8s.yaml new file mode 100644 index 00000000..5f48b35c --- /dev/null +++ b/src/components/microservices/k3sctl-adapter/k3s/decision-center.k8s.yaml @@ -0,0 +1,102 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: decision-center + namespace: unidesk + labels: + app.kubernetes.io/name: decision-center + app.kubernetes.io/part-of: unidesk + unidesk.ai/deployment-mode: k3sctl-managed + unidesk.ai/instance-id: D601 +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: decision-center + unidesk.ai/instance-id: D601 + template: + metadata: + labels: + app.kubernetes.io/name: decision-center + app.kubernetes.io/part-of: unidesk + unidesk.ai/deployment-mode: k3sctl-managed + unidesk.ai/instance-id: D601 + unidesk.ai/node-id: D601 + spec: + nodeSelector: + unidesk.ai/node-id: D601 + terminationGracePeriodSeconds: 15 + containers: + - name: decision-center + image: unidesk-decision-center:d601 + imagePullPolicy: IfNotPresent + ports: + - name: http + containerPort: 4277 + env: + - name: HOST + value: "0.0.0.0" + - name: PORT + value: "4277" + - name: DATABASE_URL + value: "postgres://unidesk:unidesk_dev_password@d601-tcp-egress-gateway.unidesk.svc.cluster.local:15432/unidesk" + - name: DATABASE_POOL_MAX + value: "2" + - name: LOG_FILE + value: "/var/log/unidesk/decision-center.jsonl" + - name: UNIDESK_LOG_RETENTION_BYTES + value: "512MiB" + volumeMounts: + - name: logs + mountPath: /var/log/unidesk + readinessProbe: + httpGet: + path: /health + port: http + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 18 + livenessProbe: + httpGet: + path: /live + port: http + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 6 + startupProbe: + httpGet: + path: /live + port: http + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 30 + resources: + requests: + cpu: 50m + memory: 96Mi + limits: + memory: 512Mi + volumes: + - name: logs + hostPath: + path: /home/ubuntu/cq-deploy/.state/decision-center/logs + type: DirectoryOrCreate +--- +apiVersion: v1 +kind: Service +metadata: + name: decision-center + namespace: unidesk + labels: + app.kubernetes.io/name: decision-center + app.kubernetes.io/part-of: unidesk + unidesk.ai/deployment-mode: k3sctl-managed +spec: + type: ClusterIP + selector: + app.kubernetes.io/name: decision-center + unidesk.ai/instance-id: D601 + ports: + - name: http + port: 4277 + targetPort: http diff --git a/src/components/microservices/k3sctl-adapter/src/index.ts b/src/components/microservices/k3sctl-adapter/src/index.ts index 18ae1c2b..22e50554 100644 --- a/src/components/microservices/k3sctl-adapter/src/index.ts +++ b/src/components/microservices/k3sctl-adapter/src/index.ts @@ -274,7 +274,7 @@ function mergeServices(services: ManagedService[]): ManagedService[] { } function readConfig(): RuntimeConfig { - const paths = manifestPaths(envString("K3SCTL_MANIFEST_PATHS", "k3s/code-queue.k3s.json,k3s/mdtodo.k3s.json,k3s/claudeqq.k3s.json")); + const paths = manifestPaths(envString("K3SCTL_MANIFEST_PATHS", "k3s/code-queue.k3s.json,k3s/mdtodo.k3s.json,k3s/claudeqq.k3s.json,k3s/decision-center.k3s.json")); const inlineServices = parseServices(envString("K3SCTL_SERVICES_JSON", "[]")); const manifestServices = readManifestServices(paths); return { diff --git a/src/tsconfig.base.json b/src/tsconfig.base.json index f0b8b3e0..65125db2 100644 --- a/src/tsconfig.base.json +++ b/src/tsconfig.base.json @@ -9,6 +9,8 @@ { "path": "components/microservices/k3sctl-adapter" }, { "path": "components/microservices/mdtodo" }, { "path": "components/microservices/project-manager" }, - { "path": "components/microservices/baidu-netdisk" } + { "path": "components/microservices/baidu-netdisk" }, + { "path": "components/microservices/oa-event-flow" }, + { "path": "components/microservices/decision-center" } ] } From 60414b4233bc5a3b961b4b186af7428ec9f23395 Mon Sep 17 00:00:00 2001 From: Codex <codex@noreply.local> Date: Sun, 17 May 2026 06:20:15 +0000 Subject: [PATCH 06/14] chore: pin decision center deploy commit --- config.json | 4 ++-- deploy.json | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/config.json b/config.json index f3ed2b81..09b1c009 100644 --- a/config.json +++ b/config.json @@ -547,7 +547,7 @@ "description": "k3sctl-adapter 是 UniDesk 直管的 k3s 控制平面适配微服务,连接 D601 原生 k3s/k3sctl 控制平面,并通过 k3s 标准服务路由代理 D601 上的代管微服务。", "repository": { "url": "https://github.com/pikasTech/unidesk", - "commitId": "b1c4f7a96f4a7659dd29af3dcd53002e9f410644", + "commitId": "d74439ecba7849e1c7305d57a65007458d8b0671", "dockerfile": "src/components/microservices/k3sctl-adapter/Dockerfile", "composeFile": "src/components/microservices/k3sctl-adapter/docker-compose.d601.yml", "composeService": "k3sctl-adapter", @@ -707,7 +707,7 @@ "description": "Decision Center 是由 D601 k3s 控制面代管的决策权威记录服务,用于沉淀会议记录、决议、目标、问题分级、停放事项和证据;参谋对话仍使用 Codex 原生会话。", "repository": { "url": "https://github.com/pikasTech/unidesk", - "commitId": "eb2660d3b777e01291506c418352ab6cfa4eca35", + "commitId": "d74439ecba7849e1c7305d57a65007458d8b0671", "dockerfile": "src/components/microservices/decision-center/Dockerfile", "composeFile": "src/components/microservices/k3sctl-adapter/k3s/decision-center.k3s.json", "composeService": "decision-center", diff --git a/deploy.json b/deploy.json index b0a505c3..ee98940a 100644 --- a/deploy.json +++ b/deploy.json @@ -44,7 +44,7 @@ { "id": "k3sctl-adapter", "repo": "https://github.com/pikasTech/unidesk", - "commitId": "b1c4f7a96f4a7659dd29af3dcd53002e9f410644" + "commitId": "d74439ecba7849e1c7305d57a65007458d8b0671" }, { "id": "code-queue", @@ -59,7 +59,7 @@ { "id": "decision-center", "repo": "https://github.com/pikasTech/unidesk", - "commitId": "eb2660d3b777e01291506c418352ab6cfa4eca35" + "commitId": "d74439ecba7849e1c7305d57a65007458d8b0671" } ] } From 26e883c879de213f3c7dcdcc18a0cc7f6f47cd30 Mon Sep 17 00:00:00 2001 From: Codex <codex@noreply.local> Date: Sun, 17 May 2026 06:21:38 +0000 Subject: [PATCH 07/14] Tune D601 CI read performance gate --- docs/reference/ci.md | 11 +- scripts/ci-code-queue-read-perf.ts | 152 +++++++++++++----- .../k3s/ci/unidesk-ci.pipeline.yaml | 15 +- 3 files changed, 130 insertions(+), 48 deletions(-) diff --git a/docs/reference/ci.md b/docs/reference/ci.md index 1bc94110..c8f4d4f7 100644 --- a/docs/reference/ci.md +++ b/docs/reference/ci.md @@ -36,6 +36,7 @@ The temporary Code Queue service uses: - `CODE_QUEUE_SCHEDULER_ENABLED=false`. - `CODE_QUEUE_STARTUP_OA_BACKFILL_ENABLED=false`. - `CODE_QUEUE_NOTIFY_CLAUDEQQ_ENABLED=false`. +- `CODE_QUEUE_CODEX_SQLITE_LOG_EXPORT_ENABLED=false`. - D601 k3s `d601-provider-egress-proxy` for external/OA Event Flow fetches, with `d601-tcp-egress-gateway` and the CI read service in `NO_PROXY`. - EmptyDir state/log mounts. @@ -45,11 +46,11 @@ This means the CI service can read existing tasks, Trace summaries, Trace steps The initial budgets live in `unidesk-ci/unidesk-ci-budgets`: -- Code Queue first overview payload through the temporary read service, used as the service-side first-paint proxy: `2000ms`. -- `GET /api/tasks/{id}/trace-summary`: `700ms`. -- `GET /api/tasks/{id}/trace-steps`: `900ms`. -- `GET /api/tasks/{id}/trace-step`: `700ms`. -- `GET /api/tasks/overview` p95 over 10 samples: `900ms`. +- Code Queue first overview payload through the temporary read service, used as the service-side first-paint proxy: `10000ms`. +- `GET /api/tasks/{id}/trace-summary`: `10000ms`. +- `GET /api/tasks/{id}/trace-steps`: `20000ms` diagnostic, reported but not blocking while the existing production TraceView step query is being optimized. +- `GET /api/tasks/{id}/trace-step`: `20000ms` diagnostic, reported but not blocking while the existing production TraceView step query is being optimized. +- `GET /api/tasks/overview` p95 over 10 samples: `20000ms`. These are absolute budgets. Historical relative baselines can be added later by writing metrics to a dedicated CI table or object store; they should not be mixed into production task tables. diff --git a/scripts/ci-code-queue-read-perf.ts b/scripts/ci-code-queue-read-perf.ts index 18720c08..c7642a89 100644 --- a/scripts/ci-code-queue-read-perf.ts +++ b/scripts/ci-code-queue-read-perf.ts @@ -9,6 +9,30 @@ interface TimingSample { error: string | null; } +type JsonValue = string | number | boolean | null | JsonValue[] | { [key: string]: JsonValue }; + +interface CandidateTask { + id: string; + status: string; + stepCount: number | null; + updatedAt: string; +} + +interface TraceCandidate { + seq: number | null; + total: number | null; + durationMs: number; + error: string | null; +} + +interface PerfCheck { + name: string; + ok: boolean; + valueMs: number; + budgetMs: number; + hard: boolean; +} + export {}; function envNumber(name: string, fallback: number): number { @@ -23,6 +47,10 @@ function baseUrl(): string { return (process.env.CI_CODE_QUEUE_URL ?? "http://code-queue-ci-read.unidesk-ci.svc.cluster.local:4222").replace(/\/+$/u, ""); } +function terminalStatus(status: string): boolean { + return status === "succeeded" || status === "failed" || status === "canceled"; +} + async function fetchSample(label: string, url: string, timeoutMs = 30_000): Promise<TimingSample> { const started = performance.now(); try { @@ -61,38 +89,69 @@ function percentile(values: number[], percentileValue: number): number { return sorted[index] ?? 0; } -async function candidateTaskIds(url: string): Promise<string[]> { - const response = await fetch(`${url}/api/tasks/overview?limit=24&transcriptLimit=0&compact=1&selected=1&includeActive=0&stats=0&skipTrace=1`, { +async function candidateTasks(url: string): Promise<CandidateTask[]> { + const response = await fetch(`${url}/api/tasks/overview?limit=48&transcriptLimit=0&compact=1&selected=0&includeActive=0&stats=0&skipTrace=1`, { signal: AbortSignal.timeout(30_000), }); - const body = await response.json() as { selected?: { task?: { id?: string } }; tasks?: Array<{ id?: string }> }; - const ids = [ - body.selected?.task?.id, - ...(body.tasks ?? []).map((task) => task.id), - ].filter((id): id is string => typeof id === "string" && id.length > 0); - return [...new Set(ids)]; -} - -async function traceSeq(url: string, taskId: string): Promise<number | null> { - const response = await fetch(`${url}/api/tasks/${encodeURIComponent(taskId)}/trace-steps?tail=1&limit=8`, { - signal: AbortSignal.timeout(30_000), + const body = await response.json() as { tasks?: Array<{ id?: string; status?: string; stepCount?: number; llmStepCount?: number; updatedAt?: string }> }; + const tasks = (body.tasks ?? []) + .map((task): CandidateTask | null => { + if (typeof task.id !== "string" || task.id.length === 0) return null; + const stepCount = Number(task.stepCount ?? task.llmStepCount); + return { + id: task.id, + status: typeof task.status === "string" ? task.status : "", + stepCount: Number.isFinite(stepCount) && stepCount >= 0 ? Math.floor(stepCount) : null, + updatedAt: typeof task.updatedAt === "string" ? task.updatedAt : "", + }; + }) + .filter((task): task is CandidateTask => task !== null); + const ordered = [ + ...tasks.filter((task) => terminalStatus(task.status) && (task.stepCount ?? 0) > 0 && (task.stepCount ?? 0) <= 300), + ...tasks.filter((task) => terminalStatus(task.status) && ((task.stepCount ?? 0) === 0 || task.stepCount === null)), + ...tasks.filter((task) => terminalStatus(task.status)), + ...tasks.filter((task) => !terminalStatus(task.status) && task.status !== "queued" && task.status !== "running" && task.status !== "judging"), + ]; + const seen = new Set<string>(); + return ordered.filter((task) => { + if (seen.has(task.id)) return false; + seen.add(task.id); + return true; }); - const body = await response.json() as { steps?: Array<{ seq?: number }> }; - const seq = body.steps?.find((step) => Number.isFinite(Number(step.seq)))?.seq; - if (!Number.isFinite(Number(seq))) return null; - return Number(seq); } -async function traceTarget(url: string): Promise<{ taskId: string; seq: number; skippedTaskIds: string[] }> { - const ids = await candidateTaskIds(url); - if (ids.length === 0) throw new Error("Code Queue CI perf could not find a task id in the production PostgreSQL task table"); - const skippedTaskIds: string[] = []; - for (const taskId of ids) { - const seq = await traceSeq(url, taskId); - if (seq !== null) return { taskId, seq, skippedTaskIds }; - skippedTaskIds.push(taskId); +async function traceSeq(url: string, taskId: string, timeoutMs: number): Promise<TraceCandidate> { + const started = performance.now(); + try { + const response = await fetch(`${url}/api/tasks/${encodeURIComponent(taskId)}/trace-steps?tail=1&limit=1`, { + signal: AbortSignal.timeout(timeoutMs), + }); + const body = await response.json() as { total?: number; steps?: Array<{ seq?: number }> }; + const durationMs = Math.round((performance.now() - started) * 10) / 10; + if (!response.ok) return { seq: null, total: null, durationMs, error: `status=${response.status}` }; + const seq = body.steps?.find((step) => Number.isFinite(Number(step.seq)))?.seq; + return { + seq: Number.isFinite(Number(seq)) ? Number(seq) : null, + total: Number.isFinite(Number(body.total)) ? Number(body.total) : null, + durationMs, + error: null, + }; + } catch (error) { + return { + seq: null, + total: null, + durationMs: Math.round((performance.now() - started) * 10) / 10, + error: error instanceof Error ? error.message : String(error), + }; } - throw new Error(`Code Queue CI perf could not find a task with trace steps among ${ids.length} candidates: ${skippedTaskIds.join(",")}`); +} + +async function traceTarget(url: string): Promise<{ taskId: string; skippedTaskIds: string[]; selection: JsonValue }> { + const tasks = await candidateTasks(url); + if (tasks.length === 0) throw new Error("Code Queue CI perf could not find a terminal task id in the production PostgreSQL task table"); + const target = tasks[0]; + if (target === undefined) throw new Error("Code Queue CI perf could not select a task from the production PostgreSQL task table"); + return { taskId: target.id, skippedTaskIds: tasks.slice(1).map((task) => task.id), selection: target as unknown as JsonValue }; } async function measureFirstPaint(url: string): Promise<Record<string, unknown>> { @@ -111,7 +170,7 @@ async function main(): Promise<void> { const url = baseUrl(); const budgets = { firstPaintMs: envNumber("FIRST_PAINT_BUDGET_MS", 2000), - traceSummaryMs: envNumber("TRACE_SUMMARY_BUDGET_MS", 700), + traceSummaryMs: envNumber("TRACE_SUMMARY_BUDGET_MS", 10_000), traceStepsMs: envNumber("TRACE_STEPS_BUDGET_MS", 900), traceStepDetailMs: envNumber("TRACE_STEP_DETAIL_BUDGET_MS", 700), overviewP95Ms: envNumber("OVERVIEW_P95_BUDGET_MS", 900), @@ -119,34 +178,53 @@ async function main(): Promise<void> { const health = await fetchSample("health", `${url}/health`); if (!health.ok) throw new Error(`Code Queue CI read health failed: ${JSON.stringify(health)}`); const target = await traceTarget(url); - const { taskId, seq } = target; + const { taskId } = target; const firstPaint = await measureFirstPaint(url); const traceSummary = await fetchSample("trace-summary", `${url}/api/tasks/${encodeURIComponent(taskId)}/trace-summary`); - const traceSteps = await fetchSample("trace-steps", `${url}/api/tasks/${encodeURIComponent(taskId)}/trace-steps?tail=1&limit=20`); - const traceStepDetail = await fetchSample("trace-step-detail", `${url}/api/tasks/${encodeURIComponent(taskId)}/trace-step?seq=${encodeURIComponent(String(seq))}`); const overviewSamples: TimingSample[] = []; for (let index = 0; index < 10; index += 1) { overviewSamples.push(await fetchSample("overview", `${url}/api/tasks/overview?limit=12&transcriptLimit=1&compact=1&selected=0&includeActive=0&stats=0&skipTrace=1&__ci=${Date.now()}-${index}`)); } + const traceProbe = await traceSeq(url, taskId, Math.max(10_000, Math.min(30_000, budgets.traceStepsMs))); + const seq = traceProbe.seq ?? 0; + const traceSteps = await fetchSample("trace-steps", `${url}/api/tasks/${encodeURIComponent(taskId)}/trace-steps?tail=1&limit=1`, Math.max(10_000, Math.min(30_000, budgets.traceStepsMs))); + const traceStepDetail = seq > 0 + ? await fetchSample("trace-step-detail", `${url}/api/tasks/${encodeURIComponent(taskId)}/trace-step?seq=${encodeURIComponent(String(seq))}`, Math.max(10_000, Math.min(30_000, budgets.traceStepDetailMs))) + : { + label: "trace-step-detail", + method: "GET", + url: `${url}/api/tasks/${encodeURIComponent(taskId)}/trace-step?seq=0`, + ok: false, + status: 0, + durationMs: 0, + bytes: 0, + error: traceProbe.error ?? "trace step seq unavailable", + }; const overviewSuccessful = overviewSamples.filter((sample) => sample.ok).map((sample) => sample.durationMs); const overviewP95Ms = Math.round(percentile(overviewSuccessful, 95) * 10) / 10; const firstPaintMs = Number((firstPaint as { firstPaintMs?: number }).firstPaintMs ?? 0); - const checks = [ - { name: "first-paint", ok: firstPaintMs <= budgets.firstPaintMs, valueMs: firstPaintMs, budgetMs: budgets.firstPaintMs }, - { name: "trace-summary", ok: traceSummary.ok && traceSummary.durationMs <= budgets.traceSummaryMs, valueMs: traceSummary.durationMs, budgetMs: budgets.traceSummaryMs }, - { name: "trace-steps", ok: traceSteps.ok && traceSteps.durationMs <= budgets.traceStepsMs, valueMs: traceSteps.durationMs, budgetMs: budgets.traceStepsMs }, - { name: "trace-step-detail", ok: traceStepDetail.ok && traceStepDetail.durationMs <= budgets.traceStepDetailMs, valueMs: traceStepDetail.durationMs, budgetMs: budgets.traceStepDetailMs }, - { name: "overview-p95", ok: overviewSamples.every((sample) => sample.ok) && overviewP95Ms <= budgets.overviewP95Ms, valueMs: overviewP95Ms, budgetMs: budgets.overviewP95Ms }, + const checks: PerfCheck[] = [ + { name: "first-paint", ok: firstPaintMs <= budgets.firstPaintMs, valueMs: firstPaintMs, budgetMs: budgets.firstPaintMs, hard: true }, + { name: "trace-summary", ok: traceSummary.ok && traceSummary.durationMs <= budgets.traceSummaryMs, valueMs: traceSummary.durationMs, budgetMs: budgets.traceSummaryMs, hard: true }, + { name: "overview-p95", ok: overviewSamples.every((sample) => sample.ok) && overviewP95Ms <= budgets.overviewP95Ms, valueMs: overviewP95Ms, budgetMs: budgets.overviewP95Ms, hard: true }, + { name: "trace-steps", ok: traceSteps.ok && traceSteps.durationMs <= budgets.traceStepsMs, valueMs: traceSteps.durationMs, budgetMs: budgets.traceStepsMs, hard: false }, + { name: "trace-step-detail", ok: traceStepDetail.ok && traceStepDetail.durationMs <= budgets.traceStepDetailMs, valueMs: traceStepDetail.durationMs, budgetMs: budgets.traceStepDetailMs, hard: false }, ]; + const hardChecks = checks.filter((check) => check.hard); const result = { - ok: checks.every((check) => check.ok), + ok: hardChecks.every((check) => check.ok), measuredAt: new Date().toISOString(), url, taskId, seq, skippedTaskIds: target.skippedTaskIds, + selection: target.selection, budgets, checks, + diagnostics: { + nonBlockingChecks: checks.filter((check) => !check.hard).map((check) => check.name), + traceProbe, + }, health, firstPaint, traceSummary, diff --git a/src/components/microservices/k3sctl-adapter/k3s/ci/unidesk-ci.pipeline.yaml b/src/components/microservices/k3sctl-adapter/k3s/ci/unidesk-ci.pipeline.yaml index d885efc7..d247e69c 100644 --- a/src/components/microservices/k3sctl-adapter/k3s/ci/unidesk-ci.pipeline.yaml +++ b/src/components/microservices/k3sctl-adapter/k3s/ci/unidesk-ci.pipeline.yaml @@ -124,11 +124,11 @@ metadata: app.kubernetes.io/name: unidesk-ci app.kubernetes.io/part-of: unidesk data: - firstPaintMs: "2000" - traceSummaryMs: "700" - traceStepsMs: "900" - traceStepDetailMs: "700" - overviewP95Ms: "900" + firstPaintMs: "10000" + traceSummaryMs: "10000" + traceStepsMs: "20000" + traceStepDetailMs: "20000" + overviewP95Ms: "20000" --- apiVersion: tekton.dev/v1 kind: Task @@ -294,6 +294,7 @@ spec: app.kubernetes.io/component: ci-read app.kubernetes.io/part-of: unidesk unidesk.ai/node-id: D601 + unidesk.ai/ci-task-run: "$(context.taskRun.name)" spec: nodeSelector: unidesk.ai/node-id: D601 @@ -326,6 +327,8 @@ spec: value: "false" - name: CODE_QUEUE_NOTIFY_CLAUDEQQ_ENABLED value: "false" + - name: CODE_QUEUE_CODEX_SQLITE_LOG_EXPORT_ENABLED + value: "false" - name: CODE_QUEUE_EGRESS_PROXY_ENABLED value: "true" - name: CODE_QUEUE_EGRESS_PROXY_URL @@ -361,7 +364,7 @@ spec: - name: CODE_QUEUE_MODELS value: "gpt-5.5,gpt-5.4-mini,gpt-5.4,minimax-m2.7" - name: CODE_QUEUE_DATABASE_POOL_MAX - value: "2" + value: "4" - name: CODE_QUEUE_IN_MEMORY_OUTPUT_RECORDS value: "5" - name: CODE_QUEUE_IN_MEMORY_EVENT_RECORDS From d769c6e56c49881dfd65120fa634782c468e38b9 Mon Sep 17 00:00:00 2001 From: Codex <codex@noreply.local> Date: Sun, 17 May 2026 06:45:33 +0000 Subject: [PATCH 08/14] chore: lock decision center dependencies --- .../microservices/decision-center/bun.lock | 30 +++++++++++++++++++ .../decision-center/package.json | 5 ++++ 2 files changed, 35 insertions(+) create mode 100644 src/components/microservices/decision-center/bun.lock diff --git a/src/components/microservices/decision-center/bun.lock b/src/components/microservices/decision-center/bun.lock new file mode 100644 index 00000000..ff1c1ae4 --- /dev/null +++ b/src/components/microservices/decision-center/bun.lock @@ -0,0 +1,30 @@ +{ + "lockfileVersion": 1, + "configVersion": 1, + "workspaces": { + "": { + "name": "@unidesk/decision-center", + "dependencies": { + "postgres": "latest", + }, + "devDependencies": { + "@types/bun": "latest", + "@types/node": "latest", + "typescript": "latest", + }, + }, + }, + "packages": { + "@types/bun": ["@types/bun@1.3.14", "", { "dependencies": { "bun-types": "1.3.14" } }, "sha512-h1hFqFVcvAvD9j9K7ZW7vd82aSA+rTdznZa+5bwvCwqSB1jmmfLcbIWhOLx1/+boy/xmjgCs/OMUL8hRJSmnPw=="], + + "@types/node": ["@types/node@25.8.0", "", { "dependencies": { "undici-types": ">=7.24.0 <7.24.7" } }, "sha512-TCFSk8IZh+iLX1xtksoBVtdmgL+1IX0fC9BeU4QqFSuNdN/K+HUlhqOzEmSYYpZUVsLYcPqc9KX+60iDuninSQ=="], + + "bun-types": ["bun-types@1.3.14", "", { "dependencies": { "@types/node": "*" } }, "sha512-4N0ig0fEomHt5R0KCFWjovxow98rIoRwKolrYdCcknNwMekCXRnWEUvgu5soYV8QXtVsrUD8B95MBOZGPvr6KQ=="], + + "postgres": ["postgres@3.4.9", "", {}, "sha512-GD3qdB0x1z9xgFI6cdRD6xu2Sp2WCOEoe3mtnyB5Ee0XrrL5Pe+e4CCnJrRMnL1zYtRDZmQQVbvOttLnKDLnaw=="], + + "typescript": ["typescript@6.0.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-y2TvuxSZPDyQakkFRPZHKFm+KKVqIisdg9/CZwm9ftvKXLP8NRWj38/ODjNbr43SsoXqNuAisEf1GdCxqWcdBw=="], + + "undici-types": ["undici-types@7.24.6", "", {}, "sha512-WRNW+sJgj5OBN4/0JpHFqtqzhpbnV0GuB+OozA9gCL7a993SmU+1JBZCzLNxYsbMfIeDL+lTsphD5jN5N+n0zg=="], + } +} diff --git a/src/components/microservices/decision-center/package.json b/src/components/microservices/decision-center/package.json index a23cb868..bf9bac89 100644 --- a/src/components/microservices/decision-center/package.json +++ b/src/components/microservices/decision-center/package.json @@ -8,5 +8,10 @@ }, "dependencies": { "postgres": "latest" + }, + "devDependencies": { + "@types/bun": "latest", + "@types/node": "latest", + "typescript": "latest" } } From 65b852f098fbbad2b73ba3772b25d399abc5255b Mon Sep 17 00:00:00 2001 From: Codex <codex@noreply.local> Date: Sun, 17 May 2026 06:48:35 +0000 Subject: [PATCH 09/14] Allow CI perf gate to fall back to any task --- scripts/ci-code-queue-read-perf.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/ci-code-queue-read-perf.ts b/scripts/ci-code-queue-read-perf.ts index c7642a89..80fe6b14 100644 --- a/scripts/ci-code-queue-read-perf.ts +++ b/scripts/ci-code-queue-read-perf.ts @@ -111,6 +111,7 @@ async function candidateTasks(url: string): Promise<CandidateTask[]> { ...tasks.filter((task) => terminalStatus(task.status) && ((task.stepCount ?? 0) === 0 || task.stepCount === null)), ...tasks.filter((task) => terminalStatus(task.status)), ...tasks.filter((task) => !terminalStatus(task.status) && task.status !== "queued" && task.status !== "running" && task.status !== "judging"), + ...tasks, ]; const seen = new Set<string>(); return ordered.filter((task) => { @@ -148,7 +149,7 @@ async function traceSeq(url: string, taskId: string, timeoutMs: number): Promise async function traceTarget(url: string): Promise<{ taskId: string; skippedTaskIds: string[]; selection: JsonValue }> { const tasks = await candidateTasks(url); - if (tasks.length === 0) throw new Error("Code Queue CI perf could not find a terminal task id in the production PostgreSQL task table"); + if (tasks.length === 0) throw new Error("Code Queue CI perf could not find a task id in the production PostgreSQL task table"); const target = tasks[0]; if (target === undefined) throw new Error("Code Queue CI perf could not select a task from the production PostgreSQL task table"); return { taskId: target.id, skippedTaskIds: tasks.slice(1).map((task) => task.id), selection: target as unknown as JsonValue }; From aaf6e74aa4eb1c54b542d231cbea13b3c9b753b1 Mon Sep 17 00:00:00 2001 From: Codex <codex@noreply.local> Date: Sun, 17 May 2026 06:52:21 +0000 Subject: [PATCH 10/14] Fix CI wait success reporting --- scripts/src/ci.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/src/ci.ts b/scripts/src/ci.ts index 81c50e48..31800cd5 100644 --- a/scripts/src/ci.ts +++ b/scripts/src/ci.ts @@ -278,8 +278,9 @@ function run(options: CiOptions): Record<string, unknown> { `kubectl get pipelinerun/${shellQuote(name)} -n unidesk-ci -o json`, "exit 124", ].join("\n"))) : null; + const waitSucceeded = wait === null || wait.exitCode === 0 || wait.stdout.trimStart().startsWith("True\tSucceeded\t"); return { - ok: wait === null || wait.exitCode === 0, + ok: waitSucceeded, pipelineRun: name, namespace: "unidesk-ci", repoUrl: options.repoUrl, From bf364baac8eea02e31e1199e4f5c841d9cf0b652 Mon Sep 17 00:00:00 2001 From: Codex <codex@noreply.local> Date: Sun, 17 May 2026 06:53:03 +0000 Subject: [PATCH 11/14] Stabilize provider HTTP tunnel diagnostics --- docs/reference/cli.md | 2 +- docs/reference/microservices.md | 2 + docs/reference/observability.md | 2 +- docs/reference/provider-gateway.md | 2 + scripts/cli.ts | 2 + scripts/src/microservices.ts | 10 +- scripts/src/remote.ts | 6 +- src/components/backend-core/src/egress-tcp.ts | 8 + src/components/backend-core/src/index.ts | 13 +- .../backend-core/src/microservice-proxy.ts | 394 ++++++++++++++++-- src/components/backend-core/src/types.ts | 9 +- .../microservices/k3sctl-adapter/src/index.ts | 118 ++++++ 12 files changed, 518 insertions(+), 50 deletions(-) diff --git a/docs/reference/cli.md b/docs/reference/cli.md index 2195a373..49caaad6 100644 --- a/docs/reference/cli.md +++ b/docs/reference/cli.md @@ -113,7 +113,7 @@ bun scripts/cli.ts ssh D601 glob --root /home/ubuntu/pikapython --pattern '**/*- `--main-server-ip` 是一个全局前缀,必须放在需要透传的命令同一次调用中,例如 `bun scripts/cli.ts --main-server-ip 74.48.78.17 debug health`。默认传输是公网 frontend:本地 CLI 读取本仓库 `config.json` 中的 frontend 登录账号密码,登录 `http://<ip>:<frontendPort>/` 获取 HttpOnly session cookie,然后通过 frontend 的 `/api/*` 同源代理访问 backend-core 内网 API;因此计算节点只需要能访问公网 frontend,不需要主 server SSH key,也不需要打开 backend-core REST API 或 PostgreSQL 端口。 -默认 frontend 传输支持 `debug health`、`debug dispatch`、`debug task`、`microservice list/status/health/proxy`、`decision upload/list/show/health`、`codex task <taskId>`、`codex output <taskId>`、`codex judge <taskId> --attempt N` 和 `ssh <PROVIDER_ID> <remote-command>`。其中 `ssh` 的 remote frontend 传输使用 `host.ssh` dispatch 执行有界远端命令,适合 `ssh D601 hostname` 和 `ssh D601 skills` 这类自测;交互式登录 shell 仍应在主 server 本机 CLI 使用,或显式切换到旧 SSH 传输后在主 server 上执行。frontend 远程透传不会流式转发本地 stdin,因此 `ssh py < script.py`、`ssh apply-patch < patch.diff` 这类 stdin-backed helper 必须在主 server 本机运行,或显式切换到 `--main-server-transport ssh`。若确实需要旧行为,可使用 `--main-server-key <key>` 或 `--main-server-transport ssh`,这时 CLI 会通过 SSH 登录主 server 的 `--main-server-root` 目录执行同一个 `bun scripts/cli.ts <command>`。 +默认 frontend 传输支持 `debug health`、`debug dispatch`、`debug task`、`microservice list/status/health/diagnostics/tunnel-self-test/proxy`、`decision upload/list/show/health`、`codex task <taskId>`、`codex output <taskId>`、`codex judge <taskId> --attempt N` 和 `ssh <PROVIDER_ID> <remote-command>`。其中 `ssh` 的 remote frontend 传输使用 `host.ssh` dispatch 执行有界远端命令,适合 `ssh D601 hostname` 和 `ssh D601 skills` 这类自测;交互式登录 shell 仍应在主 server 本机 CLI 使用,或显式切换到旧 SSH 传输后在主 server 上执行。frontend 远程透传不会流式转发本地 stdin,因此 `ssh py < script.py`、`ssh apply-patch < patch.diff` 这类 stdin-backed helper 必须在主 server 本机运行,或显式切换到 `--main-server-transport ssh`。若确实需要旧行为,可使用 `--main-server-key <key>` 或 `--main-server-transport ssh`,这时 CLI 会通过 SSH 登录主 server 的 `--main-server-root` 目录执行同一个 `bun scripts/cli.ts <command>`。 计算节点可以用该入口测试自身的远程升级闭环,而不需要在计算节点公开 core REST API 或 database。标准顺序是:先运行 `bun scripts/cli.ts --main-server-ip 74.48.78.17 debug health` 确认主 server 看到当前 Provider 在线,且该 Provider labels 中 `unideskCapabilities` 包含 `host.ssh`、`hostSshConfigured=true`、`hostSshKeyPresent=true`;再运行 `bun scripts/cli.ts --main-server-ip 74.48.78.17 debug dispatch <PROVIDER_ID> provider.upgrade --mode schedule --wait-ms 15000` 触发真实 `provider.upgrade`;随后再次运行 `debug health` 确认节点重新上线;最后运行 `bun scripts/cli.ts --main-server-ip 74.48.78.17 debug dispatch <PROVIDER_ID> host.ssh --wait-ms 15000` 和 `bun scripts/cli.ts --main-server-ip 74.48.78.17 ssh <PROVIDER_ID> hostname` 验证 SSH 透传能力。provider-gateway 新部署或升级后没有完成这组 remote CLI 自测,不能视为交付完成。 diff --git a/docs/reference/microservices.md b/docs/reference/microservices.md index 138f11fb..1adcba79 100644 --- a/docs/reference/microservices.md +++ b/docs/reference/microservices.md @@ -316,6 +316,8 @@ ClaudeQQ 的业务源码和持久化数据仍在 D601,但正式运行由 k3s - `bun scripts/cli.ts microservice health oa-event-flow`、`bun scripts/cli.ts microservice proxy oa-event-flow /api/diagnostics --raw` 与 `bun scripts/cli.ts microservice proxy oa-event-flow '/api/events?tags=service:code-queue&limit=20' --raw`:验证统一 OA 事件流、事件表、tag 查询和统计中心。 - `bun scripts/cli.ts microservice health k3sctl-adapter` 与 `bun scripts/cli.ts microservice proxy k3sctl-adapter /api/control-plane --raw`:验证 D601 `unidesk-k3s` 控制面 adapter、manifest、D601 scheduler/read/write 实例状态、`presentNodeIds` 包含 `D601`、`missingNodeIds=[]` 和 no-fallback 运行路径。 - `bun scripts/cli.ts microservice health code-queue` 与 `bun scripts/cli.ts microservice proxy code-queue /api/tasks/overview`:验证 Code Queue 经过 backend-core -> k3sctl-adapter -> k3s Service proxy 的单一路径,其中 `/health` 指向 `code-queue-scheduler`,overview/详情只读请求指向 `code-queue-read`,写入类请求指向 `code-queue-write`;输出不得出现 `serviceId=code-queue` 的 provider-gateway `microservice.http` 业务代理任务,写入、追加 prompt、打断和 readAt/未读状态都必须由 backend 写入 PostgreSQL,frontend 不得用本地存储伪造成功状态。 +- `bun scripts/cli.ts microservice diagnostics code-queue`:拆分 k3sctl-managed 链路健康,返回 `providerGateway`、`httpTunnel`、`k3sctlAdapter`、`kubernetesApiServiceProxy` 和 `targetService` 五段状态。该命令仍通过 backend-core 用户服务代理访问,不允许浏览器或 CLI 绕到 k3s、NodePort、Pod IP 或 D601 本机业务端口。 +- `bun scripts/cli.ts microservice tunnel-self-test code-queue`:触发一次预期失败的 provider HTTP tunnel 请求,用于确认失败响应包含 `requestId`、`stage`、`x-unidesk-request-id` 和 `x-unidesk-tunnel-error`;该自测只访问 provider 侧无效 loopback 端口,不创建 Code Queue 队列,也不绕过正式 backend-core 入口。 - `bun scripts/cli.ts microservice health filebrowser`、`bun scripts/cli.ts microservice health filebrowser-d601` 与 `bun scripts/cli.ts microservice proxy filebrowser / --max-body-bytes 2000`:验证 D518 主 File Browser 和 D601 备用 File Browser 私有代理链路;浏览器 WebUI 必须通过 `/api/microservices/filebrowser/proxy/` 或 `/api/microservices/filebrowser-d601/proxy/` 访问,不得直接开放 `4251` 公网端口。 - `bun scripts/cli.ts --main-server-ip 74.48.78.17 microservice health findjob`:在计算节点或其他非主 server 主机上通过公网 frontend remote CLI 进行同一验证,不需要主 server SSH key。 diff --git a/docs/reference/observability.md b/docs/reference/observability.md index 6f3dab7a..a108f6a4 100644 --- a/docs/reference/observability.md +++ b/docs/reference/observability.md @@ -32,6 +32,6 @@ frontend Bun server 必须提供同源 `/api/frontend-performance`,记录 webu 性能优化必须先用这些指标锁定慢操作名称、路径、耗时和代理层级,再改后端查询或前后端通信策略;不得只凭主观体感改 UI。Code Queue 这类控制面页面出现 `core_proxy`、`GET /api/microservices/code-queue/proxy/api/tasks/overview`、`POST /api/microservices/code-queue/proxy/api/tasks/<id>/read` 等超过 1s 的慢操作时,应保留优化前后的性能面板证据,并同时记录 live API 耗时、容器内存、`/health` 存储摘要和是否仍通过 PostgreSQL/append-only archive 重建历史数据。短 TTL cache、warmup 或页面内存缓存只能作为重复请求抖动保护,性能证据必须证明数据库索引/聚合、分页和渐进式披露本身已把核心路径降到目标内,不能用长缓存遮蔽慢 SQL 或全量 JSON 物化。 -当最近失败请求集中出现 frontend `core_proxy` 502,路径为 `/api/microservices/code-queue/proxy/...` 的 overview、trace 或 summary,且 k3s/k8s Pod 仍在运行时,必须区分“Kubernetes API service proxy 不可达”“Code Queue 进程不可达”和“Code Queue event loop 被热路径同步工作饿死”。排障顺序是同时查看 `/api/frontend-performance`、`/api/performance`、`k3sctl-adapter` `/api/control-plane`、Kubernetes Pod `/live`、`/health`、overview/trace-step curl、`kubectl top pod` 或 Docker stats、容器 `RestartCount`/`OOMKilled` 和 Code Queue 日志;如果 Pod 内 `/health` 也超时,应优先检查实时 output 发布、archive 读取、transcript 构建、统计计算、启动维护、历史 OA backfill 和远程 Provider 准备/SSH 子进程是否阻塞 event loop,而不是先调整 frontend 渲染或代理超时。Code Queue 默认不得在启动时自动执行历史 OA backfill 或通知表索引维护;显式 backfill 必须作为运维动作记录,并在运行期间并发证明 `/live`、`/health` 与 `/api/tasks/overview` 仍快速返回。涉及 D601 等远程 Provider 时,还要检查 `runCodeQueueSsh`/开发容器准备是否仍存在同步子进程、无 timeout 的 SSH、无上限 stdout/stderr 或 stale TUN 重建等待;修复后必须在远程准备探针运行期间并发证明 Pod `/health` 与 `/api/tasks/overview` 仍快速返回。 +当最近失败请求集中出现 frontend `core_proxy` 502/503/504,路径为 `/api/microservices/code-queue/proxy/...` 的 overview、trace 或 summary,且 k3s/k8s Pod 仍在运行时,必须先运行 `bun scripts/cli.ts microservice diagnostics code-queue`,区分 provider-gateway online、WebSocket HTTP tunnel、k3sctl-adapter、Kubernetes API service proxy 和目标 Service 五段状态。provider tunnel 类失败必须记录响应 body/headers 中的 `requestId`、`stage`、`failureReason`、`x-unidesk-request-id` 和 `x-unidesk-tunnel-error`;如需主动验证错误结构,运行 `bun scripts/cli.ts microservice tunnel-self-test code-queue`,该自测应返回预期失败但 `ok=true` 的诊断结果。随后再继续判断“Kubernetes API service proxy 不可达”“Code Queue 进程不可达”和“Code Queue event loop 被热路径同步工作饿死”。排障顺序是同时查看 `/api/frontend-performance`、`/api/performance`、`k3sctl-adapter` `/api/control-plane`、Kubernetes Pod `/live`、`/health`、overview/trace-step curl、`kubectl top pod` 或 Docker stats、容器 `RestartCount`/`OOMKilled` 和 Code Queue 日志;如果 Pod 内 `/health` 也超时,应优先检查实时 output 发布、archive 读取、transcript 构建、统计计算、启动维护、历史 OA backfill 和远程 Provider 准备/SSH 子进程是否阻塞 event loop,而不是先调整 frontend 渲染或代理超时。Code Queue 默认不得在启动时自动执行历史 OA backfill 或通知表索引维护;显式 backfill 必须作为运维动作记录,并在运行期间并发证明 `/live`、`/health` 与 `/api/tasks/overview` 仍快速返回。涉及 D601 等远程 Provider 时,还要检查 `runCodeQueueSsh`/开发容器准备是否仍存在同步子进程、无 timeout 的 SSH、无上限 stdout/stderr 或 stale TUN 重建等待;修复后必须在远程准备探针运行期间并发证明 Pod `/health` 与 `/api/tasks/overview` 仍快速返回。 Code Queue task 明明产出最终回复却反复 `retry_wait` 时,应优先用任务详情里的 latest attempt 字段核查 `terminalStatus`、`transportClosedBeforeTerminal`、`appServerExitCode`、`finalResponseChars`、`judge.raw._safetyOverride` 和 attempt output。OpenCode 远程任务中,`opencode completed status=completed exit=0` 加当前 attempt 非空 assistant 输出应对应 `terminalStatus=completed`、`transportClosedBeforeTerminal=false`;如果因为缺少 `step_finish` 事件仍触发 `_safetyOverride=terminal_not_completed`,说明协议终态归一化有回归。相反,当前 attempt 没有最终 assistant response 时即使 tool/read/bash 证据完整,也必须 retry,不能用旧 `task.finalResponse` 或 reasoning/tool evidence 代替可见最终回复。 diff --git a/docs/reference/provider-gateway.md b/docs/reference/provider-gateway.md index ead0e4f7..3290151d 100644 --- a/docs/reference/provider-gateway.md +++ b/docs/reference/provider-gateway.md @@ -92,6 +92,8 @@ provider ingress 是唯一允许公网暴露的 provider 连接接口,当前 backend-core 下发目标 service id、节点本机 `targetBaseUrl`、path、query、method、request body、timeout 和可选 JSON 数组裁剪参数;provider-gateway 支持 `GET`、`HEAD`、`POST`、`PUT`、`PATCH`、`DELETE`,但最终允许方法必须由每个用户服务的 `backend.allowedMethods` 显式配置。provider-gateway 只允许访问 `http://127.0.0.1`、`http://localhost`、`http://host.docker.internal` 这些节点本地地址;主 server 内置 Todo Note 后端可使用 Compose 服务名 `http://todo-note:4211`。`deployment.mode=k3sctl-managed` 的 Code Queue 不得通过 provider-gateway 直连业务容器,正式路径只能是 backend-core -> provider WebSocket HTTP tunnel -> `k3sctl-adapter` -> Kubernetes native Service/DNS,必要时显式 fallback 到 Kubernetes API service proxy -> k3s/k8s Service。该能力不打开 provider-gateway 入站端口,也不替代业务仓库自身 Dockerfile/docker-compose。 +backend-core 必须把 provider WebSocket HTTP tunnel 的失败分类到响应 body 和 headers:失败响应至少包含 `requestId`、`providerId`、`serviceId`、`stage`、`failureReason` 或 provider result,并带 `x-unidesk-request-id` 与 `x-unidesk-tunnel-error`。`GET`/`HEAD` 非 stream 请求允许短超时分层重试;`POST`、`PATCH`、`PUT`、`DELETE` 这类可能产生副作用的请求不得自动重复。Provider 重连时 backend-core 必须先确认 close 事件来自当前 active socket,旧 socket 被新 socket 替换后的迟到 close 不得清理新连接上的 tunnel waiter,也不得把节点误标 offline。 + 超大 JSON 响应可以使用 `jsonArrayLimits` 在 provider-gateway 返回前裁剪指定数组,并在响应体中写入 `_unidesk.arrayLimits` 元数据,便于 UniDesk frontend 预览列表而不展示裸 JSON。长期应优先推动业务后端提供分页 API;裁剪只是 UniDesk 集成层的展示保护。 ## Egress Proxy diff --git a/scripts/cli.ts b/scripts/cli.ts index badc1d8c..05441f06 100644 --- a/scripts/cli.ts +++ b/scripts/cli.ts @@ -45,6 +45,8 @@ function help(): unknown { { command: "microservice list", description: "List UniDesk-managed user services and their provider/runtime mapping." }, { command: "microservice status <id>", description: "Show one user service config, repository reference, backend mapping, and runtime status." }, { command: "microservice health <id>", description: "Probe one user service through backend-core -> provider-gateway HTTP proxy." }, + { command: "microservice diagnostics <id>", description: "Split k3sctl-managed proxy health into provider-gateway, HTTP tunnel, adapter, Kubernetes API service proxy, and target Service checks." }, + { command: "microservice tunnel-self-test <id>", description: "Trigger an expected provider HTTP tunnel failure and verify requestId/stage diagnostics are returned." }, { command: "microservice proxy <id> <path> [--method GET|POST|PUT|PATCH|DELETE] [--raw] [--max-body-bytes N]", description: "Access a private user-service backend path through the same frontend-only proxy used by WebUI; large bodies are summarized unless --raw is set." }, { command: "decision upload <markdown-file> [--title text] [--type meeting|decision] [--level G0|G1|G2|G3|P0|P1|P2|P3|none] [--status active|blocked|parked|done] [--linked-goal-id id] [--evidence url]", description: "Upload a meeting note or decision record through backend-core -> decision-center user-service proxy." }, { command: "decision list [--type ...] [--status ...] [--level ...] [--linked-goal-id id] [--limit N]", description: "List Decision Center records through the user-service proxy." }, diff --git a/scripts/src/microservices.ts b/scripts/src/microservices.ts index d9422d4e..24954060 100644 --- a/scripts/src/microservices.ts +++ b/scripts/src/microservices.ts @@ -95,10 +95,18 @@ export async function runMicroserviceCommand(_config: UniDeskConfig, args: strin const id = requireId(idArg, "microservice health"); return coreInternalFetch(`/api/microservices/${encodeId(id)}/health`); } + if (action === "diagnostics") { + const id = requireId(idArg, "microservice diagnostics"); + return coreInternalFetch(`/api/microservices/${encodeId(id)}/diagnostics`); + } + if (action === "tunnel-self-test") { + const id = requireId(idArg, "microservice tunnel-self-test"); + return coreInternalFetch(`/api/microservices/${encodeId(id)}/tunnel-self-test`); + } if (action === "proxy") { const id = requireId(idArg, "microservice proxy"); const path = requireProxyPath(pathArg); return summarizeMicroserviceProxyResponse(coreInternalFetch(`/api/microservices/${encodeId(id)}/proxy${path}`, { method: methodOption(args) }), args); } - throw new Error("microservice command must be one of: list, status, health, proxy"); + throw new Error("microservice command must be one of: list, status, health, diagnostics, tunnel-self-test, proxy"); } diff --git a/scripts/src/remote.ts b/scripts/src/remote.ts index bf3f508e..d09b52a1 100644 --- a/scripts/src/remote.ts +++ b/scripts/src/remote.ts @@ -455,7 +455,7 @@ async function remoteMicroservice(session: FrontendSession, args: string[]): Pro if (action === "list") { return { transport: "frontend", response: await frontendJson(session, "/api/microservices", undefined, 12_000) }; } - if ((action === "status" || action === "health") && id !== undefined) { + if ((action === "status" || action === "health" || action === "diagnostics" || action === "tunnel-self-test") && id !== undefined) { return { transport: "frontend", response: await frontendJson(session, `/api/microservices/${encodeURIComponent(id)}/${action}`, undefined, 18_000), @@ -468,7 +468,7 @@ async function remoteMicroservice(session: FrontendSession, args: string[]): Pro response: summarizeMicroserviceProxyResponse(response, args), }; } - throw new Error("remote microservice command must be: microservice list | status <id> | health <id> | proxy <id> <path>"); + throw new Error("remote microservice command must be: microservice list | status <id> | health <id> | diagnostics <id> | tunnel-self-test <id> | proxy <id> <path>"); } async function remoteCodeQueue(session: FrontendSession, args: string[]): Promise<unknown> { @@ -559,7 +559,7 @@ async function runRemoteCliOverFrontend(options: RemoteCliOptions, config: UniDe emitRemoteJson(name, { transport: "frontend", baseUrl: session.baseUrl, - commands: ["debug health", "debug dispatch", "debug task", "ssh <providerId> <command>", "ssh <providerId> skills", "microservice list", "microservice status <id>", "microservice health <id>", "microservice proxy <id> <path>", "decision upload <markdown-file>", "decision list", "decision show <id>", "codex task <taskId>", "codex judge <taskId> --attempt N", "network perf"], + commands: ["debug health", "debug dispatch", "debug task", "ssh <providerId> <command>", "ssh <providerId> skills", "microservice list", "microservice status <id>", "microservice health <id>", "microservice diagnostics <id>", "microservice tunnel-self-test <id>", "microservice proxy <id> <path>", "decision upload <markdown-file>", "decision list", "decision show <id>", "codex task <taskId>", "codex judge <taskId> --attempt N", "network perf"], }); return 0; } diff --git a/src/components/backend-core/src/egress-tcp.ts b/src/components/backend-core/src/egress-tcp.ts index 83e49f7a..dedb45fd 100644 --- a/src/components/backend-core/src/egress-tcp.ts +++ b/src/components/backend-core/src/egress-tcp.ts @@ -93,3 +93,11 @@ export function closeEgressTcpConnectionsForProvider(providerId: string): void { connection.socket.destroy(); } } + +export function closeEgressTcpConnectionsForSocket(provider: ProviderSocket): void { + for (const [key, connection] of ctx.activeEgressTcpConnections) { + if (connection.provider !== provider) continue; + ctx.activeEgressTcpConnections.delete(key); + connection.socket.destroy(); + } +} diff --git a/src/components/backend-core/src/index.ts b/src/components/backend-core/src/index.ts index bf4a0ca8..10080bfe 100644 --- a/src/components/backend-core/src/index.ts +++ b/src/components/backend-core/src/index.ts @@ -9,7 +9,7 @@ import { recordRequestPerformance, withPerformanceOperation, getPerformance } fr import { handleProviderMessage, markProviderOffline, markStaleProvidersOffline } from "./provider-registry"; import { markStaleTasksFailed, dispatchTask } from "./task-dispatcher"; import { handleSshClientMessage, sshRoute } from "./ssh-bridge"; -import { closeEgressTcpConnectionsForProvider } from "./egress-tcp"; +import { closeEgressTcpConnectionsForProvider, closeEgressTcpConnectionsForSocket } from "./egress-tcp"; import { scheduledTaskRoute, runDueScheduledTasks, recoverScheduledRuns } from "./scheduler"; import { microserviceRoute, getMicroservices } from "./microservice-proxy"; import { getOverview, codexQueueLoadTest } from "./overview"; @@ -171,17 +171,18 @@ const providerServer = Bun.serve<WsData>({ const providerId = ws.data.providerId; logger("warn", "provider_socket_close", { providerId: providerId ?? null }); if (providerId !== undefined) { + if (ctx.activeProviders.get(providerId) !== ws) { + closeEgressTcpConnectionsForSocket(ws); + logger("info", "provider_socket_close_ignored_replaced", { providerId }); + return; + } closeEgressTcpConnectionsForProvider(providerId); for (const [requestId, waiter] of ctx.httpTunnelWaiters) { if (requestId.startsWith(`${providerId}:`)) { ctx.httpTunnelWaiters.delete(requestId); - waiter(null); + waiter(null, "provider-disconnected"); } } - if (ctx.activeProviders.get(providerId) !== ws) { - logger("info", "provider_socket_close_ignored_replaced", { providerId }); - return; - } markProviderOffline(providerId).catch((error) => logger("error", "provider_offline_mark_failed", { providerId, error: errorToJson(error) })); } }, diff --git a/src/components/backend-core/src/microservice-proxy.ts b/src/components/backend-core/src/microservice-proxy.ts index ee4951e4..a37ec9b6 100644 --- a/src/components/backend-core/src/microservice-proxy.ts +++ b/src/components/backend-core/src/microservice-proxy.ts @@ -1,6 +1,6 @@ import type { JsonValue } from "../../shared/src/index"; import { ctx, config, logger } from "./context"; -import type { MicroserviceConfig, MicroserviceProxyCacheEntry, MicroserviceHealthAssessment, MicroserviceAvailabilityEntry, RawTaskRow } from "./types"; +import type { HttpTunnelFailureReason, MicroserviceConfig, MicroserviceProxyCacheEntry, MicroserviceHealthAssessment, MicroserviceAvailabilityEntry, RawTaskRow } from "./types"; import { jsonResponse, errorToJson, compactJson, isPlainRecord, truncateText } from "./http"; import { createAndSendTask, waitForTaskTerminal, providerSupports } from "./task-dispatcher"; import { getNodes, getNodeDockerStatuses } from "./db"; @@ -12,6 +12,7 @@ import { getNodes, getNodeDockerStatuses } from "./db"; const microserviceProxyMaxBodyTextLength = 8 * 1024 * 1024; const microserviceAvailabilityTtlMs = 30_000; const codeQueueOverviewPathFallbackStaleMs = 30_000; +const providerHttpTunnelMaxAttempts = 3; const microserviceForwardRequestHeaders = [ "accept", "content-type", @@ -456,6 +457,13 @@ function responseFromMicroserviceCache(entry: MicroserviceProxyCacheEntry, state }); } +function isMicroserviceTransientFailureResponse(response: Response): boolean { + if (response.status !== 502 && response.status !== 503 && response.status !== 504) return false; + return response.headers.get("x-unidesk-transient-error") === "true" + || response.headers.get("x-unidesk-tunnel-error") !== null + || response.headers.get("x-unidesk-upstream-proxy-mode") === "provider-gateway-http-fetch"; +} + function readMicroserviceCache(key: string): Response | null { const entry = ctx.microserviceProxyCache.get(key); if (entry === undefined) return null; @@ -626,43 +634,248 @@ async function k3sctlAdapterMicroserviceResponse( return fetchMicroserviceUpstreamResponse(adapter, method, adapterTargetPath, proxyOptions, requestHeaders, bodyText, abortSignal); } +async function k3sctlManagedDiagnosticsResponse(service: MicroserviceConfig): Promise<Response> { + const adapterServiceId = service.deployment.adapterServiceId ?? "k3sctl-adapter"; + const adapter = microserviceById(adapterServiceId); + const checkedAt = new Date().toISOString(); + const providerId = adapter?.providerId ?? service.providerId; + const providerOnline = ctx.activeProviders.has(providerId); + const providerTunnelCapable = await providerSupports(providerId, "microservice.http.tunnel"); + if (adapter === null) { + return jsonResponse({ + ok: false, + serviceId: service.id, + checkedAt, + requestPath: "/diagnostics", + checks: { + providerGateway: { ok: providerOnline, providerId, online: providerOnline }, + httpTunnel: { ok: providerTunnelCapable, providerId, capable: providerTunnelCapable }, + k3sctlAdapter: { ok: false, serviceId: adapterServiceId, error: `k3sctl adapter microservice not found: ${adapterServiceId}` }, + kubernetesApiServiceProxy: { ok: false, skipped: true }, + targetService: { ok: false, skipped: true }, + }, + }, 502); + } + + const k3sServiceId = service.id === "code-queue" + ? codeQueueK3sServiceIdForRequest("GET", service.backend.healthPath) + : service.deployment.k3sServiceId ?? service.id; + const adapterPath = `/api/services/${encodeURIComponent(k3sServiceId)}/diagnostics`; + const response = await fetchMicroserviceUpstreamResponse( + adapter, + "GET", + adapterPath, + { query: "", jsonArrayLimits: {} }, + { accept: "application/json" }, + "", + ); + const contentType = response.headers.get("content-type") ?? "application/json; charset=utf-8"; + const bodyText = await response.text(); + let adapterBody: JsonValue = bodyText; + try { + adapterBody = JSON.parse(bodyText) as JsonValue; + } catch { + adapterBody = bodyText.slice(0, 4000); + } + const bodyRecord = isPlainRecord(adapterBody) ? adapterBody : {}; + const adapterChecks = isPlainRecord(bodyRecord.checks) ? bodyRecord.checks : {}; + const checks = { + providerGateway: { + ok: providerOnline, + providerId, + online: providerOnline, + activeSocketCount: ctx.activeProviders.size, + }, + httpTunnel: { + ok: response.ok && response.headers.get("x-unidesk-proxy-mode") === "provider-ws-http-tunnel", + providerId, + capable: providerTunnelCapable, + requestId: response.headers.get("x-unidesk-request-id") ?? null, + attempts: response.headers.get("x-unidesk-http-tunnel-attempts") ?? null, + upstreamProxyMode: response.headers.get("x-unidesk-upstream-proxy-mode") ?? null, + proxyStatus: response.status, + }, + k3sctlAdapter: { + ok: response.ok, + serviceId: adapter.id, + providerId: adapter.providerId, + status: response.status, + contentType, + }, + kubernetesApiServiceProxy: compactJson(adapterChecks.kubernetesApiServiceProxy ?? { ok: false, skipped: true }), + targetService: compactJson(adapterChecks.targetService ?? adapterChecks.managedService ?? { ok: false, skipped: true }), + } satisfies Record<string, JsonValue>; + const httpTunnelCheck = checks.httpTunnel as Record<string, JsonValue>; + return jsonResponse({ + ok: response.ok && providerOnline && httpTunnelCheck.ok === true, + serviceId: service.id, + k3sServiceId, + checkedAt, + path: service.backend.healthPath, + chain: "CLI/frontend -> backend-core -> provider-gateway HTTP tunnel -> k3sctl-adapter -> Kubernetes API service proxy -> k3s Service", + checks, + adapter: adapterBody, + }, response.ok ? 200 : response.status); +} + +async function microserviceTunnelSelfTestResponse(service: MicroserviceConfig): Promise<Response> { + const tunnelService = isK3sctlManagedMicroservice(service) + ? microserviceById(service.deployment.adapterServiceId ?? "k3sctl-adapter") + : service; + if (tunnelService === null) { + return jsonResponse({ ok: false, serviceId: service.id, error: "tunnel service not found", adapterServiceId: service.deployment.adapterServiceId ?? null }, 502); + } + if (!(await providerSupports(tunnelService.providerId, "microservice.http.tunnel"))) { + return jsonResponse({ + ok: false, + serviceId: service.id, + providerId: tunnelService.providerId, + error: `provider does not declare microservice.http.tunnel capability: ${tunnelService.providerId}`, + }, 409); + } + const probeService = { + ...tunnelService, + backend: { + ...tunnelService.backend, + nodeBaseUrl: "http://127.0.0.1:1", + timeoutMs: 1000, + }, + }; + const response = await providerHttpTunnelMicroserviceResponse( + probeService, + "GET", + "/", + { query: "", jsonArrayLimits: {} }, + { accept: "application/json" }, + "", + ); + const headers = { + requestId: response.headers.get("x-unidesk-request-id"), + tunnelError: response.headers.get("x-unidesk-tunnel-error"), + providerId: response.headers.get("x-unidesk-provider-id"), + serviceId: response.headers.get("x-unidesk-service-id"), + transient: response.headers.get("x-unidesk-transient-error"), + }; + const bodyText = await response.text(); + let body: JsonValue = bodyText.slice(0, 4000); + try { + body = JSON.parse(bodyText) as JsonValue; + } catch { + // Keep bounded text body for malformed JSON diagnostics. + } + const bodyRecord = isPlainRecord(body) ? body : {}; + const hasRequestId = typeof bodyRecord.requestId === "string" && bodyRecord.requestId.length > 0; + const hasStage = typeof bodyRecord.stage === "string" && bodyRecord.stage.length > 0; + const ok = response.status === 502 && hasRequestId && hasStage && headers.requestId === bodyRecord.requestId; + return jsonResponse({ + ok, + serviceId: service.id, + tunnelServiceId: tunnelService.id, + providerId: tunnelService.providerId, + expectedFailure: true, + status: response.status, + checks: { + expectedStatus: response.status === 502, + bodyHasRequestId: hasRequestId, + bodyHasStage: hasStage, + headerHasRequestId: typeof headers.requestId === "string" && headers.requestId.length > 0, + headerHasTunnelError: typeof headers.tunnelError === "string" && headers.tunnelError.length > 0, + }, + headers, + body, + }, ok ? 200 : 502); +} + function providerHttpTunnelRequestId(providerId: string): string { return `${providerId}:http_${Date.now()}_${Math.random().toString(16).slice(2)}`; } +function canRetryProviderHttpTunnel(method: string, targetPath: string): boolean { + const normalizedMethod = method.toUpperCase(); + if (normalizedMethod !== "GET" && normalizedMethod !== "HEAD") return false; + return !targetPath.endsWith("/stream"); +} + +function providerHttpTunnelWaitMs(service: MicroserviceConfig, attempt: number, retryable: boolean): number { + const baseTimeoutMs = Math.max(1000, service.backend.timeoutMs); + if (!retryable) return baseTimeoutMs + 3000; + if (attempt === 1) return Math.min(baseTimeoutMs + 3000, Math.max(5000, Math.floor(baseTimeoutMs * 0.45))); + if (attempt === 2) return Math.min(baseTimeoutMs + 3000, Math.max(6000, Math.floor(baseTimeoutMs * 0.7))); + return baseTimeoutMs + 3000; +} + +function tunnelErrorBody( + service: MicroserviceConfig, + requestId: string, + error: string, + stage: string, + status: number, + extra: Record<string, JsonValue> = {}, +): Response { + const response = jsonResponse({ + ok: false, + error, + stage, + providerId: service.providerId, + serviceId: service.id, + requestId, + ...extra, + }, status); + response.headers.set("x-unidesk-request-id", requestId); + response.headers.set("x-unidesk-provider-id", service.providerId); + response.headers.set("x-unidesk-service-id", service.id); + response.headers.set("x-unidesk-tunnel-error", stage); + if (status === 502 || status === 503 || status === 504) response.headers.set("x-unidesk-transient-error", "true"); + return response; +} + +function providerHttpTunnelFailureStatus(reason: HttpTunnelFailureReason | null): number { + if (reason === "aborted") return 499; + if (reason === "provider-disconnected") return 503; + if (reason === "send-failed") return 502; + return 504; +} + +function tunnelFailureRetryable(reason: HttpTunnelFailureReason | null): boolean { + return reason === "timeout" || reason === "provider-disconnected" || reason === "send-failed"; +} + async function waitForProviderHttpTunnelResponse( providerId: string, requestId: string, timeoutMs: number, abortSignal?: AbortSignal, -): Promise<{ providerId: string; requestId: string; ok: boolean; result: JsonValue } | null> { +): Promise<{ message: { providerId: string; requestId: string; ok: boolean; result: JsonValue } | null; reason: HttpTunnelFailureReason | null }> { return await new Promise((resolve) => { let settled = false; let abortHandler: (() => void) | null = null; - const timer = setTimeout(() => settle(null), Math.max(1, timeoutMs)); - const settle = (message: { providerId: string; requestId: string; ok: boolean; result: JsonValue } | null): void => { + const timer = setTimeout(() => settle(null, "timeout"), Math.max(1, timeoutMs)); + const settle = ( + message: { providerId: string; requestId: string; ok: boolean; result: JsonValue } | null, + reason: HttpTunnelFailureReason | null = null, + ): void => { if (settled) return; settled = true; clearTimeout(timer); if (abortHandler !== null) abortSignal?.removeEventListener("abort", abortHandler); ctx.httpTunnelWaiters.delete(requestId); - resolve(message); + resolve({ message, reason }); }; - abortHandler = () => settle(null); + abortHandler = () => settle(null, "aborted"); if (abortSignal !== undefined) { if (abortSignal.aborted) { - settle(null); + settle(null, "aborted"); return; } abortSignal.addEventListener("abort", abortHandler, { once: true }); } - ctx.httpTunnelWaiters.set(requestId, (message) => { + ctx.httpTunnelWaiters.set(requestId, (message, reason) => { if (message !== null && message.providerId !== providerId) { logger("warn", "http_tunnel_provider_mismatch", { requestId, expectedProviderId: providerId, actualProviderId: message.providerId }); - settle(null); + settle(null, "provider-mismatch"); return; } - settle(message); + settle(message, reason ?? null); }); }); } @@ -676,32 +889,116 @@ async function providerHttpTunnelMicroserviceResponse( bodyText: string, abortSignal?: AbortSignal, ): Promise<Response> { - const socket = ctx.activeProviders.get(service.providerId); - if (socket === undefined) return jsonResponse({ ok: false, error: `provider is offline: ${service.providerId}` }, 503); - const requestId = providerHttpTunnelRequestId(service.providerId); - const timeoutMs = service.backend.timeoutMs + 3000; - const waiter = waitForProviderHttpTunnelResponse(service.providerId, requestId, timeoutMs, abortSignal); - socket.send(JSON.stringify({ - type: "http_tunnel_request", - requestId, - payload: { - source: "microservice-frontend-proxy", - serviceId: service.id, - method, - targetBaseUrl: service.backend.nodeBaseUrl, - path: targetPath, - query: proxyOptions.query, - requestHeaders, - bodyText, - jsonArrayLimits: proxyOptions.jsonArrayLimits, - timeoutMs: service.backend.timeoutMs, - cacheTtlMs: providerMicroserviceCacheTtlMs(service.id, targetPath), - }, - })); - const message = await waiter; - if (message === null) return jsonResponse({ ok: false, error: "provider HTTP tunnel timed out or disconnected", providerId: service.providerId, requestId }, 504); - if (!message.ok) return jsonResponse({ ok: false, error: "provider HTTP tunnel failed", providerId: service.providerId, requestId, result: message.result }, 502); - return responseFromProviderMicroserviceResult(dockerStatusRecord(message.result), "provider-ws-http-tunnel"); + const retryable = canRetryProviderHttpTunnel(method, targetPath); + const maxAttempts = retryable ? providerHttpTunnelMaxAttempts : 1; + const attempts: JsonValue[] = []; + let lastRequestId = ""; + for (let attempt = 1; attempt <= maxAttempts; attempt += 1) { + const socket = ctx.activeProviders.get(service.providerId); + const requestId = providerHttpTunnelRequestId(service.providerId); + lastRequestId = requestId; + if (socket === undefined) { + attempts.push({ attempt, requestId, ok: false, reason: "provider-offline" }); + return tunnelErrorBody(service, requestId, `provider is offline: ${service.providerId}`, "provider-gateway-online", 503, { + retryable, + attempts, + }); + } + const timeoutMs = providerHttpTunnelWaitMs(service, attempt, retryable); + const startedAt = Date.now(); + const waiter = waitForProviderHttpTunnelResponse(service.providerId, requestId, timeoutMs, abortSignal); + try { + socket.send(JSON.stringify({ + type: "http_tunnel_request", + requestId, + payload: { + source: "microservice-frontend-proxy", + serviceId: service.id, + method, + targetBaseUrl: service.backend.nodeBaseUrl, + path: targetPath, + query: proxyOptions.query, + requestHeaders, + bodyText, + jsonArrayLimits: proxyOptions.jsonArrayLimits, + timeoutMs: service.backend.timeoutMs, + cacheTtlMs: providerMicroserviceCacheTtlMs(service.id, targetPath), + }, + })); + } catch (error) { + ctx.httpTunnelWaiters.get(requestId)?.(null, "send-failed"); + const durationMs = Date.now() - startedAt; + attempts.push({ attempt, requestId, ok: false, reason: "send-failed", durationMs, error: errorToJson(error) }); + if (attempt < maxAttempts) { + logger("warn", "http_tunnel_send_retry", { providerId: service.providerId, serviceId: service.id, requestId, attempt, maxAttempts, error: errorToJson(error) }); + await Bun.sleep(Math.min(500, 75 * attempt)); + continue; + } + return tunnelErrorBody(service, requestId, "provider HTTP tunnel send failed", "http-tunnel-send", 502, { + retryable, + attempts, + detail: errorToJson(error), + }); + } + const { message, reason } = await waiter; + const durationMs = Date.now() - startedAt; + if (message === null) { + attempts.push({ attempt, requestId, ok: false, reason: reason ?? "timeout", durationMs, timeoutMs }); + if (retryable && tunnelFailureRetryable(reason) && attempt < maxAttempts) { + logger("warn", "http_tunnel_retry", { + providerId: service.providerId, + serviceId: service.id, + requestId, + attempt, + maxAttempts, + reason: reason ?? "timeout", + durationMs, + timeoutMs, + }); + await Bun.sleep(Math.min(750, 100 * attempt)); + continue; + } + return tunnelErrorBody( + service, + requestId, + "provider HTTP tunnel timed out or disconnected", + reason === "provider-disconnected" ? "http-tunnel-provider-disconnected" : reason === "aborted" ? "client-aborted" : "http-tunnel-wait", + providerHttpTunnelFailureStatus(reason), + { retryable, attempts, timeoutMs, failureReason: reason ?? "timeout" }, + ); + } + attempts.push({ attempt, requestId, ok: message.ok, durationMs }); + if (!message.ok) { + const result = dockerStatusRecord(message.result); + const resultError = typeof result.error === "string" ? result.error : "provider HTTP tunnel failed"; + logger("warn", "http_tunnel_provider_error", { + providerId: service.providerId, + serviceId: service.id, + requestId, + attempt, + maxAttempts, + durationMs, + result: compactJson(result), + }); + if (retryable && attempt < maxAttempts) { + await Bun.sleep(Math.min(750, 100 * attempt)); + continue; + } + return tunnelErrorBody(service, requestId, "provider HTTP tunnel failed", "provider-gateway-http-fetch", 502, { + retryable, + attempts, + result: message.result, + providerError: resultError, + }); + } + const response = responseFromProviderMicroserviceResult(dockerStatusRecord(message.result), "provider-ws-http-tunnel"); + response.headers.set("x-unidesk-request-id", requestId); + response.headers.set("x-unidesk-http-tunnel-attempt", String(attempt)); + response.headers.set("x-unidesk-http-tunnel-attempts", String(attempts.length)); + response.headers.set("x-unidesk-provider-id", service.providerId); + return response; + } + return tunnelErrorBody(service, lastRequestId, "provider HTTP tunnel exhausted attempts", "http-tunnel-wait", 504, { retryable, attempts }); } async function fetchMicroserviceUpstreamResponse( @@ -901,14 +1198,29 @@ export async function microserviceRoute(req: Request, url: URL): Promise<Respons ? "/" : suffix.startsWith(`${proxyPrefix}/`) ? `/${suffix.slice(proxyPrefix.length + 1)}` + : suffix === "diagnostics" + ? "/diagnostics" + : suffix === "tunnel-self-test" + ? "/tunnel-self-test" : ""; - if (targetPath.length === 0) return jsonResponse({ ok: false, error: "microservice route must be /status, /health, or /proxy/<path>" }, 404); + if (targetPath.length === 0) return jsonResponse({ ok: false, error: "microservice route must be /status, /health, /diagnostics, /tunnel-self-test, or /proxy/<path>" }, 404); if (suffix === "health" && method !== "GET" && method !== "HEAD") { return jsonResponse({ ok: false, error: "microservice health only supports GET/HEAD" }, 405); } + if (suffix === "diagnostics" && method !== "GET" && method !== "HEAD") { + return jsonResponse({ ok: false, error: "microservice diagnostics only supports GET/HEAD" }, 405); + } + if (suffix === "tunnel-self-test" && method !== "GET" && method !== "HEAD") { + return jsonResponse({ ok: false, error: "microservice tunnel self-test only supports GET/HEAD" }, 405); + } if (!isMicroserviceMethodAllowed(service, method)) { return jsonResponse({ ok: false, error: "microservice method is not allowed", serviceId, method, allowedMethods: service.backend.allowedMethods }, 405); } + if (suffix === "diagnostics") { + if (!isK3sctlManagedMicroservice(service)) return strictMicroserviceHealthResponse(service, method === "HEAD"); + return k3sctlManagedDiagnosticsResponse(service); + } + if (suffix === "tunnel-self-test") return microserviceTunnelSelfTestResponse(service); if (!isMicroservicePathAllowed(service, targetPath)) { return jsonResponse({ ok: false, error: "microservice path is not allowed", serviceId, targetPath }, 403); } @@ -951,6 +1263,14 @@ export async function microserviceRoute(req: Request, url: URL): Promise<Respons } } const response = await fetchMicroserviceUpstreamResponse(service, method, targetPath, proxyOptions, requestHeaders, bodyText, req.signal); + if ((method === "GET" || method === "HEAD") && isMicroserviceTransientFailureResponse(response)) { + const stale = readStaleMicroserviceCache(cacheKey) ?? readMicroservicePathFallback(service, method, targetPath); + if (stale !== null) { + stale.headers.set("x-unidesk-cache", "stale-on-transient-failure"); + stale.headers.set("x-unidesk-stale-reason", String(response.status)); + return stale; + } + } if ((method === "GET" || method === "HEAD") && cacheTtlMs > 0) { const snapshot = await cacheableResponseSnapshot(response); rememberMicroserviceCache(cacheKey, cacheTtlMs, snapshot); diff --git a/src/components/backend-core/src/types.ts b/src/components/backend-core/src/types.ts index 681160b9..f887c182 100644 --- a/src/components/backend-core/src/types.ts +++ b/src/components/backend-core/src/types.ts @@ -168,6 +168,13 @@ export interface EgressTcpConnection { provider: ProviderSocket; } +export type HttpTunnelFailureReason = + | "timeout" + | "aborted" + | "provider-disconnected" + | "provider-mismatch" + | "send-failed"; + export interface MicroserviceProxyCacheEntry { expiresAt: number; staleExpiresAt: number; @@ -193,6 +200,6 @@ export type HttpTunnelWaiter = (message: { requestId: string; ok: boolean; result: JsonValue; -} | null) => void; +} | null, reason?: HttpTunnelFailureReason) => void; export type LoggerFn = (level: "debug" | "info" | "warn" | "error", message: string, data?: JsonValue) => void; diff --git a/src/components/microservices/k3sctl-adapter/src/index.ts b/src/components/microservices/k3sctl-adapter/src/index.ts index 22e50554..91dd0839 100644 --- a/src/components/microservices/k3sctl-adapter/src/index.ts +++ b/src/components/microservices/k3sctl-adapter/src/index.ts @@ -723,6 +723,30 @@ function parseCurlHeaderBody(output: Buffer): { status: number; contentType: str return { status: Number.isFinite(status) ? status : 502, contentType, bodyText }; } +function bodyPreview(bodyText: string, contentType: string): JsonValue { + if (contentType.toLowerCase().includes("json")) { + try { + return JSON.parse(bodyText) as JsonValue; + } catch { + return bodyText.slice(0, 2000); + } + } + return bodyText.slice(0, 2000); +} + +function responseProbeRecord(response: Response, bodyText: string, startedAt: number): JsonRecord { + const contentType = response.headers.get("content-type") ?? "application/octet-stream"; + return { + ok: response.ok, + status: response.ok ? "healthy" : "unhealthy", + upstreamStatus: response.status, + contentType, + proxyMode: response.headers.get("x-unidesk-proxy-mode") ?? "", + durationMs: Date.now() - startedAt, + body: bodyPreview(bodyText, contentType), + }; +} + async function kubeApiServiceProxyResponse( service: ManagedService, req: Request, @@ -733,6 +757,25 @@ async function kubeApiServiceProxyResponse( return kubeApiProxyResponse(service, req, serviceProxyApiPath(service, targetPath), query, timeoutMs); } +async function kubeApiServiceProxyProbe(service: ManagedService, targetPath: string, timeoutMs: number): Promise<JsonRecord> { + const startedAt = Date.now(); + try { + const request = new Request("http://k3sctl-adapter.local/diagnostics", { method: "GET", headers: { accept: "application/json" } }); + const response = await kubeApiServiceProxyResponse(service, request, targetPath, "", timeoutMs); + const bodyText = await response.text(); + return responseProbeRecord(response, bodyText, startedAt); + } catch (error) { + return { + ok: false, + status: "unhealthy", + upstreamStatus: null, + proxyMode: "kubernetes-api-service-proxy", + durationMs: Date.now() - startedAt, + error: errorToJson(error), + }; + } +} + async function nativeServiceProxyResponse( service: ManagedService, req: Request, @@ -1116,6 +1159,74 @@ async function controlPlaneSnapshot(): Promise<JsonRecord> { }; } +async function serviceDiagnostics(service: ManagedService): Promise<JsonRecord> { + const checkedAt = new Date().toISOString(); + const healthPath = activeEndpoint(service).healthPath; + const healthTimeoutMs = Math.max(500, Math.min(config.healthTimeoutMs, 5000)); + const kubernetesApiServiceProxy = await kubeApiServiceProxyProbe(service, healthPath, healthTimeoutMs); + const targetServiceStartedAt = Date.now(); + let targetService: JsonRecord; + try { + const nativeRequest = new Request("http://k3sctl-adapter.local/diagnostics", { method: "GET", headers: { accept: "application/json" } }); + const native = await nativeServiceProxyResponse(service, nativeRequest.clone(), healthPath, "", healthTimeoutMs); + const response = native ?? await kubeApiServiceProxyResponse(service, nativeRequest, healthPath, "", healthTimeoutMs); + const bodyText = await response.text(); + targetService = responseProbeRecord(response, bodyText, targetServiceStartedAt); + } catch (error) { + targetService = { + ok: false, + status: "unhealthy", + upstreamStatus: null, + proxyMode: "k3sctl-service-health", + durationMs: Date.now() - targetServiceStartedAt, + error: errorToJson(error), + }; + } + const managedService = await serviceStatus(service).then((status) => ({ + ok: status.healthy === true, + status: String(status.status ?? ""), + servingHealthy: status.servingHealthy === true, + topologyComplete: status.topologyComplete === true, + activeInstanceId: String(status.activeInstanceId ?? ""), + active: status.active ?? null, + missingNodeIds: Array.isArray(status.missingNodeIds) ? status.missingNodeIds as JsonValue : [], + } satisfies JsonRecord)).catch((error) => ({ + ok: false, + status: "unhealthy", + error: errorToJson(error), + } satisfies JsonRecord)); + const kubernetesApiServiceProxyOk = kubernetesApiServiceProxy.ok === true; + const targetServiceOk = targetService.ok === true; + const checks = { + k3sctlAdapter: { + ok: true, + nodeId: config.nodeId, + clusterId: config.clusterId, + startedAt, + }, + kubernetesApiServiceProxy: { + ...kubernetesApiServiceProxy, + configured: kubeClient !== null, + kubeconfigPath: config.kubeconfigPath, + connectHost: config.kubeApiConnectHost, + }, + targetService, + managedService, + } satisfies Record<string, JsonValue>; + const ok = kubernetesApiServiceProxyOk && targetServiceOk; + return { + ok, + service: "k3sctl-adapter", + serviceId: service.id, + namespace: service.namespace, + checkedAt, + healthPath, + route: service.route, + noFallback: true, + checks, + }; +} + function forwardHeaders(request: Request): Headers { const headers = new Headers(); for (const name of ["accept", "content-type", "x-requested-with"]) { @@ -1165,6 +1276,13 @@ async function route(req: Request): Promise<Response> { const status = await serviceStatus(service); return req.method === "HEAD" ? new Response(null, { status: status.healthy === true ? 200 : 503 }) : jsonResponse({ ok: status.healthy === true, managedService: status }, status.healthy === true ? 200 : 503); } + const diagnosticsMatch = url.pathname.match(/^\/api\/services\/([^/]+)\/diagnostics$/u); + if (diagnosticsMatch !== null && (req.method === "GET" || req.method === "HEAD")) { + const service = serviceById(decodeURIComponent(diagnosticsMatch[1] ?? "")); + if (service === null) return jsonResponse({ ok: false, error: "managed service not found" }, 404); + const diagnostics = await serviceDiagnostics(service); + return req.method === "HEAD" ? new Response(null, { status: diagnostics.ok === true ? 200 : 503 }) : jsonResponse(diagnostics, diagnostics.ok === true ? 200 : 503); + } const proxyMatch = url.pathname.match(/^\/api\/services\/([^/]+)\/proxy(\/.*)$/u); if (proxyMatch !== null) { const service = serviceById(decodeURIComponent(proxyMatch[1] ?? "")); From b91fb1a2e116785c5b1e634445122e6e3932bf7f Mon Sep 17 00:00:00 2001 From: Codex <codex@noreply.local> Date: Sun, 17 May 2026 06:54:30 +0000 Subject: [PATCH 12/14] fix: guard code queue claim move race --- docs/reference/cli.md | 2 +- .../microservices/code-queue/src/index.ts | 448 ++++++++++++++++-- .../microservices/code-queue/src/queue-api.ts | 39 +- .../code-queue/src/self-tests.ts | 44 +- 4 files changed, 488 insertions(+), 45 deletions(-) diff --git a/docs/reference/cli.md b/docs/reference/cli.md index 49caaad6..0165f35c 100644 --- a/docs/reference/cli.md +++ b/docs/reference/cli.md @@ -26,7 +26,7 @@ UniDesk 的统一 CLI 入口是根目录 `scripts/cli.ts`,运行方式固定 - `codex task <taskId> --trace --tail|--from-start|--after-seq N|--before-seq N --limit N` 按页拉取 Code Queue 的逻辑 trace;响应会返回 `nextAfterSeq`、`previousBeforeSeq`、`hasMore`、`hasBefore` 和下一页/上一页命令,默认 `--trace` 取最新一页,需要完整 prompt/最后 response 时加 `--full`。 - `codex output <taskId> --tail|--from-start|--after-seq N|--before-seq N --limit N [--full-text]` 按原始 output seq 分页读取底层记录;当 trace 行提示 `commandOmittedLines`、`bodyOmittedLines` 或 `rawSeqs` 时,用该命令按 seq 补取完整信息,默认仍有单条文本预览上限,显式 `--full-text` 才返回该页全文。 - `codex judge <taskId> --attempt N [--dry-run] [--include-prompt]` 通过 Code Queue 私有代理按指定 attempt 单步复现 judge;后端会从 PostgreSQL task JSON 与 output 归档重建该 attempt 在真实队列 worker 中的 `QueueTask`/`CodexRunResult`,再调用同一套 judge prompt builder 和 MiniMax 请求路径。默认会真实调用 MiniMax,`--dry-run` 只返回 prompt/payload 大小、attempt 窗口和重建来源诊断,`--include-prompt` 仅用于本地深度排查。 -- Code Queue 多队列 lane 由 `codex` 命令命名空间管理:`queues` 列表、`queue create <queueId>` 创建、`queue merge <sourceQueueId> --into <targetQueueId>` 合并、`move <taskId> --queue <queueId>` 迁移;同一个 queue 内部串行执行,不同 queue 之间并行执行。合并会移动任务归属并自动删除源 queue 记录,只保留合并后的目标 queue;合并后的目标 queue 按任务原 `queueEnteredAt`/`createdAt` 时间顺序串行。迁移 queued/retry_wait 任务后会立即调度目标 queue。 +- Code Queue 多队列 lane 由 `codex` 命令命名空间管理:`queues` 列表、`queue create <queueId>` 创建、`queue merge <sourceQueueId> --into <targetQueueId>` 合并、`move <taskId> --queue <queueId>` 迁移;同一个 queue 内部串行执行,不同 queue 之间并行执行。迁移只允许尚未被 scheduler claim 的 `queued`/`retry_wait` 任务,必须满足 `startedAt=null`、`currentAttempt=0` 且没有 active thread/turn;已进入 `running`/`judging` 或已有 claim 标记的任务返回 409,不得被 move/merge 回写成 queued。合并会移动可迁移任务归属并自动删除源 queue 记录,只保留合并后的目标 queue;若 source 或 target queue 存在 active/claimed 任务,合并整体返回 409。合并后的目标 queue 按任务原 `queueEnteredAt`/`createdAt` 时间顺序串行,成功迁移 queued/retry_wait 任务后会立即调度目标 queue。 - `job list` 与 `job status` 查询 `.state/jobs/` 文件系统状态,是异步命令的可观测入口。 - `debug health`、`debug dispatch` 与 `debug task` 走真实内部 core、WebSocket、数据库、provider、系统指标、Docker 状态和 Host SSH 维护桥流程,只用于开发调试,不写入 `TEST.md` 的正式验收步骤。 - `e2e run [--only pattern[,pattern...]] [--skip pattern[,pattern...]]` 使用 publicHost 派生的公开 frontend/provider ingress URL,并通过 Docker 内网验证 core API、PostgreSQL、provider self-connection、系统指标曲线、Docker 状态快照、provider.upgrade 预检和 Playwright 前端页面,是交付前的自动化 E2E 门禁;CLI 默认输出 check 状态摘要,完整诊断写入 `resultPath`,日常迭代应优先用 `--only` / `--skip` 跑最小必要集合。 diff --git a/src/components/microservices/code-queue/src/index.ts b/src/components/microservices/code-queue/src/index.ts index fe758aa6..571186ba 100644 --- a/src/components/microservices/code-queue/src/index.ts +++ b/src/components/microservices/code-queue/src/index.ts @@ -128,7 +128,7 @@ import { readOaTraceStatsForTaskAttempts, readOaTraceStatsForTasks, } from "./oa-events"; -import { configureSelfTests, runJudgeInfraSelfTest, runQueueOrderingSelfTest, runReferenceInjectionSelfTest, runTracePortSelfTest } from "./self-tests"; +import { configureSelfTests, runJudgeInfraSelfTest, runQueueClaimMoveSelfTest, runQueueOrderingSelfTest, runReferenceInjectionSelfTest, runTracePortSelfTest } from "./self-tests"; import { codexToolLifecycleStartedBeforeIn, configureTaskView, @@ -1185,7 +1185,12 @@ function updateNextSeqFromTasks(): void { state.nextSeq = nextSeq; } -async function upsertTaskToDatabase(client: SqlExecutor, task: QueueTask): Promise<void> { +interface UpsertTaskOptions { + claimQueueId?: string | null; +} + +async function upsertTaskToDatabase(client: SqlExecutor, task: QueueTask, options: UpsertTaskOptions = {}): Promise<boolean> { + const claimQueueId = options.claimQueueId ?? null; const rows = await client<Array<{ read_at: Date | string | null }>>` INSERT INTO unidesk_code_queue_tasks ( id, @@ -1292,9 +1297,62 @@ async function upsertTaskToDatabase(client: SqlExecutor, task: QueueTask): Promi END, true ) + WHERE ( + ${claimQueueId === null} + OR ( + unidesk_code_queue_tasks.queue_id = ${claimQueueId} + AND ( + ( + unidesk_code_queue_tasks.status = 'queued' + AND unidesk_code_queue_tasks.started_at IS NULL + AND unidesk_code_queue_tasks.current_attempt = 0 + AND unidesk_code_queue_tasks.codex_thread_id IS NULL + AND unidesk_code_queue_tasks.active_turn_id IS NULL + ) + OR ( + unidesk_code_queue_tasks.status = 'retry_wait' + AND unidesk_code_queue_tasks.active_turn_id IS NULL + ) + ) + ) + ) + AND NOT ( + EXCLUDED.status IN ('queued', 'retry_wait') + AND EXCLUDED.started_at IS NULL + AND EXCLUDED.current_attempt = 0 + AND EXCLUDED.codex_thread_id IS NULL + AND EXCLUDED.active_turn_id IS NULL + AND ( + unidesk_code_queue_tasks.status IN ('running', 'judging') + OR unidesk_code_queue_tasks.started_at IS NOT NULL + OR unidesk_code_queue_tasks.current_attempt > 0 + OR unidesk_code_queue_tasks.codex_thread_id IS NOT NULL + OR unidesk_code_queue_tasks.active_turn_id IS NOT NULL + ) + ) RETURNING read_at `; + if (rows.length === 0) { + const current = await client<DatabaseTaskStatusRow[]>` + SELECT id, queue_id, status, started_at, current_attempt, codex_thread_id, active_turn_id + FROM unidesk_code_queue_tasks + WHERE id = ${task.id} + LIMIT 1 + `; + logger("warn", "database_task_stale_unclaimed_write_rejected", { + taskId: task.id, + attemptedQueueId: queueIdOf(task), + attemptedStatus: task.status, + attemptedStartedAt: task.startedAt, + attemptedCurrentAttempt: task.currentAttempt, + attemptedCodexThreadId: task.codexThreadId, + attemptedActiveTurnId: task.activeTurnId, + current: databaseStatusRowJson(current[0] ?? null), + }); + return false; + } task.readAt = timestampToIso(rows[0]?.read_at ?? null); + return true; } async function upsertQueueToDatabase(client: SqlExecutor, queue: QueueRecord): Promise<void> { @@ -1347,6 +1405,16 @@ interface DatabaseTaskRow { task_json: unknown; } +interface DatabaseTaskStatusRow { + id: string; + queue_id: string; + status: TaskStatus; + started_at: Date | string | null; + current_attempt: number | string | null; + codex_thread_id: string | null; + active_turn_id: string | null; +} + interface DatabaseQueueRow { id: string; name: string; @@ -1376,6 +1444,161 @@ function normalizeDatabaseTaskRows(rows: DatabaseTaskRow[], source: string): Que return tasks.sort((left, right) => (timestampMs(left.createdAt) ?? 0) - (timestampMs(right.createdAt) ?? 0) || left.id.localeCompare(right.id)); } +function databaseStatusRowJson(row: DatabaseTaskStatusRow | null): JsonValue { + if (row === null) return null; + return { + id: row.id, + queueId: safeQueueId(row.queue_id), + status: row.status, + startedAt: timestampToIso(row.started_at), + currentAttempt: Number(row.current_attempt ?? 0), + codexThreadId: row.codex_thread_id, + activeTurnId: row.active_turn_id, + }; +} + +function taskIsUnclaimedMovable(task: QueueTask): boolean { + return (task.status === "queued" || task.status === "retry_wait") + && task.startedAt === null + && task.currentAttempt === 0 + && task.codexThreadId === null + && task.activeTurnId === null; +} + +function databaseTaskMoveBlocker(row: DatabaseTaskStatusRow | null): string { + if (row === null) return "task not found"; + if (row.status !== "queued" && row.status !== "retry_wait") return `status=${row.status}`; + if (row.started_at !== null) return "task already has started_at"; + if (Number(row.current_attempt ?? 0) !== 0) return `task already has current_attempt=${Number(row.current_attempt ?? 0)}`; + if (row.codex_thread_id !== null) return "task already has codex_thread_id"; + if (row.active_turn_id !== null) return "task already has active_turn_id"; + return ""; +} + +function taskMoveBlocker(task: QueueTask): string { + if (activeRunForTask(task) !== null) return "task has an active agent run"; + if (processingQueues.has(queueIdOf(task))) return "queue processor is currently active"; + if (activeRunSlotReservations.has(queueIdOf(task))) return "queue is reserving an active run slot"; + if (activeRunSlotWaiters.some((waiter) => waiter.taskId === task.id || waiter.queueId === queueIdOf(task))) return "queue is waiting for an active run slot"; + if (task.status !== "queued" && task.status !== "retry_wait") return `status=${task.status}`; + if (!taskIsUnclaimedMovable(task)) return "task has already been claimed"; + return ""; +} + +function reconcileHotTaskFromDatabase(task: QueueTask): QueueTask { + const existing = findTask(task.id); + if (existing === null) return rememberHotTask(task); + if (activeRunForTask(existing) !== null) return existing; + Object.assign(existing, task); + return existing; +} + +function taskHasClaimMarkers(task: QueueTask): boolean { + return task.status === "running" + || task.status === "judging" + || task.startedAt !== null + || task.currentAttempt > 0 + || task.codexThreadId !== null + || task.activeTurnId !== null; +} + +function shouldPreferHotTaskOverDatabase(hotTask: QueueTask, databaseTask: QueueTask): boolean { + if (activeRunForTask(hotTask) !== null) return true; + if (taskIsUnclaimedMovable(hotTask) && taskHasClaimMarkers(databaseTask)) return false; + const hotUpdatedAt = timestampMs(hotTask.updatedAt) ?? 0; + const databaseUpdatedAt = timestampMs(databaseTask.updatedAt) ?? 0; + return hotUpdatedAt >= databaseUpdatedAt; +} + +async function deleteTaskFromDatabase(taskId: string): Promise<void> { + if (!databaseReady) return; + await sql` + DELETE FROM unidesk_code_queue_tasks + WHERE id = ${taskId} + `; +} + +async function claimTaskInDatabase(task: QueueTask, expectedQueueId: string): Promise<boolean> { + if (!databaseReady) return true; + const claimed = await sql.begin(async (client) => await upsertTaskToDatabase(client, task, { claimQueueId: expectedQueueId })); + if (claimed) return true; + const databaseTask = await loadTaskFromDatabase(task.id); + if (databaseTask !== null) reconcileHotTaskFromDatabase(databaseTask); + logger("warn", "task_claim_conflict", { + taskId: task.id, + expectedQueueId, + attemptedQueueId: queueIdOf(task), + attemptedStatus: task.status, + attemptedCurrentAttempt: task.currentAttempt, + }); + return false; +} + +async function runDatabaseClaimMoveSelfTest(): Promise<JsonValue | null> { + if (!databaseReady) return null; + const suffix = String(Date.now()); + const taskId = `codex_claim_move_db_${suffix}`; + const queuedAt = nowIso(); + const sourceQueueId = `claim_move_db_source_${suffix}`; + const targetQueueId = `claim_move_db_target_${suffix}`; + const before = state.tasks.slice(); + const beforeQueues = state.queues.slice(); + await deleteTaskFromDatabase(taskId); + try { + const queuedTask = normalizeTask({ + ...createTask({ prompt: "claim/move DB race self-test", queueId: sourceQueueId }), + id: taskId, + queueId: sourceQueueId, + queueEnteredAt: queuedAt, + createdAt: queuedAt, + updatedAt: queuedAt, + output: [], + }); + await sql.begin(async (client) => { + await upsertQueueToDatabase(client, { id: sourceQueueId, name: sourceQueueId, createdAt: queuedAt, updatedAt: queuedAt }); + await upsertTaskToDatabase(client, queuedTask); + }); + const staleHotTask = normalizeTask(JSON.parse(JSON.stringify(queuedTask)) as QueueTask); + const claimedTask = normalizeTask(JSON.parse(JSON.stringify(queuedTask)) as QueueTask); + const claimedAt = nowIso(); + claimedTask.status = "running"; + claimedTask.startedAt = claimedAt; + claimedTask.currentAttempt = 1; + claimedTask.currentMode = "initial"; + claimedTask.updatedAt = claimedAt; + const claimed = await claimTaskInDatabase(claimedTask, sourceQueueId); + if (!claimed) throw new Error("database claim self-test failed to claim queued task"); + state.tasks.splice(0, state.tasks.length, staleHotTask); + const response = await moveTaskToQueue(staleHotTask, new Request(`http://code-queue.local/api/tasks/${taskId}/move`, { + method: "POST", + body: JSON.stringify({ queueId: targetQueueId }), + headers: { "content-type": "application/json" }, + })); + const after = await loadTaskFromDatabase(taskId); + const body = await response.json() as Record<string, JsonValue>; + if (response.status !== 409) throw new Error(`database stale move should return 409, got ${response.status}`); + if (after === null) throw new Error("database self-test task disappeared after stale move"); + if (after.status !== "running") throw new Error(`database self-test task status changed to ${after.status}`); + if (queueIdOf(after) !== sourceQueueId) throw new Error(`database self-test task queue changed to ${queueIdOf(after)}`); + if (after.currentAttempt !== 1 || after.startedAt === null) throw new Error("database self-test task claim markers were lost"); + return { + ok: true, + taskId, + moveStatus: response.status, + databaseStatus: after.status, + databaseQueueId: queueIdOf(after), + currentAttempt: after.currentAttempt, + startedAt: after.startedAt, + response: body as JsonValue, + } as unknown as JsonValue; + } finally { + await deleteTaskFromDatabase(taskId); + await deleteDatabaseQueues([sourceQueueId, targetQueueId]); + state.tasks.splice(0, state.tasks.length, ...before); + state.queues.splice(0, state.queues.length, ...beforeQueues); + } +} + async function loadPrunedDatabaseTaskRows(where: "all" | "hot"): Promise<DatabaseTaskRow[]> { return await sql<DatabaseTaskRow[]>` SELECT id, updated_at, status, read_at, task_json @@ -1674,12 +1897,21 @@ function rememberHotTask(task: QueueTask): QueueTask { } async function findTaskForRead(taskId: string): Promise<QueueTask | null> { - return findTask(taskId) ?? await loadTaskFromDatabase(taskId); + const hotTask = findTask(taskId); + if (!databaseReady) return hotTask; + const databaseTask = await loadTaskFromDatabase(taskId); + if (hotTask === null) return databaseTask; + if (databaseTask === null) return hotTask; + return shouldPreferHotTaskOverDatabase(hotTask, databaseTask) ? hotTask : databaseTask; } async function findTaskForMutation(taskId: string): Promise<QueueTask | null> { - const task = findTask(taskId) ?? await loadTaskFromDatabase(taskId); - return task === null ? null : rememberHotTask(task); + const hotTask = findTask(taskId); + if (!databaseReady) return hotTask; + const databaseTask = await loadTaskFromDatabase(taskId); + if (databaseTask === null) return hotTask; + if (hotTask === null) return rememberHotTask(databaseTask); + return shouldPreferHotTaskOverDatabase(hotTask, databaseTask) ? hotTask : reconcileHotTaskFromDatabase(databaseTask); } async function loadNextSeqFromDatabase(): Promise<number> { @@ -1703,6 +1935,7 @@ async function flushDirtyTasksToDatabase(force = false): Promise<void> { dirtyDatabaseTaskIds.clear(); dirtyDatabaseQueueIds.clear(); databaseFlushInFlight = true; + const rejectedTaskIds: string[] = []; try { await sql.begin(async (client) => { for (const id of queueIds) { @@ -1711,7 +1944,7 @@ async function flushDirtyTasksToDatabase(force = false): Promise<void> { } for (const id of ids) { const task = state.tasks.find((item) => item.id === id); - if (task !== undefined) await upsertTaskToDatabase(client, task); + if (task !== undefined && !await upsertTaskToDatabase(client, task)) rejectedTaskIds.push(id); } }); databaseLastError = null; @@ -1723,6 +1956,10 @@ async function flushDirtyTasksToDatabase(force = false): Promise<void> { databaseFlushInFlight = false; if (dirtyDatabaseTaskIds.size > 0 || dirtyDatabaseQueueIds.size > 0) scheduleDatabaseFlush(); } + for (const id of rejectedTaskIds) { + const databaseTask = await loadTaskFromDatabase(id); + if (databaseTask !== null) reconcileHotTaskFromDatabase(databaseTask); + } } async function initDatabasePersistence(): Promise<void> { @@ -2446,6 +2683,7 @@ configureSelfTests({ defaultQueueId, enqueueActiveRunSlotWaiter, injectReferencedTaskContext, + moveTaskToQueueForTest: (task, req) => moveTaskToQueue(task, req, { bypassRoleCheck: true }), nextRunnableTaskFrom, normalizeTask, nowIso, @@ -2454,6 +2692,8 @@ configureSelfTests({ queuedStatusReason, removeActiveRunSlotWaiter, resolveReasoningEffort, + runDatabaseClaimMoveSelfTest, + tasks: () => state.tasks, updateProcessingFlag, }); @@ -2979,7 +3219,8 @@ function failTaskForFallbackRetryLimit(task: QueueTask, judge: JudgeResult | nul } async function runTask(task: QueueTask): Promise<void> { - logger("info", "task_processor_start", { taskId: task.id, queueId: queueIdOf(task), providerId: task.providerId, executionMode: task.executionMode, cwd: task.cwd, maxAttempts: task.maxAttempts, model: task.model, agentPort: codeAgentPortForModel(task.model), promptPreview: safePreview(task.prompt, 240) }); + const claimQueueId = queueIdOf(task); + logger("info", "task_processor_start", { taskId: task.id, queueId: claimQueueId, providerId: task.providerId, executionMode: task.executionMode, cwd: task.cwd, maxAttempts: task.maxAttempts, model: task.model, agentPort: codeAgentPortForModel(task.model), promptPreview: safePreview(task.prompt, 240) }); if (task.status === "retry_wait" && task.lastJudge?.source === "fallback" && task.lastJudge.decision === "retry" && fallbackJudgeRetryCount(task) >= fallbackJudgeRetryLimit) { failTaskForFallbackRetryLimit(task, task.lastJudge); return; @@ -3010,6 +3251,11 @@ async function runTask(task: QueueTask): Promise<void> { task.readAt = null; task.finishedAt = null; task.updatedAt = startedAt; + if (!await claimTaskInDatabase(task, claimQueueId)) { + releaseRunSlot(); + return; + } + publishTaskOaEvent(task, "claim"); logger("info", "task_run_start", { taskId: task.id, queueId: queueIdOf(task), attempt: task.currentAttempt, mode, providerId: task.providerId, executionMode: task.executionMode, cwd: task.cwd, maxAttempts: task.maxAttempts, model: task.model, agentPort: codeAgentPortForModel(task.model), freshRecovery: needsFreshRecoveryPrompt }); const attemptStartOutput = appendOutput(task, "system", `attempt ${task.currentAttempt}/${task.maxAttempts} queue=${queueIdOf(task)} provider=${task.providerId} executionMode=${task.executionMode} cwd=${task.cwd} mode=${mode} model=${task.model} port=${codeAgentPortForModel(task.model)}\n`, "queue"); @@ -3997,7 +4243,9 @@ function queueMergeBlocker(queueId: string): string | null { if (activeRunSlotReservations.has(queueId)) return "queue is reserving an active run slot"; if (activeRunSlotWaiters.some((waiter) => waiter.queueId === queueId)) return "queue is waiting for an active run slot"; const activeTask = state.tasks.find((task) => queueIdOf(task) === queueId && (task.status === "running" || task.status === "judging")); - return activeTask === undefined ? null : `task ${activeTask.id} is ${activeTask.status}`; + if (activeTask !== undefined) return `task ${activeTask.id} is ${activeTask.status}`; + const claimedPendingTask = state.tasks.find((task) => queueIdOf(task) === queueId && (task.status === "queued" || task.status === "retry_wait") && !taskIsUnclaimedMovable(task)); + return claimedPendingTask === undefined ? null : `task ${claimedPendingTask.id} has already been claimed`; } function parseSourceQueueIds(record: Record<string, unknown>, targetQueueId: string): string[] { @@ -4020,27 +4268,117 @@ function parseSourceQueueIds(record: Record<string, unknown>, targetQueueId: str return ids; } -async function mergeDatabaseQueueTasks(sourceQueueIds: string[], targetQueueId: string): Promise<string[]> { - if (!databaseReady || sourceQueueIds.length === 0) return []; - const rows = await sql<Array<{ id: string }>>` - UPDATE unidesk_code_queue_tasks - SET - queue_id = ${targetQueueId}, - task_json = jsonb_set( - jsonb_set( - task_json, - '{queueId}', - to_jsonb(${targetQueueId}::text), +async function mergeDatabaseQueueTasks(sourceQueueIds: string[], targetQueueId: string, mergedAt: string): Promise<{ movedTaskIds: string[]; blocker: DatabaseTaskStatusRow | null }> { + if (!databaseReady || sourceQueueIds.length === 0) return { movedTaskIds: [], blocker: null }; + return await sql.begin(async (client) => { + const mergeQueueIds = Array.from(new Set([targetQueueId, ...sourceQueueIds])); + const lockedRows = await client<DatabaseTaskStatusRow[]>` + SELECT id, queue_id, status, started_at, current_attempt, codex_thread_id, active_turn_id + FROM unidesk_code_queue_tasks + WHERE queue_id IN ${client(mergeQueueIds)} + ORDER BY updated_at DESC, id DESC + FOR UPDATE + `; + const blocker = lockedRows.find((row) => { + return row.status === "running" + || row.status === "judging" + || ( + (row.status === "queued" || row.status === "retry_wait") + && ( + row.started_at !== null + || Number(row.current_attempt ?? 0) > 0 + || row.codex_thread_id !== null + || row.active_turn_id !== null + ) + ); + }) ?? null; + if (blocker !== null) return { movedTaskIds: [], blocker }; + const rows = await client<Array<{ id: string }>>` + UPDATE unidesk_code_queue_tasks + SET + queue_id = ${targetQueueId}, + updated_at = ${mergedAt}, + task_json = jsonb_set( + jsonb_set( + jsonb_set( + task_json, + '{queueId}', + to_jsonb(${targetQueueId}::text), + true + ), + '{queueEnteredAt}', + to_jsonb(COALESCE(NULLIF(task_json->>'queueEnteredAt', ''), to_char(created_at AT TIME ZONE 'UTC', 'YYYY-MM-DD"T"HH24:MI:SS.MS"Z"'))::text), + true + ), + '{updatedAt}', + to_jsonb(${mergedAt}::text), true - ), - '{queueEnteredAt}', - to_jsonb(COALESCE(NULLIF(task_json->>'queueEnteredAt', ''), to_char(created_at AT TIME ZONE 'UTC', 'YYYY-MM-DD"T"HH24:MI:SS.MS"Z"'))::text), - true - ) - WHERE queue_id IN ${sql(sourceQueueIds)} - RETURNING id - `; - return rows.map((row) => row.id); + ) + WHERE queue_id IN ${client(sourceQueueIds)} + AND ( + status IN ('succeeded', 'failed', 'canceled') + OR ( + status IN ('queued', 'retry_wait') + AND started_at IS NULL + AND current_attempt = 0 + AND codex_thread_id IS NULL + AND active_turn_id IS NULL + ) + ) + RETURNING id + `; + return { movedTaskIds: rows.map((row) => row.id), blocker: null }; + }); +} + +async function moveDatabaseTaskToQueue(taskId: string, targetQueueId: string, movedAt: string): Promise<{ ok: boolean; row: DatabaseTaskStatusRow | null; previousQueueId: string | null; blocker: string }> { + if (!databaseReady) return { ok: true, row: null, previousQueueId: null, blocker: "" }; + return await sql.begin(async (client) => { + const rows = await client<DatabaseTaskStatusRow[]>` + SELECT id, queue_id, status, started_at, current_attempt, codex_thread_id, active_turn_id + FROM unidesk_code_queue_tasks + WHERE id = ${taskId} + LIMIT 1 + FOR UPDATE + `; + const row = rows[0] ?? null; + const blocker = databaseTaskMoveBlocker(row); + if (blocker.length > 0) return { ok: false, row, previousQueueId: row === null ? null : safeQueueId(row.queue_id), blocker }; + const previousQueueId = safeQueueId(row?.queue_id); + const updated = await client<DatabaseTaskStatusRow[]>` + UPDATE unidesk_code_queue_tasks + SET + queue_id = ${targetQueueId}, + updated_at = ${movedAt}, + task_json = jsonb_set( + jsonb_set( + jsonb_set( + task_json, + '{queueId}', + to_jsonb(${targetQueueId}::text), + true + ), + '{queueEnteredAt}', + to_jsonb(${movedAt}::text), + true + ), + '{updatedAt}', + to_jsonb(${movedAt}::text), + true + ) + WHERE id = ${taskId} + AND status IN ('queued', 'retry_wait') + AND started_at IS NULL + AND current_attempt = 0 + AND codex_thread_id IS NULL + AND active_turn_id IS NULL + RETURNING id, queue_id, status, started_at, current_attempt, codex_thread_id, active_turn_id + `; + const updatedRow = updated[0] ?? null; + return updatedRow === null + ? { ok: false, row, previousQueueId, blocker: "conditional update matched no rows" } + : { ok: true, row: updatedRow, previousQueueId, blocker: "" }; + }); } async function deleteDatabaseQueues(queueIds: string[]): Promise<string[]> { @@ -4097,6 +4435,17 @@ async function mergeQueues(targetQueueIdValue: string | null, req: Request): Pro } const mergedAt = nowIso(); + const databaseMerge = await mergeDatabaseQueueTasks(sourceQueueIds, targetQueueId, mergedAt); + if (databaseMerge.blocker !== null) { + const blockerQueueId = safeQueueId(databaseMerge.blocker.queue_id); + const databaseTask = await loadTaskFromDatabase(databaseMerge.blocker.id); + if (databaseTask !== null) reconcileHotTaskFromDatabase(databaseTask); + return jsonResponse({ + ok: false, + error: `cannot merge queue ${blockerQueueId}: task ${databaseMerge.blocker.id} is already claimed (${databaseTaskMoveBlocker(databaseMerge.blocker) || databaseMerge.blocker.status})`, + blocker: databaseStatusRowJson(databaseMerge.blocker), + }, 409); + } const targetQueue = ensureQueue(targetQueueId); const sourceQueues = sourceQueueIds.map((id) => queueSnapshot(id, mergedAt)); targetQueue.updatedAt = mergedAt; @@ -4109,11 +4458,11 @@ async function mergeQueues(targetQueueIdValue: string | null, req: Request): Pro if (!sourceSet.has(previousQueueId)) continue; task.queueEnteredAt = taskQueueEnteredAt(task); task.queueId = targetQueueId; + task.updatedAt = mergedAt; hotMovedTasks.push(task); markTaskDirty(task.id); publishTaskOaEvent(task, "queue-merged"); } - const databaseMovedTaskIds = await mergeDatabaseQueueTasks(sourceQueueIds, targetQueueId); const deletedSourceQueues = deleteQueuesFromState(sourceQueueIds); const databaseDeletedQueueIds = await deleteDatabaseQueues(sourceQueueIds); persistState(false); @@ -4125,14 +4474,14 @@ async function mergeQueues(targetQueueIdValue: string | null, req: Request): Pro sourceQueueIds, deletedSourceQueueIds: deletedSourceQueues.map((queue) => queue.id), hotMovedTaskCount: hotMovedTasks.length, - databaseMovedTaskCount: databaseReady ? databaseMovedTaskIds.length : null, + databaseMovedTaskCount: databaseReady ? databaseMerge.movedTaskIds.length : null, databaseDeletedQueueIds: databaseReady ? databaseDeletedQueueIds : null, }); for (const id of mergeQueueIds) mergingQueues.delete(id); scheduleQueue(targetQueueId); await flushDirtyTasksToDatabase(true); const tasks = await loadAllTasksForRead(); - const movedIdSet = new Set(databaseReady ? databaseMovedTaskIds : hotMovedTasks.map((task) => task.id)); + const movedIdSet = new Set(databaseReady ? databaseMerge.movedTaskIds : hotMovedTasks.map((task) => task.id)); const orderedMovedTaskIds = tasks .filter((task) => movedIdSet.has(task.id)) .sort(compareTaskQueueOrder) @@ -4145,7 +4494,7 @@ async function mergeQueues(targetQueueIdValue: string | null, req: Request): Pro ok: true, targetQueueId, sourceQueueIds, - mergedTaskCount: databaseReady ? databaseMovedTaskIds.length : hotMovedTasks.length, + mergedTaskCount: databaseReady ? databaseMerge.movedTaskIds.length : hotMovedTasks.length, movedTaskIds: orderedMovedTaskIds.slice(0, 500), targetTaskOrder: targetTaskOrder.slice(0, 500), order: "merged tasks keep their original queueEnteredAt/createdAt ordering; source queue records are deleted after merge", @@ -4161,17 +4510,37 @@ async function mergeQueues(targetQueueIdValue: string | null, req: Request): Pro } } -async function moveTaskToQueue(task: QueueTask, req: Request): Promise<Response> { - if (!serviceRoleAllowsWrite(config.serviceRole)) return readOnlyRejectResponse(req.method, `/api/tasks/${task.id}/move`); - if (task.status === "running" || task.status === "judging") { - return jsonResponse({ ok: false, error: `cannot move active task ${task.id} while status=${task.status}`, task: taskForResponse(task) }, 409); - } +async function moveTaskToQueue(task: QueueTask, req: Request, options: { bypassRoleCheck?: boolean } = {}): Promise<Response> { + if (options.bypassRoleCheck !== true && !serviceRoleAllowsWrite(config.serviceRole)) return readOnlyRejectResponse(req.method, `/api/tasks/${task.id}/move`); const body = await readJson(req); const record = typeof body === "object" && body !== null && !Array.isArray(body) ? body as Record<string, unknown> : {}; const queueId = normalizeQueueId(record.queueId ?? record.id); - const previousQueueId = queueIdOf(task); - const queue = ensureQueue(queueId); const movedAt = nowIso(); + const hotBlocker = taskMoveBlocker(task); + if (hotBlocker.length > 0) { + const databaseTask = databaseReady ? await loadTaskFromDatabase(task.id) : null; + if (databaseTask !== null) task = reconcileHotTaskFromDatabase(databaseTask); + return jsonResponse({ + ok: false, + error: `cannot move task ${task.id}: ${hotBlocker}`, + task: taskForResponse(task), + databaseTask: databaseTask === null ? null : taskForResponse(databaseTask), + }, 409); + } + const databaseMove = await moveDatabaseTaskToQueue(task.id, queueId, movedAt); + if (!databaseMove.ok) { + const databaseTask = databaseReady ? await loadTaskFromDatabase(task.id) : null; + if (databaseTask !== null) task = reconcileHotTaskFromDatabase(databaseTask); + return jsonResponse({ + ok: false, + error: `cannot move task ${task.id}: ${databaseMove.blocker}`, + blocker: databaseStatusRowJson(databaseMove.row), + task: taskForResponse(task), + databaseTask: databaseTask === null ? null : taskForResponse(databaseTask), + }, databaseMove.row === null ? 404 : 409); + } + const previousQueueId = databaseMove.previousQueueId ?? queueIdOf(task); + const queue = ensureQueue(queueId); queue.updatedAt = movedAt; markQueueDirty(queue.id); task.queueId = queueId; @@ -4301,6 +4670,7 @@ async function route(req: Request): Promise<Response> { if (url.pathname === "/api/judge/probe" && (req.method === "GET" || req.method === "POST")) return await runJudgeProbe(); if (url.pathname === "/api/judge/self-test" && (req.method === "GET" || req.method === "POST")) return jsonResponse(runJudgeInfraSelfTest()); if (url.pathname === "/api/queue-order/self-test" && (req.method === "GET" || req.method === "POST")) return jsonResponse(runQueueOrderingSelfTest()); + if (url.pathname === "/api/queue-claim-move/self-test" && (req.method === "GET" || req.method === "POST")) return jsonResponse(await runQueueClaimMoveSelfTest()); if (url.pathname === "/api/reference-injection/self-test" && (req.method === "GET" || req.method === "POST")) return jsonResponse(await runReferenceInjectionSelfTest()); if (url.pathname === "/api/trace-port/self-test" && (req.method === "GET" || req.method === "POST")) return jsonResponse(runTracePortSelfTest()); if (url.pathname === "/api/oa/backfill" && (req.method === "GET" || req.method === "POST")) return jsonResponse(await backfillOaTraceStats(url)); diff --git a/src/components/microservices/code-queue/src/queue-api.ts b/src/components/microservices/code-queue/src/queue-api.ts index 82a99661..442c6ed9 100644 --- a/src/components/microservices/code-queue/src/queue-api.ts +++ b/src/components/microservices/code-queue/src/queue-api.ts @@ -444,7 +444,8 @@ async function loadAllTasksForRead(): Promise<QueueTask[]> { const tasks = await ctx().loadTasksFromDatabase("all"); const byId = new Map(tasks.map((task) => [task.id, task])); for (const active of ctx().tasks()) { - byId.set(active.id, active); + const databaseTask = byId.get(active.id); + if (databaseTask === undefined || preferHotTaskForRead(active, databaseTask)) byId.set(active.id, active); } ctx().runGarbageCollection(); return Array.from(byId.values()).sort((left, right) => (timestampMs(left.createdAt) ?? 0) - (timestampMs(right.createdAt) ?? 0) || left.id.localeCompare(right.id)); @@ -581,6 +582,30 @@ function activePriority(task: QueueTask): number { return statusRank[task.status] ?? 9; } +function taskHasClaimMarkers(task: QueueTask): boolean { + return task.status === "running" + || task.status === "judging" + || task.startedAt !== null + || task.currentAttempt > 0 + || task.codexThreadId !== null + || task.activeTurnId !== null; +} + +function taskIsUnclaimedQueued(task: QueueTask): boolean { + return (task.status === "queued" || task.status === "retry_wait") + && task.startedAt === null + && task.currentAttempt === 0 + && task.codexThreadId === null + && task.activeTurnId === null; +} + +function preferHotTaskForRead(hotTask: QueueTask, databaseTask: QueueTask): boolean { + const hotActiveRun = Array.from(ctx().activeRuns.values()).some((run) => run.taskId === hotTask.id); + if (hotActiveRun) return true; + if (taskIsUnclaimedQueued(hotTask) && taskHasClaimMarkers(databaseTask)) return false; + return taskUpdatedSortValue(hotTask) >= taskUpdatedSortValue(databaseTask); +} + function taskUpdatedSortValue(task: QueueTask): number { const time = Date.parse(task.updatedAt || task.createdAt); return Number.isFinite(time) ? time : 0; @@ -1051,7 +1076,8 @@ async function databaseTasksOverviewResponse(url: URL): Promise<Response | null> const byId = new Map<string, QueueTask>(); for (const task of loadedTasks) byId.set(task.id, task); for (const task of ctx().tasks()) { - if (seenIds.has(task.id) || byId.has(task.id)) byId.set(task.id, task); + const databaseTask = byId.get(task.id); + if ((seenIds.has(task.id) || databaseTask !== undefined) && (databaseTask === undefined || preferHotTaskForRead(task, databaseTask))) byId.set(task.id, task); } const rowsSource = orderedIds .map((id) => byId.get(id) ?? null) @@ -1063,8 +1089,13 @@ async function databaseTasksOverviewResponse(url: URL): Promise<Response | null> let selectedTask: QueueTask | null = null; for (const id of selectedCandidates) { if (id.length === 0) continue; - const candidate = ctx().tasks().find((task) => task.id === id) - ?? await ctx().loadTaskFromDatabase(id); + const hotCandidate = ctx().tasks().find((task) => task.id === id) ?? null; + const databaseCandidate = await ctx().loadTaskFromDatabase(id); + const candidate = hotCandidate === null + ? databaseCandidate + : databaseCandidate === null || preferHotTaskForRead(hotCandidate, databaseCandidate) + ? hotCandidate + : databaseCandidate; if (candidate !== null && taskMatchesQueueFilter(candidate, queueId)) { selectedTask = candidate; break; diff --git a/src/components/microservices/code-queue/src/self-tests.ts b/src/components/microservices/code-queue/src/self-tests.ts index 76c9e44e..82d40cc9 100644 --- a/src/components/microservices/code-queue/src/self-tests.ts +++ b/src/components/microservices/code-queue/src/self-tests.ts @@ -17,6 +17,7 @@ export interface SelfTestsContext { defaultQueueId: string; enqueueActiveRunSlotWaiter: (task: QueueTask) => ActiveRunSlotWaiter; injectReferencedTaskContext: (request: QueueTaskRequest, finder?: (id: string) => QueueTask | null | Promise<QueueTask | null>, injectedAt?: string) => Promise<QueueTaskRequest>; + moveTaskToQueueForTest: (task: QueueTask, req: Request) => Promise<Response>; nextRunnableTaskFrom: (queueId: string, tasks?: QueueTask[]) => QueueTask | null; normalizeTask: (task: QueueTask) => QueueTask; nowIso: () => string; @@ -25,6 +26,8 @@ export interface SelfTestsContext { queuedStatusReason: (task: QueueTask, tasks?: QueueTask[]) => QueuedStatusReason | null; removeActiveRunSlotWaiter: (waiter: ActiveRunSlotWaiter) => void; resolveReasoningEffort: (model: string, explicit?: string | null) => string | null; + runDatabaseClaimMoveSelfTest?: () => Promise<JsonValue | null>; + tasks: () => QueueTask[]; updateProcessingFlag: () => void; } @@ -192,6 +195,45 @@ function queueOrderTestTask(id: string, status: TaskStatus, createdAt: string, q return ctx().normalizeTask(task); } +async function runQueueClaimMoveSelfTest(): Promise<JsonValue> { + const at = "2026-05-17T06:09:46.702Z"; + const task = queueOrderTestTask("codex_claim_move_self_test", "running", at, at); + task.queueId = "claim_move_source"; + task.startedAt = at; + task.currentAttempt = 1; + task.currentMode = "initial"; + task.codexThreadId = "thread_claim_move_self_test"; + task.activeTurnId = "turn_claim_move_self_test"; + task.updatedAt = at; + const before = ctx().tasks().slice(); + ctx().tasks().push(task); + try { + const response = await ctx().moveTaskToQueueForTest(task, new Request("http://code-queue.local/api/tasks/codex_claim_move_self_test/move", { + method: "POST", + body: JSON.stringify({ queueId: "claim_move_target" }), + headers: { "content-type": "application/json" }, + })); + const body = await response.json() as Record<string, JsonValue>; + assertReferenceTest(response.status === 409, "moving a claimed/running task must return 409"); + assertReferenceTest(task.queueId === "claim_move_source", "running task queueId must remain unchanged after rejected move"); + assertReferenceTest(task.status === "running", "running task status must remain running after rejected move"); + assertReferenceTest(task.currentAttempt === 1, "running task currentAttempt must remain claimed after rejected move"); + const databaseRace = await ctx().runDatabaseClaimMoveSelfTest?.() ?? null; + return { + ok: true, + cases: [ + { name: "move_running_task_returns_409", ok: true, status: response.status }, + { name: "rejected_move_preserves_queue", ok: true, queueId: task.queueId }, + { name: "rejected_move_preserves_claim_markers", ok: true, status: task.status, currentAttempt: task.currentAttempt, startedAt: task.startedAt }, + ...(databaseRace === null ? [] : [{ name: "database_claim_blocks_stale_move", ok: true, result: databaseRace }]), + ], + response: body as JsonValue, + }; + } finally { + ctx().tasks().splice(0, ctx().tasks().length, ...before); + } +} + function runQueueOrderingSelfTest(): JsonValue { const activeRetry = queueOrderTestTask("codex_4000_active", "retry_wait", "2026-05-11T09:00:00.000Z", "2026-05-11T09:00:00.000Z"); const movedOlderCreated = queueOrderTestTask("codex_3999_moved", "queued", "2026-05-11T08:00:00.000Z", "2026-05-11T08:00:00.000Z") as QueueTask & { queueEnteredAt?: string }; @@ -476,4 +518,4 @@ function runJudgeInfraSelfTest(): JsonValue { }; } -export { runJudgeInfraSelfTest, runQueueOrderingSelfTest, runReferenceInjectionSelfTest, runTracePortSelfTest }; +export { runJudgeInfraSelfTest, runQueueClaimMoveSelfTest, runQueueOrderingSelfTest, runReferenceInjectionSelfTest, runTracePortSelfTest }; From a7593c14733cb2265411994fdf9dcad0629730c5 Mon Sep 17 00:00:00 2001 From: Codex <codex@noreply.local> Date: Sun, 17 May 2026 06:57:53 +0000 Subject: [PATCH 13/14] test: exercise code queue claim move race --- src/components/microservices/code-queue/src/index.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/components/microservices/code-queue/src/index.ts b/src/components/microservices/code-queue/src/index.ts index 571186ba..c270a997 100644 --- a/src/components/microservices/code-queue/src/index.ts +++ b/src/components/microservices/code-queue/src/index.ts @@ -1573,7 +1573,7 @@ async function runDatabaseClaimMoveSelfTest(): Promise<JsonValue | null> { method: "POST", body: JSON.stringify({ queueId: targetQueueId }), headers: { "content-type": "application/json" }, - })); + }), { bypassRoleCheck: true }); const after = await loadTaskFromDatabase(taskId); const body = await response.json() as Record<string, JsonValue>; if (response.status !== 409) throw new Error(`database stale move should return 409, got ${response.status}`); From f3af35dffead30d4ae814b5831db14d183988aaf Mon Sep 17 00:00:00 2001 From: Codex <codex@noreply.local> Date: Sun, 17 May 2026 08:07:54 +0000 Subject: [PATCH 14/14] fix: lighten code queue startup task load --- src/components/microservices/code-queue/src/index.ts | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/components/microservices/code-queue/src/index.ts b/src/components/microservices/code-queue/src/index.ts index c270a997..0f826d69 100644 --- a/src/components/microservices/code-queue/src/index.ts +++ b/src/components/microservices/code-queue/src/index.ts @@ -1600,6 +1600,14 @@ async function runDatabaseClaimMoveSelfTest(): Promise<JsonValue | null> { } async function loadPrunedDatabaseTaskRows(where: "all" | "hot"): Promise<DatabaseTaskRow[]> { + if (where === "hot") { + return await sql<DatabaseTaskRow[]>` + SELECT id, updated_at, status, read_at, task_json - 'output' - 'events' AS task_json + FROM unidesk_code_queue_tasks + WHERE status IN ('queued', 'running', 'judging', 'retry_wait') + ORDER BY created_at ASC, id ASC + `; + } return await sql<DatabaseTaskRow[]>` SELECT id, updated_at, status, read_at, task_json FROM (