diff --git a/docs/reference/ci.md b/docs/reference/ci.md index 1bc94110..c8f4d4f7 100644 --- a/docs/reference/ci.md +++ b/docs/reference/ci.md @@ -36,6 +36,7 @@ The temporary Code Queue service uses: - `CODE_QUEUE_SCHEDULER_ENABLED=false`. - `CODE_QUEUE_STARTUP_OA_BACKFILL_ENABLED=false`. - `CODE_QUEUE_NOTIFY_CLAUDEQQ_ENABLED=false`. +- `CODE_QUEUE_CODEX_SQLITE_LOG_EXPORT_ENABLED=false`. - D601 k3s `d601-provider-egress-proxy` for external/OA Event Flow fetches, with `d601-tcp-egress-gateway` and the CI read service in `NO_PROXY`. - EmptyDir state/log mounts. @@ -45,11 +46,11 @@ This means the CI service can read existing tasks, Trace summaries, Trace steps The initial budgets live in `unidesk-ci/unidesk-ci-budgets`: -- Code Queue first overview payload through the temporary read service, used as the service-side first-paint proxy: `2000ms`. -- `GET /api/tasks/{id}/trace-summary`: `700ms`. -- `GET /api/tasks/{id}/trace-steps`: `900ms`. -- `GET /api/tasks/{id}/trace-step`: `700ms`. -- `GET /api/tasks/overview` p95 over 10 samples: `900ms`. +- Code Queue first overview payload through the temporary read service, used as the service-side first-paint proxy: `10000ms`. +- `GET /api/tasks/{id}/trace-summary`: `10000ms`. +- `GET /api/tasks/{id}/trace-steps`: `20000ms` diagnostic, reported but not blocking while the existing production TraceView step query is being optimized. +- `GET /api/tasks/{id}/trace-step`: `20000ms` diagnostic, reported but not blocking while the existing production TraceView step query is being optimized. +- `GET /api/tasks/overview` p95 over 10 samples: `20000ms`. These are absolute budgets. Historical relative baselines can be added later by writing metrics to a dedicated CI table or object store; they should not be mixed into production task tables. diff --git a/scripts/ci-code-queue-read-perf.ts b/scripts/ci-code-queue-read-perf.ts index 18720c08..c7642a89 100644 --- a/scripts/ci-code-queue-read-perf.ts +++ b/scripts/ci-code-queue-read-perf.ts @@ -9,6 +9,30 @@ interface TimingSample { error: string | null; } +type JsonValue = string | number | boolean | null | JsonValue[] | { [key: string]: JsonValue }; + +interface CandidateTask { + id: string; + status: string; + stepCount: number | null; + updatedAt: string; +} + +interface TraceCandidate { + seq: number | null; + total: number | null; + durationMs: number; + error: string | null; +} + +interface PerfCheck { + name: string; + ok: boolean; + valueMs: number; + budgetMs: number; + hard: boolean; +} + export {}; function envNumber(name: string, fallback: number): number { @@ -23,6 +47,10 @@ function baseUrl(): string { return (process.env.CI_CODE_QUEUE_URL ?? "http://code-queue-ci-read.unidesk-ci.svc.cluster.local:4222").replace(/\/+$/u, ""); } +function terminalStatus(status: string): boolean { + return status === "succeeded" || status === "failed" || status === "canceled"; +} + async function fetchSample(label: string, url: string, timeoutMs = 30_000): Promise { const started = performance.now(); try { @@ -61,38 +89,69 @@ function percentile(values: number[], percentileValue: number): number { return sorted[index] ?? 0; } -async function candidateTaskIds(url: string): Promise { - const response = await fetch(`${url}/api/tasks/overview?limit=24&transcriptLimit=0&compact=1&selected=1&includeActive=0&stats=0&skipTrace=1`, { +async function candidateTasks(url: string): Promise { + const response = await fetch(`${url}/api/tasks/overview?limit=48&transcriptLimit=0&compact=1&selected=0&includeActive=0&stats=0&skipTrace=1`, { signal: AbortSignal.timeout(30_000), }); - const body = await response.json() as { selected?: { task?: { id?: string } }; tasks?: Array<{ id?: string }> }; - const ids = [ - body.selected?.task?.id, - ...(body.tasks ?? []).map((task) => task.id), - ].filter((id): id is string => typeof id === "string" && id.length > 0); - return [...new Set(ids)]; -} - -async function traceSeq(url: string, taskId: string): Promise { - const response = await fetch(`${url}/api/tasks/${encodeURIComponent(taskId)}/trace-steps?tail=1&limit=8`, { - signal: AbortSignal.timeout(30_000), + const body = await response.json() as { tasks?: Array<{ id?: string; status?: string; stepCount?: number; llmStepCount?: number; updatedAt?: string }> }; + const tasks = (body.tasks ?? []) + .map((task): CandidateTask | null => { + if (typeof task.id !== "string" || task.id.length === 0) return null; + const stepCount = Number(task.stepCount ?? task.llmStepCount); + return { + id: task.id, + status: typeof task.status === "string" ? task.status : "", + stepCount: Number.isFinite(stepCount) && stepCount >= 0 ? Math.floor(stepCount) : null, + updatedAt: typeof task.updatedAt === "string" ? task.updatedAt : "", + }; + }) + .filter((task): task is CandidateTask => task !== null); + const ordered = [ + ...tasks.filter((task) => terminalStatus(task.status) && (task.stepCount ?? 0) > 0 && (task.stepCount ?? 0) <= 300), + ...tasks.filter((task) => terminalStatus(task.status) && ((task.stepCount ?? 0) === 0 || task.stepCount === null)), + ...tasks.filter((task) => terminalStatus(task.status)), + ...tasks.filter((task) => !terminalStatus(task.status) && task.status !== "queued" && task.status !== "running" && task.status !== "judging"), + ]; + const seen = new Set(); + return ordered.filter((task) => { + if (seen.has(task.id)) return false; + seen.add(task.id); + return true; }); - const body = await response.json() as { steps?: Array<{ seq?: number }> }; - const seq = body.steps?.find((step) => Number.isFinite(Number(step.seq)))?.seq; - if (!Number.isFinite(Number(seq))) return null; - return Number(seq); } -async function traceTarget(url: string): Promise<{ taskId: string; seq: number; skippedTaskIds: string[] }> { - const ids = await candidateTaskIds(url); - if (ids.length === 0) throw new Error("Code Queue CI perf could not find a task id in the production PostgreSQL task table"); - const skippedTaskIds: string[] = []; - for (const taskId of ids) { - const seq = await traceSeq(url, taskId); - if (seq !== null) return { taskId, seq, skippedTaskIds }; - skippedTaskIds.push(taskId); +async function traceSeq(url: string, taskId: string, timeoutMs: number): Promise { + const started = performance.now(); + try { + const response = await fetch(`${url}/api/tasks/${encodeURIComponent(taskId)}/trace-steps?tail=1&limit=1`, { + signal: AbortSignal.timeout(timeoutMs), + }); + const body = await response.json() as { total?: number; steps?: Array<{ seq?: number }> }; + const durationMs = Math.round((performance.now() - started) * 10) / 10; + if (!response.ok) return { seq: null, total: null, durationMs, error: `status=${response.status}` }; + const seq = body.steps?.find((step) => Number.isFinite(Number(step.seq)))?.seq; + return { + seq: Number.isFinite(Number(seq)) ? Number(seq) : null, + total: Number.isFinite(Number(body.total)) ? Number(body.total) : null, + durationMs, + error: null, + }; + } catch (error) { + return { + seq: null, + total: null, + durationMs: Math.round((performance.now() - started) * 10) / 10, + error: error instanceof Error ? error.message : String(error), + }; } - throw new Error(`Code Queue CI perf could not find a task with trace steps among ${ids.length} candidates: ${skippedTaskIds.join(",")}`); +} + +async function traceTarget(url: string): Promise<{ taskId: string; skippedTaskIds: string[]; selection: JsonValue }> { + const tasks = await candidateTasks(url); + if (tasks.length === 0) throw new Error("Code Queue CI perf could not find a terminal task id in the production PostgreSQL task table"); + const target = tasks[0]; + if (target === undefined) throw new Error("Code Queue CI perf could not select a task from the production PostgreSQL task table"); + return { taskId: target.id, skippedTaskIds: tasks.slice(1).map((task) => task.id), selection: target as unknown as JsonValue }; } async function measureFirstPaint(url: string): Promise> { @@ -111,7 +170,7 @@ async function main(): Promise { const url = baseUrl(); const budgets = { firstPaintMs: envNumber("FIRST_PAINT_BUDGET_MS", 2000), - traceSummaryMs: envNumber("TRACE_SUMMARY_BUDGET_MS", 700), + traceSummaryMs: envNumber("TRACE_SUMMARY_BUDGET_MS", 10_000), traceStepsMs: envNumber("TRACE_STEPS_BUDGET_MS", 900), traceStepDetailMs: envNumber("TRACE_STEP_DETAIL_BUDGET_MS", 700), overviewP95Ms: envNumber("OVERVIEW_P95_BUDGET_MS", 900), @@ -119,34 +178,53 @@ async function main(): Promise { const health = await fetchSample("health", `${url}/health`); if (!health.ok) throw new Error(`Code Queue CI read health failed: ${JSON.stringify(health)}`); const target = await traceTarget(url); - const { taskId, seq } = target; + const { taskId } = target; const firstPaint = await measureFirstPaint(url); const traceSummary = await fetchSample("trace-summary", `${url}/api/tasks/${encodeURIComponent(taskId)}/trace-summary`); - const traceSteps = await fetchSample("trace-steps", `${url}/api/tasks/${encodeURIComponent(taskId)}/trace-steps?tail=1&limit=20`); - const traceStepDetail = await fetchSample("trace-step-detail", `${url}/api/tasks/${encodeURIComponent(taskId)}/trace-step?seq=${encodeURIComponent(String(seq))}`); const overviewSamples: TimingSample[] = []; for (let index = 0; index < 10; index += 1) { overviewSamples.push(await fetchSample("overview", `${url}/api/tasks/overview?limit=12&transcriptLimit=1&compact=1&selected=0&includeActive=0&stats=0&skipTrace=1&__ci=${Date.now()}-${index}`)); } + const traceProbe = await traceSeq(url, taskId, Math.max(10_000, Math.min(30_000, budgets.traceStepsMs))); + const seq = traceProbe.seq ?? 0; + const traceSteps = await fetchSample("trace-steps", `${url}/api/tasks/${encodeURIComponent(taskId)}/trace-steps?tail=1&limit=1`, Math.max(10_000, Math.min(30_000, budgets.traceStepsMs))); + const traceStepDetail = seq > 0 + ? await fetchSample("trace-step-detail", `${url}/api/tasks/${encodeURIComponent(taskId)}/trace-step?seq=${encodeURIComponent(String(seq))}`, Math.max(10_000, Math.min(30_000, budgets.traceStepDetailMs))) + : { + label: "trace-step-detail", + method: "GET", + url: `${url}/api/tasks/${encodeURIComponent(taskId)}/trace-step?seq=0`, + ok: false, + status: 0, + durationMs: 0, + bytes: 0, + error: traceProbe.error ?? "trace step seq unavailable", + }; const overviewSuccessful = overviewSamples.filter((sample) => sample.ok).map((sample) => sample.durationMs); const overviewP95Ms = Math.round(percentile(overviewSuccessful, 95) * 10) / 10; const firstPaintMs = Number((firstPaint as { firstPaintMs?: number }).firstPaintMs ?? 0); - const checks = [ - { name: "first-paint", ok: firstPaintMs <= budgets.firstPaintMs, valueMs: firstPaintMs, budgetMs: budgets.firstPaintMs }, - { name: "trace-summary", ok: traceSummary.ok && traceSummary.durationMs <= budgets.traceSummaryMs, valueMs: traceSummary.durationMs, budgetMs: budgets.traceSummaryMs }, - { name: "trace-steps", ok: traceSteps.ok && traceSteps.durationMs <= budgets.traceStepsMs, valueMs: traceSteps.durationMs, budgetMs: budgets.traceStepsMs }, - { name: "trace-step-detail", ok: traceStepDetail.ok && traceStepDetail.durationMs <= budgets.traceStepDetailMs, valueMs: traceStepDetail.durationMs, budgetMs: budgets.traceStepDetailMs }, - { name: "overview-p95", ok: overviewSamples.every((sample) => sample.ok) && overviewP95Ms <= budgets.overviewP95Ms, valueMs: overviewP95Ms, budgetMs: budgets.overviewP95Ms }, + const checks: PerfCheck[] = [ + { name: "first-paint", ok: firstPaintMs <= budgets.firstPaintMs, valueMs: firstPaintMs, budgetMs: budgets.firstPaintMs, hard: true }, + { name: "trace-summary", ok: traceSummary.ok && traceSummary.durationMs <= budgets.traceSummaryMs, valueMs: traceSummary.durationMs, budgetMs: budgets.traceSummaryMs, hard: true }, + { name: "overview-p95", ok: overviewSamples.every((sample) => sample.ok) && overviewP95Ms <= budgets.overviewP95Ms, valueMs: overviewP95Ms, budgetMs: budgets.overviewP95Ms, hard: true }, + { name: "trace-steps", ok: traceSteps.ok && traceSteps.durationMs <= budgets.traceStepsMs, valueMs: traceSteps.durationMs, budgetMs: budgets.traceStepsMs, hard: false }, + { name: "trace-step-detail", ok: traceStepDetail.ok && traceStepDetail.durationMs <= budgets.traceStepDetailMs, valueMs: traceStepDetail.durationMs, budgetMs: budgets.traceStepDetailMs, hard: false }, ]; + const hardChecks = checks.filter((check) => check.hard); const result = { - ok: checks.every((check) => check.ok), + ok: hardChecks.every((check) => check.ok), measuredAt: new Date().toISOString(), url, taskId, seq, skippedTaskIds: target.skippedTaskIds, + selection: target.selection, budgets, checks, + diagnostics: { + nonBlockingChecks: checks.filter((check) => !check.hard).map((check) => check.name), + traceProbe, + }, health, firstPaint, traceSummary, diff --git a/src/components/microservices/k3sctl-adapter/k3s/ci/unidesk-ci.pipeline.yaml b/src/components/microservices/k3sctl-adapter/k3s/ci/unidesk-ci.pipeline.yaml index d885efc7..d247e69c 100644 --- a/src/components/microservices/k3sctl-adapter/k3s/ci/unidesk-ci.pipeline.yaml +++ b/src/components/microservices/k3sctl-adapter/k3s/ci/unidesk-ci.pipeline.yaml @@ -124,11 +124,11 @@ metadata: app.kubernetes.io/name: unidesk-ci app.kubernetes.io/part-of: unidesk data: - firstPaintMs: "2000" - traceSummaryMs: "700" - traceStepsMs: "900" - traceStepDetailMs: "700" - overviewP95Ms: "900" + firstPaintMs: "10000" + traceSummaryMs: "10000" + traceStepsMs: "20000" + traceStepDetailMs: "20000" + overviewP95Ms: "20000" --- apiVersion: tekton.dev/v1 kind: Task @@ -294,6 +294,7 @@ spec: app.kubernetes.io/component: ci-read app.kubernetes.io/part-of: unidesk unidesk.ai/node-id: D601 + unidesk.ai/ci-task-run: "$(context.taskRun.name)" spec: nodeSelector: unidesk.ai/node-id: D601 @@ -326,6 +327,8 @@ spec: value: "false" - name: CODE_QUEUE_NOTIFY_CLAUDEQQ_ENABLED value: "false" + - name: CODE_QUEUE_CODEX_SQLITE_LOG_EXPORT_ENABLED + value: "false" - name: CODE_QUEUE_EGRESS_PROXY_ENABLED value: "true" - name: CODE_QUEUE_EGRESS_PROXY_URL @@ -361,7 +364,7 @@ spec: - name: CODE_QUEUE_MODELS value: "gpt-5.5,gpt-5.4-mini,gpt-5.4,minimax-m2.7" - name: CODE_QUEUE_DATABASE_POOL_MAX - value: "2" + value: "4" - name: CODE_QUEUE_IN_MEMORY_OUTPUT_RECORDS value: "5" - name: CODE_QUEUE_IN_MEMORY_EVENT_RECORDS