fix: clarify live split brain liveness
This commit is contained in:
+341
-1
@@ -157,6 +157,8 @@ const FRONTEND_CHECK_NAMES = [
|
||||
"frontend:code-queue-error-red-markers",
|
||||
"frontend:code-queue-stats-visible",
|
||||
"frontend:code-queue-stats-degraded-visible",
|
||||
"frontend:code-queue-split-brain-live-not-failed",
|
||||
"frontend:code-queue-split-brain-stale-failed",
|
||||
"frontend:code-queue-retry-attempt-trace-current",
|
||||
"frontend:code-queue-step-missing-diagnostic",
|
||||
"frontend:code-queue-judge-feedback-attempt-order",
|
||||
@@ -190,6 +192,10 @@ const CODE_QUEUE_STATS_FIXTURE_CHECK_NAMES = [
|
||||
"frontend:code-queue-stats-visible",
|
||||
"frontend:code-queue-stats-degraded-visible",
|
||||
] as const;
|
||||
const CODE_QUEUE_LIVENESS_UI_FIXTURE_CHECK_NAMES = [
|
||||
"frontend:code-queue-split-brain-live-not-failed",
|
||||
"frontend:code-queue-split-brain-stale-failed",
|
||||
] as const;
|
||||
|
||||
const ALL_E2E_CHECK_NAMES = [
|
||||
...NETWORK_CHECK_NAMES,
|
||||
@@ -1462,6 +1468,280 @@ async function runCodeQueueStatsFixture(page: Page, frontendUrl: string, mode: "
|
||||
}
|
||||
}
|
||||
|
||||
function codeQueueSplitBrainFixtureOverview(mode: "live" | "stale"): any {
|
||||
const taskId = mode === "live" ? "codex_issue14_split_brain_live" : "codex_issue14_split_brain_stale";
|
||||
const now = "2026-05-20T06:00:00.000Z";
|
||||
const freshAt = "2026-05-20T05:59:52.000Z";
|
||||
const staleAt = "2026-05-20T05:42:00.000Z";
|
||||
const activeAt = mode === "live" ? freshAt : staleAt;
|
||||
const diagnostics = mode === "live"
|
||||
? {
|
||||
state: "split-brain",
|
||||
health: "split-brain",
|
||||
degraded: true,
|
||||
splitBrain: true,
|
||||
splitBrainLive: true,
|
||||
effectiveLiveness: "live",
|
||||
recommendedAction: "continue-supervision",
|
||||
livenessSummary: "PostgreSQL and the local control-plane view are split, but scheduler-owned heartbeat and trace are fresh; treat the task as live and continue supervision.",
|
||||
executionStateSource: "postgres-control-plane",
|
||||
controlPlane: "master-code-queue-mgr",
|
||||
databaseActiveTaskIds: [taskId],
|
||||
databaseActiveTaskCount: 1,
|
||||
schedulerActiveTaskIds: [],
|
||||
schedulerActiveTaskCount: 0,
|
||||
schedulerActiveRunSlotCount: 0,
|
||||
schedulerActiveQueueIds: [],
|
||||
schedulerProcessingQueueIds: [],
|
||||
schedulerOrphanedActiveTaskIds: [taskId],
|
||||
schedulerOrphanedActiveTaskCount: 1,
|
||||
activeHeartbeatTaskIds: [taskId],
|
||||
activeHeartbeatCount: 1,
|
||||
heartbeatFreshTaskIds: [taskId],
|
||||
heartbeatExpiredTaskIds: [],
|
||||
heartbeatMissingTaskIds: [],
|
||||
staleRecoveryCandidateTaskIds: [],
|
||||
heartbeatRiskTaskIds: [],
|
||||
traceGapTaskIds: [],
|
||||
traceGapNotStaleTaskIds: [taskId],
|
||||
schedulerHeartbeatStaleMs: 300000,
|
||||
now,
|
||||
lastSchedulerHeartbeatAt: freshAt,
|
||||
lastObservedAgentEventAt: freshAt,
|
||||
lastPersistedTraceAt: freshAt,
|
||||
oaPublisher: null,
|
||||
reasons: ["postgres control-plane has database-active tasks while its local active slots are empty, but scheduler heartbeat is fresh"],
|
||||
guidance: ["Continue supervision; do not restart solely because master activeRunSlotCount is zero."],
|
||||
}
|
||||
: {
|
||||
state: "split-brain",
|
||||
health: "split-brain",
|
||||
degraded: true,
|
||||
splitBrain: true,
|
||||
splitBrainLive: false,
|
||||
effectiveLiveness: "at-risk",
|
||||
recommendedAction: "investigate-heartbeat-risk",
|
||||
livenessSummary: "Heartbeat is expired for a database-active task; investigate before assuming the task is still live.",
|
||||
executionStateSource: "postgres-control-plane",
|
||||
controlPlane: "master-code-queue-mgr",
|
||||
databaseActiveTaskIds: [taskId],
|
||||
databaseActiveTaskCount: 1,
|
||||
schedulerActiveTaskIds: [],
|
||||
schedulerActiveTaskCount: 0,
|
||||
schedulerActiveRunSlotCount: 0,
|
||||
schedulerActiveQueueIds: [],
|
||||
schedulerProcessingQueueIds: [],
|
||||
schedulerOrphanedActiveTaskIds: [taskId],
|
||||
schedulerOrphanedActiveTaskCount: 1,
|
||||
activeHeartbeatTaskIds: [taskId],
|
||||
activeHeartbeatCount: 1,
|
||||
heartbeatFreshTaskIds: [],
|
||||
heartbeatExpiredTaskIds: [taskId],
|
||||
heartbeatMissingTaskIds: [],
|
||||
staleRecoveryCandidateTaskIds: [taskId],
|
||||
heartbeatRiskTaskIds: [taskId],
|
||||
traceGapTaskIds: [taskId],
|
||||
traceGapNotStaleTaskIds: [],
|
||||
schedulerHeartbeatStaleMs: 300000,
|
||||
now,
|
||||
lastSchedulerHeartbeatAt: staleAt,
|
||||
lastObservedAgentEventAt: staleAt,
|
||||
lastPersistedTraceAt: staleAt,
|
||||
oaPublisher: null,
|
||||
reasons: ["owner heartbeat is expired and scheduler has no local active run for at least one database-active task"],
|
||||
guidance: ["Investigate heartbeat risk before retry or restart."],
|
||||
};
|
||||
const task = {
|
||||
id: taskId,
|
||||
queueId: "issue14-liveness-fixture",
|
||||
queueEnteredAt: "2026-05-20T05:50:00.000Z",
|
||||
displayPrompt: `Issue #14 ${mode} split-brain fixture`,
|
||||
basePrompt: `Issue #14 ${mode} split-brain fixture`,
|
||||
displayPromptPreview: `Issue #14 ${mode} split-brain fixture`,
|
||||
providerId: "D601-dev",
|
||||
executionMode: "default",
|
||||
cwd: "/workspace-dev",
|
||||
model: "gpt-5.5",
|
||||
maxAttempts: 99,
|
||||
status: "running",
|
||||
createdAt: "2026-05-20T05:50:00.000Z",
|
||||
updatedAt: activeAt,
|
||||
startedAt: "2026-05-20T05:50:10.000Z",
|
||||
finishedAt: null,
|
||||
currentAttempt: 1,
|
||||
currentMode: "initial",
|
||||
codexThreadId: "thread_issue14_fixture",
|
||||
activeTurnId: "turn_issue14_fixture",
|
||||
finalResponse: "",
|
||||
stepCount: mode === "live" ? 4 : 0,
|
||||
llmStepCount: mode === "live" ? 2 : 0,
|
||||
outputCount: mode === "live" ? 8 : 1,
|
||||
eventCount: 1,
|
||||
attemptCount: 0,
|
||||
attempts: [],
|
||||
summaryOnly: true,
|
||||
promptEditable: false,
|
||||
schedulerHeartbeat: {
|
||||
taskId,
|
||||
queueId: "issue14-liveness-fixture",
|
||||
attempt: 1,
|
||||
activeTurnId: "turn_issue14_fixture",
|
||||
codexThreadId: "thread_issue14_fixture",
|
||||
owner: "D601-dev",
|
||||
schedulerInstance: "code-queue-scheduler-fixture",
|
||||
executionPlane: "scheduler-execution-plane",
|
||||
agentPort: "codex",
|
||||
status: "running",
|
||||
lastLocalHeartbeatAt: activeAt,
|
||||
lastObservedAgentEventAt: activeAt,
|
||||
lastPersistedTraceAt: activeAt,
|
||||
outputMaxSeq: mode === "live" ? 8 : 1,
|
||||
source: "scheduler",
|
||||
},
|
||||
timing: { durationMs: 600000, totalElapsedMs: 600000 },
|
||||
};
|
||||
const queue = {
|
||||
activeTaskId: taskId,
|
||||
activeTaskIds: [taskId],
|
||||
total: 1,
|
||||
unreadTerminal: 0,
|
||||
counts: { running: 1, queued: 0, retry_wait: 0, judging: 0 },
|
||||
queues: [{ id: "issue14-liveness-fixture", name: "Issue 14 Liveness Fixture", total: 1, activeTaskId: taskId, counts: { running: 1 } }],
|
||||
mainProviderId: "D601-dev",
|
||||
defaultProviderId: "D601-dev",
|
||||
defaultWorkdir: "/workspace-dev",
|
||||
defaultWorkdirByProvider: { "D601-dev": "/workspace-dev" },
|
||||
codeModels: ["gpt-5.5", "gpt-5.4-mini", "gpt-5.4"],
|
||||
executionProviders: [{ id: "D601-dev", label: "D601 dev", defaultWorkdir: "/workspace-dev" }],
|
||||
executionModes: [{ id: "default", label: "默认容器/本机" }],
|
||||
executionDiagnostics: diagnostics,
|
||||
};
|
||||
return {
|
||||
ok: true,
|
||||
queue,
|
||||
statistics: { timezone: "Asia/Shanghai", range: { startDate: "2026-05-20", endDate: "2026-05-20" }, totals: {}, daily: [] },
|
||||
tasks: [task],
|
||||
pagination: { returned: 1, total: 1, hasMore: false },
|
||||
selected: { task, transcript: [], hasMore: false, preview: true },
|
||||
};
|
||||
}
|
||||
|
||||
async function runCodeQueueSplitBrainFixture(page: Page, frontendUrl: string, mode: "live" | "stale"): Promise<any> {
|
||||
const overview = codeQueueSplitBrainFixtureOverview(mode);
|
||||
const taskId = String(overview.tasks[0]?.id || "");
|
||||
const diagnostics = overview.queue.executionDiagnostics;
|
||||
const routePattern = "**/api/microservices/code-queue/proxy/api/**";
|
||||
const requests: string[] = [];
|
||||
const handler = async (route: any, request: any): Promise<void> => {
|
||||
if (request.method() !== "GET") {
|
||||
await route.continue();
|
||||
return;
|
||||
}
|
||||
const url = new URL(request.url());
|
||||
const path = url.pathname.replace(/^\/api\/microservices\/code-queue\/proxy/u, "");
|
||||
requests.push(`${path}${url.search}`);
|
||||
if (path === "/api/tasks/overview") {
|
||||
await route.fulfill({ status: 200, contentType: "application/json", body: JSON.stringify(overview) });
|
||||
return;
|
||||
}
|
||||
if (path === "/api/workdirs") {
|
||||
await route.fulfill({ status: 200, contentType: "application/json", body: JSON.stringify({ ok: true, workdirs: [{ providerId: "D601-dev", executionMode: "default", path: "/workspace-dev", source: "fixture" }] }) });
|
||||
return;
|
||||
}
|
||||
if (path === "/api/queues") {
|
||||
await route.fulfill({ status: 200, contentType: "application/json", body: JSON.stringify({ ok: true, queues: overview.queue.queues, summary: overview.queue }) });
|
||||
return;
|
||||
}
|
||||
if (path === `/api/tasks/${encodeURIComponent(taskId)}/trace-summary` || path === `/api/tasks/${taskId}/trace-summary`) {
|
||||
await route.fulfill({ status: 200, contentType: "application/json", body: JSON.stringify({ ok: true, summary: { ...overview.tasks[0], finalResponse: "", attempts: [], prompt: {}, execution: { durationMs: 600000, traceLineCount: 2, stepCount: mode === "live" ? 2 : 0 } } }) });
|
||||
return;
|
||||
}
|
||||
await route.continue();
|
||||
};
|
||||
|
||||
await page.route(routePattern, handler);
|
||||
try {
|
||||
await page.goto(`${frontendUrl}/app/code-queue/`, { waitUntil: "domcontentloaded", timeout: 15000 });
|
||||
await page.waitForSelector('[data-testid="code-queue-page"]', { timeout: 15000, state: "attached" });
|
||||
await page.waitForSelector('[data-testid="codex-liveness-diagnostics"]', { timeout: 15000 });
|
||||
await page.waitForFunction((expectedAction) => {
|
||||
const panel = document.querySelector(".codex-liveness-panel") as HTMLElement | null;
|
||||
return Boolean(panel && (panel.textContent || "").includes(String(expectedAction)));
|
||||
}, diagnostics.recommendedAction, { timeout: 15000 });
|
||||
const metrics = await page.evaluate(() => {
|
||||
const panel = document.querySelector(".codex-liveness-panel") as HTMLElement | null;
|
||||
const grid = document.querySelector('[data-testid="codex-liveness-diagnostics"]') as HTMLElement | null;
|
||||
const metricRows = Array.from(grid?.querySelectorAll(".codex-liveness-metric") || []) as HTMLElement[];
|
||||
const metric = (label: string): any => {
|
||||
const row = metricRows.find((item) => (item.querySelector("span")?.textContent || "").trim() === label) || null;
|
||||
return {
|
||||
className: String(row?.className || ""),
|
||||
label: row?.querySelector("span")?.textContent || "",
|
||||
value: row?.querySelector("strong")?.textContent || "",
|
||||
hint: row?.querySelector("code")?.textContent || "",
|
||||
};
|
||||
};
|
||||
const chips = Array.from(panel?.querySelectorAll(".codex-trace-status-chip.liveness") || []) as HTMLElement[];
|
||||
const advisory = panel?.querySelector(".codex-liveness-advisory") as HTMLElement | null;
|
||||
const text = panel?.innerText || panel?.textContent || "";
|
||||
return {
|
||||
panelText: text,
|
||||
health: metric("健康状态"),
|
||||
effective: metric("Effective liveness"),
|
||||
heartbeatRisk: metric("Heartbeat risk"),
|
||||
staleCandidates: metric("Stale candidates"),
|
||||
livenessChipClasses: chips.map((chip) => String(chip.className || "")),
|
||||
advisoryClass: String(advisory?.className || ""),
|
||||
advisoryText: advisory?.textContent || "",
|
||||
failedMetricCount: metricRows.filter((row) => row.classList.contains("failed")).length,
|
||||
healthFailed: metric("健康状态").className.includes("failed"),
|
||||
effectiveFailed: metric("Effective liveness").className.includes("failed"),
|
||||
warns: metricRows.filter((row) => row.classList.contains("warn") || row.classList.contains("degraded-live")).length,
|
||||
};
|
||||
});
|
||||
const ok = mode === "live"
|
||||
? metrics.health.value === "split-brain live"
|
||||
&& metrics.health.className.includes("degraded-live")
|
||||
&& !metrics.health.className.includes("failed")
|
||||
&& metrics.effective.value === "live"
|
||||
&& metrics.heartbeatRisk.value === "0"
|
||||
&& metrics.failedMetricCount === 0
|
||||
&& metrics.panelText.includes("continue-supervision")
|
||||
&& metrics.panelText.includes("控制面/执行面观测分裂")
|
||||
&& metrics.panelText.includes("heartbeat/trace 新鲜")
|
||||
&& !/任务死亡|dead task|task death|必须重启|must restart/i.test(metrics.panelText)
|
||||
: metrics.health.value === "split-brain"
|
||||
&& metrics.health.className.includes("failed")
|
||||
&& metrics.effective.value === "at-risk"
|
||||
&& metrics.heartbeatRisk.value === "1"
|
||||
&& metrics.heartbeatRisk.className.includes("failed")
|
||||
&& metrics.staleCandidates.value === "1"
|
||||
&& metrics.panelText.includes("investigate-heartbeat-risk")
|
||||
&& metrics.panelText.includes(taskId)
|
||||
&& metrics.failedMetricCount >= 3;
|
||||
return {
|
||||
checked: true,
|
||||
mode,
|
||||
taskId,
|
||||
fixtureDiagnostics: {
|
||||
state: diagnostics.state,
|
||||
splitBrainLive: diagnostics.splitBrainLive,
|
||||
effectiveLiveness: diagnostics.effectiveLiveness,
|
||||
recommendedAction: diagnostics.recommendedAction,
|
||||
heartbeatFreshTaskIds: diagnostics.heartbeatFreshTaskIds,
|
||||
heartbeatRiskTaskIds: diagnostics.heartbeatRiskTaskIds,
|
||||
traceGapNotStaleTaskIds: diagnostics.traceGapNotStaleTaskIds,
|
||||
staleRecoveryCandidateTaskIds: diagnostics.staleRecoveryCandidateTaskIds,
|
||||
},
|
||||
requests,
|
||||
...metrics,
|
||||
ok,
|
||||
};
|
||||
} finally {
|
||||
await page.unroute(routePattern, handler).catch(() => undefined);
|
||||
}
|
||||
}
|
||||
|
||||
function runPsql(config: UniDeskConfig, sql: string): { ok: boolean; stdout: string; stderr: string; exitCode: number | null } {
|
||||
const result = runCommand([
|
||||
"docker",
|
||||
@@ -2371,6 +2651,8 @@ async function frontendCheck(config: UniDeskConfig, urls: PublicUrls, checks: E2
|
||||
"frontend:code-queue-error-red-markers",
|
||||
"frontend:code-queue-stats-visible",
|
||||
"frontend:code-queue-stats-degraded-visible",
|
||||
"frontend:code-queue-split-brain-live-not-failed",
|
||||
"frontend:code-queue-split-brain-stale-failed",
|
||||
"frontend:code-queue-retry-attempt-trace-current",
|
||||
"frontend:code-queue-step-missing-diagnostic",
|
||||
"frontend:code-queue-judge-feedback-attempt-order",
|
||||
@@ -2502,6 +2784,8 @@ async function frontendCheck(config: UniDeskConfig, urls: PublicUrls, checks: E2
|
||||
let codeQueueErrorHighlightMetrics: any = { checked: false, candidateFound: false };
|
||||
let codeQueueStatsVisibleMetrics: any = { checked: false };
|
||||
let codeQueueStatsDegradedMetrics: any = { checked: false };
|
||||
let codeQueueSplitBrainLiveMetrics: any = { checked: false };
|
||||
let codeQueueSplitBrainStaleMetrics: any = { checked: false };
|
||||
let codeQueueRetryTraceFixtureMetrics: any = { checked: false };
|
||||
let claudeqqText = "";
|
||||
let routeDeepLinkText = "";
|
||||
@@ -3423,6 +3707,12 @@ async function frontendCheck(config: UniDeskConfig, urls: PublicUrls, checks: E2
|
||||
if (wants("frontend:code-queue-stats-degraded-visible")) {
|
||||
codeQueueStatsDegradedMetrics = await runCodeQueueStatsFixture(page, urls.frontendUrl, "degraded");
|
||||
}
|
||||
if (wants("frontend:code-queue-split-brain-live-not-failed")) {
|
||||
codeQueueSplitBrainLiveMetrics = await runCodeQueueSplitBrainFixture(page, urls.frontendUrl, "live");
|
||||
}
|
||||
if (wants("frontend:code-queue-split-brain-stale-failed")) {
|
||||
codeQueueSplitBrainStaleMetrics = await runCodeQueueSplitBrainFixture(page, urls.frontendUrl, "stale");
|
||||
}
|
||||
codeQueueOutputText = needCodeQueueFullSurface ? await page.locator('[data-testid="codex-output"]').innerText({ timeout: 5000 }) : "";
|
||||
codeQueueText = await page.locator('[data-testid="code-queue-page"]').innerText({ timeout: 5000 });
|
||||
codeQueueHtmlGuard = await page.evaluate(async () => {
|
||||
@@ -4180,6 +4470,29 @@ async function frontendCheck(config: UniDeskConfig, urls: PublicUrls, checks: E2
|
||||
codeQueueStatsDegradedMetrics.checked === true
|
||||
&& codeQueueStatsDegradedMetrics.ok === true,
|
||||
{ codeQueueStatsDegradedMetrics });
|
||||
addSelectedCheck(checks, options, "frontend:code-queue-split-brain-live-not-failed",
|
||||
codeQueueSplitBrainLiveMetrics.checked === true
|
||||
&& codeQueueSplitBrainLiveMetrics.ok === true
|
||||
&& codeQueueSplitBrainLiveMetrics.healthFailed === false
|
||||
&& codeQueueSplitBrainLiveMetrics.health?.value === "split-brain live"
|
||||
&& codeQueueSplitBrainLiveMetrics.health?.className.includes("degraded-live")
|
||||
&& codeQueueSplitBrainLiveMetrics.effective?.value === "live"
|
||||
&& codeQueueSplitBrainLiveMetrics.heartbeatRisk?.value === "0"
|
||||
&& Number(codeQueueSplitBrainLiveMetrics.failedMetricCount || 0) === 0
|
||||
&& String(codeQueueSplitBrainLiveMetrics.panelText || "").includes("continue-supervision")
|
||||
&& String(codeQueueSplitBrainLiveMetrics.panelText || "").includes("控制面/执行面观测分裂")
|
||||
&& !/任务死亡|dead task|task death|必须重启|must restart/i.test(String(codeQueueSplitBrainLiveMetrics.panelText || "")),
|
||||
{ codeQueueSplitBrainLiveMetrics });
|
||||
addSelectedCheck(checks, options, "frontend:code-queue-split-brain-stale-failed",
|
||||
codeQueueSplitBrainStaleMetrics.checked === true
|
||||
&& codeQueueSplitBrainStaleMetrics.ok === true
|
||||
&& codeQueueSplitBrainStaleMetrics.healthFailed === true
|
||||
&& codeQueueSplitBrainStaleMetrics.effective?.value === "at-risk"
|
||||
&& codeQueueSplitBrainStaleMetrics.heartbeatRisk?.value === "1"
|
||||
&& codeQueueSplitBrainStaleMetrics.heartbeatRisk?.className.includes("failed")
|
||||
&& codeQueueSplitBrainStaleMetrics.staleCandidates?.value === "1"
|
||||
&& String(codeQueueSplitBrainStaleMetrics.panelText || "").includes("investigate-heartbeat-risk"),
|
||||
{ codeQueueSplitBrainStaleMetrics });
|
||||
addSelectedCheck(checks, options, "frontend:code-queue-retry-attempt-trace-current",
|
||||
codeQueueRetryTraceFixtureMetrics.checked === true
|
||||
&& codeQueueRetryTraceFixtureMetrics.ok === true
|
||||
@@ -4345,10 +4658,12 @@ export async function runE2E(
|
||||
&& selectedChecks.every((name) => [
|
||||
...(CODE_QUEUE_RETRY_TRACE_FIXTURE_CHECK_NAMES as readonly string[]),
|
||||
...(CODE_QUEUE_STATS_FIXTURE_CHECK_NAMES as readonly string[]),
|
||||
...(CODE_QUEUE_LIVENESS_UI_FIXTURE_CHECK_NAMES as readonly string[]),
|
||||
].includes(name));
|
||||
const needFrontend = wantsPrefix(options, "frontend") && !onlyRetryTraceFixture && !onlyCodeQueueBrowserFixtures;
|
||||
const needRetryTraceFixture = wantsAnyCheck(options, [...CODE_QUEUE_RETRY_TRACE_FIXTURE_CHECK_NAMES]);
|
||||
const needStatsFixture = wantsAnyCheck(options, [...CODE_QUEUE_STATS_FIXTURE_CHECK_NAMES]);
|
||||
const needLivenessUiFixture = wantsAnyCheck(options, [...CODE_QUEUE_LIVENESS_UI_FIXTURE_CHECK_NAMES]);
|
||||
const needCodeQueueFixtures = wantsAnyCheck(options, [...CODE_QUEUE_FIXTURE_CHECK_NAMES]);
|
||||
const executedSections: string[] = [];
|
||||
|
||||
@@ -4376,7 +4691,7 @@ export async function runE2E(
|
||||
if (needFrontend) {
|
||||
executedSections.push("frontend");
|
||||
frontend = await frontendCheck(config, urls, checks, options);
|
||||
} else if (needRetryTraceFixture || needStatsFixture) {
|
||||
} else if (needRetryTraceFixture || needStatsFixture || needLivenessUiFixture) {
|
||||
executedSections.push("frontend-code-queue-browser-fixtures");
|
||||
const browser = await chromium.launch({ headless: true });
|
||||
try {
|
||||
@@ -4394,6 +4709,8 @@ export async function runE2E(
|
||||
const codeQueueRetryTraceFixtureMetrics = needRetryTraceFixture ? await runCodeQueueRetryTraceFixture(page, urls.frontendUrl) : { checked: false };
|
||||
const codeQueueStatsVisibleMetrics = wantsCheck(options, "frontend:code-queue-stats-visible") ? await runCodeQueueStatsFixture(page, urls.frontendUrl, "visible") : { checked: false };
|
||||
const codeQueueStatsDegradedMetrics = wantsCheck(options, "frontend:code-queue-stats-degraded-visible") ? await runCodeQueueStatsFixture(page, urls.frontendUrl, "degraded") : { checked: false };
|
||||
const codeQueueSplitBrainLiveMetrics = wantsCheck(options, "frontend:code-queue-split-brain-live-not-failed") ? await runCodeQueueSplitBrainFixture(page, urls.frontendUrl, "live") : { checked: false };
|
||||
const codeQueueSplitBrainStaleMetrics = wantsCheck(options, "frontend:code-queue-split-brain-stale-failed") ? await runCodeQueueSplitBrainFixture(page, urls.frontendUrl, "stale") : { checked: false };
|
||||
addSelectedCheck(checks, options, "frontend:code-queue-stats-visible",
|
||||
codeQueueStatsVisibleMetrics.checked === true
|
||||
&& codeQueueStatsVisibleMetrics.ok === true,
|
||||
@@ -4402,6 +4719,29 @@ export async function runE2E(
|
||||
codeQueueStatsDegradedMetrics.checked === true
|
||||
&& codeQueueStatsDegradedMetrics.ok === true,
|
||||
{ codeQueueStatsDegradedMetrics });
|
||||
addSelectedCheck(checks, options, "frontend:code-queue-split-brain-live-not-failed",
|
||||
codeQueueSplitBrainLiveMetrics.checked === true
|
||||
&& codeQueueSplitBrainLiveMetrics.ok === true
|
||||
&& codeQueueSplitBrainLiveMetrics.healthFailed === false
|
||||
&& codeQueueSplitBrainLiveMetrics.health?.value === "split-brain live"
|
||||
&& codeQueueSplitBrainLiveMetrics.health?.className.includes("degraded-live")
|
||||
&& codeQueueSplitBrainLiveMetrics.effective?.value === "live"
|
||||
&& codeQueueSplitBrainLiveMetrics.heartbeatRisk?.value === "0"
|
||||
&& Number(codeQueueSplitBrainLiveMetrics.failedMetricCount || 0) === 0
|
||||
&& String(codeQueueSplitBrainLiveMetrics.panelText || "").includes("continue-supervision")
|
||||
&& String(codeQueueSplitBrainLiveMetrics.panelText || "").includes("控制面/执行面观测分裂")
|
||||
&& !/任务死亡|dead task|task death|必须重启|must restart/i.test(String(codeQueueSplitBrainLiveMetrics.panelText || "")),
|
||||
{ codeQueueSplitBrainLiveMetrics });
|
||||
addSelectedCheck(checks, options, "frontend:code-queue-split-brain-stale-failed",
|
||||
codeQueueSplitBrainStaleMetrics.checked === true
|
||||
&& codeQueueSplitBrainStaleMetrics.ok === true
|
||||
&& codeQueueSplitBrainStaleMetrics.healthFailed === true
|
||||
&& codeQueueSplitBrainStaleMetrics.effective?.value === "at-risk"
|
||||
&& codeQueueSplitBrainStaleMetrics.heartbeatRisk?.value === "1"
|
||||
&& codeQueueSplitBrainStaleMetrics.heartbeatRisk?.className.includes("failed")
|
||||
&& codeQueueSplitBrainStaleMetrics.staleCandidates?.value === "1"
|
||||
&& String(codeQueueSplitBrainStaleMetrics.panelText || "").includes("investigate-heartbeat-risk"),
|
||||
{ codeQueueSplitBrainStaleMetrics });
|
||||
addSelectedCheck(checks, options, "frontend:code-queue-retry-attempt-trace-current",
|
||||
codeQueueRetryTraceFixtureMetrics.checked === true
|
||||
&& codeQueueRetryTraceFixtureMetrics.ok === true
|
||||
|
||||
File diff suppressed because one or more lines are too long
@@ -2173,13 +2173,18 @@ input:focus, select:focus, textarea:focus { border-color: var(--accent-2); }
|
||||
border-color: rgba(78, 183, 168, 0.50);
|
||||
color: var(--accent-2);
|
||||
}
|
||||
.codex-trace-status-chip.liveness.warn {
|
||||
.codex-trace-status-chip.liveness.warn,
|
||||
.codex-trace-status-chip.liveness.degraded-live {
|
||||
border-color: rgba(215, 161, 58, 0.55);
|
||||
color: #ffe0a2;
|
||||
background:
|
||||
linear-gradient(135deg, rgba(215, 161, 58, 0.13), rgba(78, 183, 168, 0.06)),
|
||||
rgba(0,0,0,0.18);
|
||||
}
|
||||
.codex-trace-status-chip.liveness.degraded-live {
|
||||
border-color: rgba(215, 161, 58, 0.62);
|
||||
box-shadow: inset 0 0 0 1px rgba(78, 183, 168, 0.10);
|
||||
}
|
||||
.codex-trace-status-chip.liveness.failed {
|
||||
border-color: rgba(255, 98, 98, 0.58);
|
||||
color: #ffb2b2;
|
||||
@@ -2204,9 +2209,15 @@ input:focus, select:focus, textarea:focus { border-color: var(--accent-2); }
|
||||
linear-gradient(135deg, rgba(78, 183, 168, 0.08), rgba(255,255,255,0.015)),
|
||||
rgba(0,0,0,0.16);
|
||||
}
|
||||
.codex-liveness-metric.warn {
|
||||
.codex-liveness-metric.warn,
|
||||
.codex-liveness-metric.degraded-live {
|
||||
border-color: rgba(215, 161, 58, 0.44);
|
||||
}
|
||||
.codex-liveness-metric.degraded-live {
|
||||
background:
|
||||
linear-gradient(135deg, rgba(215, 161, 58, 0.12), rgba(78, 183, 168, 0.05)),
|
||||
rgba(0,0,0,0.16);
|
||||
}
|
||||
.codex-liveness-metric.failed {
|
||||
border-color: rgba(255, 98, 98, 0.46);
|
||||
}
|
||||
@@ -2252,16 +2263,23 @@ input:focus, select:focus, textarea:focus { border-color: var(--accent-2); }
|
||||
min-width: 0;
|
||||
overflow-wrap: anywhere;
|
||||
}
|
||||
.codex-liveness-advisory.warn {
|
||||
.codex-liveness-advisory.warn,
|
||||
.codex-liveness-advisory.degraded-live {
|
||||
border-color: rgba(215, 161, 58, 0.42);
|
||||
background: rgba(215, 161, 58, 0.07);
|
||||
}
|
||||
.codex-liveness-advisory.warn b { color: var(--warn); }
|
||||
.codex-liveness-advisory.warn b,
|
||||
.codex-liveness-advisory.degraded-live b { color: var(--warn); }
|
||||
.codex-liveness-advisory.failed {
|
||||
border-color: rgba(255, 98, 98, 0.46);
|
||||
background: rgba(207, 106, 84, 0.09);
|
||||
}
|
||||
.codex-liveness-advisory.failed b { color: #ffb2b2; }
|
||||
.codex-liveness-advisory code {
|
||||
max-width: 100%;
|
||||
color: var(--text);
|
||||
overflow-wrap: anywhere;
|
||||
}
|
||||
.codex-liveness-reasons {
|
||||
display: flex;
|
||||
flex-wrap: wrap;
|
||||
|
||||
@@ -249,11 +249,12 @@ function diagnosticsHeartbeatRiskTaskIds(diagnostics: any): string[] {
|
||||
}
|
||||
|
||||
function splitBrainLiveDiagnostics(diagnostics: any): boolean {
|
||||
if (diagnosticsHeartbeatRiskTaskIds(diagnostics).length > 0) return false;
|
||||
if (typeof diagnostics?.splitBrainLive === "boolean") return diagnostics.splitBrainLive;
|
||||
const state = String(diagnostics?.state || diagnostics?.health || "").toLowerCase();
|
||||
const effective = String(diagnostics?.effectiveLiveness || "").toLowerCase();
|
||||
return state === "split-brain"
|
||||
&& stringArray(diagnostics?.heartbeatFreshTaskIds).length > 0
|
||||
&& diagnosticsHeartbeatRiskTaskIds(diagnostics).length === 0;
|
||||
&& (effective === "live" || stringArray(diagnostics?.heartbeatFreshTaskIds).length > 0);
|
||||
}
|
||||
|
||||
function diagnosticsEffectiveLiveness(diagnostics: any): string {
|
||||
@@ -267,11 +268,15 @@ function diagnosticsEffectiveLiveness(diagnostics: any): string {
|
||||
|
||||
function diagnosticsTone(diagnostics: any): string {
|
||||
const effective = diagnosticsEffectiveLiveness(diagnostics);
|
||||
const heartbeatRiskTaskIds = diagnosticsHeartbeatRiskTaskIds(diagnostics);
|
||||
const state = String(diagnostics?.state || diagnostics?.health || "unknown").toLowerCase();
|
||||
if (heartbeatRiskTaskIds.length > 0 || effective === "at-risk" || state === "stale-active") return "failed";
|
||||
if (splitBrainLiveDiagnostics(diagnostics)) return "degraded-live";
|
||||
if (effective === "live" || effective === "degraded") return "warn";
|
||||
if (effective === "at-risk") return "failed";
|
||||
if (["dead", "failed", "stale", "unhealthy"].includes(effective)) return "failed";
|
||||
const value = String(diagnostics?.state || diagnostics?.health || "unknown").toLowerCase();
|
||||
if (value === "healthy") return "ok";
|
||||
if (value === "stale-active") return "failed";
|
||||
if (value === "split-brain") return "warn";
|
||||
if (value === "degraded") return "warn";
|
||||
return "unknown";
|
||||
}
|
||||
@@ -1878,7 +1883,7 @@ function CodeQueueLivenessPanel({ diagnostics, queue, onRaw }: AnyRecord) {
|
||||
const effectiveLiveness = diagnosticsEffectiveLiveness(diagnostics);
|
||||
const recommendedAction = String(diagnostics?.recommendedAction || (heartbeatRiskTaskIds.length > 0 ? "investigate-heartbeat-risk" : splitBrainLive ? "continue-supervision" : effectiveLiveness === "degraded" ? "observe-degraded" : "none"));
|
||||
const livenessText = String(splitBrainLive
|
||||
? "执行面 heartbeat 新鲜,任务仍应继续监督。"
|
||||
? "控制面/执行面观测分裂,但 heartbeat/trace 新鲜,继续监督。"
|
||||
: heartbeatRiskTaskIds.length > 0
|
||||
? "存在 expired/missing/stale heartbeat 风险,请先确认执行面状态。"
|
||||
: diagnostics?.livenessSummary || (effectiveLiveness === "degraded"
|
||||
@@ -1906,7 +1911,7 @@ function CodeQueueLivenessPanel({ diagnostics, queue, onRaw }: AnyRecord) {
|
||||
h(LivenessMetric, { label: "PostgreSQL active", value: String(diagnostics?.databaseActiveTaskCount ?? queue?.databaseActiveTaskCount ?? 0), hint: compactIdList(diagnostics?.databaseActiveTaskIds ?? queue?.databaseActiveTaskIds) }),
|
||||
h(LivenessMetric, { label: "Scheduler active", value: String(diagnostics?.schedulerActiveRunSlotCount ?? queue?.activeRunSlotCount ?? 0), hint: compactIdList(diagnostics?.schedulerActiveTaskIds ?? queue?.activeTaskIds) }),
|
||||
h(LivenessMetric, { label: "Fresh heartbeat", value: String(stringArray(diagnostics?.heartbeatFreshTaskIds).length), hint: compactIdList(diagnostics?.heartbeatFreshTaskIds) }),
|
||||
h(LivenessMetric, { tone: heartbeatRiskTaskIds.length > 0 ? "failed" : splitBrainLive ? "warn" : "", label: "Heartbeat risk", value: String(heartbeatRiskTaskIds.length), hint: heartbeatRiskTaskIds.length > 0 ? compactIdList(heartbeatRiskTaskIds) : splitBrainLive ? "fresh heartbeat: keep supervising" : "--" }),
|
||||
h(LivenessMetric, { tone: heartbeatRiskTaskIds.length > 0 ? "failed" : splitBrainLive ? "degraded-live" : "", label: "Heartbeat risk", value: String(heartbeatRiskTaskIds.length), hint: heartbeatRiskTaskIds.length > 0 ? compactIdList(heartbeatRiskTaskIds) : splitBrainLive ? "fresh heartbeat/trace: keep supervising" : "--" }),
|
||||
h(LivenessMetric, { tone: stringArray(diagnostics?.traceGapNotStaleTaskIds).length > 0 ? "warn" : "", label: "Trace gap", value: String(stringArray(diagnostics?.traceGapTaskIds).length), hint: compactIdList(diagnostics?.traceGapNotStaleTaskIds) }),
|
||||
h(LivenessMetric, { tone: stringArray(diagnostics?.staleRecoveryCandidateTaskIds).length > 0 ? "failed" : "", label: "Stale candidates", value: String(stringArray(diagnostics?.staleRecoveryCandidateTaskIds).length), hint: compactIdList(diagnostics?.staleRecoveryCandidateTaskIds) }),
|
||||
h(LivenessMetric, { label: "Last scheduler heartbeat", value: fmtRelativeAge(diagnostics?.lastSchedulerHeartbeatAt), hint: String(diagnostics?.lastSchedulerHeartbeatAt || "--") }),
|
||||
@@ -1917,6 +1922,7 @@ function CodeQueueLivenessPanel({ diagnostics, queue, onRaw }: AnyRecord) {
|
||||
h("div", { className: `codex-liveness-advisory ${tone}` },
|
||||
h("b", null, splitBrainLive ? "Observing split" : heartbeatRiskTaskIds.length > 0 ? "Heartbeat risk" : "Liveness note"),
|
||||
h("span", null, livenessText),
|
||||
h("code", null, recommendedAction),
|
||||
),
|
||||
reasons.length > 0 ? h("div", { className: "codex-liveness-reasons" }, reasons.map((reason: string) => h("span", { key: reason }, reason))) : null,
|
||||
);
|
||||
|
||||
@@ -55,6 +55,9 @@ COPY package.json /app/package.json
|
||||
COPY bun.lock /app/bun.lock
|
||||
RUN bun install
|
||||
COPY src/components/shared /app/src/components/shared
|
||||
COPY src/components/frontend/package.json /app/src/components/frontend/package.json
|
||||
WORKDIR /app/src/components/frontend
|
||||
RUN test -d node_modules/react || bun install
|
||||
WORKDIR /app/src/components/microservices/code-queue
|
||||
COPY src/components/microservices/code-queue/tsconfig.json ./tsconfig.json
|
||||
COPY src/components/microservices/code-queue/src ./src
|
||||
|
||||
Reference in New Issue
Block a user