fix: clarify live split brain liveness

This commit is contained in:
Codex
2026-05-20 06:27:34 +00:00
parent d2eef85aad
commit d7c043ad5c
5 changed files with 416 additions and 49 deletions
+341 -1
View File
@@ -157,6 +157,8 @@ const FRONTEND_CHECK_NAMES = [
"frontend:code-queue-error-red-markers",
"frontend:code-queue-stats-visible",
"frontend:code-queue-stats-degraded-visible",
"frontend:code-queue-split-brain-live-not-failed",
"frontend:code-queue-split-brain-stale-failed",
"frontend:code-queue-retry-attempt-trace-current",
"frontend:code-queue-step-missing-diagnostic",
"frontend:code-queue-judge-feedback-attempt-order",
@@ -190,6 +192,10 @@ const CODE_QUEUE_STATS_FIXTURE_CHECK_NAMES = [
"frontend:code-queue-stats-visible",
"frontend:code-queue-stats-degraded-visible",
] as const;
const CODE_QUEUE_LIVENESS_UI_FIXTURE_CHECK_NAMES = [
"frontend:code-queue-split-brain-live-not-failed",
"frontend:code-queue-split-brain-stale-failed",
] as const;
const ALL_E2E_CHECK_NAMES = [
...NETWORK_CHECK_NAMES,
@@ -1462,6 +1468,280 @@ async function runCodeQueueStatsFixture(page: Page, frontendUrl: string, mode: "
}
}
function codeQueueSplitBrainFixtureOverview(mode: "live" | "stale"): any {
const taskId = mode === "live" ? "codex_issue14_split_brain_live" : "codex_issue14_split_brain_stale";
const now = "2026-05-20T06:00:00.000Z";
const freshAt = "2026-05-20T05:59:52.000Z";
const staleAt = "2026-05-20T05:42:00.000Z";
const activeAt = mode === "live" ? freshAt : staleAt;
const diagnostics = mode === "live"
? {
state: "split-brain",
health: "split-brain",
degraded: true,
splitBrain: true,
splitBrainLive: true,
effectiveLiveness: "live",
recommendedAction: "continue-supervision",
livenessSummary: "PostgreSQL and the local control-plane view are split, but scheduler-owned heartbeat and trace are fresh; treat the task as live and continue supervision.",
executionStateSource: "postgres-control-plane",
controlPlane: "master-code-queue-mgr",
databaseActiveTaskIds: [taskId],
databaseActiveTaskCount: 1,
schedulerActiveTaskIds: [],
schedulerActiveTaskCount: 0,
schedulerActiveRunSlotCount: 0,
schedulerActiveQueueIds: [],
schedulerProcessingQueueIds: [],
schedulerOrphanedActiveTaskIds: [taskId],
schedulerOrphanedActiveTaskCount: 1,
activeHeartbeatTaskIds: [taskId],
activeHeartbeatCount: 1,
heartbeatFreshTaskIds: [taskId],
heartbeatExpiredTaskIds: [],
heartbeatMissingTaskIds: [],
staleRecoveryCandidateTaskIds: [],
heartbeatRiskTaskIds: [],
traceGapTaskIds: [],
traceGapNotStaleTaskIds: [taskId],
schedulerHeartbeatStaleMs: 300000,
now,
lastSchedulerHeartbeatAt: freshAt,
lastObservedAgentEventAt: freshAt,
lastPersistedTraceAt: freshAt,
oaPublisher: null,
reasons: ["postgres control-plane has database-active tasks while its local active slots are empty, but scheduler heartbeat is fresh"],
guidance: ["Continue supervision; do not restart solely because master activeRunSlotCount is zero."],
}
: {
state: "split-brain",
health: "split-brain",
degraded: true,
splitBrain: true,
splitBrainLive: false,
effectiveLiveness: "at-risk",
recommendedAction: "investigate-heartbeat-risk",
livenessSummary: "Heartbeat is expired for a database-active task; investigate before assuming the task is still live.",
executionStateSource: "postgres-control-plane",
controlPlane: "master-code-queue-mgr",
databaseActiveTaskIds: [taskId],
databaseActiveTaskCount: 1,
schedulerActiveTaskIds: [],
schedulerActiveTaskCount: 0,
schedulerActiveRunSlotCount: 0,
schedulerActiveQueueIds: [],
schedulerProcessingQueueIds: [],
schedulerOrphanedActiveTaskIds: [taskId],
schedulerOrphanedActiveTaskCount: 1,
activeHeartbeatTaskIds: [taskId],
activeHeartbeatCount: 1,
heartbeatFreshTaskIds: [],
heartbeatExpiredTaskIds: [taskId],
heartbeatMissingTaskIds: [],
staleRecoveryCandidateTaskIds: [taskId],
heartbeatRiskTaskIds: [taskId],
traceGapTaskIds: [taskId],
traceGapNotStaleTaskIds: [],
schedulerHeartbeatStaleMs: 300000,
now,
lastSchedulerHeartbeatAt: staleAt,
lastObservedAgentEventAt: staleAt,
lastPersistedTraceAt: staleAt,
oaPublisher: null,
reasons: ["owner heartbeat is expired and scheduler has no local active run for at least one database-active task"],
guidance: ["Investigate heartbeat risk before retry or restart."],
};
const task = {
id: taskId,
queueId: "issue14-liveness-fixture",
queueEnteredAt: "2026-05-20T05:50:00.000Z",
displayPrompt: `Issue #14 ${mode} split-brain fixture`,
basePrompt: `Issue #14 ${mode} split-brain fixture`,
displayPromptPreview: `Issue #14 ${mode} split-brain fixture`,
providerId: "D601-dev",
executionMode: "default",
cwd: "/workspace-dev",
model: "gpt-5.5",
maxAttempts: 99,
status: "running",
createdAt: "2026-05-20T05:50:00.000Z",
updatedAt: activeAt,
startedAt: "2026-05-20T05:50:10.000Z",
finishedAt: null,
currentAttempt: 1,
currentMode: "initial",
codexThreadId: "thread_issue14_fixture",
activeTurnId: "turn_issue14_fixture",
finalResponse: "",
stepCount: mode === "live" ? 4 : 0,
llmStepCount: mode === "live" ? 2 : 0,
outputCount: mode === "live" ? 8 : 1,
eventCount: 1,
attemptCount: 0,
attempts: [],
summaryOnly: true,
promptEditable: false,
schedulerHeartbeat: {
taskId,
queueId: "issue14-liveness-fixture",
attempt: 1,
activeTurnId: "turn_issue14_fixture",
codexThreadId: "thread_issue14_fixture",
owner: "D601-dev",
schedulerInstance: "code-queue-scheduler-fixture",
executionPlane: "scheduler-execution-plane",
agentPort: "codex",
status: "running",
lastLocalHeartbeatAt: activeAt,
lastObservedAgentEventAt: activeAt,
lastPersistedTraceAt: activeAt,
outputMaxSeq: mode === "live" ? 8 : 1,
source: "scheduler",
},
timing: { durationMs: 600000, totalElapsedMs: 600000 },
};
const queue = {
activeTaskId: taskId,
activeTaskIds: [taskId],
total: 1,
unreadTerminal: 0,
counts: { running: 1, queued: 0, retry_wait: 0, judging: 0 },
queues: [{ id: "issue14-liveness-fixture", name: "Issue 14 Liveness Fixture", total: 1, activeTaskId: taskId, counts: { running: 1 } }],
mainProviderId: "D601-dev",
defaultProviderId: "D601-dev",
defaultWorkdir: "/workspace-dev",
defaultWorkdirByProvider: { "D601-dev": "/workspace-dev" },
codeModels: ["gpt-5.5", "gpt-5.4-mini", "gpt-5.4"],
executionProviders: [{ id: "D601-dev", label: "D601 dev", defaultWorkdir: "/workspace-dev" }],
executionModes: [{ id: "default", label: "默认容器/本机" }],
executionDiagnostics: diagnostics,
};
return {
ok: true,
queue,
statistics: { timezone: "Asia/Shanghai", range: { startDate: "2026-05-20", endDate: "2026-05-20" }, totals: {}, daily: [] },
tasks: [task],
pagination: { returned: 1, total: 1, hasMore: false },
selected: { task, transcript: [], hasMore: false, preview: true },
};
}
async function runCodeQueueSplitBrainFixture(page: Page, frontendUrl: string, mode: "live" | "stale"): Promise<any> {
const overview = codeQueueSplitBrainFixtureOverview(mode);
const taskId = String(overview.tasks[0]?.id || "");
const diagnostics = overview.queue.executionDiagnostics;
const routePattern = "**/api/microservices/code-queue/proxy/api/**";
const requests: string[] = [];
const handler = async (route: any, request: any): Promise<void> => {
if (request.method() !== "GET") {
await route.continue();
return;
}
const url = new URL(request.url());
const path = url.pathname.replace(/^\/api\/microservices\/code-queue\/proxy/u, "");
requests.push(`${path}${url.search}`);
if (path === "/api/tasks/overview") {
await route.fulfill({ status: 200, contentType: "application/json", body: JSON.stringify(overview) });
return;
}
if (path === "/api/workdirs") {
await route.fulfill({ status: 200, contentType: "application/json", body: JSON.stringify({ ok: true, workdirs: [{ providerId: "D601-dev", executionMode: "default", path: "/workspace-dev", source: "fixture" }] }) });
return;
}
if (path === "/api/queues") {
await route.fulfill({ status: 200, contentType: "application/json", body: JSON.stringify({ ok: true, queues: overview.queue.queues, summary: overview.queue }) });
return;
}
if (path === `/api/tasks/${encodeURIComponent(taskId)}/trace-summary` || path === `/api/tasks/${taskId}/trace-summary`) {
await route.fulfill({ status: 200, contentType: "application/json", body: JSON.stringify({ ok: true, summary: { ...overview.tasks[0], finalResponse: "", attempts: [], prompt: {}, execution: { durationMs: 600000, traceLineCount: 2, stepCount: mode === "live" ? 2 : 0 } } }) });
return;
}
await route.continue();
};
await page.route(routePattern, handler);
try {
await page.goto(`${frontendUrl}/app/code-queue/`, { waitUntil: "domcontentloaded", timeout: 15000 });
await page.waitForSelector('[data-testid="code-queue-page"]', { timeout: 15000, state: "attached" });
await page.waitForSelector('[data-testid="codex-liveness-diagnostics"]', { timeout: 15000 });
await page.waitForFunction((expectedAction) => {
const panel = document.querySelector(".codex-liveness-panel") as HTMLElement | null;
return Boolean(panel && (panel.textContent || "").includes(String(expectedAction)));
}, diagnostics.recommendedAction, { timeout: 15000 });
const metrics = await page.evaluate(() => {
const panel = document.querySelector(".codex-liveness-panel") as HTMLElement | null;
const grid = document.querySelector('[data-testid="codex-liveness-diagnostics"]') as HTMLElement | null;
const metricRows = Array.from(grid?.querySelectorAll(".codex-liveness-metric") || []) as HTMLElement[];
const metric = (label: string): any => {
const row = metricRows.find((item) => (item.querySelector("span")?.textContent || "").trim() === label) || null;
return {
className: String(row?.className || ""),
label: row?.querySelector("span")?.textContent || "",
value: row?.querySelector("strong")?.textContent || "",
hint: row?.querySelector("code")?.textContent || "",
};
};
const chips = Array.from(panel?.querySelectorAll(".codex-trace-status-chip.liveness") || []) as HTMLElement[];
const advisory = panel?.querySelector(".codex-liveness-advisory") as HTMLElement | null;
const text = panel?.innerText || panel?.textContent || "";
return {
panelText: text,
health: metric("健康状态"),
effective: metric("Effective liveness"),
heartbeatRisk: metric("Heartbeat risk"),
staleCandidates: metric("Stale candidates"),
livenessChipClasses: chips.map((chip) => String(chip.className || "")),
advisoryClass: String(advisory?.className || ""),
advisoryText: advisory?.textContent || "",
failedMetricCount: metricRows.filter((row) => row.classList.contains("failed")).length,
healthFailed: metric("健康状态").className.includes("failed"),
effectiveFailed: metric("Effective liveness").className.includes("failed"),
warns: metricRows.filter((row) => row.classList.contains("warn") || row.classList.contains("degraded-live")).length,
};
});
const ok = mode === "live"
? metrics.health.value === "split-brain live"
&& metrics.health.className.includes("degraded-live")
&& !metrics.health.className.includes("failed")
&& metrics.effective.value === "live"
&& metrics.heartbeatRisk.value === "0"
&& metrics.failedMetricCount === 0
&& metrics.panelText.includes("continue-supervision")
&& metrics.panelText.includes("控制面/执行面观测分裂")
&& metrics.panelText.includes("heartbeat/trace 新鲜")
&& !/任务死亡|dead task|task death|必须重启|must restart/i.test(metrics.panelText)
: metrics.health.value === "split-brain"
&& metrics.health.className.includes("failed")
&& metrics.effective.value === "at-risk"
&& metrics.heartbeatRisk.value === "1"
&& metrics.heartbeatRisk.className.includes("failed")
&& metrics.staleCandidates.value === "1"
&& metrics.panelText.includes("investigate-heartbeat-risk")
&& metrics.panelText.includes(taskId)
&& metrics.failedMetricCount >= 3;
return {
checked: true,
mode,
taskId,
fixtureDiagnostics: {
state: diagnostics.state,
splitBrainLive: diagnostics.splitBrainLive,
effectiveLiveness: diagnostics.effectiveLiveness,
recommendedAction: diagnostics.recommendedAction,
heartbeatFreshTaskIds: diagnostics.heartbeatFreshTaskIds,
heartbeatRiskTaskIds: diagnostics.heartbeatRiskTaskIds,
traceGapNotStaleTaskIds: diagnostics.traceGapNotStaleTaskIds,
staleRecoveryCandidateTaskIds: diagnostics.staleRecoveryCandidateTaskIds,
},
requests,
...metrics,
ok,
};
} finally {
await page.unroute(routePattern, handler).catch(() => undefined);
}
}
function runPsql(config: UniDeskConfig, sql: string): { ok: boolean; stdout: string; stderr: string; exitCode: number | null } {
const result = runCommand([
"docker",
@@ -2371,6 +2651,8 @@ async function frontendCheck(config: UniDeskConfig, urls: PublicUrls, checks: E2
"frontend:code-queue-error-red-markers",
"frontend:code-queue-stats-visible",
"frontend:code-queue-stats-degraded-visible",
"frontend:code-queue-split-brain-live-not-failed",
"frontend:code-queue-split-brain-stale-failed",
"frontend:code-queue-retry-attempt-trace-current",
"frontend:code-queue-step-missing-diagnostic",
"frontend:code-queue-judge-feedback-attempt-order",
@@ -2502,6 +2784,8 @@ async function frontendCheck(config: UniDeskConfig, urls: PublicUrls, checks: E2
let codeQueueErrorHighlightMetrics: any = { checked: false, candidateFound: false };
let codeQueueStatsVisibleMetrics: any = { checked: false };
let codeQueueStatsDegradedMetrics: any = { checked: false };
let codeQueueSplitBrainLiveMetrics: any = { checked: false };
let codeQueueSplitBrainStaleMetrics: any = { checked: false };
let codeQueueRetryTraceFixtureMetrics: any = { checked: false };
let claudeqqText = "";
let routeDeepLinkText = "";
@@ -3423,6 +3707,12 @@ async function frontendCheck(config: UniDeskConfig, urls: PublicUrls, checks: E2
if (wants("frontend:code-queue-stats-degraded-visible")) {
codeQueueStatsDegradedMetrics = await runCodeQueueStatsFixture(page, urls.frontendUrl, "degraded");
}
if (wants("frontend:code-queue-split-brain-live-not-failed")) {
codeQueueSplitBrainLiveMetrics = await runCodeQueueSplitBrainFixture(page, urls.frontendUrl, "live");
}
if (wants("frontend:code-queue-split-brain-stale-failed")) {
codeQueueSplitBrainStaleMetrics = await runCodeQueueSplitBrainFixture(page, urls.frontendUrl, "stale");
}
codeQueueOutputText = needCodeQueueFullSurface ? await page.locator('[data-testid="codex-output"]').innerText({ timeout: 5000 }) : "";
codeQueueText = await page.locator('[data-testid="code-queue-page"]').innerText({ timeout: 5000 });
codeQueueHtmlGuard = await page.evaluate(async () => {
@@ -4180,6 +4470,29 @@ async function frontendCheck(config: UniDeskConfig, urls: PublicUrls, checks: E2
codeQueueStatsDegradedMetrics.checked === true
&& codeQueueStatsDegradedMetrics.ok === true,
{ codeQueueStatsDegradedMetrics });
addSelectedCheck(checks, options, "frontend:code-queue-split-brain-live-not-failed",
codeQueueSplitBrainLiveMetrics.checked === true
&& codeQueueSplitBrainLiveMetrics.ok === true
&& codeQueueSplitBrainLiveMetrics.healthFailed === false
&& codeQueueSplitBrainLiveMetrics.health?.value === "split-brain live"
&& codeQueueSplitBrainLiveMetrics.health?.className.includes("degraded-live")
&& codeQueueSplitBrainLiveMetrics.effective?.value === "live"
&& codeQueueSplitBrainLiveMetrics.heartbeatRisk?.value === "0"
&& Number(codeQueueSplitBrainLiveMetrics.failedMetricCount || 0) === 0
&& String(codeQueueSplitBrainLiveMetrics.panelText || "").includes("continue-supervision")
&& String(codeQueueSplitBrainLiveMetrics.panelText || "").includes("控制面/执行面观测分裂")
&& !/任务死亡|dead task|task death|必须重启|must restart/i.test(String(codeQueueSplitBrainLiveMetrics.panelText || "")),
{ codeQueueSplitBrainLiveMetrics });
addSelectedCheck(checks, options, "frontend:code-queue-split-brain-stale-failed",
codeQueueSplitBrainStaleMetrics.checked === true
&& codeQueueSplitBrainStaleMetrics.ok === true
&& codeQueueSplitBrainStaleMetrics.healthFailed === true
&& codeQueueSplitBrainStaleMetrics.effective?.value === "at-risk"
&& codeQueueSplitBrainStaleMetrics.heartbeatRisk?.value === "1"
&& codeQueueSplitBrainStaleMetrics.heartbeatRisk?.className.includes("failed")
&& codeQueueSplitBrainStaleMetrics.staleCandidates?.value === "1"
&& String(codeQueueSplitBrainStaleMetrics.panelText || "").includes("investigate-heartbeat-risk"),
{ codeQueueSplitBrainStaleMetrics });
addSelectedCheck(checks, options, "frontend:code-queue-retry-attempt-trace-current",
codeQueueRetryTraceFixtureMetrics.checked === true
&& codeQueueRetryTraceFixtureMetrics.ok === true
@@ -4345,10 +4658,12 @@ export async function runE2E(
&& selectedChecks.every((name) => [
...(CODE_QUEUE_RETRY_TRACE_FIXTURE_CHECK_NAMES as readonly string[]),
...(CODE_QUEUE_STATS_FIXTURE_CHECK_NAMES as readonly string[]),
...(CODE_QUEUE_LIVENESS_UI_FIXTURE_CHECK_NAMES as readonly string[]),
].includes(name));
const needFrontend = wantsPrefix(options, "frontend") && !onlyRetryTraceFixture && !onlyCodeQueueBrowserFixtures;
const needRetryTraceFixture = wantsAnyCheck(options, [...CODE_QUEUE_RETRY_TRACE_FIXTURE_CHECK_NAMES]);
const needStatsFixture = wantsAnyCheck(options, [...CODE_QUEUE_STATS_FIXTURE_CHECK_NAMES]);
const needLivenessUiFixture = wantsAnyCheck(options, [...CODE_QUEUE_LIVENESS_UI_FIXTURE_CHECK_NAMES]);
const needCodeQueueFixtures = wantsAnyCheck(options, [...CODE_QUEUE_FIXTURE_CHECK_NAMES]);
const executedSections: string[] = [];
@@ -4376,7 +4691,7 @@ export async function runE2E(
if (needFrontend) {
executedSections.push("frontend");
frontend = await frontendCheck(config, urls, checks, options);
} else if (needRetryTraceFixture || needStatsFixture) {
} else if (needRetryTraceFixture || needStatsFixture || needLivenessUiFixture) {
executedSections.push("frontend-code-queue-browser-fixtures");
const browser = await chromium.launch({ headless: true });
try {
@@ -4394,6 +4709,8 @@ export async function runE2E(
const codeQueueRetryTraceFixtureMetrics = needRetryTraceFixture ? await runCodeQueueRetryTraceFixture(page, urls.frontendUrl) : { checked: false };
const codeQueueStatsVisibleMetrics = wantsCheck(options, "frontend:code-queue-stats-visible") ? await runCodeQueueStatsFixture(page, urls.frontendUrl, "visible") : { checked: false };
const codeQueueStatsDegradedMetrics = wantsCheck(options, "frontend:code-queue-stats-degraded-visible") ? await runCodeQueueStatsFixture(page, urls.frontendUrl, "degraded") : { checked: false };
const codeQueueSplitBrainLiveMetrics = wantsCheck(options, "frontend:code-queue-split-brain-live-not-failed") ? await runCodeQueueSplitBrainFixture(page, urls.frontendUrl, "live") : { checked: false };
const codeQueueSplitBrainStaleMetrics = wantsCheck(options, "frontend:code-queue-split-brain-stale-failed") ? await runCodeQueueSplitBrainFixture(page, urls.frontendUrl, "stale") : { checked: false };
addSelectedCheck(checks, options, "frontend:code-queue-stats-visible",
codeQueueStatsVisibleMetrics.checked === true
&& codeQueueStatsVisibleMetrics.ok === true,
@@ -4402,6 +4719,29 @@ export async function runE2E(
codeQueueStatsDegradedMetrics.checked === true
&& codeQueueStatsDegradedMetrics.ok === true,
{ codeQueueStatsDegradedMetrics });
addSelectedCheck(checks, options, "frontend:code-queue-split-brain-live-not-failed",
codeQueueSplitBrainLiveMetrics.checked === true
&& codeQueueSplitBrainLiveMetrics.ok === true
&& codeQueueSplitBrainLiveMetrics.healthFailed === false
&& codeQueueSplitBrainLiveMetrics.health?.value === "split-brain live"
&& codeQueueSplitBrainLiveMetrics.health?.className.includes("degraded-live")
&& codeQueueSplitBrainLiveMetrics.effective?.value === "live"
&& codeQueueSplitBrainLiveMetrics.heartbeatRisk?.value === "0"
&& Number(codeQueueSplitBrainLiveMetrics.failedMetricCount || 0) === 0
&& String(codeQueueSplitBrainLiveMetrics.panelText || "").includes("continue-supervision")
&& String(codeQueueSplitBrainLiveMetrics.panelText || "").includes("控制面/执行面观测分裂")
&& !/任务死亡|dead task|task death|必须重启|must restart/i.test(String(codeQueueSplitBrainLiveMetrics.panelText || "")),
{ codeQueueSplitBrainLiveMetrics });
addSelectedCheck(checks, options, "frontend:code-queue-split-brain-stale-failed",
codeQueueSplitBrainStaleMetrics.checked === true
&& codeQueueSplitBrainStaleMetrics.ok === true
&& codeQueueSplitBrainStaleMetrics.healthFailed === true
&& codeQueueSplitBrainStaleMetrics.effective?.value === "at-risk"
&& codeQueueSplitBrainStaleMetrics.heartbeatRisk?.value === "1"
&& codeQueueSplitBrainStaleMetrics.heartbeatRisk?.className.includes("failed")
&& codeQueueSplitBrainStaleMetrics.staleCandidates?.value === "1"
&& String(codeQueueSplitBrainStaleMetrics.panelText || "").includes("investigate-heartbeat-risk"),
{ codeQueueSplitBrainStaleMetrics });
addSelectedCheck(checks, options, "frontend:code-queue-retry-attempt-trace-current",
codeQueueRetryTraceFixtureMetrics.checked === true
&& codeQueueRetryTraceFixtureMetrics.ok === true
File diff suppressed because one or more lines are too long
+22 -4
View File
@@ -2173,13 +2173,18 @@ input:focus, select:focus, textarea:focus { border-color: var(--accent-2); }
border-color: rgba(78, 183, 168, 0.50);
color: var(--accent-2);
}
.codex-trace-status-chip.liveness.warn {
.codex-trace-status-chip.liveness.warn,
.codex-trace-status-chip.liveness.degraded-live {
border-color: rgba(215, 161, 58, 0.55);
color: #ffe0a2;
background:
linear-gradient(135deg, rgba(215, 161, 58, 0.13), rgba(78, 183, 168, 0.06)),
rgba(0,0,0,0.18);
}
.codex-trace-status-chip.liveness.degraded-live {
border-color: rgba(215, 161, 58, 0.62);
box-shadow: inset 0 0 0 1px rgba(78, 183, 168, 0.10);
}
.codex-trace-status-chip.liveness.failed {
border-color: rgba(255, 98, 98, 0.58);
color: #ffb2b2;
@@ -2204,9 +2209,15 @@ input:focus, select:focus, textarea:focus { border-color: var(--accent-2); }
linear-gradient(135deg, rgba(78, 183, 168, 0.08), rgba(255,255,255,0.015)),
rgba(0,0,0,0.16);
}
.codex-liveness-metric.warn {
.codex-liveness-metric.warn,
.codex-liveness-metric.degraded-live {
border-color: rgba(215, 161, 58, 0.44);
}
.codex-liveness-metric.degraded-live {
background:
linear-gradient(135deg, rgba(215, 161, 58, 0.12), rgba(78, 183, 168, 0.05)),
rgba(0,0,0,0.16);
}
.codex-liveness-metric.failed {
border-color: rgba(255, 98, 98, 0.46);
}
@@ -2252,16 +2263,23 @@ input:focus, select:focus, textarea:focus { border-color: var(--accent-2); }
min-width: 0;
overflow-wrap: anywhere;
}
.codex-liveness-advisory.warn {
.codex-liveness-advisory.warn,
.codex-liveness-advisory.degraded-live {
border-color: rgba(215, 161, 58, 0.42);
background: rgba(215, 161, 58, 0.07);
}
.codex-liveness-advisory.warn b { color: var(--warn); }
.codex-liveness-advisory.warn b,
.codex-liveness-advisory.degraded-live b { color: var(--warn); }
.codex-liveness-advisory.failed {
border-color: rgba(255, 98, 98, 0.46);
background: rgba(207, 106, 84, 0.09);
}
.codex-liveness-advisory.failed b { color: #ffb2b2; }
.codex-liveness-advisory code {
max-width: 100%;
color: var(--text);
overflow-wrap: anywhere;
}
.codex-liveness-reasons {
display: flex;
flex-wrap: wrap;
+12 -6
View File
@@ -249,11 +249,12 @@ function diagnosticsHeartbeatRiskTaskIds(diagnostics: any): string[] {
}
function splitBrainLiveDiagnostics(diagnostics: any): boolean {
if (diagnosticsHeartbeatRiskTaskIds(diagnostics).length > 0) return false;
if (typeof diagnostics?.splitBrainLive === "boolean") return diagnostics.splitBrainLive;
const state = String(diagnostics?.state || diagnostics?.health || "").toLowerCase();
const effective = String(diagnostics?.effectiveLiveness || "").toLowerCase();
return state === "split-brain"
&& stringArray(diagnostics?.heartbeatFreshTaskIds).length > 0
&& diagnosticsHeartbeatRiskTaskIds(diagnostics).length === 0;
&& (effective === "live" || stringArray(diagnostics?.heartbeatFreshTaskIds).length > 0);
}
function diagnosticsEffectiveLiveness(diagnostics: any): string {
@@ -267,11 +268,15 @@ function diagnosticsEffectiveLiveness(diagnostics: any): string {
function diagnosticsTone(diagnostics: any): string {
const effective = diagnosticsEffectiveLiveness(diagnostics);
const heartbeatRiskTaskIds = diagnosticsHeartbeatRiskTaskIds(diagnostics);
const state = String(diagnostics?.state || diagnostics?.health || "unknown").toLowerCase();
if (heartbeatRiskTaskIds.length > 0 || effective === "at-risk" || state === "stale-active") return "failed";
if (splitBrainLiveDiagnostics(diagnostics)) return "degraded-live";
if (effective === "live" || effective === "degraded") return "warn";
if (effective === "at-risk") return "failed";
if (["dead", "failed", "stale", "unhealthy"].includes(effective)) return "failed";
const value = String(diagnostics?.state || diagnostics?.health || "unknown").toLowerCase();
if (value === "healthy") return "ok";
if (value === "stale-active") return "failed";
if (value === "split-brain") return "warn";
if (value === "degraded") return "warn";
return "unknown";
}
@@ -1878,7 +1883,7 @@ function CodeQueueLivenessPanel({ diagnostics, queue, onRaw }: AnyRecord) {
const effectiveLiveness = diagnosticsEffectiveLiveness(diagnostics);
const recommendedAction = String(diagnostics?.recommendedAction || (heartbeatRiskTaskIds.length > 0 ? "investigate-heartbeat-risk" : splitBrainLive ? "continue-supervision" : effectiveLiveness === "degraded" ? "observe-degraded" : "none"));
const livenessText = String(splitBrainLive
? "执行面 heartbeat 新鲜,任务仍应继续监督。"
? "控制面/执行面观测分裂,但 heartbeat/trace 新鲜,继续监督。"
: heartbeatRiskTaskIds.length > 0
? "存在 expired/missing/stale heartbeat 风险,请先确认执行面状态。"
: diagnostics?.livenessSummary || (effectiveLiveness === "degraded"
@@ -1906,7 +1911,7 @@ function CodeQueueLivenessPanel({ diagnostics, queue, onRaw }: AnyRecord) {
h(LivenessMetric, { label: "PostgreSQL active", value: String(diagnostics?.databaseActiveTaskCount ?? queue?.databaseActiveTaskCount ?? 0), hint: compactIdList(diagnostics?.databaseActiveTaskIds ?? queue?.databaseActiveTaskIds) }),
h(LivenessMetric, { label: "Scheduler active", value: String(diagnostics?.schedulerActiveRunSlotCount ?? queue?.activeRunSlotCount ?? 0), hint: compactIdList(diagnostics?.schedulerActiveTaskIds ?? queue?.activeTaskIds) }),
h(LivenessMetric, { label: "Fresh heartbeat", value: String(stringArray(diagnostics?.heartbeatFreshTaskIds).length), hint: compactIdList(diagnostics?.heartbeatFreshTaskIds) }),
h(LivenessMetric, { tone: heartbeatRiskTaskIds.length > 0 ? "failed" : splitBrainLive ? "warn" : "", label: "Heartbeat risk", value: String(heartbeatRiskTaskIds.length), hint: heartbeatRiskTaskIds.length > 0 ? compactIdList(heartbeatRiskTaskIds) : splitBrainLive ? "fresh heartbeat: keep supervising" : "--" }),
h(LivenessMetric, { tone: heartbeatRiskTaskIds.length > 0 ? "failed" : splitBrainLive ? "degraded-live" : "", label: "Heartbeat risk", value: String(heartbeatRiskTaskIds.length), hint: heartbeatRiskTaskIds.length > 0 ? compactIdList(heartbeatRiskTaskIds) : splitBrainLive ? "fresh heartbeat/trace: keep supervising" : "--" }),
h(LivenessMetric, { tone: stringArray(diagnostics?.traceGapNotStaleTaskIds).length > 0 ? "warn" : "", label: "Trace gap", value: String(stringArray(diagnostics?.traceGapTaskIds).length), hint: compactIdList(diagnostics?.traceGapNotStaleTaskIds) }),
h(LivenessMetric, { tone: stringArray(diagnostics?.staleRecoveryCandidateTaskIds).length > 0 ? "failed" : "", label: "Stale candidates", value: String(stringArray(diagnostics?.staleRecoveryCandidateTaskIds).length), hint: compactIdList(diagnostics?.staleRecoveryCandidateTaskIds) }),
h(LivenessMetric, { label: "Last scheduler heartbeat", value: fmtRelativeAge(diagnostics?.lastSchedulerHeartbeatAt), hint: String(diagnostics?.lastSchedulerHeartbeatAt || "--") }),
@@ -1917,6 +1922,7 @@ function CodeQueueLivenessPanel({ diagnostics, queue, onRaw }: AnyRecord) {
h("div", { className: `codex-liveness-advisory ${tone}` },
h("b", null, splitBrainLive ? "Observing split" : heartbeatRiskTaskIds.length > 0 ? "Heartbeat risk" : "Liveness note"),
h("span", null, livenessText),
h("code", null, recommendedAction),
),
reasons.length > 0 ? h("div", { className: "codex-liveness-reasons" }, reasons.map((reason: string) => h("span", { key: reason }, reason))) : null,
);
@@ -55,6 +55,9 @@ COPY package.json /app/package.json
COPY bun.lock /app/bun.lock
RUN bun install
COPY src/components/shared /app/src/components/shared
COPY src/components/frontend/package.json /app/src/components/frontend/package.json
WORKDIR /app/src/components/frontend
RUN test -d node_modules/react || bun install
WORKDIR /app/src/components/microservices/code-queue
COPY src/components/microservices/code-queue/tsconfig.json ./tsconfig.json
COPY src/components/microservices/code-queue/src ./src