659 lines
30 KiB
TypeScript
659 lines
30 KiB
TypeScript
import { type UniDeskConfig } from "./config";
|
|
import { coreInternalFetch } from "./microservices";
|
|
import { debugDispatch, debugHealth } from "./debug";
|
|
import { runArtifactRegistryCommand } from "./artifact-registry";
|
|
import { runCodeQueueCommand } from "./code-queue";
|
|
import { classifyRunnerError } from "../../src/components/microservices/code-queue/src/runner-error-classifier";
|
|
|
|
export type ProviderSignalScope =
|
|
| "runner-local"
|
|
| "provider-gateway"
|
|
| "ssh"
|
|
| "registry"
|
|
| "k3s"
|
|
| "scheduler"
|
|
| "external-provider"
|
|
| "service-proxy"
|
|
| "microservice"
|
|
| "unknown";
|
|
|
|
export type ProviderSignalStatus = "ok" | "degraded" | "failed" | "unknown";
|
|
|
|
export type ProviderBlockingDisposition =
|
|
| "transient"
|
|
| "runner-local-observation-gap"
|
|
| "external-provider-backoff"
|
|
| "provider-degraded"
|
|
| "service-degraded"
|
|
| "global-blocker";
|
|
|
|
export type ProviderTriageDecision =
|
|
| "healthy"
|
|
| "retryable-transient"
|
|
| "service-degraded"
|
|
| "global-offline";
|
|
|
|
export interface ProviderTriageSignal {
|
|
id: string;
|
|
scope: ProviderSignalScope;
|
|
status: ProviderSignalStatus;
|
|
independentPath: boolean;
|
|
observedAt: string;
|
|
summary: string;
|
|
evidence?: unknown;
|
|
}
|
|
|
|
export interface ProviderTriageClassification {
|
|
scope: ProviderSignalScope;
|
|
decision: ProviderTriageDecision;
|
|
observedAt: string;
|
|
retryable: boolean;
|
|
recommendedCrossChecks: string[];
|
|
blockingDisposition: ProviderBlockingDisposition;
|
|
rationale: string[];
|
|
failedScopes: ProviderSignalScope[];
|
|
degradedScopes: ProviderSignalScope[];
|
|
healthyScopes: ProviderSignalScope[];
|
|
failedIndependentScopes: ProviderSignalScope[];
|
|
healthyIndependentScopes: ProviderSignalScope[];
|
|
}
|
|
|
|
export interface ProviderTriageResult extends ProviderTriageClassification {
|
|
ok: boolean;
|
|
providerId: string;
|
|
signals: ProviderTriageSignal[];
|
|
contract: {
|
|
singlePathProviderOfflineIsGlobalBlocker: false;
|
|
globalBlockerRequiresIndependentCriticalFailures: true;
|
|
};
|
|
}
|
|
|
|
type JsonRecord = Record<string, unknown>;
|
|
|
|
const criticalScopes = new Set<ProviderSignalScope>(["provider-gateway", "ssh", "scheduler", "k3s"]);
|
|
const commandPrefix = "bun scripts/cli.ts";
|
|
|
|
function asRecord(value: unknown): JsonRecord | null {
|
|
return typeof value === "object" && value !== null && !Array.isArray(value) ? value as JsonRecord : null;
|
|
}
|
|
|
|
function asArray(value: unknown): unknown[] {
|
|
return Array.isArray(value) ? value : [];
|
|
}
|
|
|
|
function text(value: unknown): string {
|
|
return typeof value === "string" ? value : "";
|
|
}
|
|
|
|
function bool(value: unknown): boolean {
|
|
return value === true;
|
|
}
|
|
|
|
function hasFlag(args: string[], name: string): boolean {
|
|
return args.includes(name);
|
|
}
|
|
|
|
function shellQuote(value: string): string {
|
|
return `'${value.replace(/'/g, `'\\''`)}'`;
|
|
}
|
|
|
|
function lower(value: unknown): string {
|
|
return String(value ?? "").toLowerCase();
|
|
}
|
|
|
|
function isoNow(): string {
|
|
return new Date().toISOString();
|
|
}
|
|
|
|
function signal(
|
|
id: string,
|
|
scope: ProviderSignalScope,
|
|
status: ProviderSignalStatus,
|
|
summary: string,
|
|
evidence?: unknown,
|
|
independentPath = true,
|
|
): ProviderTriageSignal {
|
|
return { id, scope, status, independentPath, observedAt: isoNow(), summary, evidence };
|
|
}
|
|
|
|
function isOkEnvelope(value: unknown): boolean {
|
|
const record = asRecord(value);
|
|
if (record === null) return false;
|
|
return record.ok === true;
|
|
}
|
|
|
|
function bodyOf(value: unknown): JsonRecord | null {
|
|
return asRecord(asRecord(value)?.body);
|
|
}
|
|
|
|
function findByProvider(items: unknown, providerId: string): JsonRecord | null {
|
|
return asArray(items)
|
|
.map(asRecord)
|
|
.find((item): item is JsonRecord => item !== null && item.providerId === providerId) ?? null;
|
|
}
|
|
|
|
function providerGatewaySignal(debug: unknown, providerId: string): ProviderTriageSignal {
|
|
const nodes = asArray(bodyOf(asRecord(debug)?.nodesInternal)?.nodes);
|
|
const node = findByProvider(nodes, providerId);
|
|
if (node === null) {
|
|
return signal("backend-core-node", "provider-gateway", "unknown", `backend-core node view has no provider ${providerId}`, {
|
|
nodesInternal: asRecord(debug)?.nodesInternal,
|
|
});
|
|
}
|
|
const labels = asRecord(node.labels) ?? {};
|
|
const capabilities = asArray(labels.unideskCapabilities).map((item) => String(item));
|
|
const online = node.status === "online";
|
|
const hasHeartbeat = typeof node.lastHeartbeat === "string" && node.lastHeartbeat.length > 0;
|
|
const status: ProviderSignalStatus = online && hasHeartbeat ? "ok" : online ? "degraded" : "failed";
|
|
return signal("backend-core-node", "provider-gateway", status, `backend-core node status=${node.status ?? "unknown"} lastHeartbeat=${node.lastHeartbeat ?? "null"}`, {
|
|
providerId: node.providerId,
|
|
name: node.name,
|
|
status: node.status,
|
|
connectedAt: node.connectedAt,
|
|
lastHeartbeat: node.lastHeartbeat,
|
|
providerGatewayVersion: labels.providerGatewayVersion ?? null,
|
|
hostSshConfigured: labels.hostSshConfigured ?? null,
|
|
hostSshKeyPresent: labels.hostSshKeyPresent ?? null,
|
|
sshDataPool: {
|
|
transport: labels.providerGatewaySshDataTransport ?? null,
|
|
desired: labels.providerGatewaySshDataPoolDesired ?? null,
|
|
ready: labels.providerGatewaySshDataPoolReady ?? null,
|
|
claimed: labels.providerGatewaySshDataPoolClaimed ?? null,
|
|
},
|
|
egressProxy: {
|
|
enabled: labels.providerGatewayEgressProxy ?? null,
|
|
connected: labels.providerGatewayEgressProxyConnected ?? null,
|
|
activeTunnels: labels.providerGatewayEgressProxyActiveTunnels ?? null,
|
|
pendingTunnels: labels.providerGatewayEgressProxyPendingTunnels ?? null,
|
|
staleTunnels: labels.providerGatewayEgressProxyStaleTunnels ?? null,
|
|
oldestTunnelAgeMs: labels.providerGatewayEgressProxyOldestTunnelAgeMs ?? null,
|
|
},
|
|
capabilities,
|
|
});
|
|
}
|
|
|
|
function systemStatusSignal(debug: unknown, providerId: string): ProviderTriageSignal {
|
|
const items = asArray(bodyOf(asRecord(debug)?.systemStatusInternal)?.systemStatuses);
|
|
const item = findByProvider(items, providerId);
|
|
if (item === null) return signal("backend-core-system-status", "provider-gateway", "unknown", `no system status sample for ${providerId}`);
|
|
const current = asRecord(item.current);
|
|
const currentOk = current === null ? null : current.ok;
|
|
const stale = item.stale === true;
|
|
const status: ProviderSignalStatus = current === null ? stale ? "degraded" : "unknown" : currentOk === false ? "degraded" : "ok";
|
|
return signal("backend-core-system-status", "provider-gateway", status, `system status current.ok=${String(currentOk)} stale=${String(stale)} updatedAt=${item.updatedAt ?? "null"}`, {
|
|
providerId: item.providerId,
|
|
nodeStatus: item.nodeStatus,
|
|
updatedAt: item.updatedAt,
|
|
currentCollectedAt: item.currentCollectedAt ?? null,
|
|
stale,
|
|
staleSeconds: item.staleSeconds ?? null,
|
|
current: current === null ? null : {
|
|
ok: current.ok,
|
|
collectedAt: current.collectedAt,
|
|
cpu: current.cpu,
|
|
memory: current.memory,
|
|
disk: current.disk,
|
|
},
|
|
historyCount: item.historyCount ?? null,
|
|
});
|
|
}
|
|
|
|
function sshSignal(result: unknown, providerId: string): ProviderTriageSignal {
|
|
const record = asRecord(result);
|
|
const waitTask = asRecord(asRecord(asRecord(record?.wait)?.task)?.result);
|
|
const dispatchBody = bodyOf(record?.dispatch);
|
|
const dispatchOk = isOkEnvelope(record?.dispatch) && dispatchBody?.taskId !== undefined;
|
|
const wait = asRecord(record?.wait);
|
|
const task = asRecord(wait?.task);
|
|
const taskStatus = text(task?.status);
|
|
const exitCode = waitTask === null ? null : waitTask.exitCode;
|
|
if (taskStatus === "succeeded" && (exitCode === 0 || exitCode === null)) {
|
|
return signal("host-ssh-probe", "ssh", "ok", "host.ssh short probe succeeded", {
|
|
taskId: dispatchBody?.taskId ?? null,
|
|
taskStatus,
|
|
exitCode,
|
|
probeLine: waitTask?.probeLine ?? null,
|
|
stdoutPreview: text(waitTask?.stdout).slice(0, 500),
|
|
});
|
|
}
|
|
if (dispatchOk && wait?.ok === false) {
|
|
return signal("host-ssh-probe", "ssh", "unknown", "host.ssh dispatch accepted but wait did not reach terminal state", {
|
|
providerId,
|
|
taskId: dispatchBody?.taskId ?? null,
|
|
wait,
|
|
});
|
|
}
|
|
return signal("host-ssh-probe", "ssh", "failed", "host.ssh short probe failed", {
|
|
providerId,
|
|
result,
|
|
});
|
|
}
|
|
|
|
function registrySignal(result: unknown): ProviderTriageSignal {
|
|
const record = asRecord(result);
|
|
if (record === null) return signal("artifact-registry-health", "registry", "unknown", "artifact registry health returned non-object", result);
|
|
const checks = asRecord(record.checks) ?? {};
|
|
const runtimeApiHealthy = checks.containerRunning === true && checks.loopbackOnly === true && checks.v2Ok === true;
|
|
const status: ProviderSignalStatus = record.ok === true && record.healthy !== false
|
|
? "ok"
|
|
: runtimeApiHealthy
|
|
? "degraded"
|
|
: record.ok === false
|
|
? "failed"
|
|
: "degraded";
|
|
return signal("artifact-registry-health", "registry", status, [
|
|
`artifact registry health ok=${String(record.ok)}`,
|
|
`healthy=${String(record.healthy)}`,
|
|
`unitActive=${String(checks.unitActive)}`,
|
|
`containerRunning=${String(checks.containerRunning)}`,
|
|
`loopbackOnly=${String(checks.loopbackOnly)}`,
|
|
`v2Ok=${String(checks.v2Ok)}`,
|
|
].join(" "), {
|
|
ok: record.ok,
|
|
installed: record.installed ?? null,
|
|
healthy: record.healthy ?? null,
|
|
checks: record.checks ?? null,
|
|
observed: record.observed ?? null,
|
|
command: record.command ?? null,
|
|
});
|
|
}
|
|
|
|
function microserviceHealthSignal(serviceId: string, scope: ProviderSignalScope, response: unknown): ProviderTriageSignal {
|
|
const body = bodyOf(response);
|
|
const record = asRecord(response);
|
|
const status: ProviderSignalStatus = record?.ok === true && body?.ok !== false ? "ok" : record?.ok === false ? "failed" : "degraded";
|
|
const upstreamStatus = record?.status ?? null;
|
|
return signal(`${serviceId}-health`, scope, status, `${serviceId} health upstream ok=${String(record?.ok)} status=${String(upstreamStatus)} body.ok=${String(body?.ok)}`, {
|
|
upstream: { ok: record?.ok ?? null, status: upstreamStatus },
|
|
body,
|
|
fallback: {
|
|
exitCode: record?.exitCode ?? null,
|
|
stderrTail: record?.stderrTail ?? null,
|
|
stdoutTail: record?.stdoutTail ?? null,
|
|
},
|
|
});
|
|
}
|
|
|
|
function codeQueueSchedulerSignal(response: unknown): ProviderTriageSignal {
|
|
const record = asRecord(response);
|
|
if (record === null) return signal("code-queue-health", "scheduler", "unknown", "Code Queue health returned non-object", response);
|
|
const devReady = asRecord(record.devReady);
|
|
const status: ProviderSignalStatus = record.upstream !== undefined && devReady?.ok !== false ? "ok" : devReady?.ok === false ? "degraded" : "unknown";
|
|
return signal("code-queue-health", "scheduler", status, `code-queue dev-ready ok=${String(devReady?.ok)} missingTools=${JSON.stringify(devReady?.missingTools ?? [])}`, {
|
|
upstream: record.upstream ?? null,
|
|
devReady,
|
|
commands: record.commands ?? null,
|
|
});
|
|
}
|
|
|
|
function codeQueueTasksSignal(response: unknown): ProviderTriageSignal {
|
|
const body = asRecord(asRecord(response)?.supervisor);
|
|
const diagnostics = asRecord(body?.executionDiagnostics);
|
|
if (diagnostics === null) return signal("code-queue-task-heartbeat", "scheduler", "unknown", "Code Queue task heartbeat diagnostics unavailable", response);
|
|
const effectiveLiveness = text(diagnostics.effectiveLiveness);
|
|
const status: ProviderSignalStatus = effectiveLiveness === "healthy" || effectiveLiveness === "live" ? "ok" : effectiveLiveness === "at-risk" ? "degraded" : "unknown";
|
|
return signal("code-queue-task-heartbeat", "scheduler", status, `Code Queue executionDiagnostics effectiveLiveness=${effectiveLiveness || "unknown"}`, {
|
|
executionDiagnostics: diagnostics,
|
|
commands: asRecord(body?.commands) ?? null,
|
|
});
|
|
}
|
|
|
|
function classifyErrorMessage(message: string): ProviderSignalScope {
|
|
const runnerClassification = classifyRunnerError(message);
|
|
if (runnerClassification.scope === "external-provider") return "external-provider";
|
|
if (runnerClassification.scope === "provider-gateway") return "provider-gateway";
|
|
if (runnerClassification.scope === "registry") return "registry";
|
|
if (runnerClassification.scope === "k3s") return "k3s";
|
|
if (runnerClassification.scope === "scheduler") return "scheduler";
|
|
if (runnerClassification.scope === "runner-local") return "runner-local";
|
|
const normalized = message.toLowerCase();
|
|
if (/provider is not online|provider .*offline|provider .*not online/u.test(normalized)) return "runner-local";
|
|
if (/ssh|host\.ssh/u.test(normalized)) return "ssh";
|
|
if (/registry|artifact/u.test(normalized)) return "registry";
|
|
if (/k3s|kubectl|kubernetes/u.test(normalized)) return "k3s";
|
|
if (/scheduler|code queue|codex/u.test(normalized)) return "scheduler";
|
|
if (/proxy|tunnel|microservice\.http/u.test(normalized)) return "service-proxy";
|
|
if (/microservice|service health/u.test(normalized)) return "microservice";
|
|
return "unknown";
|
|
}
|
|
|
|
function observedErrorSignal(message: string, scope: ProviderSignalScope): ProviderTriageSignal {
|
|
return signal("observed-error", scope, "failed", message, { message, runnerErrorClassification: classifyRunnerError(message) }, scope !== "runner-local");
|
|
}
|
|
|
|
function compactStringList(value: unknown, limit = 6): Record<string, unknown> {
|
|
const all = Array.from(new Set(asArray(value).map((item) => String(item ?? "")).filter(Boolean)));
|
|
return {
|
|
items: all.slice(0, limit),
|
|
count: all.length,
|
|
truncated: all.length > limit,
|
|
omitted: Math.max(0, all.length - limit),
|
|
};
|
|
}
|
|
|
|
function compactEvidence(value: unknown): unknown {
|
|
const record = asRecord(value);
|
|
if (record === null) return value;
|
|
const body = bodyOf(value);
|
|
const devReady = asRecord(record.devReady) ?? asRecord(body?.devReady);
|
|
const diagnostics = asRecord(record.executionDiagnostics) ?? asRecord(body?.executionDiagnostics) ?? asRecord(asRecord(value)?.diagnostics);
|
|
return {
|
|
upstream: record.upstream ?? (body === null ? null : { ok: asRecord(value)?.ok ?? null, status: asRecord(value)?.status ?? null }),
|
|
status: record.status ?? body?.status ?? null,
|
|
ok: record.ok ?? body?.ok ?? null,
|
|
serviceId: record.serviceId ?? body?.serviceId ?? null,
|
|
providerGatewayVersion: record.providerGatewayVersion ?? null,
|
|
hostSshConfigured: record.hostSshConfigured ?? null,
|
|
taskId: record.taskId ?? null,
|
|
taskStatus: record.taskStatus ?? null,
|
|
exitCode: record.exitCode ?? null,
|
|
devReady: devReady === null ? null : {
|
|
ok: devReady.ok ?? null,
|
|
missingTools: compactStringList(devReady.missingTools),
|
|
},
|
|
executionDiagnostics: diagnostics === null ? null : {
|
|
state: diagnostics.state ?? null,
|
|
effectiveLiveness: diagnostics.effectiveLiveness ?? null,
|
|
recommendedAction: diagnostics.recommendedAction ?? null,
|
|
splitBrainLive: diagnostics.splitBrainLive ?? null,
|
|
heartbeatFreshTaskIds: compactStringList(diagnostics.heartbeatFreshTaskIds),
|
|
heartbeatRiskTaskIds: compactStringList(diagnostics.heartbeatRiskTaskIds),
|
|
},
|
|
fallback: record.fallback === undefined ? null : record.fallback,
|
|
error: record.error ?? null,
|
|
};
|
|
}
|
|
|
|
function providerTriageCommand(providerId: string, args: string[], mode: "--full" | "--raw"): string {
|
|
const kept: string[] = [];
|
|
const valueOptions = new Set(["--observed-error", "--observed-scope", "--microservice", "--service", "--microservices"]);
|
|
for (let index = 0; index < args.length; index += 1) {
|
|
const arg = args[index] ?? "";
|
|
if (arg === "--full" || arg === "--raw") continue;
|
|
if (valueOptions.has(arg)) {
|
|
const value = args[index + 1];
|
|
if (value !== undefined) {
|
|
kept.push(arg, shellQuote(value));
|
|
index += 1;
|
|
}
|
|
continue;
|
|
}
|
|
kept.push(arg);
|
|
}
|
|
return [`${commandPrefix} provider triage ${providerId}`, ...kept, mode].filter(Boolean).join(" ");
|
|
}
|
|
|
|
export function compactProviderTriageResult(result: ProviderTriageResult, args: string[] = []): Record<string, unknown> {
|
|
const issueSignals = result.signals
|
|
.filter((item) => item.status === "failed" || item.status === "degraded" || item.status === "unknown")
|
|
.sort((left, right) => {
|
|
const rank: Record<ProviderSignalStatus, number> = { failed: 0, degraded: 1, unknown: 2, ok: 3 };
|
|
return rank[left.status] - rank[right.status];
|
|
});
|
|
const sourceSignals = issueSignals.length > 0 ? issueSignals : [];
|
|
const signalLimit = issueSignals.length > 0 ? 8 : 0;
|
|
const visibleSignals = sourceSignals.slice(0, signalLimit);
|
|
const okSignalCount = result.signals.filter((item) => item.status === "ok").length;
|
|
const issueSignalCount = issueSignals.length;
|
|
return {
|
|
ok: result.ok,
|
|
providerId: result.providerId,
|
|
decision: result.decision,
|
|
scope: result.scope,
|
|
retryable: result.retryable,
|
|
blockingDisposition: result.blockingDisposition,
|
|
observedAt: result.observedAt,
|
|
failedScopes: result.failedScopes,
|
|
degradedScopes: result.degradedScopes,
|
|
healthyScopes: result.healthyScopes,
|
|
failedIndependentScopes: result.failedIndependentScopes,
|
|
healthyIndependentScopes: result.healthyIndependentScopes,
|
|
rationale: result.rationale,
|
|
signalCounts: {
|
|
total: result.signals.length,
|
|
returned: visibleSignals.length,
|
|
limit: signalLimit,
|
|
ok: okSignalCount,
|
|
degraded: result.signals.filter((item) => item.status === "degraded").length,
|
|
failed: result.signals.filter((item) => item.status === "failed").length,
|
|
unknown: result.signals.filter((item) => item.status === "unknown").length,
|
|
omittedOkSignals: Math.max(0, result.signals.filter((item) => item.status === "ok").length - visibleSignals.filter((item) => item.status === "ok").length),
|
|
omittedIssueSignals: Math.max(0, issueSignalCount - visibleSignals.filter((item) => item.status === "failed" || item.status === "degraded" || item.status === "unknown").length),
|
|
omittedSignals: Math.max(0, sourceSignals.length - visibleSignals.length),
|
|
},
|
|
signals: visibleSignals.map((item) => ({
|
|
id: item.id,
|
|
scope: item.scope,
|
|
status: item.status,
|
|
independentPath: item.independentPath,
|
|
observedAt: item.observedAt,
|
|
summary: item.summary,
|
|
evidenceSummary: compactEvidence(item.evidence),
|
|
})),
|
|
recommendedCrossChecks: result.recommendedCrossChecks.slice(0, 8),
|
|
outputPolicy: {
|
|
default: "compact-triage-summary",
|
|
signalLimit,
|
|
full: providerTriageCommand(result.providerId, args, "--full"),
|
|
raw: providerTriageCommand(result.providerId, args, "--raw"),
|
|
note: "Default output returns prioritized failed/degraded/unknown signals plus bounded evidence. Use --full or --raw only when complete evidence is required.",
|
|
},
|
|
contract: result.contract,
|
|
};
|
|
}
|
|
|
|
export function providerTriageRecommendedCrossChecks(providerId: string): string[] {
|
|
return [
|
|
`${commandPrefix} provider triage ${providerId}`,
|
|
`${commandPrefix} debug health`,
|
|
`${commandPrefix} debug dispatch ${providerId} host.ssh --wait-ms 15000`,
|
|
`trans ${providerId} argv true`,
|
|
`${commandPrefix} artifact-registry health --provider-id ${providerId}`,
|
|
`${commandPrefix} microservice health k3sctl-adapter`,
|
|
`${commandPrefix} microservice health code-queue`,
|
|
`${commandPrefix} codex tasks --view supervisor --limit 20`,
|
|
];
|
|
}
|
|
|
|
function uniqueScopes(signals: ProviderTriageSignal[], statuses: ProviderSignalStatus[], independentOnly = true): ProviderSignalScope[] {
|
|
return Array.from(new Set(signals
|
|
.filter((item) => !independentOnly || item.independentPath)
|
|
.filter((item) => statuses.includes(item.status))
|
|
.map((item) => item.scope)))
|
|
.sort();
|
|
}
|
|
|
|
function primaryScope(signals: ProviderTriageSignal[]): ProviderSignalScope {
|
|
const failed = uniqueScopes(signals, ["failed"]);
|
|
if (failed.length === 1) return failed[0] ?? "unknown";
|
|
if (failed.length > 1) return failed.some((scope) => criticalScopes.has(scope)) ? failed.find((scope) => criticalScopes.has(scope)) ?? "unknown" : failed[0] ?? "unknown";
|
|
const degraded = uniqueScopes(signals, ["degraded"]);
|
|
if (degraded.length === 1) return degraded[0] ?? "unknown";
|
|
if (degraded.length > 1) return degraded[0] ?? "unknown";
|
|
return "unknown";
|
|
}
|
|
|
|
export function classifyProviderTriage(providerId: string, signals: ProviderTriageSignal[], observedAt = isoNow()): ProviderTriageClassification {
|
|
const failedScopes = uniqueScopes(signals, ["failed"], false);
|
|
const degradedScopes = uniqueScopes(signals, ["degraded"], false);
|
|
const healthyScopes = uniqueScopes(signals, ["ok"]);
|
|
const independentFailedScopes = uniqueScopes(signals, ["failed"]).filter((scope) => scope !== "runner-local");
|
|
const independentDegradedScopes = uniqueScopes(signals, ["degraded"]);
|
|
const failedCriticalScopes = independentFailedScopes.filter((scope) => criticalScopes.has(scope));
|
|
const runnerLocalObservedFailure = signals.some((signal) => signal.scope === "runner-local" && signal.status === "failed");
|
|
const externalProviderObservedFailure = signals.some((signal) => signal.scope === "external-provider" && signal.status === "failed");
|
|
const serviceOnlyFailure = independentFailedScopes.length > 0 && independentFailedScopes.every((scope) => scope === "registry" || scope === "service-proxy" || scope === "microservice" || scope === "k3s" || scope === "external-provider");
|
|
const hasIndependentHealthy = healthyScopes.length > 0;
|
|
const rationale: string[] = [];
|
|
let blockingDisposition: ProviderBlockingDisposition;
|
|
|
|
if (externalProviderObservedFailure && failedCriticalScopes.length === 0) {
|
|
blockingDisposition = "external-provider-backoff";
|
|
rationale.push("external model provider 429/rate-limit should stay in Code Queue retry_wait with conservative backoff while scheduler heartbeat remains healthy");
|
|
} else if (runnerLocalObservedFailure && independentFailedScopes.length === 0) {
|
|
blockingDisposition = "runner-local-observation-gap";
|
|
rationale.push("single runner-local provider offline observation is not sufficient evidence for global D601 outage");
|
|
} else if (failedCriticalScopes.length >= 2 && healthyScopes.length === 0) {
|
|
blockingDisposition = "global-blocker";
|
|
rationale.push("multiple independent critical provider paths failed and no independent healthy path was observed");
|
|
} else if (serviceOnlyFailure && hasIndependentHealthy) {
|
|
blockingDisposition = "service-degraded";
|
|
rationale.push("service-scoped path failed while at least one provider-level path remains healthy");
|
|
} else if (failedCriticalScopes.length > 0 || independentDegradedScopes.some((scope) => criticalScopes.has(scope))) {
|
|
blockingDisposition = hasIndependentHealthy ? "provider-degraded" : "transient";
|
|
rationale.push(hasIndependentHealthy
|
|
? "provider-critical path is degraded but cross-checks still show independent healthy evidence"
|
|
: "critical path issue lacks enough independent failed evidence for global blocker");
|
|
} else if (failedScopes.length > 0 || degradedScopes.length > 0) {
|
|
blockingDisposition = "service-degraded";
|
|
rationale.push("only non-provider-global service paths are failed or degraded");
|
|
} else {
|
|
blockingDisposition = "transient";
|
|
rationale.push("no failed independent path was observed");
|
|
}
|
|
|
|
if (runnerLocalObservedFailure) rationale.push("runner-local observation failed but is not counted as an independent global blocker by contract");
|
|
if (hasIndependentHealthy) rationale.push(`healthy independent scopes: ${healthyScopes.join(", ")}`);
|
|
if (failedScopes.length > 0) rationale.push(`failed scopes: ${failedScopes.join(", ")}`);
|
|
const hasAnyIssueSignal = signals.some((item) => item.status === "failed" || item.status === "degraded");
|
|
const decision: ProviderTriageDecision = blockingDisposition === "global-blocker"
|
|
? "global-offline"
|
|
: blockingDisposition === "service-degraded"
|
|
? "service-degraded"
|
|
: hasAnyIssueSignal
|
|
? "retryable-transient"
|
|
: "healthy";
|
|
|
|
return {
|
|
scope: runnerLocalObservedFailure && independentFailedScopes.length === 0 && independentDegradedScopes.length === 0 ? "runner-local" : primaryScope(signals),
|
|
decision,
|
|
observedAt,
|
|
retryable: blockingDisposition !== "global-blocker",
|
|
recommendedCrossChecks: providerTriageRecommendedCrossChecks(providerId),
|
|
blockingDisposition,
|
|
rationale,
|
|
failedScopes,
|
|
degradedScopes,
|
|
healthyScopes,
|
|
failedIndependentScopes: independentFailedScopes,
|
|
healthyIndependentScopes: healthyScopes,
|
|
};
|
|
}
|
|
|
|
export function buildProviderTriageResult(providerId: string, signals: ProviderTriageSignal[], observedAt = isoNow()): ProviderTriageResult {
|
|
const classification = classifyProviderTriage(providerId, signals, observedAt);
|
|
return {
|
|
ok: classification.blockingDisposition !== "global-blocker",
|
|
providerId,
|
|
...classification,
|
|
signals,
|
|
contract: {
|
|
singlePathProviderOfflineIsGlobalBlocker: false,
|
|
globalBlockerRequiresIndependentCriticalFailures: true,
|
|
},
|
|
};
|
|
}
|
|
|
|
function parseServiceList(args: string[]): string[] {
|
|
const services: string[] = [];
|
|
for (let index = 0; index < args.length; index += 1) {
|
|
const arg = args[index] ?? "";
|
|
if (arg === "--microservice" || arg === "--service") {
|
|
const value = args[index + 1];
|
|
if (value === undefined || value.length === 0) throw new Error(`${arg} requires a service id`);
|
|
services.push(value);
|
|
index += 1;
|
|
}
|
|
if (arg === "--microservices") {
|
|
const value = args[index + 1];
|
|
if (value === undefined || value.length === 0) throw new Error(`${arg} requires a comma-separated service list`);
|
|
services.push(...value.split(",").map((item) => item.trim()).filter(Boolean));
|
|
index += 1;
|
|
}
|
|
}
|
|
return Array.from(new Set(services));
|
|
}
|
|
|
|
function optionValue(args: string[], name: string): string | undefined {
|
|
const index = args.indexOf(name);
|
|
if (index === -1) return undefined;
|
|
const raw = args[index + 1];
|
|
if (raw === undefined || raw.length === 0) throw new Error(`${name} requires a non-empty value`);
|
|
return raw;
|
|
}
|
|
|
|
function assertKnownOptions(args: string[]): void {
|
|
const flags = new Set(["--full", "--raw"]);
|
|
const valueOptions = new Set(["--observed-error", "--observed-scope", "--microservice", "--service", "--microservices"]);
|
|
for (let index = 0; index < args.length; index += 1) {
|
|
const arg = args[index] ?? "";
|
|
if (!arg.startsWith("--")) continue;
|
|
if (flags.has(arg)) continue;
|
|
if (!valueOptions.has(arg)) throw new Error(`unsupported provider triage option: ${arg}`);
|
|
const value = args[index + 1];
|
|
if (value === undefined || value.startsWith("--")) throw new Error(`${arg} requires a value`);
|
|
index += 1;
|
|
}
|
|
}
|
|
|
|
export async function runProviderTriage(config: UniDeskConfig, providerId: string, args: string[] = []): Promise<unknown> {
|
|
if (!/^[A-Za-z0-9_.-]{1,64}$/u.test(providerId)) throw new Error("provider triage requires a safe provider id such as D601");
|
|
assertKnownOptions(args);
|
|
const observedAt = isoNow();
|
|
const signals: ProviderTriageSignal[] = [];
|
|
const observedError = optionValue(args, "--observed-error");
|
|
const observedScope = optionValue(args, "--observed-scope") as ProviderSignalScope | undefined;
|
|
if (observedError !== undefined) signals.push(observedErrorSignal(observedError, observedScope ?? classifyErrorMessage(observedError)));
|
|
|
|
const debug = await debugHealth(config);
|
|
signals.push(providerGatewaySignal(debug, providerId));
|
|
signals.push(systemStatusSignal(debug, providerId));
|
|
|
|
try {
|
|
signals.push(sshSignal(await debugDispatch(config, providerId, "host.ssh", { source: "provider-triage", mode: "probe", timeoutMs: 8000 }, 15_000), providerId));
|
|
} catch (error) {
|
|
signals.push(signal("host-ssh-probe", "ssh", "failed", error instanceof Error ? error.message : String(error), { error: String(error) }));
|
|
}
|
|
|
|
try {
|
|
signals.push(registrySignal(await runArtifactRegistryCommand(["health", "--provider-id", providerId])));
|
|
} catch (error) {
|
|
signals.push(signal("artifact-registry-health", "registry", "failed", error instanceof Error ? error.message : String(error), { error: String(error) }));
|
|
}
|
|
|
|
try {
|
|
signals.push(microserviceHealthSignal("k3sctl-adapter", "k3s", coreInternalFetch("/api/microservices/k3sctl-adapter/health")));
|
|
} catch (error) {
|
|
signals.push(signal("k3sctl-adapter-health", "k3s", "failed", error instanceof Error ? error.message : String(error), { error: String(error) }));
|
|
}
|
|
|
|
try {
|
|
signals.push(microserviceHealthSignal("code-queue", "scheduler", coreInternalFetch("/api/microservices/code-queue/health")));
|
|
} catch (error) {
|
|
signals.push(signal("code-queue-microservice-health", "scheduler", "failed", error instanceof Error ? error.message : String(error), { error: String(error) }));
|
|
}
|
|
|
|
try {
|
|
signals.push(codeQueueSchedulerSignal(await runCodeQueueCommand(config, ["dev-ready"])));
|
|
} catch (error) {
|
|
signals.push(signal("code-queue-health", "scheduler", "unknown", error instanceof Error ? error.message : String(error), { error: String(error) }));
|
|
}
|
|
|
|
try {
|
|
signals.push(codeQueueTasksSignal(await runCodeQueueCommand(config, ["tasks", "--view", "supervisor", "--limit", "20"])));
|
|
} catch (error) {
|
|
signals.push(signal("code-queue-task-heartbeat", "scheduler", "unknown", error instanceof Error ? error.message : String(error), { error: String(error) }));
|
|
}
|
|
|
|
for (const serviceId of parseServiceList(args)) {
|
|
try {
|
|
signals.push(microserviceHealthSignal(serviceId, "microservice", coreInternalFetch(`/api/microservices/${encodeURIComponent(serviceId)}/health`)));
|
|
} catch (error) {
|
|
signals.push(signal(`${serviceId}-health`, "microservice", "failed", error instanceof Error ? error.message : String(error), { error: String(error) }));
|
|
}
|
|
}
|
|
|
|
const result = buildProviderTriageResult(providerId, signals, observedAt);
|
|
return hasFlag(args, "--full") || hasFlag(args, "--raw") ? result : compactProviderTriageResult(result, args);
|
|
}
|