Files
pikasTech-unidesk/scripts/src/provider-triage.ts
T

659 lines
30 KiB
TypeScript

import { type UniDeskConfig } from "./config";
import { coreInternalFetch } from "./microservices";
import { debugDispatch, debugHealth } from "./debug";
import { runArtifactRegistryCommand } from "./artifact-registry";
import { runCodeQueueCommand } from "./code-queue";
import { classifyRunnerError } from "../../src/components/microservices/code-queue/src/runner-error-classifier";
export type ProviderSignalScope =
| "runner-local"
| "provider-gateway"
| "ssh"
| "registry"
| "k3s"
| "scheduler"
| "external-provider"
| "service-proxy"
| "microservice"
| "unknown";
export type ProviderSignalStatus = "ok" | "degraded" | "failed" | "unknown";
export type ProviderBlockingDisposition =
| "transient"
| "runner-local-observation-gap"
| "external-provider-backoff"
| "provider-degraded"
| "service-degraded"
| "global-blocker";
export type ProviderTriageDecision =
| "healthy"
| "retryable-transient"
| "service-degraded"
| "global-offline";
export interface ProviderTriageSignal {
id: string;
scope: ProviderSignalScope;
status: ProviderSignalStatus;
independentPath: boolean;
observedAt: string;
summary: string;
evidence?: unknown;
}
export interface ProviderTriageClassification {
scope: ProviderSignalScope;
decision: ProviderTriageDecision;
observedAt: string;
retryable: boolean;
recommendedCrossChecks: string[];
blockingDisposition: ProviderBlockingDisposition;
rationale: string[];
failedScopes: ProviderSignalScope[];
degradedScopes: ProviderSignalScope[];
healthyScopes: ProviderSignalScope[];
failedIndependentScopes: ProviderSignalScope[];
healthyIndependentScopes: ProviderSignalScope[];
}
export interface ProviderTriageResult extends ProviderTriageClassification {
ok: boolean;
providerId: string;
signals: ProviderTriageSignal[];
contract: {
singlePathProviderOfflineIsGlobalBlocker: false;
globalBlockerRequiresIndependentCriticalFailures: true;
};
}
type JsonRecord = Record<string, unknown>;
const criticalScopes = new Set<ProviderSignalScope>(["provider-gateway", "ssh", "scheduler", "k3s"]);
const commandPrefix = "bun scripts/cli.ts";
function asRecord(value: unknown): JsonRecord | null {
return typeof value === "object" && value !== null && !Array.isArray(value) ? value as JsonRecord : null;
}
function asArray(value: unknown): unknown[] {
return Array.isArray(value) ? value : [];
}
function text(value: unknown): string {
return typeof value === "string" ? value : "";
}
function bool(value: unknown): boolean {
return value === true;
}
function hasFlag(args: string[], name: string): boolean {
return args.includes(name);
}
function shellQuote(value: string): string {
return `'${value.replace(/'/g, `'\\''`)}'`;
}
function lower(value: unknown): string {
return String(value ?? "").toLowerCase();
}
function isoNow(): string {
return new Date().toISOString();
}
function signal(
id: string,
scope: ProviderSignalScope,
status: ProviderSignalStatus,
summary: string,
evidence?: unknown,
independentPath = true,
): ProviderTriageSignal {
return { id, scope, status, independentPath, observedAt: isoNow(), summary, evidence };
}
function isOkEnvelope(value: unknown): boolean {
const record = asRecord(value);
if (record === null) return false;
return record.ok === true;
}
function bodyOf(value: unknown): JsonRecord | null {
return asRecord(asRecord(value)?.body);
}
function findByProvider(items: unknown, providerId: string): JsonRecord | null {
return asArray(items)
.map(asRecord)
.find((item): item is JsonRecord => item !== null && item.providerId === providerId) ?? null;
}
function providerGatewaySignal(debug: unknown, providerId: string): ProviderTriageSignal {
const nodes = asArray(bodyOf(asRecord(debug)?.nodesInternal)?.nodes);
const node = findByProvider(nodes, providerId);
if (node === null) {
return signal("backend-core-node", "provider-gateway", "unknown", `backend-core node view has no provider ${providerId}`, {
nodesInternal: asRecord(debug)?.nodesInternal,
});
}
const labels = asRecord(node.labels) ?? {};
const capabilities = asArray(labels.unideskCapabilities).map((item) => String(item));
const online = node.status === "online";
const hasHeartbeat = typeof node.lastHeartbeat === "string" && node.lastHeartbeat.length > 0;
const status: ProviderSignalStatus = online && hasHeartbeat ? "ok" : online ? "degraded" : "failed";
return signal("backend-core-node", "provider-gateway", status, `backend-core node status=${node.status ?? "unknown"} lastHeartbeat=${node.lastHeartbeat ?? "null"}`, {
providerId: node.providerId,
name: node.name,
status: node.status,
connectedAt: node.connectedAt,
lastHeartbeat: node.lastHeartbeat,
providerGatewayVersion: labels.providerGatewayVersion ?? null,
hostSshConfigured: labels.hostSshConfigured ?? null,
hostSshKeyPresent: labels.hostSshKeyPresent ?? null,
sshDataPool: {
transport: labels.providerGatewaySshDataTransport ?? null,
desired: labels.providerGatewaySshDataPoolDesired ?? null,
ready: labels.providerGatewaySshDataPoolReady ?? null,
claimed: labels.providerGatewaySshDataPoolClaimed ?? null,
},
egressProxy: {
enabled: labels.providerGatewayEgressProxy ?? null,
connected: labels.providerGatewayEgressProxyConnected ?? null,
activeTunnels: labels.providerGatewayEgressProxyActiveTunnels ?? null,
pendingTunnels: labels.providerGatewayEgressProxyPendingTunnels ?? null,
staleTunnels: labels.providerGatewayEgressProxyStaleTunnels ?? null,
oldestTunnelAgeMs: labels.providerGatewayEgressProxyOldestTunnelAgeMs ?? null,
},
capabilities,
});
}
function systemStatusSignal(debug: unknown, providerId: string): ProviderTriageSignal {
const items = asArray(bodyOf(asRecord(debug)?.systemStatusInternal)?.systemStatuses);
const item = findByProvider(items, providerId);
if (item === null) return signal("backend-core-system-status", "provider-gateway", "unknown", `no system status sample for ${providerId}`);
const current = asRecord(item.current);
const currentOk = current === null ? null : current.ok;
const stale = item.stale === true;
const status: ProviderSignalStatus = current === null ? stale ? "degraded" : "unknown" : currentOk === false ? "degraded" : "ok";
return signal("backend-core-system-status", "provider-gateway", status, `system status current.ok=${String(currentOk)} stale=${String(stale)} updatedAt=${item.updatedAt ?? "null"}`, {
providerId: item.providerId,
nodeStatus: item.nodeStatus,
updatedAt: item.updatedAt,
currentCollectedAt: item.currentCollectedAt ?? null,
stale,
staleSeconds: item.staleSeconds ?? null,
current: current === null ? null : {
ok: current.ok,
collectedAt: current.collectedAt,
cpu: current.cpu,
memory: current.memory,
disk: current.disk,
},
historyCount: item.historyCount ?? null,
});
}
function sshSignal(result: unknown, providerId: string): ProviderTriageSignal {
const record = asRecord(result);
const waitTask = asRecord(asRecord(asRecord(record?.wait)?.task)?.result);
const dispatchBody = bodyOf(record?.dispatch);
const dispatchOk = isOkEnvelope(record?.dispatch) && dispatchBody?.taskId !== undefined;
const wait = asRecord(record?.wait);
const task = asRecord(wait?.task);
const taskStatus = text(task?.status);
const exitCode = waitTask === null ? null : waitTask.exitCode;
if (taskStatus === "succeeded" && (exitCode === 0 || exitCode === null)) {
return signal("host-ssh-probe", "ssh", "ok", "host.ssh short probe succeeded", {
taskId: dispatchBody?.taskId ?? null,
taskStatus,
exitCode,
probeLine: waitTask?.probeLine ?? null,
stdoutPreview: text(waitTask?.stdout).slice(0, 500),
});
}
if (dispatchOk && wait?.ok === false) {
return signal("host-ssh-probe", "ssh", "unknown", "host.ssh dispatch accepted but wait did not reach terminal state", {
providerId,
taskId: dispatchBody?.taskId ?? null,
wait,
});
}
return signal("host-ssh-probe", "ssh", "failed", "host.ssh short probe failed", {
providerId,
result,
});
}
function registrySignal(result: unknown): ProviderTriageSignal {
const record = asRecord(result);
if (record === null) return signal("artifact-registry-health", "registry", "unknown", "artifact registry health returned non-object", result);
const checks = asRecord(record.checks) ?? {};
const runtimeApiHealthy = checks.containerRunning === true && checks.loopbackOnly === true && checks.v2Ok === true;
const status: ProviderSignalStatus = record.ok === true && record.healthy !== false
? "ok"
: runtimeApiHealthy
? "degraded"
: record.ok === false
? "failed"
: "degraded";
return signal("artifact-registry-health", "registry", status, [
`artifact registry health ok=${String(record.ok)}`,
`healthy=${String(record.healthy)}`,
`unitActive=${String(checks.unitActive)}`,
`containerRunning=${String(checks.containerRunning)}`,
`loopbackOnly=${String(checks.loopbackOnly)}`,
`v2Ok=${String(checks.v2Ok)}`,
].join(" "), {
ok: record.ok,
installed: record.installed ?? null,
healthy: record.healthy ?? null,
checks: record.checks ?? null,
observed: record.observed ?? null,
command: record.command ?? null,
});
}
function microserviceHealthSignal(serviceId: string, scope: ProviderSignalScope, response: unknown): ProviderTriageSignal {
const body = bodyOf(response);
const record = asRecord(response);
const status: ProviderSignalStatus = record?.ok === true && body?.ok !== false ? "ok" : record?.ok === false ? "failed" : "degraded";
const upstreamStatus = record?.status ?? null;
return signal(`${serviceId}-health`, scope, status, `${serviceId} health upstream ok=${String(record?.ok)} status=${String(upstreamStatus)} body.ok=${String(body?.ok)}`, {
upstream: { ok: record?.ok ?? null, status: upstreamStatus },
body,
fallback: {
exitCode: record?.exitCode ?? null,
stderrTail: record?.stderrTail ?? null,
stdoutTail: record?.stdoutTail ?? null,
},
});
}
function codeQueueSchedulerSignal(response: unknown): ProviderTriageSignal {
const record = asRecord(response);
if (record === null) return signal("code-queue-health", "scheduler", "unknown", "Code Queue health returned non-object", response);
const devReady = asRecord(record.devReady);
const status: ProviderSignalStatus = record.upstream !== undefined && devReady?.ok !== false ? "ok" : devReady?.ok === false ? "degraded" : "unknown";
return signal("code-queue-health", "scheduler", status, `code-queue dev-ready ok=${String(devReady?.ok)} missingTools=${JSON.stringify(devReady?.missingTools ?? [])}`, {
upstream: record.upstream ?? null,
devReady,
commands: record.commands ?? null,
});
}
function codeQueueTasksSignal(response: unknown): ProviderTriageSignal {
const body = asRecord(asRecord(response)?.supervisor);
const diagnostics = asRecord(body?.executionDiagnostics);
if (diagnostics === null) return signal("code-queue-task-heartbeat", "scheduler", "unknown", "Code Queue task heartbeat diagnostics unavailable", response);
const effectiveLiveness = text(diagnostics.effectiveLiveness);
const status: ProviderSignalStatus = effectiveLiveness === "healthy" || effectiveLiveness === "live" ? "ok" : effectiveLiveness === "at-risk" ? "degraded" : "unknown";
return signal("code-queue-task-heartbeat", "scheduler", status, `Code Queue executionDiagnostics effectiveLiveness=${effectiveLiveness || "unknown"}`, {
executionDiagnostics: diagnostics,
commands: asRecord(body?.commands) ?? null,
});
}
function classifyErrorMessage(message: string): ProviderSignalScope {
const runnerClassification = classifyRunnerError(message);
if (runnerClassification.scope === "external-provider") return "external-provider";
if (runnerClassification.scope === "provider-gateway") return "provider-gateway";
if (runnerClassification.scope === "registry") return "registry";
if (runnerClassification.scope === "k3s") return "k3s";
if (runnerClassification.scope === "scheduler") return "scheduler";
if (runnerClassification.scope === "runner-local") return "runner-local";
const normalized = message.toLowerCase();
if (/provider is not online|provider .*offline|provider .*not online/u.test(normalized)) return "runner-local";
if (/ssh|host\.ssh/u.test(normalized)) return "ssh";
if (/registry|artifact/u.test(normalized)) return "registry";
if (/k3s|kubectl|kubernetes/u.test(normalized)) return "k3s";
if (/scheduler|code queue|codex/u.test(normalized)) return "scheduler";
if (/proxy|tunnel|microservice\.http/u.test(normalized)) return "service-proxy";
if (/microservice|service health/u.test(normalized)) return "microservice";
return "unknown";
}
function observedErrorSignal(message: string, scope: ProviderSignalScope): ProviderTriageSignal {
return signal("observed-error", scope, "failed", message, { message, runnerErrorClassification: classifyRunnerError(message) }, scope !== "runner-local");
}
function compactStringList(value: unknown, limit = 6): Record<string, unknown> {
const all = Array.from(new Set(asArray(value).map((item) => String(item ?? "")).filter(Boolean)));
return {
items: all.slice(0, limit),
count: all.length,
truncated: all.length > limit,
omitted: Math.max(0, all.length - limit),
};
}
function compactEvidence(value: unknown): unknown {
const record = asRecord(value);
if (record === null) return value;
const body = bodyOf(value);
const devReady = asRecord(record.devReady) ?? asRecord(body?.devReady);
const diagnostics = asRecord(record.executionDiagnostics) ?? asRecord(body?.executionDiagnostics) ?? asRecord(asRecord(value)?.diagnostics);
return {
upstream: record.upstream ?? (body === null ? null : { ok: asRecord(value)?.ok ?? null, status: asRecord(value)?.status ?? null }),
status: record.status ?? body?.status ?? null,
ok: record.ok ?? body?.ok ?? null,
serviceId: record.serviceId ?? body?.serviceId ?? null,
providerGatewayVersion: record.providerGatewayVersion ?? null,
hostSshConfigured: record.hostSshConfigured ?? null,
taskId: record.taskId ?? null,
taskStatus: record.taskStatus ?? null,
exitCode: record.exitCode ?? null,
devReady: devReady === null ? null : {
ok: devReady.ok ?? null,
missingTools: compactStringList(devReady.missingTools),
},
executionDiagnostics: diagnostics === null ? null : {
state: diagnostics.state ?? null,
effectiveLiveness: diagnostics.effectiveLiveness ?? null,
recommendedAction: diagnostics.recommendedAction ?? null,
splitBrainLive: diagnostics.splitBrainLive ?? null,
heartbeatFreshTaskIds: compactStringList(diagnostics.heartbeatFreshTaskIds),
heartbeatRiskTaskIds: compactStringList(diagnostics.heartbeatRiskTaskIds),
},
fallback: record.fallback === undefined ? null : record.fallback,
error: record.error ?? null,
};
}
function providerTriageCommand(providerId: string, args: string[], mode: "--full" | "--raw"): string {
const kept: string[] = [];
const valueOptions = new Set(["--observed-error", "--observed-scope", "--microservice", "--service", "--microservices"]);
for (let index = 0; index < args.length; index += 1) {
const arg = args[index] ?? "";
if (arg === "--full" || arg === "--raw") continue;
if (valueOptions.has(arg)) {
const value = args[index + 1];
if (value !== undefined) {
kept.push(arg, shellQuote(value));
index += 1;
}
continue;
}
kept.push(arg);
}
return [`${commandPrefix} provider triage ${providerId}`, ...kept, mode].filter(Boolean).join(" ");
}
export function compactProviderTriageResult(result: ProviderTriageResult, args: string[] = []): Record<string, unknown> {
const issueSignals = result.signals
.filter((item) => item.status === "failed" || item.status === "degraded" || item.status === "unknown")
.sort((left, right) => {
const rank: Record<ProviderSignalStatus, number> = { failed: 0, degraded: 1, unknown: 2, ok: 3 };
return rank[left.status] - rank[right.status];
});
const sourceSignals = issueSignals.length > 0 ? issueSignals : [];
const signalLimit = issueSignals.length > 0 ? 8 : 0;
const visibleSignals = sourceSignals.slice(0, signalLimit);
const okSignalCount = result.signals.filter((item) => item.status === "ok").length;
const issueSignalCount = issueSignals.length;
return {
ok: result.ok,
providerId: result.providerId,
decision: result.decision,
scope: result.scope,
retryable: result.retryable,
blockingDisposition: result.blockingDisposition,
observedAt: result.observedAt,
failedScopes: result.failedScopes,
degradedScopes: result.degradedScopes,
healthyScopes: result.healthyScopes,
failedIndependentScopes: result.failedIndependentScopes,
healthyIndependentScopes: result.healthyIndependentScopes,
rationale: result.rationale,
signalCounts: {
total: result.signals.length,
returned: visibleSignals.length,
limit: signalLimit,
ok: okSignalCount,
degraded: result.signals.filter((item) => item.status === "degraded").length,
failed: result.signals.filter((item) => item.status === "failed").length,
unknown: result.signals.filter((item) => item.status === "unknown").length,
omittedOkSignals: Math.max(0, result.signals.filter((item) => item.status === "ok").length - visibleSignals.filter((item) => item.status === "ok").length),
omittedIssueSignals: Math.max(0, issueSignalCount - visibleSignals.filter((item) => item.status === "failed" || item.status === "degraded" || item.status === "unknown").length),
omittedSignals: Math.max(0, sourceSignals.length - visibleSignals.length),
},
signals: visibleSignals.map((item) => ({
id: item.id,
scope: item.scope,
status: item.status,
independentPath: item.independentPath,
observedAt: item.observedAt,
summary: item.summary,
evidenceSummary: compactEvidence(item.evidence),
})),
recommendedCrossChecks: result.recommendedCrossChecks.slice(0, 8),
outputPolicy: {
default: "compact-triage-summary",
signalLimit,
full: providerTriageCommand(result.providerId, args, "--full"),
raw: providerTriageCommand(result.providerId, args, "--raw"),
note: "Default output returns prioritized failed/degraded/unknown signals plus bounded evidence. Use --full or --raw only when complete evidence is required.",
},
contract: result.contract,
};
}
export function providerTriageRecommendedCrossChecks(providerId: string): string[] {
return [
`${commandPrefix} provider triage ${providerId}`,
`${commandPrefix} debug health`,
`${commandPrefix} debug dispatch ${providerId} host.ssh --wait-ms 15000`,
`trans ${providerId} argv true`,
`${commandPrefix} artifact-registry health --provider-id ${providerId}`,
`${commandPrefix} microservice health k3sctl-adapter`,
`${commandPrefix} microservice health code-queue`,
`${commandPrefix} codex tasks --view supervisor --limit 20`,
];
}
function uniqueScopes(signals: ProviderTriageSignal[], statuses: ProviderSignalStatus[], independentOnly = true): ProviderSignalScope[] {
return Array.from(new Set(signals
.filter((item) => !independentOnly || item.independentPath)
.filter((item) => statuses.includes(item.status))
.map((item) => item.scope)))
.sort();
}
function primaryScope(signals: ProviderTriageSignal[]): ProviderSignalScope {
const failed = uniqueScopes(signals, ["failed"]);
if (failed.length === 1) return failed[0] ?? "unknown";
if (failed.length > 1) return failed.some((scope) => criticalScopes.has(scope)) ? failed.find((scope) => criticalScopes.has(scope)) ?? "unknown" : failed[0] ?? "unknown";
const degraded = uniqueScopes(signals, ["degraded"]);
if (degraded.length === 1) return degraded[0] ?? "unknown";
if (degraded.length > 1) return degraded[0] ?? "unknown";
return "unknown";
}
export function classifyProviderTriage(providerId: string, signals: ProviderTriageSignal[], observedAt = isoNow()): ProviderTriageClassification {
const failedScopes = uniqueScopes(signals, ["failed"], false);
const degradedScopes = uniqueScopes(signals, ["degraded"], false);
const healthyScopes = uniqueScopes(signals, ["ok"]);
const independentFailedScopes = uniqueScopes(signals, ["failed"]).filter((scope) => scope !== "runner-local");
const independentDegradedScopes = uniqueScopes(signals, ["degraded"]);
const failedCriticalScopes = independentFailedScopes.filter((scope) => criticalScopes.has(scope));
const runnerLocalObservedFailure = signals.some((signal) => signal.scope === "runner-local" && signal.status === "failed");
const externalProviderObservedFailure = signals.some((signal) => signal.scope === "external-provider" && signal.status === "failed");
const serviceOnlyFailure = independentFailedScopes.length > 0 && independentFailedScopes.every((scope) => scope === "registry" || scope === "service-proxy" || scope === "microservice" || scope === "k3s" || scope === "external-provider");
const hasIndependentHealthy = healthyScopes.length > 0;
const rationale: string[] = [];
let blockingDisposition: ProviderBlockingDisposition;
if (externalProviderObservedFailure && failedCriticalScopes.length === 0) {
blockingDisposition = "external-provider-backoff";
rationale.push("external model provider 429/rate-limit should stay in Code Queue retry_wait with conservative backoff while scheduler heartbeat remains healthy");
} else if (runnerLocalObservedFailure && independentFailedScopes.length === 0) {
blockingDisposition = "runner-local-observation-gap";
rationale.push("single runner-local provider offline observation is not sufficient evidence for global D601 outage");
} else if (failedCriticalScopes.length >= 2 && healthyScopes.length === 0) {
blockingDisposition = "global-blocker";
rationale.push("multiple independent critical provider paths failed and no independent healthy path was observed");
} else if (serviceOnlyFailure && hasIndependentHealthy) {
blockingDisposition = "service-degraded";
rationale.push("service-scoped path failed while at least one provider-level path remains healthy");
} else if (failedCriticalScopes.length > 0 || independentDegradedScopes.some((scope) => criticalScopes.has(scope))) {
blockingDisposition = hasIndependentHealthy ? "provider-degraded" : "transient";
rationale.push(hasIndependentHealthy
? "provider-critical path is degraded but cross-checks still show independent healthy evidence"
: "critical path issue lacks enough independent failed evidence for global blocker");
} else if (failedScopes.length > 0 || degradedScopes.length > 0) {
blockingDisposition = "service-degraded";
rationale.push("only non-provider-global service paths are failed or degraded");
} else {
blockingDisposition = "transient";
rationale.push("no failed independent path was observed");
}
if (runnerLocalObservedFailure) rationale.push("runner-local observation failed but is not counted as an independent global blocker by contract");
if (hasIndependentHealthy) rationale.push(`healthy independent scopes: ${healthyScopes.join(", ")}`);
if (failedScopes.length > 0) rationale.push(`failed scopes: ${failedScopes.join(", ")}`);
const hasAnyIssueSignal = signals.some((item) => item.status === "failed" || item.status === "degraded");
const decision: ProviderTriageDecision = blockingDisposition === "global-blocker"
? "global-offline"
: blockingDisposition === "service-degraded"
? "service-degraded"
: hasAnyIssueSignal
? "retryable-transient"
: "healthy";
return {
scope: runnerLocalObservedFailure && independentFailedScopes.length === 0 && independentDegradedScopes.length === 0 ? "runner-local" : primaryScope(signals),
decision,
observedAt,
retryable: blockingDisposition !== "global-blocker",
recommendedCrossChecks: providerTriageRecommendedCrossChecks(providerId),
blockingDisposition,
rationale,
failedScopes,
degradedScopes,
healthyScopes,
failedIndependentScopes: independentFailedScopes,
healthyIndependentScopes: healthyScopes,
};
}
export function buildProviderTriageResult(providerId: string, signals: ProviderTriageSignal[], observedAt = isoNow()): ProviderTriageResult {
const classification = classifyProviderTriage(providerId, signals, observedAt);
return {
ok: classification.blockingDisposition !== "global-blocker",
providerId,
...classification,
signals,
contract: {
singlePathProviderOfflineIsGlobalBlocker: false,
globalBlockerRequiresIndependentCriticalFailures: true,
},
};
}
function parseServiceList(args: string[]): string[] {
const services: string[] = [];
for (let index = 0; index < args.length; index += 1) {
const arg = args[index] ?? "";
if (arg === "--microservice" || arg === "--service") {
const value = args[index + 1];
if (value === undefined || value.length === 0) throw new Error(`${arg} requires a service id`);
services.push(value);
index += 1;
}
if (arg === "--microservices") {
const value = args[index + 1];
if (value === undefined || value.length === 0) throw new Error(`${arg} requires a comma-separated service list`);
services.push(...value.split(",").map((item) => item.trim()).filter(Boolean));
index += 1;
}
}
return Array.from(new Set(services));
}
function optionValue(args: string[], name: string): string | undefined {
const index = args.indexOf(name);
if (index === -1) return undefined;
const raw = args[index + 1];
if (raw === undefined || raw.length === 0) throw new Error(`${name} requires a non-empty value`);
return raw;
}
function assertKnownOptions(args: string[]): void {
const flags = new Set(["--full", "--raw"]);
const valueOptions = new Set(["--observed-error", "--observed-scope", "--microservice", "--service", "--microservices"]);
for (let index = 0; index < args.length; index += 1) {
const arg = args[index] ?? "";
if (!arg.startsWith("--")) continue;
if (flags.has(arg)) continue;
if (!valueOptions.has(arg)) throw new Error(`unsupported provider triage option: ${arg}`);
const value = args[index + 1];
if (value === undefined || value.startsWith("--")) throw new Error(`${arg} requires a value`);
index += 1;
}
}
export async function runProviderTriage(config: UniDeskConfig, providerId: string, args: string[] = []): Promise<unknown> {
if (!/^[A-Za-z0-9_.-]{1,64}$/u.test(providerId)) throw new Error("provider triage requires a safe provider id such as D601");
assertKnownOptions(args);
const observedAt = isoNow();
const signals: ProviderTriageSignal[] = [];
const observedError = optionValue(args, "--observed-error");
const observedScope = optionValue(args, "--observed-scope") as ProviderSignalScope | undefined;
if (observedError !== undefined) signals.push(observedErrorSignal(observedError, observedScope ?? classifyErrorMessage(observedError)));
const debug = await debugHealth(config);
signals.push(providerGatewaySignal(debug, providerId));
signals.push(systemStatusSignal(debug, providerId));
try {
signals.push(sshSignal(await debugDispatch(config, providerId, "host.ssh", { source: "provider-triage", mode: "probe", timeoutMs: 8000 }, 15_000), providerId));
} catch (error) {
signals.push(signal("host-ssh-probe", "ssh", "failed", error instanceof Error ? error.message : String(error), { error: String(error) }));
}
try {
signals.push(registrySignal(await runArtifactRegistryCommand(["health", "--provider-id", providerId])));
} catch (error) {
signals.push(signal("artifact-registry-health", "registry", "failed", error instanceof Error ? error.message : String(error), { error: String(error) }));
}
try {
signals.push(microserviceHealthSignal("k3sctl-adapter", "k3s", coreInternalFetch("/api/microservices/k3sctl-adapter/health")));
} catch (error) {
signals.push(signal("k3sctl-adapter-health", "k3s", "failed", error instanceof Error ? error.message : String(error), { error: String(error) }));
}
try {
signals.push(microserviceHealthSignal("code-queue", "scheduler", coreInternalFetch("/api/microservices/code-queue/health")));
} catch (error) {
signals.push(signal("code-queue-microservice-health", "scheduler", "failed", error instanceof Error ? error.message : String(error), { error: String(error) }));
}
try {
signals.push(codeQueueSchedulerSignal(await runCodeQueueCommand(config, ["dev-ready"])));
} catch (error) {
signals.push(signal("code-queue-health", "scheduler", "unknown", error instanceof Error ? error.message : String(error), { error: String(error) }));
}
try {
signals.push(codeQueueTasksSignal(await runCodeQueueCommand(config, ["tasks", "--view", "supervisor", "--limit", "20"])));
} catch (error) {
signals.push(signal("code-queue-task-heartbeat", "scheduler", "unknown", error instanceof Error ? error.message : String(error), { error: String(error) }));
}
for (const serviceId of parseServiceList(args)) {
try {
signals.push(microserviceHealthSignal(serviceId, "microservice", coreInternalFetch(`/api/microservices/${encodeURIComponent(serviceId)}/health`)));
} catch (error) {
signals.push(signal(`${serviceId}-health`, "microservice", "failed", error instanceof Error ? error.message : String(error), { error: String(error) }));
}
}
const result = buildProviderTriageResult(providerId, signals, observedAt);
return hasFlag(args, "--full") || hasFlag(args, "--raw") ? result : compactProviderTriageResult(result, args);
}