diff --git a/src/backend/codex-stdio.ts b/src/backend/codex-stdio.ts index 8f4eca0..0abd9e7 100644 --- a/src/backend/codex-stdio.ts +++ b/src/backend/codex-stdio.ts @@ -911,7 +911,7 @@ function normalizeCodexNotification(message: JsonRecord, suppressed: SuppressedN const messageText = typeof error.message === "string" ? error.message : "Codex app-server error"; const failureKind = classifyCodexErrorRecord(error, "backend-failed"); const terminal = params.willRetry === true ? undefined : { status: "failed" as const, failureKind, message: redactText(messageText) }; - return { events: [{ type: "error", payload: { failureKind, error: redactJson(error), willRetry: params.willRetry === true } }], ...(terminal ? { terminal } : {}) }; + return { events: [{ type: "error", payload: { failureKind, message: redactText(messageText), error: redactJson(error), willRetry: params.willRetry === true, ...providerRetryDiagnostics(error, params) } }], ...(terminal ? { terminal } : {}) }; } if (method === "turn/completed") { const turn = asRecordAt(params, "turn"); @@ -1435,6 +1435,7 @@ function notificationOtelAttributes(message: JsonRecord): JsonRecord { : method === "turn/completed" && turnStatus !== "completed" ? classifyCodexErrorRecord(Object.keys(error).length > 0 ? error : { message: turnStatus ?? "unknown" }, "backend-failed") : null; + const retryDiagnostics = method === "error" ? providerRetryDiagnostics(error, params) : {}; return { method, itemId: stringAt(item, "id") ?? stringAt(params, "itemId"), @@ -1444,11 +1445,65 @@ function notificationOtelAttributes(message: JsonRecord): JsonRecord { turnId: stringAt(turn, "id"), failureKind, willRetry: typeof params.willRetry === "boolean" ? params.willRetry : null, + ...retryDiagnostics, deltaChars: typeof params.delta === "string" ? params.delta.length : null, valuesPrinted: false, }; } +function providerRetryDiagnostics(error: JsonRecord, params: JsonRecord): JsonRecord { + const message = typeof error.message === "string" ? error.message : ""; + const additionalDetails = typeof error.additionalDetails === "string" ? error.additionalDetails : ""; + const text = `${message}\n${additionalDetails}`; + const retry = parseRetryCounter(text); + const responseStreamDisconnected = asRecordAt(asRecordAt(error, "codexErrorInfo"), "responseStreamDisconnected"); + const upstreamHttpStatus = numberAt(responseStreamDisconnected, "httpStatusCode") ?? parseHttpStatus(text); + const retryBackoffMs = numberAt(error, "retryBackoffMs") ?? numberAt(error, "backoffMs") ?? numberAt(params, "retryBackoffMs") ?? numberAt(params, "backoffMs"); + const providerErrorClass = providerErrorClassForText(text, upstreamHttpStatus); + return { + retryAttempt: retry?.attempt ?? null, + retryMax: retry?.max ?? null, + retryExhausted: retry ? retry.attempt >= retry.max : null, + retryBackoffMs, + upstreamHttpStatus, + upstreamHost: parseUrlHost(text), + providerErrorClass, + errorSummary: redactText(message || additionalDetails || providerErrorClass), + valuesPrinted: false, + }; +} + +function parseRetryCounter(text: string): { attempt: number; max: number } | null { + const match = /(?:reconnect(?:ing)?|retry(?:ing)?)?[^0-9]{0,24}(\d+)\s*\/\s*(\d+)/iu.exec(text); + if (!match) return null; + const attempt = Number(match[1]); + const max = Number(match[2]); + if (!Number.isFinite(attempt) || !Number.isFinite(max) || attempt < 0 || max <= 0) return null; + return { attempt: Math.floor(attempt), max: Math.floor(max) }; +} + +function parseHttpStatus(text: string): number | null { + const match = /(?:http\s*)?(?:status\s*)?(\b[1-5][0-9]{2}\b)/iu.exec(text); + if (!match) return null; + const status = Number(match[1]); + return Number.isFinite(status) ? status : null; +} + +function parseUrlHost(text: string): string | null { + const match = /https?:\/\/([^\s/]+)(?:\/|\s|$)/iu.exec(text); + return match ? match[1].toLowerCase() : null; +} + +function providerErrorClassForText(text: string, httpStatus: number | null): string { + const lower = text.toLowerCase(); + if (httpStatus !== null) return httpStatus >= 500 ? "provider-http-5xx" : httpStatus === 429 ? "provider-rate-limit" : httpStatus === 401 || httpStatus === 403 ? "provider-auth" : "provider-http-status"; + if (/connection refused|econnrefused|os error 111/u.test(lower)) return "provider-connection-refused"; + if (/timed out|timeout|etimedout/u.test(lower)) return "provider-timeout"; + if (/reset|econnreset/u.test(lower)) return "provider-connection-reset"; + if (/disconnect|stream/u.test(lower)) return "provider-stream-disconnected"; + return "provider-unavailable"; +} + function toolCallSummaryFromNotification(message: JsonRecord): JsonRecord | null { const method = typeof message.method === "string" ? message.method : ""; if (method !== "item/started" && method !== "item/completed") return null; @@ -1609,6 +1664,11 @@ function stringAt(value: JsonRecord, key: string): string | null { return typeof value[key] === "string" && String(value[key]).length > 0 ? String(value[key]) : null; } +function numberAt(value: JsonRecord, key: string): number | null { + const next = value[key]; + return typeof next === "number" && Number.isFinite(next) ? next : null; +} + function shortHash(value: string): string { return createHash("sha256").update(value).digest("hex").slice(0, 12); } diff --git a/src/mgr/server.ts b/src/mgr/server.ts index a881853..494bf05 100644 --- a/src/mgr/server.ts +++ b/src/mgr/server.ts @@ -1024,6 +1024,14 @@ function emitRunEventOtelSpan(type: RunEvent["type"], payload: JsonRecord, run: threadId: stringJsonValue(payload.threadId), turnId: stringJsonValue(payload.turnId), willRetry: typeof payload.willRetry === "boolean" ? payload.willRetry : null, + retryAttempt: numberJsonValue(payload.retryAttempt), + retryMax: numberJsonValue(payload.retryMax), + retryExhausted: typeof payload.retryExhausted === "boolean" ? payload.retryExhausted : null, + retryBackoffMs: numberJsonValue(payload.retryBackoffMs), + upstreamHttpStatus: numberJsonValue(payload.upstreamHttpStatus), + upstreamHost: stringJsonValue(payload.upstreamHost), + providerErrorClass: stringJsonValue(payload.providerErrorClass), + errorSummary: boundedJsonString(payload.errorSummary, 300), message: boundedJsonString(payload.message, 300), }, });