fix: expose provider retry diagnostics in otel (#215)
* fix: expose provider retry diagnostics in otel * fix: place retry diagnostics in notification attributes
This commit is contained in:
@@ -911,7 +911,7 @@ function normalizeCodexNotification(message: JsonRecord, suppressed: SuppressedN
|
||||
const messageText = typeof error.message === "string" ? error.message : "Codex app-server error";
|
||||
const failureKind = classifyCodexErrorRecord(error, "backend-failed");
|
||||
const terminal = params.willRetry === true ? undefined : { status: "failed" as const, failureKind, message: redactText(messageText) };
|
||||
return { events: [{ type: "error", payload: { failureKind, error: redactJson(error), willRetry: params.willRetry === true } }], ...(terminal ? { terminal } : {}) };
|
||||
return { events: [{ type: "error", payload: { failureKind, message: redactText(messageText), error: redactJson(error), willRetry: params.willRetry === true, ...providerRetryDiagnostics(error, params) } }], ...(terminal ? { terminal } : {}) };
|
||||
}
|
||||
if (method === "turn/completed") {
|
||||
const turn = asRecordAt(params, "turn");
|
||||
@@ -1435,6 +1435,7 @@ function notificationOtelAttributes(message: JsonRecord): JsonRecord {
|
||||
: method === "turn/completed" && turnStatus !== "completed"
|
||||
? classifyCodexErrorRecord(Object.keys(error).length > 0 ? error : { message: turnStatus ?? "unknown" }, "backend-failed")
|
||||
: null;
|
||||
const retryDiagnostics = method === "error" ? providerRetryDiagnostics(error, params) : {};
|
||||
return {
|
||||
method,
|
||||
itemId: stringAt(item, "id") ?? stringAt(params, "itemId"),
|
||||
@@ -1444,11 +1445,65 @@ function notificationOtelAttributes(message: JsonRecord): JsonRecord {
|
||||
turnId: stringAt(turn, "id"),
|
||||
failureKind,
|
||||
willRetry: typeof params.willRetry === "boolean" ? params.willRetry : null,
|
||||
...retryDiagnostics,
|
||||
deltaChars: typeof params.delta === "string" ? params.delta.length : null,
|
||||
valuesPrinted: false,
|
||||
};
|
||||
}
|
||||
|
||||
function providerRetryDiagnostics(error: JsonRecord, params: JsonRecord): JsonRecord {
|
||||
const message = typeof error.message === "string" ? error.message : "";
|
||||
const additionalDetails = typeof error.additionalDetails === "string" ? error.additionalDetails : "";
|
||||
const text = `${message}\n${additionalDetails}`;
|
||||
const retry = parseRetryCounter(text);
|
||||
const responseStreamDisconnected = asRecordAt(asRecordAt(error, "codexErrorInfo"), "responseStreamDisconnected");
|
||||
const upstreamHttpStatus = numberAt(responseStreamDisconnected, "httpStatusCode") ?? parseHttpStatus(text);
|
||||
const retryBackoffMs = numberAt(error, "retryBackoffMs") ?? numberAt(error, "backoffMs") ?? numberAt(params, "retryBackoffMs") ?? numberAt(params, "backoffMs");
|
||||
const providerErrorClass = providerErrorClassForText(text, upstreamHttpStatus);
|
||||
return {
|
||||
retryAttempt: retry?.attempt ?? null,
|
||||
retryMax: retry?.max ?? null,
|
||||
retryExhausted: retry ? retry.attempt >= retry.max : null,
|
||||
retryBackoffMs,
|
||||
upstreamHttpStatus,
|
||||
upstreamHost: parseUrlHost(text),
|
||||
providerErrorClass,
|
||||
errorSummary: redactText(message || additionalDetails || providerErrorClass),
|
||||
valuesPrinted: false,
|
||||
};
|
||||
}
|
||||
|
||||
function parseRetryCounter(text: string): { attempt: number; max: number } | null {
|
||||
const match = /(?:reconnect(?:ing)?|retry(?:ing)?)?[^0-9]{0,24}(\d+)\s*\/\s*(\d+)/iu.exec(text);
|
||||
if (!match) return null;
|
||||
const attempt = Number(match[1]);
|
||||
const max = Number(match[2]);
|
||||
if (!Number.isFinite(attempt) || !Number.isFinite(max) || attempt < 0 || max <= 0) return null;
|
||||
return { attempt: Math.floor(attempt), max: Math.floor(max) };
|
||||
}
|
||||
|
||||
function parseHttpStatus(text: string): number | null {
|
||||
const match = /(?:http\s*)?(?:status\s*)?(\b[1-5][0-9]{2}\b)/iu.exec(text);
|
||||
if (!match) return null;
|
||||
const status = Number(match[1]);
|
||||
return Number.isFinite(status) ? status : null;
|
||||
}
|
||||
|
||||
function parseUrlHost(text: string): string | null {
|
||||
const match = /https?:\/\/([^\s/]+)(?:\/|\s|$)/iu.exec(text);
|
||||
return match ? match[1].toLowerCase() : null;
|
||||
}
|
||||
|
||||
function providerErrorClassForText(text: string, httpStatus: number | null): string {
|
||||
const lower = text.toLowerCase();
|
||||
if (httpStatus !== null) return httpStatus >= 500 ? "provider-http-5xx" : httpStatus === 429 ? "provider-rate-limit" : httpStatus === 401 || httpStatus === 403 ? "provider-auth" : "provider-http-status";
|
||||
if (/connection refused|econnrefused|os error 111/u.test(lower)) return "provider-connection-refused";
|
||||
if (/timed out|timeout|etimedout/u.test(lower)) return "provider-timeout";
|
||||
if (/reset|econnreset/u.test(lower)) return "provider-connection-reset";
|
||||
if (/disconnect|stream/u.test(lower)) return "provider-stream-disconnected";
|
||||
return "provider-unavailable";
|
||||
}
|
||||
|
||||
function toolCallSummaryFromNotification(message: JsonRecord): JsonRecord | null {
|
||||
const method = typeof message.method === "string" ? message.method : "";
|
||||
if (method !== "item/started" && method !== "item/completed") return null;
|
||||
@@ -1609,6 +1664,11 @@ function stringAt(value: JsonRecord, key: string): string | null {
|
||||
return typeof value[key] === "string" && String(value[key]).length > 0 ? String(value[key]) : null;
|
||||
}
|
||||
|
||||
function numberAt(value: JsonRecord, key: string): number | null {
|
||||
const next = value[key];
|
||||
return typeof next === "number" && Number.isFinite(next) ? next : null;
|
||||
}
|
||||
|
||||
function shortHash(value: string): string {
|
||||
return createHash("sha256").update(value).digest("hex").slice(0, 12);
|
||||
}
|
||||
|
||||
@@ -1024,6 +1024,14 @@ function emitRunEventOtelSpan(type: RunEvent["type"], payload: JsonRecord, run:
|
||||
threadId: stringJsonValue(payload.threadId),
|
||||
turnId: stringJsonValue(payload.turnId),
|
||||
willRetry: typeof payload.willRetry === "boolean" ? payload.willRetry : null,
|
||||
retryAttempt: numberJsonValue(payload.retryAttempt),
|
||||
retryMax: numberJsonValue(payload.retryMax),
|
||||
retryExhausted: typeof payload.retryExhausted === "boolean" ? payload.retryExhausted : null,
|
||||
retryBackoffMs: numberJsonValue(payload.retryBackoffMs),
|
||||
upstreamHttpStatus: numberJsonValue(payload.upstreamHttpStatus),
|
||||
upstreamHost: stringJsonValue(payload.upstreamHost),
|
||||
providerErrorClass: stringJsonValue(payload.providerErrorClass),
|
||||
errorSummary: boundedJsonString(payload.errorSummary, 300),
|
||||
message: boundedJsonString(payload.message, 300),
|
||||
},
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user