From 01f8da23ed731d0c198b98c7037419007547bd06 Mon Sep 17 00:00:00 2001 From: Codex Date: Fri, 26 Jun 2026 16:04:58 +0000 Subject: [PATCH] fix(web-probe): recover observe startup after transport timeout --- config/hwlab-node-lanes.yaml | 5 + scripts/src/hwlab-node-lanes.ts | 29 +++++ .../hwlab-node-web-observe-runner-source.ts | 100 ++++++++++++++++-- scripts/src/hwlab-node/web-probe-observe.ts | 55 ++++++++-- 4 files changed, 173 insertions(+), 16 deletions(-) diff --git a/config/hwlab-node-lanes.yaml b/config/hwlab-node-lanes.yaml index 65b1dfb7..747e50a2 100644 --- a/config/hwlab-node-lanes.yaml +++ b/config/hwlab-node-lanes.yaml @@ -179,6 +179,11 @@ lanes: defaultOrigin: mode: public baseUrl: https://hwlab.pikapython.com + authLogin: + maxAttempts: 6 + requestTimeoutMs: 30000 + initialDelayMs: 500 + maxDelayMs: 10000 alertThresholds: sameOriginApiSlowMs: 10000 partialApiSlowMs: 10000 diff --git a/scripts/src/hwlab-node-lanes.ts b/scripts/src/hwlab-node-lanes.ts index 7cb27116..fc47a735 100644 --- a/scripts/src/hwlab-node-lanes.ts +++ b/scripts/src/hwlab-node-lanes.ts @@ -137,10 +137,18 @@ export type HwlabRuntimeWebProbeOriginSpec = HwlabRuntimeWebProbeServiceOriginSp export interface HwlabRuntimeWebProbeSpec { readonly browserProxyMode?: "auto" | "direct"; readonly defaultOrigin?: HwlabRuntimeWebProbeOriginSpec; + readonly authLogin?: HwlabRuntimeWebProbeAuthLoginSpec; readonly alertThresholds?: HwlabRuntimeWebProbeAlertThresholdsSpec; readonly projectManagement?: HwlabRuntimeWebProbeProjectManagementSpec; } +export interface HwlabRuntimeWebProbeAuthLoginSpec { + readonly maxAttempts: number; + readonly requestTimeoutMs: number; + readonly initialDelayMs: number; + readonly maxDelayMs: number; +} + export type HwlabRuntimeWebProbeSentinelConfigRefKey = "runtime" | "scenarios" | "promptSet" | "reportViews" | "publicExposure" | "cicd" | "secrets"; export const HWLAB_WEB_PROBE_SENTINEL_CONFIG_REF_KEYS = ["runtime", "scenarios", "promptSet", "reportViews", "publicExposure", "cicd", "secrets"] as const satisfies readonly HwlabRuntimeWebProbeSentinelConfigRefKey[]; @@ -447,6 +455,14 @@ function positiveNumberField(obj: Record, key: string, path: st return value; } +function boundedIntegerField(obj: Record, key: string, path: string, min: number, max: number): number { + const value = obj[key]; + if (typeof value !== "number" || !Number.isInteger(value) || value < min || value > max) { + throw new Error(`${path}.${key} must be an integer between ${min} and ${max}`); + } + return value; +} + function sortedRecordEntries(value: unknown, path: string): Array<[string, Record]> { return Object.entries(asRecord(value, path)).map(([key, item]) => [key, asRecord(item, `${path}.${key}`)]); } @@ -774,11 +790,24 @@ function webProbeConfig(value: unknown, path: string): HwlabRuntimeWebProbeSpec return { ...(browserProxyMode === undefined ? {} : { browserProxyMode }), ...(raw.defaultOrigin === undefined ? {} : { defaultOrigin: webProbeOriginConfig(raw.defaultOrigin, `${path}.defaultOrigin`) }), + ...(raw.authLogin === undefined ? {} : { authLogin: webProbeAuthLoginConfig(raw.authLogin, `${path}.authLogin`) }), ...(raw.alertThresholds === undefined ? {} : { alertThresholds: webProbeAlertThresholdsConfig(raw.alertThresholds, `${path}.alertThresholds`) }), ...(raw.projectManagement === undefined ? {} : { projectManagement: webProbeProjectManagementConfig(raw.projectManagement, `${path}.projectManagement`) }), }; } +function webProbeAuthLoginConfig(value: unknown, path: string): HwlabRuntimeWebProbeAuthLoginSpec { + const raw = asRecord(value, path); + const maxAttempts = positiveNumberField(raw, "maxAttempts", path); + if (!Number.isInteger(maxAttempts) || maxAttempts < 1 || maxAttempts > 20) throw new Error(`${path}.maxAttempts must be an integer between 1 and 20`); + return { + maxAttempts, + requestTimeoutMs: boundedIntegerField(raw, "requestTimeoutMs", path, 1000, 120000), + initialDelayMs: boundedIntegerField(raw, "initialDelayMs", path, 0, 60000), + maxDelayMs: boundedIntegerField(raw, "maxDelayMs", path, 0, 120000), + }; +} + function webProbeSentinelConfig(value: unknown, path: string): HwlabRuntimeWebProbeSentinelSpec { const raw = asRecord(value, path); const allowed = new Set(["enabled", "configRefs"]); diff --git a/scripts/src/hwlab-node-web-observe-runner-source.ts b/scripts/src/hwlab-node-web-observe-runner-source.ts index 2fd586ca..9039dd95 100644 --- a/scripts/src/hwlab-node-web-observe-runner-source.ts +++ b/scripts/src/hwlab-node-web-observe-runner-source.ts @@ -25,6 +25,10 @@ const maxSamples = positiveInteger(process.env.UNIDESK_WEB_OBSERVE_MAX_SAMPLES, const observerRefreshIntervalMs = positiveInteger(process.env.UNIDESK_WEB_OBSERVE_OBSERVER_REFRESH_INTERVAL_MS, 180000); const viewport = parseViewport(process.env.UNIDESK_WEB_OBSERVE_VIEWPORT || "1440x900"); const browserProxyMode = parseBrowserProxyMode(process.env.UNIDESK_WEB_OBSERVE_BROWSER_PROXY_MODE || "auto"); +const authLoginMaxAttempts = boundedInteger(process.env.UNIDESK_WEB_OBSERVE_AUTH_LOGIN_MAX_ATTEMPTS, 6, 1, 20); +const authLoginRequestTimeoutMs = boundedInteger(process.env.UNIDESK_WEB_OBSERVE_AUTH_LOGIN_REQUEST_TIMEOUT_MS, 30000, 1000, 120000); +const authLoginInitialDelayMs = boundedInteger(process.env.UNIDESK_WEB_OBSERVE_AUTH_LOGIN_INITIAL_DELAY_MS, 500, 0, 60000); +const authLoginMaxDelayMs = boundedInteger(process.env.UNIDESK_WEB_OBSERVE_AUTH_LOGIN_MAX_DELAY_MS, 10000, 0, 120000); const alertThresholds = parseAlertThresholds(process.env.UNIDESK_WEB_OBSERVE_ALERT_THRESHOLDS_JSON); const projectManagement = parseProjectManagementConfig(process.env.UNIDESK_WEB_OBSERVE_PROJECT_MANAGEMENT_JSON); const playwrightProxy = proxyConfigFromEnv(baseUrl); @@ -493,11 +497,9 @@ async function runControlCommand(command, fn) { async function authenticate(browserContext) { const loginUrl = new URL("/auth/login", baseUrl).toString(); const attempts = []; - const maxAttempts = 5; - const initialDelayMs = 250; - const maxDelayMs = 5000; + const maxAttempts = authLoginMaxAttempts; for (let attempt = 1; attempt <= maxAttempts; attempt += 1) { - const retryDelayMs = attempt < maxAttempts ? Math.min(maxDelayMs, initialDelayMs * (2 ** (attempt - 1))) : 0; + const retryDelayMs = authRetryDelayMs(attempt, maxAttempts); const retryLabel = attempt + "/" + maxAttempts; await writeHeartbeat({ status: terminalStatus, auth: { phase: "api-login", retryAttempt: attempt, retryMaxAttempts: maxAttempts, lastRetryLabel: retryLabel, retryDelayMs: 0, retryExhausted: false, valuesRedacted: true } }).catch(() => {}); try { @@ -510,6 +512,7 @@ async function authenticate(browserContext) { retryMaxAttempts: maxAttempts, retryLabel, retryDelayMs: retryable && attempt < maxAttempts ? retryDelayMs : 0, + requestTimeoutMs: authLoginRequestTimeoutMs, method: "api", status: response.status, statusText: response.statusText, @@ -547,6 +550,7 @@ async function authenticate(browserContext) { retryMaxAttempts: maxAttempts, retryLabel, retryDelayMs: retryable && attempt < maxAttempts ? retryDelayMs : 0, + requestTimeoutMs: authLoginRequestTimeoutMs, method: "api", status: 0, statusText: "request-error", @@ -593,7 +597,7 @@ async function pageAuthLogin(browserContext, loginUrl, credential = { username, const response = await browserContext.request.post(loginUrl, { data: { username: credential.username, password: credential.password }, headers: { accept: "application/json", "content-type": "application/json" }, - timeout: 12000, + timeout: authLoginRequestTimeoutMs, }); await response.text().catch(() => ""); return { @@ -608,11 +612,79 @@ async function loginAccount(command) { const credential = credentialForAccount(accountId); const loginUrl = new URL("/auth/login", baseUrl).toString(); const before = await accountSessionSnapshot(); - const response = await pageAuthLogin(context, loginUrl, credential); - const cookieState = await readAuthCookieState(context); + const attempts = []; + let response = null; + let cookieState = null; + const maxAttempts = authLoginMaxAttempts; + for (let attempt = 1; attempt <= maxAttempts; attempt += 1) { + const retryDelayMs = authRetryDelayMs(attempt, maxAttempts); + const retryLabel = attempt + "/" + maxAttempts; + try { + response = await pageAuthLogin(context, loginUrl, credential); + cookieState = await readAuthCookieState(context); + const retryable = isRetryableAuthStatus(response.status); + attempts.push({ + attempt, + retryAttempt: attempt, + retryMaxAttempts: maxAttempts, + retryLabel, + retryDelayMs: retryable && attempt < maxAttempts ? retryDelayMs : 0, + requestTimeoutMs: authLoginRequestTimeoutMs, + method: "api", + status: response.status, + statusText: response.statusText, + retryable, + cookiePresent: cookieState.cookiePresent, + cookieNames: cookieState.cookieNames, + credentialSource: credential.source, + valuesRedacted: true, + }); + if (response.ok && cookieState.cookiePresent) break; + if (!retryable) break; + } catch (error) { + const retryable = isRetryableAuthError(error); + attempts.push({ + attempt, + retryAttempt: attempt, + retryMaxAttempts: maxAttempts, + retryLabel, + retryDelayMs: retryable && attempt < maxAttempts ? retryDelayMs : 0, + requestTimeoutMs: authLoginRequestTimeoutMs, + method: "api", + status: 0, + statusText: "request-error", + retryable, + error: error && error.message ? truncate(error.message, 500) : truncate(String(error), 500), + cookiePresent: false, + cookieNames: [], + credentialSource: credential.source, + valuesRedacted: true, + }); + response = { ok: false, status: 0, statusText: "request-error" }; + cookieState = await readAuthCookieState(context).catch(() => ({ cookiePresent: false, cookieNames: [] })); + if (!retryable) break; + } + if (attempt < maxAttempts && attempts[attempts.length - 1]?.retryable === true) await sleep(retryDelayMs); + } + response = response ?? { ok: false, status: 0, statusText: "api-login-failed" }; + cookieState = cookieState ?? await readAuthCookieState(context); if (!response.ok || !cookieState.cookiePresent) { const error = new Error("loginAccount failed for accountId=" + accountId + " status=" + response.status + " " + (response.statusText || "")); - error.details = { accountId, status: response.status, statusText: response.statusText, cookiePresent: cookieState.cookiePresent, credentialSource: credential.source, valuesRedacted: true }; + const retryable = attempts.some((item) => item && item.retryable === true); + error.details = { + accountId, + status: response.status, + statusText: response.statusText, + cookiePresent: cookieState.cookiePresent, + credentialSource: credential.source, + attempts, + retryCount: Math.max(0, attempts.length - 1), + retryMaxAttempts: maxAttempts, + lastRetryLabel: attempts[attempts.length - 1]?.retryLabel || null, + retryExhausted: retryable && attempts.length >= maxAttempts, + retryable, + valuesRedacted: true, + }; throw error; } const target = isWorkbenchPathname(safeUrlPath(currentPageUrl()) || "") ? safeUrlPath(currentPageUrl()) : targetPath; @@ -786,6 +858,10 @@ function isRetryableAuthError(error) { return /AbortError|EAI_AGAIN|ETIMEDOUT|ECONNRESET|ECONNREFUSED|ECONNABORTED|socket hang up|ERR_NETWORK_CHANGED|fetch failed|failed to fetch|network|timeout|aborted/iu.test(message); } +function authRetryDelayMs(attempt, maxAttempts) { + return attempt < maxAttempts ? Math.min(authLoginMaxDelayMs, authLoginInitialDelayMs * (2 ** (attempt - 1))) : 0; +} + function authFailureMessage(failure) { const last = Array.isArray(failure.attempts) && failure.attempts.length > 0 ? failure.attempts[failure.attempts.length - 1] : null; const retry = failure.lastRetryLabel ? " retry=" + failure.lastRetryLabel : ""; @@ -3767,6 +3843,14 @@ function positiveInteger(value, fallback) { return Number.isFinite(parsed) && parsed >= 0 ? Math.floor(parsed) : fallback; } +function boundedInteger(value, fallback, min, max) { + const parsed = Number(value); + if (!Number.isFinite(parsed)) return fallback; + const integer = Math.floor(parsed); + if (integer < min || integer > max) return fallback; + return integer; +} + function positiveNumber(value, fallback) { const parsed = Number(value); return Number.isFinite(parsed) && parsed > 0 ? parsed : fallback; diff --git a/scripts/src/hwlab-node/web-probe-observe.ts b/scripts/src/hwlab-node/web-probe-observe.ts index 503eaa66..1cf479ff 100644 --- a/scripts/src/hwlab-node/web-probe-observe.ts +++ b/scripts/src/hwlab-node/web-probe-observe.ts @@ -14,7 +14,7 @@ import { runCommand, type CommandResult } from "../command"; import { startJob } from "../jobs"; import { classifySshTcpPoolFailure } from "../ssh"; import { HWLAB_NODE_CONTROL_PLANE_CONFIG_PATH, hwlabNodeControlPlaneInfraHelp, runHwlabNodeControlPlaneInfra } from "../hwlab-node-control-plane"; -import { hwlabRuntimeLaneConfigPath, hwlabRuntimeLaneIds, hwlabRuntimeLaneSpec, hwlabRuntimeLaneSpecForNode, hwlabRuntimeNodeIds, isHwlabRuntimeLane, type HwlabRuntimeLane, type HwlabRuntimeLaneSpec, type HwlabRuntimeObservabilityRecordingRuleSpec, type HwlabRuntimeObservabilitySpec, type HwlabRuntimeObservabilityWarningAlertSpec, type HwlabRuntimePublicExposureSpec, type HwlabRuntimeWebProbeAlertThresholdsSpec, type HwlabRuntimeWebProbeProjectManagementSpec } from "../hwlab-node-lanes"; +import { hwlabRuntimeLaneConfigPath, hwlabRuntimeLaneIds, hwlabRuntimeLaneSpec, hwlabRuntimeLaneSpecForNode, hwlabRuntimeNodeIds, isHwlabRuntimeLane, type HwlabRuntimeLane, type HwlabRuntimeLaneSpec, type HwlabRuntimeObservabilityRecordingRuleSpec, type HwlabRuntimeObservabilitySpec, type HwlabRuntimeObservabilityWarningAlertSpec, type HwlabRuntimePublicExposureSpec, type HwlabRuntimeWebProbeAlertThresholdsSpec, type HwlabRuntimeWebProbeAuthLoginSpec, type HwlabRuntimeWebProbeProjectManagementSpec } from "../hwlab-node-lanes"; import { nodeWebProbeScriptRunnerSource } from "../hwlab-node-web-probe-runner-source"; import { nodeWebObserveAnalyzerSource } from "../hwlab-node-web-observe-analyzer-source"; import { nodeWebObserveRunnerSource } from "../hwlab-node-web-observe-runner-source"; @@ -1089,6 +1089,10 @@ export function nodeWebProbeProjectManagementConfig(spec: HwlabRuntimeLaneSpec): return spec.webProbe?.projectManagement ?? null; } +export function nodeWebProbeAuthLoginConfig(spec: HwlabRuntimeLaneSpec): HwlabRuntimeWebProbeAuthLoginSpec | null { + return spec.webProbe?.authLogin ?? null; +} + export interface NodeWebProbeHostProxyEnv { readonly envAssignments: string[]; readonly summary: Record; @@ -1252,6 +1256,7 @@ export function runNodeWebProbeObserveStart( const webProbeProxy = nodeWebProbeHostProxyEnv(spec, options.browserProxyMode); const alertThresholds = nodeWebProbeAlertThresholds(spec); const projectManagement = nodeWebProbeProjectManagementConfig(spec); + const authLogin = nodeWebProbeAuthLoginConfig(spec); const runnerEnvAssignments = [ ...webProbeProxy.envAssignments, ...webProbeAccountEnvAssignments(), @@ -1269,6 +1274,14 @@ export function runNodeWebProbeObserveStart( `UNIDESK_WEB_OBSERVE_BROWSER_PROXY_MODE=${shellQuote(options.browserProxyMode)}`, `UNIDESK_WEB_OBSERVE_ALERT_THRESHOLDS_JSON=${shellQuote(JSON.stringify(alertThresholds))}`, `UNIDESK_WEB_OBSERVE_PROJECT_MANAGEMENT_JSON=${shellQuote(JSON.stringify(projectManagement))}`, + ...(authLogin === null + ? [] + : [ + `UNIDESK_WEB_OBSERVE_AUTH_LOGIN_MAX_ATTEMPTS=${shellQuote(String(authLogin.maxAttempts))}`, + `UNIDESK_WEB_OBSERVE_AUTH_LOGIN_REQUEST_TIMEOUT_MS=${shellQuote(String(authLogin.requestTimeoutMs))}`, + `UNIDESK_WEB_OBSERVE_AUTH_LOGIN_INITIAL_DELAY_MS=${shellQuote(String(authLogin.initialDelayMs))}`, + `UNIDESK_WEB_OBSERVE_AUTH_LOGIN_MAX_DELAY_MS=${shellQuote(String(authLogin.maxDelayMs))}`, + ]), ].join(" "); const script = [ "set -eu", @@ -1291,9 +1304,27 @@ export function runNodeWebProbeObserveStart( ].join("\n"); const result = runTransWorkspaceStdinScript(options.node, spec.workspace, script, options.commandTimeoutSeconds); const started = parseJsonObject(result.stdout); - const observerId = typeof started?.jobId === "string" ? started.jobId : jobId; - const index = result.exitCode === 0 && started?.ok === true - ? upsertWebObserveIndexEntry({ + const startOk = result.exitCode === 0 && started?.ok === true; + const recovery = startOk + ? null + : readNodeWebProbeObserveRemoteStatus({ ...options, id: jobId, stateDir }, spec, 1, Math.min(options.commandTimeoutSeconds, 30)); + const recoveredStatus = !startOk && recovery !== null && recovery.result.exitCode === 0 && recovery.status !== null && recovery.status.ok !== false + ? recovery.status + : null; + const effectiveOk = startOk || recoveredStatus !== null; + const observer = startOk ? started : recoveredStatus; + const observerId = typeof started?.jobId === "string" + ? started.jobId + : webObserveIdFromStatus(recoveredStatus, { ...options, id: jobId }) ?? jobId; + const degradedReason = !startOk && recoveredStatus !== null + ? "web-probe-start-transport-timeout-recovered" + : result.timedOut + ? "web-probe-command-timeout" + : result.exitCode !== 0 + ? "web-probe-observe-start-failed" + : null; + const index = effectiveOk + ? upsertWebObserveIndexEntry(startOk ? { id: observerId, node: options.node, lane: options.lane, @@ -1305,11 +1336,11 @@ export function runNodeWebProbeObserveStart( pid: typeof started.pid === "number" ? started.pid : null, startedAt: new Date().toISOString(), updatedAt: new Date().toISOString(), - }) + } : webObserveIndexEntryFromOptions({ ...options, id: observerId, stateDir }, spec, observerId, recoveredStatus ?? {})) : null; return renderWebObserveStartResult({ - ok: result.exitCode === 0 && started?.ok === true, - status: result.exitCode === 0 && started?.ok === true ? "started" : "blocked", + ok: effectiveOk, + status: effectiveOk ? "started" : "blocked", command: `web-probe observe start --node ${options.node} --lane ${options.lane}`, node: options.node, lane: options.lane, @@ -1320,12 +1351,20 @@ export function runNodeWebProbeObserveStart( projectManagement, targetPath: options.targetPath, id: observerId, + degradedReason, credential, - observer: withWebObserveShortcuts(started, observerId), + observer: withWebObserveShortcuts(observer, observerId), wrapper: buildWebObserveWrapperForObserveOptions("start", options, spec.workspace, { id: observerId, jobId: observerId, stateDir }), index, next: webObserveNextCommands(observerId), result: compactCommandResultRedacted(result, [material.password ?? ""]), + recovery: recovery === null ? null : { + attempted: true, + ok: recoveredStatus !== null, + reason: degradedReason, + result: compactCommandResultWithStdoutTail(recovery.result), + valuesRedacted: true, + }, valuesRedacted: true, }); }