fix(web-probe): recover observe startup after transport timeout

This commit is contained in:
Codex
2026-06-26 16:04:58 +00:00
parent 2d953cc911
commit 01f8da23ed
4 changed files with 173 additions and 16 deletions
+5
View File
@@ -179,6 +179,11 @@ lanes:
defaultOrigin:
mode: public
baseUrl: https://hwlab.pikapython.com
authLogin:
maxAttempts: 6
requestTimeoutMs: 30000
initialDelayMs: 500
maxDelayMs: 10000
alertThresholds:
sameOriginApiSlowMs: 10000
partialApiSlowMs: 10000
+29
View File
@@ -137,10 +137,18 @@ export type HwlabRuntimeWebProbeOriginSpec = HwlabRuntimeWebProbeServiceOriginSp
export interface HwlabRuntimeWebProbeSpec {
readonly browserProxyMode?: "auto" | "direct";
readonly defaultOrigin?: HwlabRuntimeWebProbeOriginSpec;
readonly authLogin?: HwlabRuntimeWebProbeAuthLoginSpec;
readonly alertThresholds?: HwlabRuntimeWebProbeAlertThresholdsSpec;
readonly projectManagement?: HwlabRuntimeWebProbeProjectManagementSpec;
}
export interface HwlabRuntimeWebProbeAuthLoginSpec {
readonly maxAttempts: number;
readonly requestTimeoutMs: number;
readonly initialDelayMs: number;
readonly maxDelayMs: number;
}
export type HwlabRuntimeWebProbeSentinelConfigRefKey = "runtime" | "scenarios" | "promptSet" | "reportViews" | "publicExposure" | "cicd" | "secrets";
export const HWLAB_WEB_PROBE_SENTINEL_CONFIG_REF_KEYS = ["runtime", "scenarios", "promptSet", "reportViews", "publicExposure", "cicd", "secrets"] as const satisfies readonly HwlabRuntimeWebProbeSentinelConfigRefKey[];
@@ -447,6 +455,14 @@ function positiveNumberField(obj: Record<string, unknown>, key: string, path: st
return value;
}
function boundedIntegerField(obj: Record<string, unknown>, key: string, path: string, min: number, max: number): number {
const value = obj[key];
if (typeof value !== "number" || !Number.isInteger(value) || value < min || value > max) {
throw new Error(`${path}.${key} must be an integer between ${min} and ${max}`);
}
return value;
}
function sortedRecordEntries(value: unknown, path: string): Array<[string, Record<string, unknown>]> {
return Object.entries(asRecord(value, path)).map(([key, item]) => [key, asRecord(item, `${path}.${key}`)]);
}
@@ -774,11 +790,24 @@ function webProbeConfig(value: unknown, path: string): HwlabRuntimeWebProbeSpec
return {
...(browserProxyMode === undefined ? {} : { browserProxyMode }),
...(raw.defaultOrigin === undefined ? {} : { defaultOrigin: webProbeOriginConfig(raw.defaultOrigin, `${path}.defaultOrigin`) }),
...(raw.authLogin === undefined ? {} : { authLogin: webProbeAuthLoginConfig(raw.authLogin, `${path}.authLogin`) }),
...(raw.alertThresholds === undefined ? {} : { alertThresholds: webProbeAlertThresholdsConfig(raw.alertThresholds, `${path}.alertThresholds`) }),
...(raw.projectManagement === undefined ? {} : { projectManagement: webProbeProjectManagementConfig(raw.projectManagement, `${path}.projectManagement`) }),
};
}
function webProbeAuthLoginConfig(value: unknown, path: string): HwlabRuntimeWebProbeAuthLoginSpec {
const raw = asRecord(value, path);
const maxAttempts = positiveNumberField(raw, "maxAttempts", path);
if (!Number.isInteger(maxAttempts) || maxAttempts < 1 || maxAttempts > 20) throw new Error(`${path}.maxAttempts must be an integer between 1 and 20`);
return {
maxAttempts,
requestTimeoutMs: boundedIntegerField(raw, "requestTimeoutMs", path, 1000, 120000),
initialDelayMs: boundedIntegerField(raw, "initialDelayMs", path, 0, 60000),
maxDelayMs: boundedIntegerField(raw, "maxDelayMs", path, 0, 120000),
};
}
function webProbeSentinelConfig(value: unknown, path: string): HwlabRuntimeWebProbeSentinelSpec {
const raw = asRecord(value, path);
const allowed = new Set(["enabled", "configRefs"]);
@@ -25,6 +25,10 @@ const maxSamples = positiveInteger(process.env.UNIDESK_WEB_OBSERVE_MAX_SAMPLES,
const observerRefreshIntervalMs = positiveInteger(process.env.UNIDESK_WEB_OBSERVE_OBSERVER_REFRESH_INTERVAL_MS, 180000);
const viewport = parseViewport(process.env.UNIDESK_WEB_OBSERVE_VIEWPORT || "1440x900");
const browserProxyMode = parseBrowserProxyMode(process.env.UNIDESK_WEB_OBSERVE_BROWSER_PROXY_MODE || "auto");
const authLoginMaxAttempts = boundedInteger(process.env.UNIDESK_WEB_OBSERVE_AUTH_LOGIN_MAX_ATTEMPTS, 6, 1, 20);
const authLoginRequestTimeoutMs = boundedInteger(process.env.UNIDESK_WEB_OBSERVE_AUTH_LOGIN_REQUEST_TIMEOUT_MS, 30000, 1000, 120000);
const authLoginInitialDelayMs = boundedInteger(process.env.UNIDESK_WEB_OBSERVE_AUTH_LOGIN_INITIAL_DELAY_MS, 500, 0, 60000);
const authLoginMaxDelayMs = boundedInteger(process.env.UNIDESK_WEB_OBSERVE_AUTH_LOGIN_MAX_DELAY_MS, 10000, 0, 120000);
const alertThresholds = parseAlertThresholds(process.env.UNIDESK_WEB_OBSERVE_ALERT_THRESHOLDS_JSON);
const projectManagement = parseProjectManagementConfig(process.env.UNIDESK_WEB_OBSERVE_PROJECT_MANAGEMENT_JSON);
const playwrightProxy = proxyConfigFromEnv(baseUrl);
@@ -493,11 +497,9 @@ async function runControlCommand(command, fn) {
async function authenticate(browserContext) {
const loginUrl = new URL("/auth/login", baseUrl).toString();
const attempts = [];
const maxAttempts = 5;
const initialDelayMs = 250;
const maxDelayMs = 5000;
const maxAttempts = authLoginMaxAttempts;
for (let attempt = 1; attempt <= maxAttempts; attempt += 1) {
const retryDelayMs = attempt < maxAttempts ? Math.min(maxDelayMs, initialDelayMs * (2 ** (attempt - 1))) : 0;
const retryDelayMs = authRetryDelayMs(attempt, maxAttempts);
const retryLabel = attempt + "/" + maxAttempts;
await writeHeartbeat({ status: terminalStatus, auth: { phase: "api-login", retryAttempt: attempt, retryMaxAttempts: maxAttempts, lastRetryLabel: retryLabel, retryDelayMs: 0, retryExhausted: false, valuesRedacted: true } }).catch(() => {});
try {
@@ -510,6 +512,7 @@ async function authenticate(browserContext) {
retryMaxAttempts: maxAttempts,
retryLabel,
retryDelayMs: retryable && attempt < maxAttempts ? retryDelayMs : 0,
requestTimeoutMs: authLoginRequestTimeoutMs,
method: "api",
status: response.status,
statusText: response.statusText,
@@ -547,6 +550,7 @@ async function authenticate(browserContext) {
retryMaxAttempts: maxAttempts,
retryLabel,
retryDelayMs: retryable && attempt < maxAttempts ? retryDelayMs : 0,
requestTimeoutMs: authLoginRequestTimeoutMs,
method: "api",
status: 0,
statusText: "request-error",
@@ -593,7 +597,7 @@ async function pageAuthLogin(browserContext, loginUrl, credential = { username,
const response = await browserContext.request.post(loginUrl, {
data: { username: credential.username, password: credential.password },
headers: { accept: "application/json", "content-type": "application/json" },
timeout: 12000,
timeout: authLoginRequestTimeoutMs,
});
await response.text().catch(() => "");
return {
@@ -608,11 +612,79 @@ async function loginAccount(command) {
const credential = credentialForAccount(accountId);
const loginUrl = new URL("/auth/login", baseUrl).toString();
const before = await accountSessionSnapshot();
const response = await pageAuthLogin(context, loginUrl, credential);
const cookieState = await readAuthCookieState(context);
const attempts = [];
let response = null;
let cookieState = null;
const maxAttempts = authLoginMaxAttempts;
for (let attempt = 1; attempt <= maxAttempts; attempt += 1) {
const retryDelayMs = authRetryDelayMs(attempt, maxAttempts);
const retryLabel = attempt + "/" + maxAttempts;
try {
response = await pageAuthLogin(context, loginUrl, credential);
cookieState = await readAuthCookieState(context);
const retryable = isRetryableAuthStatus(response.status);
attempts.push({
attempt,
retryAttempt: attempt,
retryMaxAttempts: maxAttempts,
retryLabel,
retryDelayMs: retryable && attempt < maxAttempts ? retryDelayMs : 0,
requestTimeoutMs: authLoginRequestTimeoutMs,
method: "api",
status: response.status,
statusText: response.statusText,
retryable,
cookiePresent: cookieState.cookiePresent,
cookieNames: cookieState.cookieNames,
credentialSource: credential.source,
valuesRedacted: true,
});
if (response.ok && cookieState.cookiePresent) break;
if (!retryable) break;
} catch (error) {
const retryable = isRetryableAuthError(error);
attempts.push({
attempt,
retryAttempt: attempt,
retryMaxAttempts: maxAttempts,
retryLabel,
retryDelayMs: retryable && attempt < maxAttempts ? retryDelayMs : 0,
requestTimeoutMs: authLoginRequestTimeoutMs,
method: "api",
status: 0,
statusText: "request-error",
retryable,
error: error && error.message ? truncate(error.message, 500) : truncate(String(error), 500),
cookiePresent: false,
cookieNames: [],
credentialSource: credential.source,
valuesRedacted: true,
});
response = { ok: false, status: 0, statusText: "request-error" };
cookieState = await readAuthCookieState(context).catch(() => ({ cookiePresent: false, cookieNames: [] }));
if (!retryable) break;
}
if (attempt < maxAttempts && attempts[attempts.length - 1]?.retryable === true) await sleep(retryDelayMs);
}
response = response ?? { ok: false, status: 0, statusText: "api-login-failed" };
cookieState = cookieState ?? await readAuthCookieState(context);
if (!response.ok || !cookieState.cookiePresent) {
const error = new Error("loginAccount failed for accountId=" + accountId + " status=" + response.status + " " + (response.statusText || ""));
error.details = { accountId, status: response.status, statusText: response.statusText, cookiePresent: cookieState.cookiePresent, credentialSource: credential.source, valuesRedacted: true };
const retryable = attempts.some((item) => item && item.retryable === true);
error.details = {
accountId,
status: response.status,
statusText: response.statusText,
cookiePresent: cookieState.cookiePresent,
credentialSource: credential.source,
attempts,
retryCount: Math.max(0, attempts.length - 1),
retryMaxAttempts: maxAttempts,
lastRetryLabel: attempts[attempts.length - 1]?.retryLabel || null,
retryExhausted: retryable && attempts.length >= maxAttempts,
retryable,
valuesRedacted: true,
};
throw error;
}
const target = isWorkbenchPathname(safeUrlPath(currentPageUrl()) || "") ? safeUrlPath(currentPageUrl()) : targetPath;
@@ -786,6 +858,10 @@ function isRetryableAuthError(error) {
return /AbortError|EAI_AGAIN|ETIMEDOUT|ECONNRESET|ECONNREFUSED|ECONNABORTED|socket hang up|ERR_NETWORK_CHANGED|fetch failed|failed to fetch|network|timeout|aborted/iu.test(message);
}
function authRetryDelayMs(attempt, maxAttempts) {
return attempt < maxAttempts ? Math.min(authLoginMaxDelayMs, authLoginInitialDelayMs * (2 ** (attempt - 1))) : 0;
}
function authFailureMessage(failure) {
const last = Array.isArray(failure.attempts) && failure.attempts.length > 0 ? failure.attempts[failure.attempts.length - 1] : null;
const retry = failure.lastRetryLabel ? " retry=" + failure.lastRetryLabel : "";
@@ -3767,6 +3843,14 @@ function positiveInteger(value, fallback) {
return Number.isFinite(parsed) && parsed >= 0 ? Math.floor(parsed) : fallback;
}
function boundedInteger(value, fallback, min, max) {
const parsed = Number(value);
if (!Number.isFinite(parsed)) return fallback;
const integer = Math.floor(parsed);
if (integer < min || integer > max) return fallback;
return integer;
}
function positiveNumber(value, fallback) {
const parsed = Number(value);
return Number.isFinite(parsed) && parsed > 0 ? parsed : fallback;
+47 -8
View File
@@ -14,7 +14,7 @@ import { runCommand, type CommandResult } from "../command";
import { startJob } from "../jobs";
import { classifySshTcpPoolFailure } from "../ssh";
import { HWLAB_NODE_CONTROL_PLANE_CONFIG_PATH, hwlabNodeControlPlaneInfraHelp, runHwlabNodeControlPlaneInfra } from "../hwlab-node-control-plane";
import { hwlabRuntimeLaneConfigPath, hwlabRuntimeLaneIds, hwlabRuntimeLaneSpec, hwlabRuntimeLaneSpecForNode, hwlabRuntimeNodeIds, isHwlabRuntimeLane, type HwlabRuntimeLane, type HwlabRuntimeLaneSpec, type HwlabRuntimeObservabilityRecordingRuleSpec, type HwlabRuntimeObservabilitySpec, type HwlabRuntimeObservabilityWarningAlertSpec, type HwlabRuntimePublicExposureSpec, type HwlabRuntimeWebProbeAlertThresholdsSpec, type HwlabRuntimeWebProbeProjectManagementSpec } from "../hwlab-node-lanes";
import { hwlabRuntimeLaneConfigPath, hwlabRuntimeLaneIds, hwlabRuntimeLaneSpec, hwlabRuntimeLaneSpecForNode, hwlabRuntimeNodeIds, isHwlabRuntimeLane, type HwlabRuntimeLane, type HwlabRuntimeLaneSpec, type HwlabRuntimeObservabilityRecordingRuleSpec, type HwlabRuntimeObservabilitySpec, type HwlabRuntimeObservabilityWarningAlertSpec, type HwlabRuntimePublicExposureSpec, type HwlabRuntimeWebProbeAlertThresholdsSpec, type HwlabRuntimeWebProbeAuthLoginSpec, type HwlabRuntimeWebProbeProjectManagementSpec } from "../hwlab-node-lanes";
import { nodeWebProbeScriptRunnerSource } from "../hwlab-node-web-probe-runner-source";
import { nodeWebObserveAnalyzerSource } from "../hwlab-node-web-observe-analyzer-source";
import { nodeWebObserveRunnerSource } from "../hwlab-node-web-observe-runner-source";
@@ -1089,6 +1089,10 @@ export function nodeWebProbeProjectManagementConfig(spec: HwlabRuntimeLaneSpec):
return spec.webProbe?.projectManagement ?? null;
}
export function nodeWebProbeAuthLoginConfig(spec: HwlabRuntimeLaneSpec): HwlabRuntimeWebProbeAuthLoginSpec | null {
return spec.webProbe?.authLogin ?? null;
}
export interface NodeWebProbeHostProxyEnv {
readonly envAssignments: string[];
readonly summary: Record<string, unknown>;
@@ -1252,6 +1256,7 @@ export function runNodeWebProbeObserveStart(
const webProbeProxy = nodeWebProbeHostProxyEnv(spec, options.browserProxyMode);
const alertThresholds = nodeWebProbeAlertThresholds(spec);
const projectManagement = nodeWebProbeProjectManagementConfig(spec);
const authLogin = nodeWebProbeAuthLoginConfig(spec);
const runnerEnvAssignments = [
...webProbeProxy.envAssignments,
...webProbeAccountEnvAssignments(),
@@ -1269,6 +1274,14 @@ export function runNodeWebProbeObserveStart(
`UNIDESK_WEB_OBSERVE_BROWSER_PROXY_MODE=${shellQuote(options.browserProxyMode)}`,
`UNIDESK_WEB_OBSERVE_ALERT_THRESHOLDS_JSON=${shellQuote(JSON.stringify(alertThresholds))}`,
`UNIDESK_WEB_OBSERVE_PROJECT_MANAGEMENT_JSON=${shellQuote(JSON.stringify(projectManagement))}`,
...(authLogin === null
? []
: [
`UNIDESK_WEB_OBSERVE_AUTH_LOGIN_MAX_ATTEMPTS=${shellQuote(String(authLogin.maxAttempts))}`,
`UNIDESK_WEB_OBSERVE_AUTH_LOGIN_REQUEST_TIMEOUT_MS=${shellQuote(String(authLogin.requestTimeoutMs))}`,
`UNIDESK_WEB_OBSERVE_AUTH_LOGIN_INITIAL_DELAY_MS=${shellQuote(String(authLogin.initialDelayMs))}`,
`UNIDESK_WEB_OBSERVE_AUTH_LOGIN_MAX_DELAY_MS=${shellQuote(String(authLogin.maxDelayMs))}`,
]),
].join(" ");
const script = [
"set -eu",
@@ -1291,9 +1304,27 @@ export function runNodeWebProbeObserveStart(
].join("\n");
const result = runTransWorkspaceStdinScript(options.node, spec.workspace, script, options.commandTimeoutSeconds);
const started = parseJsonObject(result.stdout);
const observerId = typeof started?.jobId === "string" ? started.jobId : jobId;
const index = result.exitCode === 0 && started?.ok === true
? upsertWebObserveIndexEntry({
const startOk = result.exitCode === 0 && started?.ok === true;
const recovery = startOk
? null
: readNodeWebProbeObserveRemoteStatus({ ...options, id: jobId, stateDir }, spec, 1, Math.min(options.commandTimeoutSeconds, 30));
const recoveredStatus = !startOk && recovery !== null && recovery.result.exitCode === 0 && recovery.status !== null && recovery.status.ok !== false
? recovery.status
: null;
const effectiveOk = startOk || recoveredStatus !== null;
const observer = startOk ? started : recoveredStatus;
const observerId = typeof started?.jobId === "string"
? started.jobId
: webObserveIdFromStatus(recoveredStatus, { ...options, id: jobId }) ?? jobId;
const degradedReason = !startOk && recoveredStatus !== null
? "web-probe-start-transport-timeout-recovered"
: result.timedOut
? "web-probe-command-timeout"
: result.exitCode !== 0
? "web-probe-observe-start-failed"
: null;
const index = effectiveOk
? upsertWebObserveIndexEntry(startOk ? {
id: observerId,
node: options.node,
lane: options.lane,
@@ -1305,11 +1336,11 @@ export function runNodeWebProbeObserveStart(
pid: typeof started.pid === "number" ? started.pid : null,
startedAt: new Date().toISOString(),
updatedAt: new Date().toISOString(),
})
} : webObserveIndexEntryFromOptions({ ...options, id: observerId, stateDir }, spec, observerId, recoveredStatus ?? {}))
: null;
return renderWebObserveStartResult({
ok: result.exitCode === 0 && started?.ok === true,
status: result.exitCode === 0 && started?.ok === true ? "started" : "blocked",
ok: effectiveOk,
status: effectiveOk ? "started" : "blocked",
command: `web-probe observe start --node ${options.node} --lane ${options.lane}`,
node: options.node,
lane: options.lane,
@@ -1320,12 +1351,20 @@ export function runNodeWebProbeObserveStart(
projectManagement,
targetPath: options.targetPath,
id: observerId,
degradedReason,
credential,
observer: withWebObserveShortcuts(started, observerId),
observer: withWebObserveShortcuts(observer, observerId),
wrapper: buildWebObserveWrapperForObserveOptions("start", options, spec.workspace, { id: observerId, jobId: observerId, stateDir }),
index,
next: webObserveNextCommands(observerId),
result: compactCommandResultRedacted(result, [material.password ?? ""]),
recovery: recovery === null ? null : {
attempted: true,
ok: recoveredStatus !== null,
reason: degradedReason,
result: compactCommandResultWithStdoutTail(recovery.result),
valuesRedacted: true,
},
valuesRedacted: true,
});
}