fix(web-probe): recover observe startup after transport timeout
This commit is contained in:
@@ -179,6 +179,11 @@ lanes:
|
||||
defaultOrigin:
|
||||
mode: public
|
||||
baseUrl: https://hwlab.pikapython.com
|
||||
authLogin:
|
||||
maxAttempts: 6
|
||||
requestTimeoutMs: 30000
|
||||
initialDelayMs: 500
|
||||
maxDelayMs: 10000
|
||||
alertThresholds:
|
||||
sameOriginApiSlowMs: 10000
|
||||
partialApiSlowMs: 10000
|
||||
|
||||
@@ -137,10 +137,18 @@ export type HwlabRuntimeWebProbeOriginSpec = HwlabRuntimeWebProbeServiceOriginSp
|
||||
export interface HwlabRuntimeWebProbeSpec {
|
||||
readonly browserProxyMode?: "auto" | "direct";
|
||||
readonly defaultOrigin?: HwlabRuntimeWebProbeOriginSpec;
|
||||
readonly authLogin?: HwlabRuntimeWebProbeAuthLoginSpec;
|
||||
readonly alertThresholds?: HwlabRuntimeWebProbeAlertThresholdsSpec;
|
||||
readonly projectManagement?: HwlabRuntimeWebProbeProjectManagementSpec;
|
||||
}
|
||||
|
||||
export interface HwlabRuntimeWebProbeAuthLoginSpec {
|
||||
readonly maxAttempts: number;
|
||||
readonly requestTimeoutMs: number;
|
||||
readonly initialDelayMs: number;
|
||||
readonly maxDelayMs: number;
|
||||
}
|
||||
|
||||
export type HwlabRuntimeWebProbeSentinelConfigRefKey = "runtime" | "scenarios" | "promptSet" | "reportViews" | "publicExposure" | "cicd" | "secrets";
|
||||
|
||||
export const HWLAB_WEB_PROBE_SENTINEL_CONFIG_REF_KEYS = ["runtime", "scenarios", "promptSet", "reportViews", "publicExposure", "cicd", "secrets"] as const satisfies readonly HwlabRuntimeWebProbeSentinelConfigRefKey[];
|
||||
@@ -447,6 +455,14 @@ function positiveNumberField(obj: Record<string, unknown>, key: string, path: st
|
||||
return value;
|
||||
}
|
||||
|
||||
function boundedIntegerField(obj: Record<string, unknown>, key: string, path: string, min: number, max: number): number {
|
||||
const value = obj[key];
|
||||
if (typeof value !== "number" || !Number.isInteger(value) || value < min || value > max) {
|
||||
throw new Error(`${path}.${key} must be an integer between ${min} and ${max}`);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
function sortedRecordEntries(value: unknown, path: string): Array<[string, Record<string, unknown>]> {
|
||||
return Object.entries(asRecord(value, path)).map(([key, item]) => [key, asRecord(item, `${path}.${key}`)]);
|
||||
}
|
||||
@@ -774,11 +790,24 @@ function webProbeConfig(value: unknown, path: string): HwlabRuntimeWebProbeSpec
|
||||
return {
|
||||
...(browserProxyMode === undefined ? {} : { browserProxyMode }),
|
||||
...(raw.defaultOrigin === undefined ? {} : { defaultOrigin: webProbeOriginConfig(raw.defaultOrigin, `${path}.defaultOrigin`) }),
|
||||
...(raw.authLogin === undefined ? {} : { authLogin: webProbeAuthLoginConfig(raw.authLogin, `${path}.authLogin`) }),
|
||||
...(raw.alertThresholds === undefined ? {} : { alertThresholds: webProbeAlertThresholdsConfig(raw.alertThresholds, `${path}.alertThresholds`) }),
|
||||
...(raw.projectManagement === undefined ? {} : { projectManagement: webProbeProjectManagementConfig(raw.projectManagement, `${path}.projectManagement`) }),
|
||||
};
|
||||
}
|
||||
|
||||
function webProbeAuthLoginConfig(value: unknown, path: string): HwlabRuntimeWebProbeAuthLoginSpec {
|
||||
const raw = asRecord(value, path);
|
||||
const maxAttempts = positiveNumberField(raw, "maxAttempts", path);
|
||||
if (!Number.isInteger(maxAttempts) || maxAttempts < 1 || maxAttempts > 20) throw new Error(`${path}.maxAttempts must be an integer between 1 and 20`);
|
||||
return {
|
||||
maxAttempts,
|
||||
requestTimeoutMs: boundedIntegerField(raw, "requestTimeoutMs", path, 1000, 120000),
|
||||
initialDelayMs: boundedIntegerField(raw, "initialDelayMs", path, 0, 60000),
|
||||
maxDelayMs: boundedIntegerField(raw, "maxDelayMs", path, 0, 120000),
|
||||
};
|
||||
}
|
||||
|
||||
function webProbeSentinelConfig(value: unknown, path: string): HwlabRuntimeWebProbeSentinelSpec {
|
||||
const raw = asRecord(value, path);
|
||||
const allowed = new Set(["enabled", "configRefs"]);
|
||||
|
||||
@@ -25,6 +25,10 @@ const maxSamples = positiveInteger(process.env.UNIDESK_WEB_OBSERVE_MAX_SAMPLES,
|
||||
const observerRefreshIntervalMs = positiveInteger(process.env.UNIDESK_WEB_OBSERVE_OBSERVER_REFRESH_INTERVAL_MS, 180000);
|
||||
const viewport = parseViewport(process.env.UNIDESK_WEB_OBSERVE_VIEWPORT || "1440x900");
|
||||
const browserProxyMode = parseBrowserProxyMode(process.env.UNIDESK_WEB_OBSERVE_BROWSER_PROXY_MODE || "auto");
|
||||
const authLoginMaxAttempts = boundedInteger(process.env.UNIDESK_WEB_OBSERVE_AUTH_LOGIN_MAX_ATTEMPTS, 6, 1, 20);
|
||||
const authLoginRequestTimeoutMs = boundedInteger(process.env.UNIDESK_WEB_OBSERVE_AUTH_LOGIN_REQUEST_TIMEOUT_MS, 30000, 1000, 120000);
|
||||
const authLoginInitialDelayMs = boundedInteger(process.env.UNIDESK_WEB_OBSERVE_AUTH_LOGIN_INITIAL_DELAY_MS, 500, 0, 60000);
|
||||
const authLoginMaxDelayMs = boundedInteger(process.env.UNIDESK_WEB_OBSERVE_AUTH_LOGIN_MAX_DELAY_MS, 10000, 0, 120000);
|
||||
const alertThresholds = parseAlertThresholds(process.env.UNIDESK_WEB_OBSERVE_ALERT_THRESHOLDS_JSON);
|
||||
const projectManagement = parseProjectManagementConfig(process.env.UNIDESK_WEB_OBSERVE_PROJECT_MANAGEMENT_JSON);
|
||||
const playwrightProxy = proxyConfigFromEnv(baseUrl);
|
||||
@@ -493,11 +497,9 @@ async function runControlCommand(command, fn) {
|
||||
async function authenticate(browserContext) {
|
||||
const loginUrl = new URL("/auth/login", baseUrl).toString();
|
||||
const attempts = [];
|
||||
const maxAttempts = 5;
|
||||
const initialDelayMs = 250;
|
||||
const maxDelayMs = 5000;
|
||||
const maxAttempts = authLoginMaxAttempts;
|
||||
for (let attempt = 1; attempt <= maxAttempts; attempt += 1) {
|
||||
const retryDelayMs = attempt < maxAttempts ? Math.min(maxDelayMs, initialDelayMs * (2 ** (attempt - 1))) : 0;
|
||||
const retryDelayMs = authRetryDelayMs(attempt, maxAttempts);
|
||||
const retryLabel = attempt + "/" + maxAttempts;
|
||||
await writeHeartbeat({ status: terminalStatus, auth: { phase: "api-login", retryAttempt: attempt, retryMaxAttempts: maxAttempts, lastRetryLabel: retryLabel, retryDelayMs: 0, retryExhausted: false, valuesRedacted: true } }).catch(() => {});
|
||||
try {
|
||||
@@ -510,6 +512,7 @@ async function authenticate(browserContext) {
|
||||
retryMaxAttempts: maxAttempts,
|
||||
retryLabel,
|
||||
retryDelayMs: retryable && attempt < maxAttempts ? retryDelayMs : 0,
|
||||
requestTimeoutMs: authLoginRequestTimeoutMs,
|
||||
method: "api",
|
||||
status: response.status,
|
||||
statusText: response.statusText,
|
||||
@@ -547,6 +550,7 @@ async function authenticate(browserContext) {
|
||||
retryMaxAttempts: maxAttempts,
|
||||
retryLabel,
|
||||
retryDelayMs: retryable && attempt < maxAttempts ? retryDelayMs : 0,
|
||||
requestTimeoutMs: authLoginRequestTimeoutMs,
|
||||
method: "api",
|
||||
status: 0,
|
||||
statusText: "request-error",
|
||||
@@ -593,7 +597,7 @@ async function pageAuthLogin(browserContext, loginUrl, credential = { username,
|
||||
const response = await browserContext.request.post(loginUrl, {
|
||||
data: { username: credential.username, password: credential.password },
|
||||
headers: { accept: "application/json", "content-type": "application/json" },
|
||||
timeout: 12000,
|
||||
timeout: authLoginRequestTimeoutMs,
|
||||
});
|
||||
await response.text().catch(() => "");
|
||||
return {
|
||||
@@ -608,11 +612,79 @@ async function loginAccount(command) {
|
||||
const credential = credentialForAccount(accountId);
|
||||
const loginUrl = new URL("/auth/login", baseUrl).toString();
|
||||
const before = await accountSessionSnapshot();
|
||||
const response = await pageAuthLogin(context, loginUrl, credential);
|
||||
const cookieState = await readAuthCookieState(context);
|
||||
const attempts = [];
|
||||
let response = null;
|
||||
let cookieState = null;
|
||||
const maxAttempts = authLoginMaxAttempts;
|
||||
for (let attempt = 1; attempt <= maxAttempts; attempt += 1) {
|
||||
const retryDelayMs = authRetryDelayMs(attempt, maxAttempts);
|
||||
const retryLabel = attempt + "/" + maxAttempts;
|
||||
try {
|
||||
response = await pageAuthLogin(context, loginUrl, credential);
|
||||
cookieState = await readAuthCookieState(context);
|
||||
const retryable = isRetryableAuthStatus(response.status);
|
||||
attempts.push({
|
||||
attempt,
|
||||
retryAttempt: attempt,
|
||||
retryMaxAttempts: maxAttempts,
|
||||
retryLabel,
|
||||
retryDelayMs: retryable && attempt < maxAttempts ? retryDelayMs : 0,
|
||||
requestTimeoutMs: authLoginRequestTimeoutMs,
|
||||
method: "api",
|
||||
status: response.status,
|
||||
statusText: response.statusText,
|
||||
retryable,
|
||||
cookiePresent: cookieState.cookiePresent,
|
||||
cookieNames: cookieState.cookieNames,
|
||||
credentialSource: credential.source,
|
||||
valuesRedacted: true,
|
||||
});
|
||||
if (response.ok && cookieState.cookiePresent) break;
|
||||
if (!retryable) break;
|
||||
} catch (error) {
|
||||
const retryable = isRetryableAuthError(error);
|
||||
attempts.push({
|
||||
attempt,
|
||||
retryAttempt: attempt,
|
||||
retryMaxAttempts: maxAttempts,
|
||||
retryLabel,
|
||||
retryDelayMs: retryable && attempt < maxAttempts ? retryDelayMs : 0,
|
||||
requestTimeoutMs: authLoginRequestTimeoutMs,
|
||||
method: "api",
|
||||
status: 0,
|
||||
statusText: "request-error",
|
||||
retryable,
|
||||
error: error && error.message ? truncate(error.message, 500) : truncate(String(error), 500),
|
||||
cookiePresent: false,
|
||||
cookieNames: [],
|
||||
credentialSource: credential.source,
|
||||
valuesRedacted: true,
|
||||
});
|
||||
response = { ok: false, status: 0, statusText: "request-error" };
|
||||
cookieState = await readAuthCookieState(context).catch(() => ({ cookiePresent: false, cookieNames: [] }));
|
||||
if (!retryable) break;
|
||||
}
|
||||
if (attempt < maxAttempts && attempts[attempts.length - 1]?.retryable === true) await sleep(retryDelayMs);
|
||||
}
|
||||
response = response ?? { ok: false, status: 0, statusText: "api-login-failed" };
|
||||
cookieState = cookieState ?? await readAuthCookieState(context);
|
||||
if (!response.ok || !cookieState.cookiePresent) {
|
||||
const error = new Error("loginAccount failed for accountId=" + accountId + " status=" + response.status + " " + (response.statusText || ""));
|
||||
error.details = { accountId, status: response.status, statusText: response.statusText, cookiePresent: cookieState.cookiePresent, credentialSource: credential.source, valuesRedacted: true };
|
||||
const retryable = attempts.some((item) => item && item.retryable === true);
|
||||
error.details = {
|
||||
accountId,
|
||||
status: response.status,
|
||||
statusText: response.statusText,
|
||||
cookiePresent: cookieState.cookiePresent,
|
||||
credentialSource: credential.source,
|
||||
attempts,
|
||||
retryCount: Math.max(0, attempts.length - 1),
|
||||
retryMaxAttempts: maxAttempts,
|
||||
lastRetryLabel: attempts[attempts.length - 1]?.retryLabel || null,
|
||||
retryExhausted: retryable && attempts.length >= maxAttempts,
|
||||
retryable,
|
||||
valuesRedacted: true,
|
||||
};
|
||||
throw error;
|
||||
}
|
||||
const target = isWorkbenchPathname(safeUrlPath(currentPageUrl()) || "") ? safeUrlPath(currentPageUrl()) : targetPath;
|
||||
@@ -786,6 +858,10 @@ function isRetryableAuthError(error) {
|
||||
return /AbortError|EAI_AGAIN|ETIMEDOUT|ECONNRESET|ECONNREFUSED|ECONNABORTED|socket hang up|ERR_NETWORK_CHANGED|fetch failed|failed to fetch|network|timeout|aborted/iu.test(message);
|
||||
}
|
||||
|
||||
function authRetryDelayMs(attempt, maxAttempts) {
|
||||
return attempt < maxAttempts ? Math.min(authLoginMaxDelayMs, authLoginInitialDelayMs * (2 ** (attempt - 1))) : 0;
|
||||
}
|
||||
|
||||
function authFailureMessage(failure) {
|
||||
const last = Array.isArray(failure.attempts) && failure.attempts.length > 0 ? failure.attempts[failure.attempts.length - 1] : null;
|
||||
const retry = failure.lastRetryLabel ? " retry=" + failure.lastRetryLabel : "";
|
||||
@@ -3767,6 +3843,14 @@ function positiveInteger(value, fallback) {
|
||||
return Number.isFinite(parsed) && parsed >= 0 ? Math.floor(parsed) : fallback;
|
||||
}
|
||||
|
||||
function boundedInteger(value, fallback, min, max) {
|
||||
const parsed = Number(value);
|
||||
if (!Number.isFinite(parsed)) return fallback;
|
||||
const integer = Math.floor(parsed);
|
||||
if (integer < min || integer > max) return fallback;
|
||||
return integer;
|
||||
}
|
||||
|
||||
function positiveNumber(value, fallback) {
|
||||
const parsed = Number(value);
|
||||
return Number.isFinite(parsed) && parsed > 0 ? parsed : fallback;
|
||||
|
||||
@@ -14,7 +14,7 @@ import { runCommand, type CommandResult } from "../command";
|
||||
import { startJob } from "../jobs";
|
||||
import { classifySshTcpPoolFailure } from "../ssh";
|
||||
import { HWLAB_NODE_CONTROL_PLANE_CONFIG_PATH, hwlabNodeControlPlaneInfraHelp, runHwlabNodeControlPlaneInfra } from "../hwlab-node-control-plane";
|
||||
import { hwlabRuntimeLaneConfigPath, hwlabRuntimeLaneIds, hwlabRuntimeLaneSpec, hwlabRuntimeLaneSpecForNode, hwlabRuntimeNodeIds, isHwlabRuntimeLane, type HwlabRuntimeLane, type HwlabRuntimeLaneSpec, type HwlabRuntimeObservabilityRecordingRuleSpec, type HwlabRuntimeObservabilitySpec, type HwlabRuntimeObservabilityWarningAlertSpec, type HwlabRuntimePublicExposureSpec, type HwlabRuntimeWebProbeAlertThresholdsSpec, type HwlabRuntimeWebProbeProjectManagementSpec } from "../hwlab-node-lanes";
|
||||
import { hwlabRuntimeLaneConfigPath, hwlabRuntimeLaneIds, hwlabRuntimeLaneSpec, hwlabRuntimeLaneSpecForNode, hwlabRuntimeNodeIds, isHwlabRuntimeLane, type HwlabRuntimeLane, type HwlabRuntimeLaneSpec, type HwlabRuntimeObservabilityRecordingRuleSpec, type HwlabRuntimeObservabilitySpec, type HwlabRuntimeObservabilityWarningAlertSpec, type HwlabRuntimePublicExposureSpec, type HwlabRuntimeWebProbeAlertThresholdsSpec, type HwlabRuntimeWebProbeAuthLoginSpec, type HwlabRuntimeWebProbeProjectManagementSpec } from "../hwlab-node-lanes";
|
||||
import { nodeWebProbeScriptRunnerSource } from "../hwlab-node-web-probe-runner-source";
|
||||
import { nodeWebObserveAnalyzerSource } from "../hwlab-node-web-observe-analyzer-source";
|
||||
import { nodeWebObserveRunnerSource } from "../hwlab-node-web-observe-runner-source";
|
||||
@@ -1089,6 +1089,10 @@ export function nodeWebProbeProjectManagementConfig(spec: HwlabRuntimeLaneSpec):
|
||||
return spec.webProbe?.projectManagement ?? null;
|
||||
}
|
||||
|
||||
export function nodeWebProbeAuthLoginConfig(spec: HwlabRuntimeLaneSpec): HwlabRuntimeWebProbeAuthLoginSpec | null {
|
||||
return spec.webProbe?.authLogin ?? null;
|
||||
}
|
||||
|
||||
export interface NodeWebProbeHostProxyEnv {
|
||||
readonly envAssignments: string[];
|
||||
readonly summary: Record<string, unknown>;
|
||||
@@ -1252,6 +1256,7 @@ export function runNodeWebProbeObserveStart(
|
||||
const webProbeProxy = nodeWebProbeHostProxyEnv(spec, options.browserProxyMode);
|
||||
const alertThresholds = nodeWebProbeAlertThresholds(spec);
|
||||
const projectManagement = nodeWebProbeProjectManagementConfig(spec);
|
||||
const authLogin = nodeWebProbeAuthLoginConfig(spec);
|
||||
const runnerEnvAssignments = [
|
||||
...webProbeProxy.envAssignments,
|
||||
...webProbeAccountEnvAssignments(),
|
||||
@@ -1269,6 +1274,14 @@ export function runNodeWebProbeObserveStart(
|
||||
`UNIDESK_WEB_OBSERVE_BROWSER_PROXY_MODE=${shellQuote(options.browserProxyMode)}`,
|
||||
`UNIDESK_WEB_OBSERVE_ALERT_THRESHOLDS_JSON=${shellQuote(JSON.stringify(alertThresholds))}`,
|
||||
`UNIDESK_WEB_OBSERVE_PROJECT_MANAGEMENT_JSON=${shellQuote(JSON.stringify(projectManagement))}`,
|
||||
...(authLogin === null
|
||||
? []
|
||||
: [
|
||||
`UNIDESK_WEB_OBSERVE_AUTH_LOGIN_MAX_ATTEMPTS=${shellQuote(String(authLogin.maxAttempts))}`,
|
||||
`UNIDESK_WEB_OBSERVE_AUTH_LOGIN_REQUEST_TIMEOUT_MS=${shellQuote(String(authLogin.requestTimeoutMs))}`,
|
||||
`UNIDESK_WEB_OBSERVE_AUTH_LOGIN_INITIAL_DELAY_MS=${shellQuote(String(authLogin.initialDelayMs))}`,
|
||||
`UNIDESK_WEB_OBSERVE_AUTH_LOGIN_MAX_DELAY_MS=${shellQuote(String(authLogin.maxDelayMs))}`,
|
||||
]),
|
||||
].join(" ");
|
||||
const script = [
|
||||
"set -eu",
|
||||
@@ -1291,9 +1304,27 @@ export function runNodeWebProbeObserveStart(
|
||||
].join("\n");
|
||||
const result = runTransWorkspaceStdinScript(options.node, spec.workspace, script, options.commandTimeoutSeconds);
|
||||
const started = parseJsonObject(result.stdout);
|
||||
const observerId = typeof started?.jobId === "string" ? started.jobId : jobId;
|
||||
const index = result.exitCode === 0 && started?.ok === true
|
||||
? upsertWebObserveIndexEntry({
|
||||
const startOk = result.exitCode === 0 && started?.ok === true;
|
||||
const recovery = startOk
|
||||
? null
|
||||
: readNodeWebProbeObserveRemoteStatus({ ...options, id: jobId, stateDir }, spec, 1, Math.min(options.commandTimeoutSeconds, 30));
|
||||
const recoveredStatus = !startOk && recovery !== null && recovery.result.exitCode === 0 && recovery.status !== null && recovery.status.ok !== false
|
||||
? recovery.status
|
||||
: null;
|
||||
const effectiveOk = startOk || recoveredStatus !== null;
|
||||
const observer = startOk ? started : recoveredStatus;
|
||||
const observerId = typeof started?.jobId === "string"
|
||||
? started.jobId
|
||||
: webObserveIdFromStatus(recoveredStatus, { ...options, id: jobId }) ?? jobId;
|
||||
const degradedReason = !startOk && recoveredStatus !== null
|
||||
? "web-probe-start-transport-timeout-recovered"
|
||||
: result.timedOut
|
||||
? "web-probe-command-timeout"
|
||||
: result.exitCode !== 0
|
||||
? "web-probe-observe-start-failed"
|
||||
: null;
|
||||
const index = effectiveOk
|
||||
? upsertWebObserveIndexEntry(startOk ? {
|
||||
id: observerId,
|
||||
node: options.node,
|
||||
lane: options.lane,
|
||||
@@ -1305,11 +1336,11 @@ export function runNodeWebProbeObserveStart(
|
||||
pid: typeof started.pid === "number" ? started.pid : null,
|
||||
startedAt: new Date().toISOString(),
|
||||
updatedAt: new Date().toISOString(),
|
||||
})
|
||||
} : webObserveIndexEntryFromOptions({ ...options, id: observerId, stateDir }, spec, observerId, recoveredStatus ?? {}))
|
||||
: null;
|
||||
return renderWebObserveStartResult({
|
||||
ok: result.exitCode === 0 && started?.ok === true,
|
||||
status: result.exitCode === 0 && started?.ok === true ? "started" : "blocked",
|
||||
ok: effectiveOk,
|
||||
status: effectiveOk ? "started" : "blocked",
|
||||
command: `web-probe observe start --node ${options.node} --lane ${options.lane}`,
|
||||
node: options.node,
|
||||
lane: options.lane,
|
||||
@@ -1320,12 +1351,20 @@ export function runNodeWebProbeObserveStart(
|
||||
projectManagement,
|
||||
targetPath: options.targetPath,
|
||||
id: observerId,
|
||||
degradedReason,
|
||||
credential,
|
||||
observer: withWebObserveShortcuts(started, observerId),
|
||||
observer: withWebObserveShortcuts(observer, observerId),
|
||||
wrapper: buildWebObserveWrapperForObserveOptions("start", options, spec.workspace, { id: observerId, jobId: observerId, stateDir }),
|
||||
index,
|
||||
next: webObserveNextCommands(observerId),
|
||||
result: compactCommandResultRedacted(result, [material.password ?? ""]),
|
||||
recovery: recovery === null ? null : {
|
||||
attempted: true,
|
||||
ok: recoveredStatus !== null,
|
||||
reason: degradedReason,
|
||||
result: compactCommandResultWithStdoutTail(recovery.result),
|
||||
valuesRedacted: true,
|
||||
},
|
||||
valuesRedacted: true,
|
||||
});
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user