fix: enforce sentinel scheduler hard timeout
This commit is contained in:
@@ -188,9 +188,11 @@ async function triggerSentinel(options: SchedulerOptions, schedule: SentinelSche
|
||||
"--timeout-seconds",
|
||||
String(schedule.timeoutSeconds),
|
||||
];
|
||||
const hardTimeoutMs = schedulerHardTimeoutMs(schedule);
|
||||
const result = await runCommandObserved(command, repoRoot, {
|
||||
timeoutMs: Math.max(60, schedule.timeoutSeconds + 90) * 1000,
|
||||
timeoutMs: hardTimeoutMs,
|
||||
heartbeatMs: 30_000,
|
||||
killAfterMs: 3_000,
|
||||
maxCaptureChars: 24_000,
|
||||
env: { ...process.env, NO_COLOR: "1" },
|
||||
});
|
||||
@@ -215,7 +217,7 @@ async function triggerSentinel(options: SchedulerOptions, schedule: SentinelSche
|
||||
latestRunIdAfter: after.latestRunId,
|
||||
status,
|
||||
stdoutTail: tail(result.stdout, 900),
|
||||
stderrTail: tail(result.stderr, 900),
|
||||
stderrTail: tail(result.timedOut ? `${result.stderr}\nscheduler hard timeout after ${Math.round(hardTimeoutMs / 1000)}s` : result.stderr, 900),
|
||||
};
|
||||
}
|
||||
|
||||
@@ -258,6 +260,8 @@ function installSystemd(options: SchedulerOptions): void {
|
||||
const servicePath = `/etc/systemd/system/${unit}.service`;
|
||||
const timerPath = `/etc/systemd/system/${unit}.timer`;
|
||||
const sentinelArg = options.sentinelId === null ? "" : ` --sentinel ${options.sentinelId}`;
|
||||
const timeoutArg = options.timeoutSeconds === null ? "" : ` --timeout-seconds ${options.timeoutSeconds}`;
|
||||
const serviceTimeoutSeconds = systemdServiceTimeoutSeconds(options);
|
||||
const service = `[Unit]
|
||||
Description=UniDesk web-probe sentinel host cadence scheduler for ${options.node}/${options.lane}
|
||||
Wants=network-online.target
|
||||
@@ -265,12 +269,15 @@ After=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
TimeoutStartSec=${serviceTimeoutSeconds}s
|
||||
TimeoutStopSec=15s
|
||||
KillMode=control-group
|
||||
Environment=HOME=/root
|
||||
Environment=PATH=${SYSTEMD_PATH}
|
||||
Environment=NO_PROXY=${SYSTEMD_NO_PROXY}
|
||||
Environment=no_proxy=${SYSTEMD_NO_PROXY}
|
||||
WorkingDirectory=${repoRoot}
|
||||
ExecStart=${BUN_EXECUTABLE} ${join(repoRoot, "scripts", "web-probe-sentinel-scheduler.ts")} run --node ${options.node} --lane ${options.lane}${sentinelArg} --stale-multiplier ${options.staleMultiplier}
|
||||
ExecStart=${BUN_EXECUTABLE} ${join(repoRoot, "scripts", "web-probe-sentinel-scheduler.ts")} run --node ${options.node} --lane ${options.lane}${sentinelArg} --stale-multiplier ${options.staleMultiplier}${timeoutArg}
|
||||
`;
|
||||
const timer = `[Unit]
|
||||
Description=Run UniDesk web-probe sentinel host cadence scheduler for ${options.node}/${options.lane}
|
||||
@@ -301,6 +308,16 @@ WantedBy=timers.target
|
||||
if (results.some((result) => result.exitCode !== 0)) process.exitCode = 2;
|
||||
}
|
||||
|
||||
function schedulerHardTimeoutMs(schedule: SentinelSchedule): number {
|
||||
return Math.max(60, schedule.timeoutSeconds) * 1000;
|
||||
}
|
||||
|
||||
function systemdServiceTimeoutSeconds(options: SchedulerOptions): number {
|
||||
const schedules = sentinelSchedules(specFor(options), options);
|
||||
const maxTimeout = Math.max(...schedules.map((schedule) => Math.max(60, schedule.timeoutSeconds)));
|
||||
return maxTimeout + Math.max(30, Math.ceil(options.fetchTimeoutMs / 1000) + 15);
|
||||
}
|
||||
|
||||
function statusSystemd(options: SchedulerOptions): void {
|
||||
const unit = systemdUnitName(options);
|
||||
const results = [
|
||||
|
||||
Reference in New Issue
Block a user