diff --git a/scripts/web-probe-sentinel-scheduler.ts b/scripts/web-probe-sentinel-scheduler.ts index 5e8faea1..7bc38aed 100644 --- a/scripts/web-probe-sentinel-scheduler.ts +++ b/scripts/web-probe-sentinel-scheduler.ts @@ -188,9 +188,11 @@ async function triggerSentinel(options: SchedulerOptions, schedule: SentinelSche "--timeout-seconds", String(schedule.timeoutSeconds), ]; + const hardTimeoutMs = schedulerHardTimeoutMs(schedule); const result = await runCommandObserved(command, repoRoot, { - timeoutMs: Math.max(60, schedule.timeoutSeconds + 90) * 1000, + timeoutMs: hardTimeoutMs, heartbeatMs: 30_000, + killAfterMs: 3_000, maxCaptureChars: 24_000, env: { ...process.env, NO_COLOR: "1" }, }); @@ -215,7 +217,7 @@ async function triggerSentinel(options: SchedulerOptions, schedule: SentinelSche latestRunIdAfter: after.latestRunId, status, stdoutTail: tail(result.stdout, 900), - stderrTail: tail(result.stderr, 900), + stderrTail: tail(result.timedOut ? `${result.stderr}\nscheduler hard timeout after ${Math.round(hardTimeoutMs / 1000)}s` : result.stderr, 900), }; } @@ -258,6 +260,8 @@ function installSystemd(options: SchedulerOptions): void { const servicePath = `/etc/systemd/system/${unit}.service`; const timerPath = `/etc/systemd/system/${unit}.timer`; const sentinelArg = options.sentinelId === null ? "" : ` --sentinel ${options.sentinelId}`; + const timeoutArg = options.timeoutSeconds === null ? "" : ` --timeout-seconds ${options.timeoutSeconds}`; + const serviceTimeoutSeconds = systemdServiceTimeoutSeconds(options); const service = `[Unit] Description=UniDesk web-probe sentinel host cadence scheduler for ${options.node}/${options.lane} Wants=network-online.target @@ -265,12 +269,15 @@ After=network-online.target [Service] Type=oneshot +TimeoutStartSec=${serviceTimeoutSeconds}s +TimeoutStopSec=15s +KillMode=control-group Environment=HOME=/root Environment=PATH=${SYSTEMD_PATH} Environment=NO_PROXY=${SYSTEMD_NO_PROXY} Environment=no_proxy=${SYSTEMD_NO_PROXY} WorkingDirectory=${repoRoot} -ExecStart=${BUN_EXECUTABLE} ${join(repoRoot, "scripts", "web-probe-sentinel-scheduler.ts")} run --node ${options.node} --lane ${options.lane}${sentinelArg} --stale-multiplier ${options.staleMultiplier} +ExecStart=${BUN_EXECUTABLE} ${join(repoRoot, "scripts", "web-probe-sentinel-scheduler.ts")} run --node ${options.node} --lane ${options.lane}${sentinelArg} --stale-multiplier ${options.staleMultiplier}${timeoutArg} `; const timer = `[Unit] Description=Run UniDesk web-probe sentinel host cadence scheduler for ${options.node}/${options.lane} @@ -301,6 +308,16 @@ WantedBy=timers.target if (results.some((result) => result.exitCode !== 0)) process.exitCode = 2; } +function schedulerHardTimeoutMs(schedule: SentinelSchedule): number { + return Math.max(60, schedule.timeoutSeconds) * 1000; +} + +function systemdServiceTimeoutSeconds(options: SchedulerOptions): number { + const schedules = sentinelSchedules(specFor(options), options); + const maxTimeout = Math.max(...schedules.map((schedule) => Math.max(60, schedule.timeoutSeconds))); + return maxTimeout + Math.max(30, Math.ceil(options.fetchTimeoutMs / 1000) + 15); +} + function statusSystemd(options: SchedulerOptions): void { const unit = systemdUnitName(options); const results = [