fix(heartbeat): prevent false process_lost failures on queued and non-child-process runs

- reapOrphanedRuns() now only scans running runs; queued runs are
  legitimately absent from runningProcesses (waiting on concurrency
  limits or issue locks) so including them caused false process_lost
  failures (closes #90)
- Add module-level activeRunExecutions set so non-child-process adapters
  (http, openclaw) are protected from the reaper during execution
- Add resumeQueuedRuns() to restart persisted queued runs after a server
  restart, called at startup and each periodic tick
- Add outer catch in executeRun() so setup failures (ensureRuntimeState,
  resolveWorkspaceForRun, etc.) are recorded as failed runs instead of
  leaving them stuck in running state
- Guard resumeQueuedRuns() against paused/terminated/pending_approval agents
- Increase opencode models discovery timeout from 20s to 45s
This commit is contained in:
Dotta
2026-03-07 12:37:15 -05:00
committed by Michael Averto
parent d14e656ec1
commit f81d37fbf7
4 changed files with 52 additions and 14 deletions

View File

@@ -7,6 +7,7 @@ import {
} from "@paperclipai/adapter-utils/server-utils";
const MODELS_CACHE_TTL_MS = 60_000;
const MODELS_DISCOVERY_TIMEOUT_MS = 20_000;
function resolveOpenCodeCommand(input: unknown): string {
const envOverride =
@@ -115,14 +116,14 @@ export async function discoverOpenCodeModels(input: {
{
cwd,
env: runtimeEnv,
timeoutSec: 20,
timeoutSec: MODELS_DISCOVERY_TIMEOUT_MS / 1000,
graceSec: 3,
onLog: async () => {},
},
);
if (result.timedOut) {
throw new Error("`opencode models` timed out.");
throw new Error(`\`opencode models\` timed out after ${MODELS_DISCOVERY_TIMEOUT_MS / 1000}s.`);
}
if ((result.exitCode ?? 1) !== 0) {
const detail = firstNonEmptyLine(result.stderr) || firstNonEmptyLine(result.stdout);