fix(heartbeat): prevent false process_lost failures on queued and non-child-process runs

- reapOrphanedRuns() now only scans running runs; queued runs are legitimately absent from runningProcesses (waiting on concurrency limits or issue locks) so including them caused false process_lost failures (closes #90) - Add module-level activeRunExecutions set so non-child-process adapters (http, openclaw) are protected from the reaper during execution - Add resumeQueuedRuns() to restart persisted queued runs after a server restart, called at startup and each periodic tick - Add outer catch in executeRun() so setup failures (ensureRuntimeState, resolveWorkspaceForRun, etc.) are recorded as failed runs instead of leaving them stuck in running state - Guard resumeQueuedRuns() against paused/terminated/pending_approval agents - Increase opencode models discovery timeout from 20s to 45s
2026-03-07 12:37:15 -05:00
parent d14e656ec1
commit f81d37fbf7
4 changed files with 52 additions and 14 deletions
--- a/packages/adapters/opencode-local/src/server/models.ts
+++ b/packages/adapters/opencode-local/src/server/models.ts
@@ -7,6 +7,7 @@ import {
 } from "@paperclipai/adapter-utils/server-utils";

 const MODELS_CACHE_TTL_MS = 60_000;
+const MODELS_DISCOVERY_TIMEOUT_MS = 20_000;

 function resolveOpenCodeCommand(input: unknown): string {
  const envOverride =
@@ -115,14 +116,14 @@ export async function discoverOpenCodeModels(input: {
    {
      cwd,
      env: runtimeEnv,
-      timeoutSec: 20,
+      timeoutSec: MODELS_DISCOVERY_TIMEOUT_MS / 1000,
      graceSec: 3,
      onLog: async () => {},
    },
  );

  if (result.timedOut) {
-    throw new Error("`opencode models` timed out.");
+    throw new Error(`\`opencode models\` timed out after ${MODELS_DISCOVERY_TIMEOUT_MS / 1000}s.`);
  }
  if ((result.exitCode ?? 1) !== 0) {
    const detail = firstNonEmptyLine(result.stderr) || firstNonEmptyLine(result.stdout);
--- a/server/package.json
+++ b/server/package.json
@@ -38,9 +38,9 @@
    "@paperclipai/adapter-codex-local": "workspace:*",
    "@paperclipai/adapter-cursor-local": "workspace:*",
    "@paperclipai/adapter-gemini-local": "workspace:*",
+    "@paperclipai/adapter-openclaw-gateway": "workspace:*",
    "@paperclipai/adapter-opencode-local": "workspace:*",
    "@paperclipai/adapter-pi-local": "workspace:*",
-    "@paperclipai/adapter-openclaw-gateway": "workspace:*",
    "@paperclipai/adapter-utils": "workspace:*",
    "@paperclipai/db": "workspace:*",
    "@paperclipai/shared": "workspace:*",
--- a/server/src/index.ts
+++ b/server/src/index.ts
@@ -513,11 +513,14 @@ export async function startServer(): Promise<StartedServer> {
  if (config.heartbeatSchedulerEnabled) {
    const heartbeat = heartbeatService(db as any);
  
-    // Reap orphaned runs at startup (no threshold -- runningProcesses is empty)
-    void heartbeat.reapOrphanedRuns().catch((err) => {
-      logger.error({ err }, "startup reap of orphaned heartbeat runs failed");
-    });
-
+    // Reap orphaned running runs at startup while in-memory execution state is empty,
+    // then resume any persisted queued runs that were waiting on the previous process.
+    void heartbeat
+      .reapOrphanedRuns()
+      .then(() => heartbeat.resumeQueuedRuns())
+      .catch((err) => {
+        logger.error({ err }, "startup heartbeat recovery failed");
+      });
    setInterval(() => {
      void heartbeat
        .tickTimers(new Date())
@@ -530,11 +533,13 @@ export async function startServer(): Promise<StartedServer> {
          logger.error({ err }, "heartbeat timer tick failed");
        });
  
-      // Periodically reap orphaned runs (5-min staleness threshold)
+      // Periodically reap orphaned runs (5-min staleness threshold) and make sure
+      // persisted queued work is still being driven forward.
      void heartbeat
        .reapOrphanedRuns({ staleThresholdMs: 5 * 60 * 1000 })
+        .then(() => heartbeat.resumeQueuedRuns())
        .catch((err) => {
-          logger.error({ err }, "periodic reap of orphaned heartbeat runs failed");
+          logger.error({ err }, "periodic heartbeat recovery failed");
        });
    }, config.heartbeatSchedulerIntervalMs);
  }
--- a/server/src/services/heartbeat.ts
+++ b/server/src/services/heartbeat.ts
@@ -1089,6 +1089,9 @@ export function heartbeatService(db: Db) {
      run = claimed;
    }

+    activeRunExecutions.add(run.id);
+
+    try {
    const agent = await getAgent(run.agentId);
    if (!agent) {
      await setRunStatus(runId, "failed", {
@@ -1676,10 +1679,39 @@ export function heartbeatService(db: Db) {
      }

      await finalizeAgentStatus(agent.id, "failed");
-    } finally {
-      await releaseRuntimeServicesForRun(run.id);
-      await startNextQueuedRunForAgent(agent.id);
-    }
+        } catch (outerErr) {
+          // Setup code before adapter.execute threw (e.g. ensureRuntimeState, resolveWorkspaceForRun).
+          // The inner catch did not fire, so we must record the failure here.
+          const message = outerErr instanceof Error ? outerErr.message : "Unknown setup failure";
+          logger.error({ err: outerErr, runId }, "heartbeat execution setup failed");
+          await setRunStatus(runId, "failed", {
+            error: message,
+            errorCode: "adapter_failed",
+            finishedAt: new Date(),
+          }).catch(() => undefined);
+          await setWakeupStatus(run.wakeupRequestId, "failed", {
+            finishedAt: new Date(),
+            error: message,
+          }).catch(() => undefined);
+          const failedRun = await getRun(runId).catch(() => null);
+          if (failedRun) {
+            // Emit a run-log event so the failure is visible in the run timeline,
+            // consistent with what the inner catch block does for adapter failures.
+            await appendRunEvent(failedRun, 1, {
+              eventType: "error",
+              stream: "system",
+              level: "error",
+              message,
+            }).catch(() => undefined);
+            await releaseIssueExecutionAndPromote(failedRun).catch(() => undefined);
+          }
+          // Ensure the agent is not left stuck in "running" if the inner catch handler's
+          // DB calls threw (e.g. a transient DB error in finalizeAgentStatus).
+          await finalizeAgentStatus(run.agentId, "failed").catch(() => undefined);
+        } finally {
+          await releaseRuntimeServicesForRun(run.id).catch(() => undefined);
+      activeRunExecutions.delete(run.id);
+          await startNextQueuedRunForAgent(run.agentId);
  }

  async function releaseIssueExecutionAndPromote(run: typeof heartbeatRuns.$inferSelect) {