fix(heartbeat): prevent false process_lost failures on queued and non-child-process runs
- reapOrphanedRuns() now only scans running runs; queued runs are legitimately absent from runningProcesses (waiting on concurrency limits or issue locks) so including them caused false process_lost failures (closes #90) - Add module-level activeRunExecutions set so non-child-process adapters (http, openclaw) are protected from the reaper during execution - Add resumeQueuedRuns() to restart persisted queued runs after a server restart, called at startup and each periodic tick - Add outer catch in executeRun() so setup failures (ensureRuntimeState, resolveWorkspaceForRun, etc.) are recorded as failed runs instead of leaving them stuck in running state - Guard resumeQueuedRuns() against paused/terminated/pending_approval agents - Increase opencode models discovery timeout from 20s to 45s
This commit is contained in:
@@ -7,6 +7,7 @@ import {
|
||||
} from "@paperclipai/adapter-utils/server-utils";
|
||||
|
||||
const MODELS_CACHE_TTL_MS = 60_000;
|
||||
const MODELS_DISCOVERY_TIMEOUT_MS = 20_000;
|
||||
|
||||
function resolveOpenCodeCommand(input: unknown): string {
|
||||
const envOverride =
|
||||
@@ -115,14 +116,14 @@ export async function discoverOpenCodeModels(input: {
|
||||
{
|
||||
cwd,
|
||||
env: runtimeEnv,
|
||||
timeoutSec: 20,
|
||||
timeoutSec: MODELS_DISCOVERY_TIMEOUT_MS / 1000,
|
||||
graceSec: 3,
|
||||
onLog: async () => {},
|
||||
},
|
||||
);
|
||||
|
||||
if (result.timedOut) {
|
||||
throw new Error("`opencode models` timed out.");
|
||||
throw new Error(`\`opencode models\` timed out after ${MODELS_DISCOVERY_TIMEOUT_MS / 1000}s.`);
|
||||
}
|
||||
if ((result.exitCode ?? 1) !== 0) {
|
||||
const detail = firstNonEmptyLine(result.stderr) || firstNonEmptyLine(result.stdout);
|
||||
|
||||
Reference in New Issue
Block a user