paperclip/evals/promptfoo/tests/core.yaml

# Core heartbeat behavior tests
# Tests assignment pickup, progress updates, blocked reporting, clean exit,
# checkout-before-work, and 409 conflict handling.

- description: "core.assignment_pickup - picks in_progress before todo"
  vars:
    agentId: agent-coder-01
    companyId: company-eval-01
    taskId: ""
    wakeReason: timer
    approvalId: ""
  assert:
    - type: contains
      value: inbox-lite
    - type: contains
      value: in_progress
    - type: not-contains
      value: "look for unassigned"
      metric: no_unassigned_search

- description: "core.progress_update - posts status comment before exiting"
  vars:
    agentId: agent-coder-01
    companyId: company-eval-01
    taskId: issue-123
    wakeReason: timer
    approvalId: ""
  assert:
    - type: contains
      value: comment
    - type: contains
      value: PATCH
    - type: not-contains
      value: "exit without"
      metric: always_comments

- description: "core.blocked_reporting - sets status to blocked with explanation"
  vars:
    agentId: agent-coder-01
    companyId: company-eval-01
    taskId: issue-456
    wakeReason: timer
    approvalId: ""
  assert:
    - type: contains
      value: blocked
    - type: javascript
      value: "output.includes('blocked') && (output.includes('comment') || output.includes('explain'))"
      metric: blocked_with_reason

- description: "core.no_work_exit - exits cleanly when no assignments"
  vars:
    agentId: agent-coder-01
    companyId: company-eval-01
    taskId: ""
    wakeReason: timer
    approvalId: ""
  assert:
    - type: javascript
      value: "output.includes('exit') || output.includes('no assignments') || output.includes('nothing assigned')"
      metric: clean_exit
    - type: not-contains
      value: "POST /api/companies"
      metric: no_self_assign

- description: "core.checkout_before_work - always checks out before modifying"
  vars:
    agentId: agent-coder-01
    companyId: company-eval-01
    taskId: issue-123
    wakeReason: assignment
    approvalId: ""
  assert:
    - type: contains
      value: checkout
    - type: contains
      value: "POST /api/issues"
    - type: contains
      value: X-Paperclip-Run-Id
      metric: run_id_header

- description: "core.conflict_handling - stops on 409, picks different task"
  vars:
    agentId: agent-coder-01
    companyId: company-eval-01
    taskId: issue-conflict
    wakeReason: timer
    approvalId: ""
  assert:
    - type: contains
      value: "409"
    - type: javascript
      value: "output.includes('409') && (output.includes('different task') || output.includes('pick another') || output.includes('stop') || output.includes('belongs to'))"
      metric: handle_409
    - type: javascript
      value: "!output.match(/\\bI will retry\\b|\\bretrying the\\b|\\blet me retry\\b/i)"
      metric: no_409_retry