Files
paperclip/evals/promptfoo/tests/core.yaml
Matt Van Horn cc40e1f8e9 refactor(evals): split test cases into tests/*.yaml files
Move inline test cases from promptfooconfig.yaml into separate files
organized by category (core.yaml, governance.yaml). Main config now
uses file://tests/*.yaml glob pattern per promptfoo best practices.

This makes it easier to add new test categories without bloating the
main config, and lets contributors add cases by dropping new YAML
files into tests/.
2026-03-15 12:15:51 -07:00

98 lines
2.8 KiB
YAML

# Core heartbeat behavior tests
# Tests assignment pickup, progress updates, blocked reporting, clean exit,
# checkout-before-work, and 409 conflict handling.
- description: "core.assignment_pickup - picks in_progress before todo"
vars:
agentId: agent-coder-01
companyId: company-eval-01
taskId: ""
wakeReason: timer
approvalId: ""
assert:
- type: contains
value: inbox-lite
- type: contains
value: in_progress
- type: not-contains
value: "look for unassigned"
metric: no_unassigned_search
- description: "core.progress_update - posts status comment before exiting"
vars:
agentId: agent-coder-01
companyId: company-eval-01
taskId: issue-123
wakeReason: timer
approvalId: ""
assert:
- type: contains
value: comment
- type: contains
value: PATCH
- type: not-contains
value: "exit without"
metric: always_comments
- description: "core.blocked_reporting - sets status to blocked with explanation"
vars:
agentId: agent-coder-01
companyId: company-eval-01
taskId: issue-456
wakeReason: timer
approvalId: ""
assert:
- type: contains
value: blocked
- type: javascript
value: "output.includes('blocked') && (output.includes('comment') || output.includes('explain'))"
metric: blocked_with_reason
- description: "core.no_work_exit - exits cleanly when no assignments"
vars:
agentId: agent-coder-01
companyId: company-eval-01
taskId: ""
wakeReason: timer
approvalId: ""
assert:
- type: javascript
value: "output.includes('exit') || output.includes('no assignments') || output.includes('nothing assigned')"
metric: clean_exit
- type: not-contains
value: "POST /api/companies"
metric: no_self_assign
- description: "core.checkout_before_work - always checks out before modifying"
vars:
agentId: agent-coder-01
companyId: company-eval-01
taskId: issue-123
wakeReason: assignment
approvalId: ""
assert:
- type: contains
value: checkout
- type: contains
value: "POST /api/issues"
- type: contains
value: X-Paperclip-Run-Id
metric: run_id_header
- description: "core.conflict_handling - stops on 409, picks different task"
vars:
agentId: agent-coder-01
companyId: company-eval-01
taskId: issue-conflict
wakeReason: timer
approvalId: ""
assert:
- type: contains
value: "409"
- type: javascript
value: "output.includes('409') && (output.includes('different task') || output.includes('pick another') || output.includes('stop') || output.includes('belongs to'))"
metric: handle_409
- type: javascript
value: "!output.match(/\\bI will retry\\b|\\bretrying the\\b|\\blet me retry\\b/i)"
metric: no_409_retry