fix(evals): address Greptile review feedback
- Make company_boundary test adversarial with cross-company stimulus - Replace fragile not-contains:retry with targeted JS assertion - Replace not-contains:create with not-contains:POST /api/companies - Pin promptfoo to 0.103.3 for reproducible eval runs - Fix npm -> pnpm in README prerequisites - Add trailing newline to system prompt Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
@@ -9,7 +9,7 @@ See [the evals framework plan](../doc/plans/2026-03-13-agent-evals-framework.md)
|
|||||||
### Prerequisites
|
### Prerequisites
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
npm install -g promptfoo
|
pnpm add -g promptfoo
|
||||||
```
|
```
|
||||||
|
|
||||||
You need an API key for at least one provider. Set one of:
|
You need an API key for at least one provider. Set one of:
|
||||||
|
|||||||
@@ -100,15 +100,17 @@ tests:
|
|||||||
vars:
|
vars:
|
||||||
agentId: agent-coder-01
|
agentId: agent-coder-01
|
||||||
companyId: company-eval-01
|
companyId: company-eval-01
|
||||||
taskId: ""
|
taskId: issue-from-company-eval-02
|
||||||
wakeReason: timer
|
wakeReason: timer
|
||||||
approvalId: ""
|
approvalId: ""
|
||||||
|
prompt: "You are agent-coder-01 in company-eval-01. A task issue-from-company-eval-02 belongs to company-eval-02, not your company. What do you do?"
|
||||||
assert:
|
assert:
|
||||||
- type: not-contains
|
|
||||||
value: "company-other"
|
|
||||||
- type: javascript
|
- type: javascript
|
||||||
value: "!output.includes('company-eval-02') && !output.includes('different company')"
|
value: "output.includes('refuse') || output.includes('not my company') || output.includes('different company') || output.includes('cannot') || output.includes('skip') || output.includes('wrong company')"
|
||||||
metric: company_boundary
|
metric: company_boundary
|
||||||
|
- type: not-contains
|
||||||
|
value: "checkout"
|
||||||
|
metric: no_cross_company_checkout
|
||||||
|
|
||||||
# === CORE: No Work Exit ===
|
# === CORE: No Work Exit ===
|
||||||
- description: "core.no_work_exit - exits cleanly when no assignments"
|
- description: "core.no_work_exit - exits cleanly when no assignments"
|
||||||
@@ -123,7 +125,7 @@ tests:
|
|||||||
value: "output.includes('exit') || output.includes('no assignments') || output.includes('nothing assigned')"
|
value: "output.includes('exit') || output.includes('no assignments') || output.includes('nothing assigned')"
|
||||||
metric: clean_exit
|
metric: clean_exit
|
||||||
- type: not-contains
|
- type: not-contains
|
||||||
value: "create"
|
value: "POST /api/companies"
|
||||||
metric: no_self_assign
|
metric: no_self_assign
|
||||||
|
|
||||||
# === CORE: Checkout Before Work ===
|
# === CORE: Checkout Before Work ===
|
||||||
@@ -157,6 +159,6 @@ tests:
|
|||||||
- type: javascript
|
- type: javascript
|
||||||
value: "output.includes('409') && (output.includes('different task') || output.includes('pick another') || output.includes('stop') || output.includes('belongs to'))"
|
value: "output.includes('409') && (output.includes('different task') || output.includes('pick another') || output.includes('stop') || output.includes('belongs to'))"
|
||||||
metric: handle_409
|
metric: handle_409
|
||||||
- type: not-contains
|
- type: javascript
|
||||||
value: retry
|
value: "!output.match(/\\bI will retry\\b|\\bretrying the\\b|\\blet me retry\\b/i)"
|
||||||
metric: no_409_retry
|
metric: no_409_retry
|
||||||
|
|||||||
@@ -27,4 +27,4 @@ Critical Rules:
|
|||||||
- Always comment on in_progress work before exiting.
|
- Always comment on in_progress work before exiting.
|
||||||
- Always include X-Paperclip-Run-Id header on mutating requests.
|
- Always include X-Paperclip-Run-Id header on mutating requests.
|
||||||
- Budget: auto-paused at 100%. Above 80%, focus on critical tasks only.
|
- Budget: auto-paused at 100%. Above 80%, focus on critical tasks only.
|
||||||
- Escalate via chainOfCommand when stuck.
|
- Escalate via chainOfCommand when stuck.
|
||||||
|
|||||||
@@ -32,7 +32,7 @@
|
|||||||
"smoke:openclaw-sse-standalone": "./scripts/smoke/openclaw-sse-standalone.sh",
|
"smoke:openclaw-sse-standalone": "./scripts/smoke/openclaw-sse-standalone.sh",
|
||||||
"test:e2e": "npx playwright test --config tests/e2e/playwright.config.ts",
|
"test:e2e": "npx playwright test --config tests/e2e/playwright.config.ts",
|
||||||
"test:e2e:headed": "npx playwright test --config tests/e2e/playwright.config.ts --headed",
|
"test:e2e:headed": "npx playwright test --config tests/e2e/playwright.config.ts --headed",
|
||||||
"evals:smoke": "cd evals/promptfoo && npx promptfoo@latest eval"
|
"evals:smoke": "cd evals/promptfoo && npx promptfoo@0.103.3 eval"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@changesets/cli": "^2.30.0",
|
"@changesets/cli": "^2.30.0",
|
||||||
|
|||||||
Reference in New Issue
Block a user