fix(openclaw-gateway): add diagnostics capture and two-lane validation to e2e

Capture run events, logs, issue state, and container logs on failures or timeouts for debugging. Write compatibility JSON keys for claimed API key. Add two-lane validation requirement to test plan. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-07 09:22:40 -06:00
parent b91820afd3
commit 4bd6961020
2 changed files with 127 additions and 6 deletions
--- a/packages/adapters/openclaw-gateway/doc/ONBOARDING_AND_TEST_PLAN.md
+++ b/packages/adapters/openclaw-gateway/doc/ONBOARDING_AND_TEST_PLAN.md
@@ -21,6 +21,10 @@ These are mandatory for onboarding and smoke testing:
 - If a kick is needed, allow at most one follow-up message (for example: “how is it going?”).
 - Required OpenClaw configuration (transport enablement, auth loading, skill usage) must be embedded in prompt instructions, not manual hidden steps.

+3. **Two-lane validation is required**
+- Lane A (stock pass lane): unmodified/clean OpenClaw image and config flow. This lane is the release gate.
+- Lane B (instrumentation lane): temporary test instrumentation is allowed only to diagnose failures; it cannot be the final passing path.
+
 ## External Protocol Constraints
 OpenClaw docs to anchor behavior:
 - Webhook mode requires `hooks.enabled=true` and exposes `/hooks/wake` + `/hooks/agent`: https://docs.openclaw.ai/automation/webhook
@@ -233,6 +237,7 @@ POST /api/companies/$CLA_COMPANY_ID/invites
 3. Approve join request.
 4. Claim API key with `claimSecret`.
 5. Save claimed token to OpenClaw expected file path (`~/.openclaw/workspace/paperclip-claimed-api-key.json`) and ensure `PAPERCLIP_API_KEY` + `PAPERCLIP_API_URL` are available for OpenClaw skill execution context.
+  - Write compatibility JSON keys (`token` and `apiKey`) to avoid runtime parser mismatch.
 6. Ensure Paperclip skill is installed for OpenClaw runtime.
 7. Send one bootstrap prompt to OpenClaw containing all setup instructions needed for this run (auth file usage, heartbeat procedure, required tools). If needed, send one follow-up nudge only.

--- a/scripts/smoke/openclaw-gateway-e2e.sh
+++ b/scripts/smoke/openclaw-gateway-e2e.sh
@@ -50,6 +50,10 @@ CASE_TIMEOUT_SEC="${CASE_TIMEOUT_SEC:-420}"
 RUN_TIMEOUT_SEC="${RUN_TIMEOUT_SEC:-300}"
 STRICT_CASES="${STRICT_CASES:-1}"
 AUTO_INSTALL_SKILL="${AUTO_INSTALL_SKILL:-1}"
+OPENCLAW_DIAG_DIR="${OPENCLAW_DIAG_DIR:-/tmp/openclaw-gateway-e2e-diag-$(date +%Y%m%d-%H%M%S)}"
+OPENCLAW_ADAPTER_TIMEOUT_SEC="${OPENCLAW_ADAPTER_TIMEOUT_SEC:-120}"
+OPENCLAW_ADAPTER_WAIT_TIMEOUT_MS="${OPENCLAW_ADAPTER_WAIT_TIMEOUT_MS:-120000}"
+PAYLOAD_TEMPLATE_MESSAGE_APPEND="${PAYLOAD_TEMPLATE_MESSAGE_APPEND:-}"

 AUTH_HEADERS=()
 if [[ -n "${PAPERCLIP_AUTH_HEADER:-}" ]]; then
@@ -109,6 +113,57 @@ api_request() {
  rm -f "$tmp"
 }

+capture_run_diagnostics() {
+  local run_id="$1"
+  local label="${2:-run}"
+  [[ -n "$run_id" ]] || return 0
+
+  mkdir -p "$OPENCLAW_DIAG_DIR"
+
+  api_request "GET" "/heartbeat-runs/${run_id}/events?limit=1000"
+  if [[ "$RESPONSE_CODE" == "200" ]]; then
+    printf "%s\n" "$RESPONSE_BODY" > "${OPENCLAW_DIAG_DIR}/${label}-${run_id}-events.json"
+  else
+    warn "could not fetch events for run ${run_id} (HTTP ${RESPONSE_CODE})"
+  fi
+
+  api_request "GET" "/heartbeat-runs/${run_id}/log?limitBytes=524288"
+  if [[ "$RESPONSE_CODE" == "200" ]]; then
+    printf "%s\n" "$RESPONSE_BODY" > "${OPENCLAW_DIAG_DIR}/${label}-${run_id}-log.json"
+    jq -r '.content // ""' <<<"$RESPONSE_BODY" > "${OPENCLAW_DIAG_DIR}/${label}-${run_id}-log.txt" 2>/dev/null || true
+  else
+    warn "could not fetch log for run ${run_id} (HTTP ${RESPONSE_CODE})"
+  fi
+}
+
+capture_issue_diagnostics() {
+  local issue_id="$1"
+  local label="${2:-issue}"
+  [[ -n "$issue_id" ]] || return 0
+  mkdir -p "$OPENCLAW_DIAG_DIR"
+
+  api_request "GET" "/issues/${issue_id}"
+  if [[ "$RESPONSE_CODE" == "200" ]]; then
+    printf "%s\n" "$RESPONSE_BODY" > "${OPENCLAW_DIAG_DIR}/${label}-${issue_id}.json"
+  fi
+
+  api_request "GET" "/issues/${issue_id}/comments"
+  if [[ "$RESPONSE_CODE" == "200" ]]; then
+    printf "%s\n" "$RESPONSE_BODY" > "${OPENCLAW_DIAG_DIR}/${label}-${issue_id}-comments.json"
+  fi
+}
+
+capture_openclaw_container_logs() {
+  mkdir -p "$OPENCLAW_DIAG_DIR"
+  local container
+  container="$(detect_openclaw_container || true)"
+  if [[ -z "$container" ]]; then
+    warn "could not detect OpenClaw container for diagnostics"
+    return 0
+  fi
+  docker logs --tail=1200 "$container" > "${OPENCLAW_DIAG_DIR}/openclaw-container.log" 2>&1 || true
+}
+
 assert_status() {
  local expected="$1"
  if [[ "$RESPONSE_CODE" != "$expected" ]]; then
@@ -351,6 +406,8 @@ create_and_approve_gateway_join() {
    --arg url "$OPENCLAW_GATEWAY_URL" \
    --arg token "$gateway_token" \
    --arg paperclipApiUrl "$PAPERCLIP_API_URL_FOR_OPENCLAW" \
+    --argjson timeoutSec "$OPENCLAW_ADAPTER_TIMEOUT_SEC" \
+    --argjson waitTimeoutMs "$OPENCLAW_ADAPTER_WAIT_TIMEOUT_MS" \
    '{
      requestType: "agent",
      agentName: $name,
@@ -364,7 +421,8 @@ create_and_approve_gateway_join() {
        disableDeviceAuth: true,
        sessionKeyStrategy: "fixed",
        sessionKey: "paperclip",
-        waitTimeoutMs: 120000,
+        timeoutSec: $timeoutSec,
+        waitTimeoutMs: $waitTimeoutMs,
        paperclipApiUrl: $paperclipApiUrl
      }
    }')"
@@ -404,10 +462,27 @@ persist_claimed_key_artifacts() {
  local workspace_dir="${OPENCLAW_CONFIG_DIR%/}/workspace"
  local skill_dir="${OPENCLAW_CONFIG_DIR%/}/skills/paperclip"
  local claimed_file="${workspace_dir}/paperclip-claimed-api-key.json"
+  local claimed_raw_file="${workspace_dir}/paperclip-claimed-api-key.raw.json"

  mkdir -p "$workspace_dir" "$skill_dir"
-  printf "%s\n" "$claim_json" > "$claimed_file"
-  chmod 600 "$claimed_file"
+  local token
+  token="$(jq -r '.token // .apiKey // empty' <<<"$claim_json")"
+  [[ -n "$token" ]] || fail "claim response missing token/apiKey"
+
+  printf "%s\n" "$claim_json" > "$claimed_raw_file"
+  chmod 600 "$claimed_raw_file"
+
+  jq -nc --arg token "$token" '{ token: $token, apiKey: $token }' > "$claimed_file"
+  # Keep this readable for OpenClaw runtime users across sandbox/container contexts.
+  chmod 644 "$claimed_file"
+
+  local container
+  container="$(detect_openclaw_container || true)"
+  if [[ -n "$container" ]]; then
+    docker exec "$container" sh -lc "mkdir -p /home/node/.openclaw/workspace" >/dev/null 2>&1 || true
+    docker cp "$claimed_file" "${container}:/home/node/.openclaw/workspace/paperclip-claimed-api-key.json" >/dev/null 2>&1 || true
+    docker exec "$container" sh -lc "chmod 644 /home/node/.openclaw/workspace/paperclip-claimed-api-key.json" >/dev/null 2>&1 || true
+  fi

  if [[ "$AUTO_INSTALL_SKILL" == "1" ]]; then
    api_request "GET" "/skills/paperclip"
@@ -420,7 +495,7 @@ persist_claimed_key_artifacts() {
    chmod 600 "${skill_dir}/SKILL.md"
  fi

-  log "wrote claimed key to ${claimed_file}"
+  log "wrote claimed key artifacts to ${claimed_file} and ${claimed_raw_file}"
 }

 inject_agent_api_key_payload_template() {
@@ -430,11 +505,17 @@ inject_agent_api_key_payload_template() {
  api_request "GET" "/agents/${AGENT_ID}"
  assert_status "200"

+  local base_message
+  base_message="Set PAPERCLIP_API_KEY=${AGENT_API_KEY} in your run context before running Paperclip heartbeat steps."
+  if [[ -n "$PAYLOAD_TEMPLATE_MESSAGE_APPEND" ]]; then
+    base_message="${base_message}\n\n${PAYLOAD_TEMPLATE_MESSAGE_APPEND}"
+  fi
+
  local patch_payload
-  patch_payload="$(jq -c --arg token "$AGENT_API_KEY" '
+  patch_payload="$(jq -c --arg message "$base_message" '
    {adapterConfig: ((.adapterConfig // {}) + {
      payloadTemplate: (((.adapterConfig // {}).payloadTemplate // {}) + {
-        message: ("Set PAPERCLIP_API_KEY=" + $token + " in your run context before running Paperclip heartbeat steps.")
+        message: $message
      })
    })}
  ' <<<"$RESPONSE_BODY")"
@@ -487,12 +568,18 @@ wait_for_run_terminal() {
  while true; do
    status="$(get_run_status "$run_id")"
    if [[ "$status" == "succeeded" || "$status" == "failed" || "$status" == "timed_out" || "$status" == "cancelled" ]]; then
+      if [[ "$status" != "succeeded" ]]; then
+        capture_run_diagnostics "$run_id" "run-nonsuccess"
+        capture_openclaw_container_logs
+      fi
      echo "$status"
      return 0
    fi

    now="$(date +%s)"
    if (( now - started >= timeout_sec )); then
+      capture_run_diagnostics "$run_id" "run-timeout"
+      capture_openclaw_container_logs
      echo "timeout"
      return 0
    fi
@@ -614,6 +701,14 @@ run_case_a() {
  marker_found="$(issue_comments_contain "$CASE_A_ISSUE_ID" "$marker")"
  log "case A issue_status=${issue_status} marker_found=${marker_found}"

+  if [[ "$issue_status" != "done" || "$marker_found" != "true" ]]; then
+    capture_issue_diagnostics "$CASE_A_ISSUE_ID" "case-a"
+    if [[ -n "$RUN_ID" ]]; then
+      capture_run_diagnostics "$RUN_ID" "case-a"
+    fi
+    capture_openclaw_container_logs
+  fi
+
  if [[ "$STRICT_CASES" == "1" ]]; then
    [[ "$run_status" == "succeeded" ]] || fail "case A run did not succeed"
    [[ "$issue_status" == "done" ]] || fail "case A issue did not reach done"
@@ -647,6 +742,14 @@ run_case_b() {
  marker_found="$(issue_comments_contain "$CASE_B_ISSUE_ID" "$marker")"
  log "case B issue_status=${issue_status} marker_found=${marker_found}"

+  if [[ "$issue_status" != "done" || "$marker_found" != "true" ]]; then
+    capture_issue_diagnostics "$CASE_B_ISSUE_ID" "case-b"
+    if [[ -n "$RUN_ID" ]]; then
+      capture_run_diagnostics "$RUN_ID" "case-b"
+    fi
+    capture_openclaw_container_logs
+  fi
+
  warn "case B requires manual UX confirmation in OpenClaw main webchat: message '${message_text}' appears in main chat"

  if [[ "$STRICT_CASES" == "1" ]]; then
@@ -689,6 +792,17 @@ run_case_c() {
  CASE_C_CREATED_ISSUE_ID="$created_issue"
  log "case C issue_status=${issue_status} marker_found=${marker_found} created_issue_id=${CASE_C_CREATED_ISSUE_ID:-none}"

+  if [[ "$issue_status" != "done" || "$marker_found" != "true" || -z "$CASE_C_CREATED_ISSUE_ID" ]]; then
+    capture_issue_diagnostics "$CASE_C_ISSUE_ID" "case-c"
+    if [[ -n "$CASE_C_CREATED_ISSUE_ID" ]]; then
+      capture_issue_diagnostics "$CASE_C_CREATED_ISSUE_ID" "case-c-created"
+    fi
+    if [[ -n "$RUN_ID" ]]; then
+      capture_run_diagnostics "$RUN_ID" "case-c"
+    fi
+    capture_openclaw_container_logs
+  fi
+
  if [[ "$STRICT_CASES" == "1" ]]; then
    [[ "$run_status" == "succeeded" ]] || fail "case C run did not succeed"
    [[ "$issue_status" == "done" ]] || fail "case C issue did not reach done"
@@ -699,6 +813,8 @@ run_case_c() {

 main() {
  log "starting OpenClaw gateway E2E smoke"
+  mkdir -p "$OPENCLAW_DIAG_DIR"
+  log "diagnostics dir: ${OPENCLAW_DIAG_DIR}"

  wait_http_ready "${PAPERCLIP_API_URL%/}/api/health" 15 || fail "Paperclip API health endpoint not reachable"
  api_request "GET" "/health"