From 4bd69610206bc0ef40b0d734df5a61d07242ca9e Mon Sep 17 00:00:00 2001
From: Dotta <bippadotta@protonmail.com>
Date: Sat, 7 Mar 2026 09:22:40 -0600
Subject: [PATCH] fix(openclaw-gateway): add diagnostics capture and two-lane
 validation to e2e

Capture run events, logs, issue state, and container logs on failures
or timeouts for debugging. Write compatibility JSON keys for claimed
API key. Add two-lane validation requirement to test plan.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../doc/ONBOARDING_AND_TEST_PLAN.md           |   5 +
 scripts/smoke/openclaw-gateway-e2e.sh         | 128 +++++++++++++++++-
 2 files changed, 127 insertions(+), 6 deletions(-)

diff --git a/packages/adapters/openclaw-gateway/doc/ONBOARDING_AND_TEST_PLAN.md b/packages/adapters/openclaw-gateway/doc/ONBOARDING_AND_TEST_PLAN.md
index 965d8179..17057c89 100644
--- a/packages/adapters/openclaw-gateway/doc/ONBOARDING_AND_TEST_PLAN.md
+++ b/packages/adapters/openclaw-gateway/doc/ONBOARDING_AND_TEST_PLAN.md
@@ -21,6 +21,10 @@ These are mandatory for onboarding and smoke testing:
 - If a kick is needed, allow at most one follow-up message (for example: “how is it going?”).
 - Required OpenClaw configuration (transport enablement, auth loading, skill usage) must be embedded in prompt instructions, not manual hidden steps.
 
+3. **Two-lane validation is required**
+- Lane A (stock pass lane): unmodified/clean OpenClaw image and config flow. This lane is the release gate.
+- Lane B (instrumentation lane): temporary test instrumentation is allowed only to diagnose failures; it cannot be the final passing path.
+
 ## External Protocol Constraints
 OpenClaw docs to anchor behavior:
 - Webhook mode requires `hooks.enabled=true` and exposes `/hooks/wake` + `/hooks/agent`: https://docs.openclaw.ai/automation/webhook
@@ -233,6 +237,7 @@ POST /api/companies/$CLA_COMPANY_ID/invites
 3. Approve join request.
 4. Claim API key with `claimSecret`.
 5. Save claimed token to OpenClaw expected file path (`~/.openclaw/workspace/paperclip-claimed-api-key.json`) and ensure `PAPERCLIP_API_KEY` + `PAPERCLIP_API_URL` are available for OpenClaw skill execution context.
+  - Write compatibility JSON keys (`token` and `apiKey`) to avoid runtime parser mismatch.
 6. Ensure Paperclip skill is installed for OpenClaw runtime.
 7. Send one bootstrap prompt to OpenClaw containing all setup instructions needed for this run (auth file usage, heartbeat procedure, required tools). If needed, send one follow-up nudge only.
 
diff --git a/scripts/smoke/openclaw-gateway-e2e.sh b/scripts/smoke/openclaw-gateway-e2e.sh
index e20db35d..e45df9f9 100755
--- a/scripts/smoke/openclaw-gateway-e2e.sh
+++ b/scripts/smoke/openclaw-gateway-e2e.sh
@@ -50,6 +50,10 @@ CASE_TIMEOUT_SEC="${CASE_TIMEOUT_SEC:-420}"
 RUN_TIMEOUT_SEC="${RUN_TIMEOUT_SEC:-300}"
 STRICT_CASES="${STRICT_CASES:-1}"
 AUTO_INSTALL_SKILL="${AUTO_INSTALL_SKILL:-1}"
+OPENCLAW_DIAG_DIR="${OPENCLAW_DIAG_DIR:-/tmp/openclaw-gateway-e2e-diag-$(date +%Y%m%d-%H%M%S)}"
+OPENCLAW_ADAPTER_TIMEOUT_SEC="${OPENCLAW_ADAPTER_TIMEOUT_SEC:-120}"
+OPENCLAW_ADAPTER_WAIT_TIMEOUT_MS="${OPENCLAW_ADAPTER_WAIT_TIMEOUT_MS:-120000}"
+PAYLOAD_TEMPLATE_MESSAGE_APPEND="${PAYLOAD_TEMPLATE_MESSAGE_APPEND:-}"
 
 AUTH_HEADERS=()
 if [[ -n "${PAPERCLIP_AUTH_HEADER:-}" ]]; then
@@ -109,6 +113,57 @@ api_request() {
   rm -f "$tmp"
 }
 
+capture_run_diagnostics() {
+  local run_id="$1"
+  local label="${2:-run}"
+  [[ -n "$run_id" ]] || return 0
+
+  mkdir -p "$OPENCLAW_DIAG_DIR"
+
+  api_request "GET" "/heartbeat-runs/${run_id}/events?limit=1000"
+  if [[ "$RESPONSE_CODE" == "200" ]]; then
+    printf "%s\n" "$RESPONSE_BODY" > "${OPENCLAW_DIAG_DIR}/${label}-${run_id}-events.json"
+  else
+    warn "could not fetch events for run ${run_id} (HTTP ${RESPONSE_CODE})"
+  fi
+
+  api_request "GET" "/heartbeat-runs/${run_id}/log?limitBytes=524288"
+  if [[ "$RESPONSE_CODE" == "200" ]]; then
+    printf "%s\n" "$RESPONSE_BODY" > "${OPENCLAW_DIAG_DIR}/${label}-${run_id}-log.json"
+    jq -r '.content // ""' <<<"$RESPONSE_BODY" > "${OPENCLAW_DIAG_DIR}/${label}-${run_id}-log.txt" 2>/dev/null || true
+  else
+    warn "could not fetch log for run ${run_id} (HTTP ${RESPONSE_CODE})"
+  fi
+}
+
+capture_issue_diagnostics() {
+  local issue_id="$1"
+  local label="${2:-issue}"
+  [[ -n "$issue_id" ]] || return 0
+  mkdir -p "$OPENCLAW_DIAG_DIR"
+
+  api_request "GET" "/issues/${issue_id}"
+  if [[ "$RESPONSE_CODE" == "200" ]]; then
+    printf "%s\n" "$RESPONSE_BODY" > "${OPENCLAW_DIAG_DIR}/${label}-${issue_id}.json"
+  fi
+
+  api_request "GET" "/issues/${issue_id}/comments"
+  if [[ "$RESPONSE_CODE" == "200" ]]; then
+    printf "%s\n" "$RESPONSE_BODY" > "${OPENCLAW_DIAG_DIR}/${label}-${issue_id}-comments.json"
+  fi
+}
+
+capture_openclaw_container_logs() {
+  mkdir -p "$OPENCLAW_DIAG_DIR"
+  local container
+  container="$(detect_openclaw_container || true)"
+  if [[ -z "$container" ]]; then
+    warn "could not detect OpenClaw container for diagnostics"
+    return 0
+  fi
+  docker logs --tail=1200 "$container" > "${OPENCLAW_DIAG_DIR}/openclaw-container.log" 2>&1 || true
+}
+
 assert_status() {
   local expected="$1"
   if [[ "$RESPONSE_CODE" != "$expected" ]]; then
@@ -351,6 +406,8 @@ create_and_approve_gateway_join() {
     --arg url "$OPENCLAW_GATEWAY_URL" \
     --arg token "$gateway_token" \
     --arg paperclipApiUrl "$PAPERCLIP_API_URL_FOR_OPENCLAW" \
+    --argjson timeoutSec "$OPENCLAW_ADAPTER_TIMEOUT_SEC" \
+    --argjson waitTimeoutMs "$OPENCLAW_ADAPTER_WAIT_TIMEOUT_MS" \
     '{
       requestType: "agent",
       agentName: $name,
@@ -364,7 +421,8 @@ create_and_approve_gateway_join() {
         disableDeviceAuth: true,
         sessionKeyStrategy: "fixed",
         sessionKey: "paperclip",
-        waitTimeoutMs: 120000,
+        timeoutSec: $timeoutSec,
+        waitTimeoutMs: $waitTimeoutMs,
         paperclipApiUrl: $paperclipApiUrl
       }
     }')"
@@ -404,10 +462,27 @@ persist_claimed_key_artifacts() {
   local workspace_dir="${OPENCLAW_CONFIG_DIR%/}/workspace"
   local skill_dir="${OPENCLAW_CONFIG_DIR%/}/skills/paperclip"
   local claimed_file="${workspace_dir}/paperclip-claimed-api-key.json"
+  local claimed_raw_file="${workspace_dir}/paperclip-claimed-api-key.raw.json"
 
   mkdir -p "$workspace_dir" "$skill_dir"
-  printf "%s\n" "$claim_json" > "$claimed_file"
-  chmod 600 "$claimed_file"
+  local token
+  token="$(jq -r '.token // .apiKey // empty' <<<"$claim_json")"
+  [[ -n "$token" ]] || fail "claim response missing token/apiKey"
+
+  printf "%s\n" "$claim_json" > "$claimed_raw_file"
+  chmod 600 "$claimed_raw_file"
+
+  jq -nc --arg token "$token" '{ token: $token, apiKey: $token }' > "$claimed_file"
+  # Keep this readable for OpenClaw runtime users across sandbox/container contexts.
+  chmod 644 "$claimed_file"
+
+  local container
+  container="$(detect_openclaw_container || true)"
+  if [[ -n "$container" ]]; then
+    docker exec "$container" sh -lc "mkdir -p /home/node/.openclaw/workspace" >/dev/null 2>&1 || true
+    docker cp "$claimed_file" "${container}:/home/node/.openclaw/workspace/paperclip-claimed-api-key.json" >/dev/null 2>&1 || true
+    docker exec "$container" sh -lc "chmod 644 /home/node/.openclaw/workspace/paperclip-claimed-api-key.json" >/dev/null 2>&1 || true
+  fi
 
   if [[ "$AUTO_INSTALL_SKILL" == "1" ]]; then
     api_request "GET" "/skills/paperclip"
@@ -420,7 +495,7 @@ persist_claimed_key_artifacts() {
     chmod 600 "${skill_dir}/SKILL.md"
   fi
 
-  log "wrote claimed key to ${claimed_file}"
+  log "wrote claimed key artifacts to ${claimed_file} and ${claimed_raw_file}"
 }
 
 inject_agent_api_key_payload_template() {
@@ -430,11 +505,17 @@ inject_agent_api_key_payload_template() {
   api_request "GET" "/agents/${AGENT_ID}"
   assert_status "200"
 
+  local base_message
+  base_message="Set PAPERCLIP_API_KEY=${AGENT_API_KEY} in your run context before running Paperclip heartbeat steps."
+  if [[ -n "$PAYLOAD_TEMPLATE_MESSAGE_APPEND" ]]; then
+    base_message="${base_message}\n\n${PAYLOAD_TEMPLATE_MESSAGE_APPEND}"
+  fi
+
   local patch_payload
-  patch_payload="$(jq -c --arg token "$AGENT_API_KEY" '
+  patch_payload="$(jq -c --arg message "$base_message" '
     {adapterConfig: ((.adapterConfig // {}) + {
       payloadTemplate: (((.adapterConfig // {}).payloadTemplate // {}) + {
-        message: ("Set PAPERCLIP_API_KEY=" + $token + " in your run context before running Paperclip heartbeat steps.")
+        message: $message
       })
     })}
   ' <<<"$RESPONSE_BODY")"
@@ -487,12 +568,18 @@ wait_for_run_terminal() {
   while true; do
     status="$(get_run_status "$run_id")"
     if [[ "$status" == "succeeded" || "$status" == "failed" || "$status" == "timed_out" || "$status" == "cancelled" ]]; then
+      if [[ "$status" != "succeeded" ]]; then
+        capture_run_diagnostics "$run_id" "run-nonsuccess"
+        capture_openclaw_container_logs
+      fi
       echo "$status"
       return 0
     fi
 
     now="$(date +%s)"
     if (( now - started >= timeout_sec )); then
+      capture_run_diagnostics "$run_id" "run-timeout"
+      capture_openclaw_container_logs
       echo "timeout"
       return 0
     fi
@@ -614,6 +701,14 @@ run_case_a() {
   marker_found="$(issue_comments_contain "$CASE_A_ISSUE_ID" "$marker")"
   log "case A issue_status=${issue_status} marker_found=${marker_found}"
 
+  if [[ "$issue_status" != "done" || "$marker_found" != "true" ]]; then
+    capture_issue_diagnostics "$CASE_A_ISSUE_ID" "case-a"
+    if [[ -n "$RUN_ID" ]]; then
+      capture_run_diagnostics "$RUN_ID" "case-a"
+    fi
+    capture_openclaw_container_logs
+  fi
+
   if [[ "$STRICT_CASES" == "1" ]]; then
     [[ "$run_status" == "succeeded" ]] || fail "case A run did not succeed"
     [[ "$issue_status" == "done" ]] || fail "case A issue did not reach done"
@@ -647,6 +742,14 @@ run_case_b() {
   marker_found="$(issue_comments_contain "$CASE_B_ISSUE_ID" "$marker")"
   log "case B issue_status=${issue_status} marker_found=${marker_found}"
 
+  if [[ "$issue_status" != "done" || "$marker_found" != "true" ]]; then
+    capture_issue_diagnostics "$CASE_B_ISSUE_ID" "case-b"
+    if [[ -n "$RUN_ID" ]]; then
+      capture_run_diagnostics "$RUN_ID" "case-b"
+    fi
+    capture_openclaw_container_logs
+  fi
+
   warn "case B requires manual UX confirmation in OpenClaw main webchat: message '${message_text}' appears in main chat"
 
   if [[ "$STRICT_CASES" == "1" ]]; then
@@ -689,6 +792,17 @@ run_case_c() {
   CASE_C_CREATED_ISSUE_ID="$created_issue"
   log "case C issue_status=${issue_status} marker_found=${marker_found} created_issue_id=${CASE_C_CREATED_ISSUE_ID:-none}"
 
+  if [[ "$issue_status" != "done" || "$marker_found" != "true" || -z "$CASE_C_CREATED_ISSUE_ID" ]]; then
+    capture_issue_diagnostics "$CASE_C_ISSUE_ID" "case-c"
+    if [[ -n "$CASE_C_CREATED_ISSUE_ID" ]]; then
+      capture_issue_diagnostics "$CASE_C_CREATED_ISSUE_ID" "case-c-created"
+    fi
+    if [[ -n "$RUN_ID" ]]; then
+      capture_run_diagnostics "$RUN_ID" "case-c"
+    fi
+    capture_openclaw_container_logs
+  fi
+
   if [[ "$STRICT_CASES" == "1" ]]; then
     [[ "$run_status" == "succeeded" ]] || fail "case C run did not succeed"
     [[ "$issue_status" == "done" ]] || fail "case C issue did not reach done"
@@ -699,6 +813,8 @@ run_case_c() {
 
 main() {
   log "starting OpenClaw gateway E2E smoke"
+  mkdir -p "$OPENCLAW_DIAG_DIR"
+  log "diagnostics dir: ${OPENCLAW_DIAG_DIR}"
 
   wait_http_ready "${PAPERCLIP_API_URL%/}/api/health" 15 || fail "Paperclip API health endpoint not reachable"
   api_request "GET" "/health"