Skip to main content

Error Handling

When something goes wrong with an event, Relay sends an error message over WebSocket. Understanding error codes and appropriate retry strategies is essential for a robust integration.

Error Message Format

Error messages follow this structure:

{
"type": "error",
"event_id": "evt_k9p2m",
"agent_id": "athena",
"error": "Agent not connected",
"code": "AGENT_OFFLINE"
}
FieldTypeNotes
typestringAlways "error"
event_idstringThe event that failed (may be null for handshake errors)
agent_idstringWhich agent had the problem
errorstringHuman-readable description
codestringError code for programmatic handling

Error Codes and Handling

AGENT_OFFLINE

Meaning: The agent is not currently connected to Relay.

Status Code: 503 (Service Unavailable)

When to retry: Yes, with exponential backoff

Recommended action:

  • Retry after 1-2 seconds
  • Show user: "Athena is offline. Retrying..."
  • After 3-5 retries, suggest user try again later
async def send_with_retry(websocket, agent_id, thread_id, payload, max_retries=3):
for attempt in range(max_retries):
# Send event
event = {
"type": "event",
"agent_id": agent_id,
"thread_id": thread_id,
"payload": payload
}
await websocket.send(json.dumps(event))

# Get response
response = await websocket.recv()
data = json.loads(response)

if data["type"] == "error" and data["code"] == "AGENT_OFFLINE":
if attempt < max_retries - 1:
delay = 2 ** (attempt + 1) # 2s, 4s, 8s
print(f"Agent offline, retrying in {delay}s...")
await asyncio.sleep(delay)
continue
else:
return {"error": "Agent offline after retries", "code": "AGENT_OFFLINE"}

elif data["type"] == "accepted":
return {"success": True, "event_id": data["event_id"]}

else:
return data # Handle other errors

AGENT_NOT_ALLOWED

Meaning: Your app isn't allowlisted for this agent.

Status Code: 403 (Forbidden)

When to retry: No

Recommended action:

  • Do not retry
  • Log as a permission error
  • Notify admin or app developer
  • Show user a clear message if they tried to mention the agent
if error_code == "AGENT_NOT_ALLOWED":
print(f"Permission denied: app {app_id} cannot access {agent_id}")
print("Contact your Relay admin to request allowlist access")
# Do not retry

AGENT_TIMEOUT

Meaning: The agent took too long to respond (exceeded timeout window).

Status Code: 504 (Gateway Timeout)

When to retry: Maybe — agent might be back online soon

Recommended action:

  • Retry after 2-5 seconds
  • Limit retries to 1-2 attempts only
  • If agent continues timing out, escalate to infrastructure team
if error_code == "AGENT_TIMEOUT":
# Retry once
await asyncio.sleep(3)
await send_event(websocket, agent_id, thread_id, payload)

PAYLOAD_TOO_LARGE

Meaning: Event payload exceeds 64KB limit.

Status Code: 413 (Payload Too Large)

When to retry: No (unless you reduce payload size)

Recommended action:

  • Do not retry with same payload
  • Reduce payload size (remove unnecessary fields, truncate text)
  • Log error with context for debugging
import json

def validate_payload(payload):
size_bytes = len(json.dumps(payload).encode('utf-8'))
if size_bytes > 64 * 1024:
raise ValueError(
f"Payload too large: {size_bytes} bytes exceeds 64KB limit. "
f"Remove non-essential fields or truncate large text."
)

try:
validate_payload(my_payload)
await send_event(ws, agent_id, thread_id, my_payload)
except ValueError as e:
print(f"Payload validation failed: {e}")
# Trim payload and retry
trimmed_payload = trim_payload(my_payload)
await send_event(ws, agent_id, thread_id, trimmed_payload)

RATE_LIMITED

Meaning: Your app has exceeded its rate limit (events per minute).

Status Code: 429 (Too Many Requests)

When to retry: Yes, but respect rate limit window

Recommended action:

  • Respect the rate limit window
  • Queue events and send at a lower rate
  • Implement exponential backoff starting after 10+ seconds

Default rate limits: (see Rate Limits)

import asyncio

class RateLimitedClient:
def __init__(self, events_per_minute=60):
self.events_per_minute = events_per_minute
self.event_queue = asyncio.Queue()
self.last_sent = 0
self.min_interval = 60 / events_per_minute

async def send_event(self, websocket, agent_id, thread_id, payload):
"""Send event, respecting rate limits"""
await self.event_queue.put((agent_id, thread_id, payload))

# Process queue with rate limiting
while not self.event_queue.empty():
agent_id, thread_id, payload = await self.event_queue.get()

# Wait if needed
elapsed = time.time() - self.last_sent
if elapsed < self.min_interval:
await asyncio.sleep(self.min_interval - elapsed)

event = {
"type": "event",
"agent_id": agent_id,
"thread_id": thread_id,
"payload": payload
}

await websocket.send(json.dumps(event))
self.last_sent = time.time()

INVALID_EVENT

Meaning: Event is malformed (missing required fields, bad JSON, etc.).

Status Code: 400 (Bad Request)

When to retry: No

Recommended action:

  • Fix the event structure
  • Verify all required fields are present
  • Check JSON is valid
def validate_event(agent_id, thread_id, payload):
if not isinstance(agent_id, str) or not agent_id.strip():
raise ValueError("agent_id must be non-empty string")
if not isinstance(thread_id, str) or not thread_id.strip():
raise ValueError("thread_id must be non-empty string")
if not isinstance(payload, dict):
raise ValueError("payload must be a dict")
return True

try:
validate_event(agent_id, thread_id, payload)
await send_event(ws, agent_id, thread_id, payload)
except ValueError as e:
print(f"Invalid event: {e}")

RELAY_INTERNAL_ERROR

Meaning: An unexpected error occurred in Relay (bug or infrastructure issue).

Status Code: 500 (Internal Server Error)

When to retry: Yes, with exponential backoff

Recommended action:

  • Retry with exponential backoff
  • After 3-5 failed attempts, escalate to support
  • Include event_id in bug report
if error_code == "RELAY_INTERNAL_ERROR":
print(f"Relay error for event {event_id}. Retrying...")
# Exponential backoff: 1s, 2s, 4s, 8s, 16s

Complete Error Handling Example

import asyncio
import json
import time

class RelayEventSender:
RETRYABLE_ERRORS = {
"AGENT_OFFLINE",
"AGENT_TIMEOUT",
"RELAY_INTERNAL_ERROR"
}

async def send_with_error_handling(
self,
websocket,
agent_id,
thread_id,
payload,
max_retries=3
):
"""Send event with comprehensive error handling"""

event = {
"type": "event",
"agent_id": agent_id,
"thread_id": thread_id,
"payload": payload
}

for attempt in range(max_retries + 1):
try:
# Validate before sending
self.validate_event(agent_id, thread_id, payload)

# Send event
await websocket.send(json.dumps(event))

# Wait for response (could be accepted or error)
response = await asyncio.wait_for(
websocket.recv(),
timeout=10
)
data = json.loads(response)

# Handle different response types
if data["type"] == "accepted":
return {"status": "accepted", "event_id": data["event_id"]}

elif data["type"] == "error":
error_code = data["code"]

# Non-retryable errors
if error_code == "AGENT_NOT_ALLOWED":
return {
"status": "error",
"code": error_code,
"error": data["error"],
"retryable": False
}

# Retryable errors
elif error_code in self.RETRYABLE_ERRORS:
if attempt < max_retries:
delay = 2 ** attempt # 1s, 2s, 4s, 8s
print(f"Error {error_code}, retrying in {delay}s...")
await asyncio.sleep(delay)
continue
else:
return {
"status": "error",
"code": error_code,
"error": data["error"],
"retryable": True,
"max_retries_exceeded": True
}

# Other errors
else:
return {
"status": "error",
"code": error_code,
"error": data["error"],
"retryable": False
}

else:
return {
"status": "unexpected",
"message_type": data["type"]
}

except asyncio.TimeoutError:
if attempt < max_retries:
await asyncio.sleep(2 ** attempt)
continue
return {
"status": "timeout",
"message": "No response from Relay"
}

except Exception as e:
return {
"status": "exception",
"error": str(e)
}

def validate_event(self, agent_id, thread_id, payload):
"""Validate event structure before sending"""
if not isinstance(agent_id, str) or not agent_id.strip():
raise ValueError("agent_id must be non-empty string")
if not isinstance(thread_id, str) or not thread_id.strip():
raise ValueError("thread_id must be non-empty string")
if not isinstance(payload, dict):
raise ValueError("payload must be dict")

size_bytes = len(json.dumps(payload).encode('utf-8'))
if size_bytes > 64 * 1024:
raise ValueError(f"Payload too large: {size_bytes} bytes > 64KB")

# Usage
sender = RelayEventSender()
result = await sender.send_with_error_handling(
websocket,
"athena",
"task-123",
{"task_id": "task-123", "message": "@athena summarize"}
)

if result["status"] == "accepted":
print(f"Event accepted: {result['event_id']}")
elif result.get("retryable"):
print(f"Retryable error: {result['code']} - {result['error']}")
else:
print(f"Non-retryable error: {result['code']} - {result['error']}")

Best Practices

Do

  • Retry only retryable errors
  • Use exponential backoff
  • Validate payloads before sending
  • Log errors with event_id
  • Set reasonable timeouts (5-10s)

Don't

  • Retry permission errors
  • Retry invalid events
  • Use fixed delays
  • Ignore rate limits
  • Retry indefinitely