Error Handling
When something goes wrong with an event, Relay sends an error message over WebSocket. Understanding error codes and appropriate retry strategies is essential for a robust integration.
Error Message Format
Error messages follow this structure:
{
"type": "error",
"event_id": "evt_k9p2m",
"agent_id": "athena",
"error": "Agent not connected",
"code": "AGENT_OFFLINE"
}
| Field | Type | Notes |
|---|---|---|
type | string | Always "error" |
event_id | string | The event that failed (may be null for handshake errors) |
agent_id | string | Which agent had the problem |
error | string | Human-readable description |
code | string | Error code for programmatic handling |
Error Codes and Handling
AGENT_OFFLINE
Meaning: The agent is not currently connected to Relay.
Status Code: 503 (Service Unavailable)
When to retry: Yes, with exponential backoff
Recommended action:
- Retry after 1-2 seconds
- Show user: "Athena is offline. Retrying..."
- After 3-5 retries, suggest user try again later
async def send_with_retry(websocket, agent_id, thread_id, payload, max_retries=3):
for attempt in range(max_retries):
# Send event
event = {
"type": "event",
"agent_id": agent_id,
"thread_id": thread_id,
"payload": payload
}
await websocket.send(json.dumps(event))
# Get response
response = await websocket.recv()
data = json.loads(response)
if data["type"] == "error" and data["code"] == "AGENT_OFFLINE":
if attempt < max_retries - 1:
delay = 2 ** (attempt + 1) # 2s, 4s, 8s
print(f"Agent offline, retrying in {delay}s...")
await asyncio.sleep(delay)
continue
else:
return {"error": "Agent offline after retries", "code": "AGENT_OFFLINE"}
elif data["type"] == "accepted":
return {"success": True, "event_id": data["event_id"]}
else:
return data # Handle other errors
AGENT_NOT_ALLOWED
Meaning: Your app isn't allowlisted for this agent.
Status Code: 403 (Forbidden)
When to retry: No
Recommended action:
- Do not retry
- Log as a permission error
- Notify admin or app developer
- Show user a clear message if they tried to mention the agent
if error_code == "AGENT_NOT_ALLOWED":
print(f"Permission denied: app {app_id} cannot access {agent_id}")
print("Contact your Relay admin to request allowlist access")
# Do not retry
AGENT_TIMEOUT
Meaning: The agent took too long to respond (exceeded timeout window).
Status Code: 504 (Gateway Timeout)
When to retry: Maybe — agent might be back online soon
Recommended action:
- Retry after 2-5 seconds
- Limit retries to 1-2 attempts only
- If agent continues timing out, escalate to infrastructure team
if error_code == "AGENT_TIMEOUT":
# Retry once
await asyncio.sleep(3)
await send_event(websocket, agent_id, thread_id, payload)
PAYLOAD_TOO_LARGE
Meaning: Event payload exceeds 64KB limit.
Status Code: 413 (Payload Too Large)
When to retry: No (unless you reduce payload size)
Recommended action:
- Do not retry with same payload
- Reduce payload size (remove unnecessary fields, truncate text)
- Log error with context for debugging
import json
def validate_payload(payload):
size_bytes = len(json.dumps(payload).encode('utf-8'))
if size_bytes > 64 * 1024:
raise ValueError(
f"Payload too large: {size_bytes} bytes exceeds 64KB limit. "
f"Remove non-essential fields or truncate large text."
)
try:
validate_payload(my_payload)
await send_event(ws, agent_id, thread_id, my_payload)
except ValueError as e:
print(f"Payload validation failed: {e}")
# Trim payload and retry
trimmed_payload = trim_payload(my_payload)
await send_event(ws, agent_id, thread_id, trimmed_payload)
RATE_LIMITED
Meaning: Your app has exceeded its rate limit (events per minute).
Status Code: 429 (Too Many Requests)
When to retry: Yes, but respect rate limit window
Recommended action:
- Respect the rate limit window
- Queue events and send at a lower rate
- Implement exponential backoff starting after 10+ seconds
Default rate limits: (see Rate Limits)
import asyncio
class RateLimitedClient:
def __init__(self, events_per_minute=60):
self.events_per_minute = events_per_minute
self.event_queue = asyncio.Queue()
self.last_sent = 0
self.min_interval = 60 / events_per_minute
async def send_event(self, websocket, agent_id, thread_id, payload):
"""Send event, respecting rate limits"""
await self.event_queue.put((agent_id, thread_id, payload))
# Process queue with rate limiting
while not self.event_queue.empty():
agent_id, thread_id, payload = await self.event_queue.get()
# Wait if needed
elapsed = time.time() - self.last_sent
if elapsed < self.min_interval:
await asyncio.sleep(self.min_interval - elapsed)
event = {
"type": "event",
"agent_id": agent_id,
"thread_id": thread_id,
"payload": payload
}
await websocket.send(json.dumps(event))
self.last_sent = time.time()
INVALID_EVENT
Meaning: Event is malformed (missing required fields, bad JSON, etc.).
Status Code: 400 (Bad Request)
When to retry: No
Recommended action:
- Fix the event structure
- Verify all required fields are present
- Check JSON is valid
def validate_event(agent_id, thread_id, payload):
if not isinstance(agent_id, str) or not agent_id.strip():
raise ValueError("agent_id must be non-empty string")
if not isinstance(thread_id, str) or not thread_id.strip():
raise ValueError("thread_id must be non-empty string")
if not isinstance(payload, dict):
raise ValueError("payload must be a dict")
return True
try:
validate_event(agent_id, thread_id, payload)
await send_event(ws, agent_id, thread_id, payload)
except ValueError as e:
print(f"Invalid event: {e}")
RELAY_INTERNAL_ERROR
Meaning: An unexpected error occurred in Relay (bug or infrastructure issue).
Status Code: 500 (Internal Server Error)
When to retry: Yes, with exponential backoff
Recommended action:
- Retry with exponential backoff
- After 3-5 failed attempts, escalate to support
- Include event_id in bug report
if error_code == "RELAY_INTERNAL_ERROR":
print(f"Relay error for event {event_id}. Retrying...")
# Exponential backoff: 1s, 2s, 4s, 8s, 16s
Complete Error Handling Example
import asyncio
import json
import time
class RelayEventSender:
RETRYABLE_ERRORS = {
"AGENT_OFFLINE",
"AGENT_TIMEOUT",
"RELAY_INTERNAL_ERROR"
}
async def send_with_error_handling(
self,
websocket,
agent_id,
thread_id,
payload,
max_retries=3
):
"""Send event with comprehensive error handling"""
event = {
"type": "event",
"agent_id": agent_id,
"thread_id": thread_id,
"payload": payload
}
for attempt in range(max_retries + 1):
try:
# Validate before sending
self.validate_event(agent_id, thread_id, payload)
# Send event
await websocket.send(json.dumps(event))
# Wait for response (could be accepted or error)
response = await asyncio.wait_for(
websocket.recv(),
timeout=10
)
data = json.loads(response)
# Handle different response types
if data["type"] == "accepted":
return {"status": "accepted", "event_id": data["event_id"]}
elif data["type"] == "error":
error_code = data["code"]
# Non-retryable errors
if error_code == "AGENT_NOT_ALLOWED":
return {
"status": "error",
"code": error_code,
"error": data["error"],
"retryable": False
}
# Retryable errors
elif error_code in self.RETRYABLE_ERRORS:
if attempt < max_retries:
delay = 2 ** attempt # 1s, 2s, 4s, 8s
print(f"Error {error_code}, retrying in {delay}s...")
await asyncio.sleep(delay)
continue
else:
return {
"status": "error",
"code": error_code,
"error": data["error"],
"retryable": True,
"max_retries_exceeded": True
}
# Other errors
else:
return {
"status": "error",
"code": error_code,
"error": data["error"],
"retryable": False
}
else:
return {
"status": "unexpected",
"message_type": data["type"]
}
except asyncio.TimeoutError:
if attempt < max_retries:
await asyncio.sleep(2 ** attempt)
continue
return {
"status": "timeout",
"message": "No response from Relay"
}
except Exception as e:
return {
"status": "exception",
"error": str(e)
}
def validate_event(self, agent_id, thread_id, payload):
"""Validate event structure before sending"""
if not isinstance(agent_id, str) or not agent_id.strip():
raise ValueError("agent_id must be non-empty string")
if not isinstance(thread_id, str) or not thread_id.strip():
raise ValueError("thread_id must be non-empty string")
if not isinstance(payload, dict):
raise ValueError("payload must be dict")
size_bytes = len(json.dumps(payload).encode('utf-8'))
if size_bytes > 64 * 1024:
raise ValueError(f"Payload too large: {size_bytes} bytes > 64KB")
# Usage
sender = RelayEventSender()
result = await sender.send_with_error_handling(
websocket,
"athena",
"task-123",
{"task_id": "task-123", "message": "@athena summarize"}
)
if result["status"] == "accepted":
print(f"Event accepted: {result['event_id']}")
elif result.get("retryable"):
print(f"Retryable error: {result['code']} - {result['error']}")
else:
print(f"Non-retryable error: {result['code']} - {result['error']}")
Best Practices
Do
- Retry only retryable errors
- Use exponential backoff
- Validate payloads before sending
- Log errors with event_id
- Set reasonable timeouts (5-10s)
Don't
- Retry permission errors
- Retry invalid events
- Use fixed delays
- Ignore rate limits
- Retry indefinitely