11 KiB
Claude API — Python
Installation
pip install anthropic
Client Initialization
import anthropic
# Default (uses ANTHROPIC_API_KEY env var)
client = anthropic.Anthropic()
# Explicit API key
client = anthropic.Anthropic(api_key="your-api-key")
# Async client
async_client = anthropic.AsyncAnthropic()
Basic Message Request
response = client.messages.create(
model="claude-opus-4-6",
max_tokens=1024,
messages=[
{"role": "user", "content": "What is the capital of France?"}
]
)
print(response.content[0].text)
System Prompts
response = client.messages.create(
model="claude-opus-4-6",
max_tokens=1024,
system="You are a helpful coding assistant. Always provide examples in Python.",
messages=[{"role": "user", "content": "How do I read a JSON file?"}]
)
Vision (Images)
Base64
import base64
with open("image.png", "rb") as f:
image_data = base64.standard_b64encode(f.read()).decode("utf-8")
response = client.messages.create(
model="claude-opus-4-6",
max_tokens=1024,
messages=[{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/png",
"data": image_data
}
},
{"type": "text", "text": "What's in this image?"}
]
}]
)
URL
response = client.messages.create(
model="claude-opus-4-6",
max_tokens=1024,
messages=[{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "url",
"url": "https://example.com/image.png"
}
},
{"type": "text", "text": "Describe this image"}
]
}]
)
Prompt Caching
Cache large context to reduce costs (up to 90% savings).
Automatic Caching (Recommended)
Use top-level cache_control to automatically cache the last cacheable block in the request — no need to annotate individual content blocks:
response = client.messages.create(
model="claude-opus-4-6",
max_tokens=1024,
cache_control={"type": "ephemeral"}, # auto-caches the last cacheable block
system="You are an expert on this large document...",
messages=[{"role": "user", "content": "Summarize the key points"}]
)
Manual Cache Control
For fine-grained control, add cache_control to specific content blocks:
response = client.messages.create(
model="claude-opus-4-6",
max_tokens=1024,
system=[{
"type": "text",
"text": "You are an expert on this large document...",
"cache_control": {"type": "ephemeral"} # default TTL is 5 minutes
}],
messages=[{"role": "user", "content": "Summarize the key points"}]
)
# With explicit TTL (time-to-live)
response = client.messages.create(
model="claude-opus-4-6",
max_tokens=1024,
system=[{
"type": "text",
"text": "You are an expert on this large document...",
"cache_control": {"type": "ephemeral", "ttl": "1h"} # 1 hour TTL
}],
messages=[{"role": "user", "content": "Summarize the key points"}]
)
Extended Thinking
Opus 4.6 and Sonnet 4.6: Use adaptive thinking.
budget_tokensis deprecated on both Opus 4.6 and Sonnet 4.6. Older models: Usethinking: {type: "enabled", budget_tokens: N}(must be <max_tokens, min 1024).
# Opus 4.6: adaptive thinking (recommended)
response = client.messages.create(
model="claude-opus-4-6",
max_tokens=16000,
thinking={"type": "adaptive"},
output_config={"effort": "high"}, # low | medium | high | max
messages=[{"role": "user", "content": "Solve this step by step..."}]
)
# Access thinking and response
for block in response.content:
if block.type == "thinking":
print(f"Thinking: {block.thinking}")
elif block.type == "text":
print(f"Response: {block.text}")
Error Handling
import anthropic
try:
response = client.messages.create(...)
except anthropic.BadRequestError as e:
print(f"Bad request: {e.message}")
except anthropic.AuthenticationError:
print("Invalid API key")
except anthropic.PermissionDeniedError:
print("API key lacks required permissions")
except anthropic.NotFoundError:
print("Invalid model or endpoint")
except anthropic.RateLimitError as e:
retry_after = int(e.response.headers.get("retry-after", "60"))
print(f"Rate limited. Retry after {retry_after}s.")
except anthropic.APIStatusError as e:
if e.status_code >= 500:
print(f"Server error ({e.status_code}). Retry later.")
else:
print(f"API error: {e.message}")
except anthropic.APIConnectionError:
print("Network error. Check internet connection.")
Multi-Turn Conversations
The API is stateless — send the full conversation history each time.
class ConversationManager:
"""Manage multi-turn conversations with the Claude API."""
def __init__(self, client: anthropic.Anthropic, model: str, system: str = None):
self.client = client
self.model = model
self.system = system
self.messages = []
def send(self, user_message: str, **kwargs) -> str:
"""Send a message and get a response."""
self.messages.append({"role": "user", "content": user_message})
response = self.client.messages.create(
model=self.model,
max_tokens=kwargs.get("max_tokens", 1024),
system=self.system,
messages=self.messages,
**kwargs
)
assistant_message = response.content[0].text
self.messages.append({"role": "assistant", "content": assistant_message})
return assistant_message
# Usage
conversation = ConversationManager(
client=anthropic.Anthropic(),
model="claude-opus-4-6",
system="You are a helpful assistant."
)
response1 = conversation.send("My name is Alice.")
response2 = conversation.send("What's my name?") # Claude remembers "Alice"
Rules:
- Messages must alternate between
userandassistant - First message must be
user
Compaction (long conversations)
Beta, Opus 4.6 only. When conversations approach the 200K context window, compaction automatically summarizes earlier context server-side. The API returns a
compactionblock; you must pass it back on subsequent requests — appendresponse.content, not just the text.
import anthropic
client = anthropic.Anthropic()
messages = []
def chat(user_message: str) -> str:
messages.append({"role": "user", "content": user_message})
response = client.beta.messages.create(
betas=["compact-2026-01-12"],
model="claude-opus-4-6",
max_tokens=4096,
messages=messages,
context_management={
"edits": [{"type": "compact_20260112"}]
}
)
# Append full content — compaction blocks must be preserved
messages.append({"role": "assistant", "content": response.content})
return next(block.text for block in response.content if block.type == "text")
# Compaction triggers automatically when context grows large
print(chat("Help me build a Python web scraper"))
print(chat("Add support for JavaScript-rendered pages"))
print(chat("Now add rate limiting and error handling"))
Stop Reasons
The stop_reason field in the response indicates why the model stopped generating:
| Value | Meaning |
|---|---|
end_turn |
Claude finished its response naturally |
max_tokens |
Hit the max_tokens limit — increase it or use streaming |
stop_sequence |
Hit a custom stop sequence |
tool_use |
Claude wants to call a tool — execute it and continue |
pause_turn |
Model paused and can be resumed (agentic flows) |
refusal |
Claude refused for safety reasons — output may not match your schema |
Cost Optimization Strategies
1. Use Prompt Caching for Repeated Context
# Automatic caching (simplest — caches the last cacheable block)
response = client.messages.create(
model="claude-opus-4-6",
max_tokens=1024,
cache_control={"type": "ephemeral"},
system=large_document_text, # e.g., 50KB of context
messages=[{"role": "user", "content": "Summarize the key points"}]
)
# First request: full cost
# Subsequent requests: ~90% cheaper for cached portion
2. Choose the Right Model
# Default to Opus for most tasks
response = client.messages.create(
model="claude-opus-4-6", # $5.00/$25.00 per 1M tokens
max_tokens=1024,
messages=[{"role": "user", "content": "Explain quantum computing"}]
)
# Use Sonnet for high-volume production workloads
standard_response = client.messages.create(
model="claude-sonnet-4-6", # $3.00/$15.00 per 1M tokens
max_tokens=1024,
messages=[{"role": "user", "content": "Summarize this document"}]
)
# Use Haiku only for simple, speed-critical tasks
simple_response = client.messages.create(
model="claude-haiku-4-5", # $1.00/$5.00 per 1M tokens
max_tokens=256,
messages=[{"role": "user", "content": "Classify this as positive or negative"}]
)
3. Use Token Counting Before Requests
count_response = client.messages.count_tokens(
model="claude-opus-4-6",
messages=messages,
system=system
)
estimated_input_cost = count_response.input_tokens * 0.000005 # $5/1M tokens
print(f"Estimated input cost: ${estimated_input_cost:.4f}")
Retry with Exponential Backoff
Note: The Anthropic SDK automatically retries rate limit (429) and server errors (5xx) with exponential backoff. You can configure this with
max_retries(default: 2). Only implement custom retry logic if you need behavior beyond what the SDK provides.
import time
import random
import anthropic
def call_with_retry(
client: anthropic.Anthropic,
max_retries: int = 5,
base_delay: float = 1.0,
max_delay: float = 60.0,
**kwargs
):
"""Call the API with exponential backoff retry."""
last_exception = None
for attempt in range(max_retries):
try:
return client.messages.create(**kwargs)
except anthropic.RateLimitError as e:
last_exception = e
except anthropic.APIStatusError as e:
if e.status_code >= 500:
last_exception = e
else:
raise # Client errors (4xx except 429) should not be retried
delay = min(base_delay * (2 ** attempt) + random.uniform(0, 1), max_delay)
print(f"Retry {attempt + 1}/{max_retries} after {delay:.1f}s")
time.sleep(delay)
raise last_exception