Skip to content

Commit cd3dc7f

Browse files
authored
remove all images except the latest two in AnthropicCuaClient (#159)
* remove all images except the latest two in AnthropicCuaClient * format code * add changeset * lint
1 parent 8d55709 commit cd3dc7f

File tree

4 files changed

+111
-6
lines changed

4 files changed

+111
-6
lines changed
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"stagehand": patch
3+
---
4+
5+
Add support for claude 4 sonnet in agent & remove all images but the last two from anthropic cua client

stagehand/agent/agent.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,13 @@
1919
"computer-use-preview": OpenAICUAClient,
2020
"claude-3-5-sonnet-latest": AnthropicCUAClient,
2121
"claude-3-7-sonnet-latest": AnthropicCUAClient,
22+
"claude-sonnet-4-20250514": AnthropicCUAClient,
2223
}
2324
MODEL_TO_PROVIDER_MAP: dict[str, AgentProvider] = {
2425
"computer-use-preview": AgentProvider.OPENAI,
2526
"claude-3-5-sonnet-20240620": AgentProvider.ANTHROPIC,
2627
"claude-3-7-sonnet-20250219": AgentProvider.ANTHROPIC,
28+
"claude-sonnet-4-20250514": AgentProvider.ANTHROPIC,
2729
# Add more mappings as needed
2830
}
2931

@@ -84,6 +86,7 @@ def _get_client(self) -> AgentClient:
8486
logger=self.logger,
8587
handler=self.cua_handler,
8688
viewport=self.viewport,
89+
experimental=self.stagehand.experimental,
8790
)
8891

8992
async def execute(

stagehand/agent/anthropic_cua.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
Point,
1919
)
2020
from .client import AgentClient
21+
from .image_compression_utils import compress_conversation_images
2122

2223
load_dotenv()
2324

@@ -51,9 +52,11 @@ def __init__(
5152
logger: Optional[Any] = None,
5253
handler: Optional[CUAHandler] = None,
5354
viewport: Optional[dict[str, int]] = None,
55+
experimental: bool = False,
5456
**kwargs,
5557
):
5658
super().__init__(model, instructions, config, logger, handler)
59+
self.experimental = experimental
5760
self.anthropic_sdk_client = Anthropic(
5861
api_key=config.options.get("apiKey") or os.getenv("ANTHROPIC_API_KEY")
5962
)
@@ -67,14 +70,14 @@ def __init__(
6770
if hasattr(self.config, "display_height") and self.config.display_height is not None: # type: ignore
6871
dimensions[1] = self.config.display_height # type: ignore
6972
computer_tool_type = (
70-
"computer_20250124"
71-
if model == "claude-3-7-sonnet-latest"
72-
else "computer_20241022"
73+
"computer_20241022"
74+
if model == "claude-3-5-sonnet-latest"
75+
else "computer_20250124"
7376
)
7477
self.beta_flag = (
75-
["computer-use-2025-01-24"]
76-
if model == "claude-3-7-sonnet-latest"
77-
else ["computer-use-2024-10-22"]
78+
["computer-use-2024-10-22"]
79+
if model == "claude-3-5-sonnet-latest"
80+
else ["computer-use-2025-01-24"]
7881
)
7982
self.tools = [
8083
{
@@ -162,6 +165,9 @@ async def run_task(
162165

163166
start_time = asyncio.get_event_loop().time()
164167
try:
168+
if self.experimental:
169+
compress_conversation_images(current_messages)
170+
165171
response = self.anthropic_sdk_client.beta.messages.create(
166172
model=self.model,
167173
max_tokens=self.max_tokens,
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
from typing import Any
2+
3+
4+
def find_items_with_images(items: list[dict[str, Any]]) -> list[int]:
5+
"""
6+
Finds all items in the conversation history that contain images
7+
8+
Args:
9+
items: Array of conversation items to check
10+
11+
Returns:
12+
Array of indices where images were found
13+
"""
14+
items_with_images = []
15+
16+
for index, item in enumerate(items):
17+
has_image = False
18+
19+
if isinstance(item.get("content"), list):
20+
has_image = any(
21+
content_item.get("type") == "tool_result"
22+
and "content" in content_item
23+
and isinstance(content_item["content"], list)
24+
and any(
25+
nested_item.get("type") == "image"
26+
for nested_item in content_item["content"]
27+
if isinstance(nested_item, dict)
28+
)
29+
for content_item in item["content"]
30+
if isinstance(content_item, dict)
31+
)
32+
33+
if has_image:
34+
items_with_images.append(index)
35+
36+
return items_with_images
37+
38+
39+
def compress_conversation_images(
40+
items: list[dict[str, Any]], keep_most_recent_count: int = 2
41+
) -> dict[str, list[dict[str, Any]]]:
42+
"""
43+
Compresses conversation history by removing images from older items
44+
while keeping the most recent images intact
45+
46+
Args:
47+
items: Array of conversation items to process
48+
keep_most_recent_count: Number of most recent image-containing items to preserve (default: 2)
49+
50+
Returns:
51+
Dictionary with processed items
52+
"""
53+
items_with_images = find_items_with_images(items)
54+
55+
for index, item in enumerate(items):
56+
image_index = -1
57+
if index in items_with_images:
58+
image_index = items_with_images.index(index)
59+
60+
should_compress = (
61+
image_index >= 0
62+
and image_index < len(items_with_images) - keep_most_recent_count
63+
)
64+
65+
if should_compress:
66+
if isinstance(item.get("content"), list):
67+
new_content = []
68+
for content_item in item["content"]:
69+
if isinstance(content_item, dict):
70+
if (
71+
content_item.get("type") == "tool_result"
72+
and "content" in content_item
73+
and isinstance(content_item["content"], list)
74+
and any(
75+
nested_item.get("type") == "image"
76+
for nested_item in content_item["content"]
77+
if isinstance(nested_item, dict)
78+
)
79+
):
80+
# Replace the content with a text placeholder
81+
new_content.append(
82+
{**content_item, "content": "screenshot taken"}
83+
)
84+
else:
85+
new_content.append(content_item)
86+
else:
87+
new_content.append(content_item)
88+
89+
item["content"] = new_content
90+
91+
return {"items": items}

0 commit comments

Comments
 (0)