8000 structured output - multi-modal input by pgrayy · Pull Request #405 · strands-agents/sdk-python · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

structured output - multi-modal input #405

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jul 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to lo 8000 ad files.
Loading
Diff view
Diff view
13 changes: 8 additions & 5 deletions src/strands/agent/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,13 +380,13 @@ async def invoke_async(self, prompt: Union[str, list[ContentBlock]], **kwargs: A

return cast(AgentResult, event["result"])

def structured_output(self, output_model: Type[T], prompt: Optional[str] = None) -> T:
def structured_output(self, output_model: Type[T], prompt: Optional[Union[str, list[ContentBlock]]] = None) -> T:
"""This method allows you to get structured output from the agent.

If you pass in a prompt, it will be added to the conversation history and the agent will respond to it.
If you don't pass in a prompt, it will use only the conversation history to respond.

For smaller models, you may want to use the optional prompt string to add additional instructions to explicitly
For smaller models, you may want to use the optional prompt to add additional instructions to explicitly
instruct the model to output the structured data.

Args:
Expand All @@ -405,13 +405,15 @@ def execute() -> T:
future = executor.submit(execute)
return future.result()

async def structured_output_async(self, output_model: Type[T], prompt: Optional[str] = None) -> T:
async def structured_output_async(
self, output_model: Type[T], prompt: Optional[Union[str, list[ContentBlock]]] = None
) -> T:
"""This method allows you to get structured output from the agent.

If you pass in a prompt, it will be added to the conversation history and the agent will respond to it.
If you don't pass in a prompt, it will use only the conversation history to respond.

For smaller models, you may want to use the optional prompt string to add additional instructions to explicitly
For smaller models, you may want to use the optional prompt to add additional instructions to explicitly
instruct the model to output the structured data.

Args:
Expand All @@ -430,7 +432,8 @@ async def structured_output_async(self, output_model: Type[T], prompt: Optional[

# add the prompt as the last message
if prompt:
self._append_message({"role": "user", "content": [{"text": prompt}]})
content: list[ContentBlock] = [{"text": prompt}] if isinstance(prompt, str) else prompt
self._append_message({"role": "user", "content": content})

events = self.model.structured_output(output_model, self.messages)
async for event in events:
Expand Down
22 changes: 22 additions & 0 deletions tests/strands/agent/test_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -959,6 +959,28 @@ def test_agent_structured_output(agent, user, agenerator):
agent.model.structured_output.assert_called_once_with(type(user), [{"role": "user", "content": [{"text": prompt}]}])


def test_agent_structured_output_multi_modal_input(agent, user, agenerator):
agent.model.structured_output = unittest.mock.Mock(return_value=agenerator([{"output": user}]))

prompt = [
{"text": "Please describe the user in this image"},
{
"image": {
"format": "png",
"source": {
"bytes": b"\x89PNG\r\n\x1a\n",
},
}
},
]

tru_result = agent.structured_output(type(user), prompt)
exp_result = user
assert tru_result == exp_result

agent.model.structured_output.assert_called_once_with(type(user), [{"role": "user", "content": prompt}])


@pytest.mark.asyncio
async def test_agent_structured_output_in_async_context(agent, user, agenerator):
agent.model.structured_output = unittest.mock.Mock(return_value=agenerator([{"output": user}]))
Expand Down
39 changes: 33 additions & 6 deletions tests_integ/models/test_model_anthropic.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
pytestmark = providers.anthropic.mark


@pytest.fixture(scope="module")
@pytest.fixture
def model():
return AnthropicModel(
client_args={
Expand All @@ -23,7 +23,7 @@ def model():
)


@pytest.fixture(scope="module")
@pytest.fixture
def tools():
@strands.tool
def tool_time() -> str:
Expand All @@ -36,17 +36,17 @@ def tool_weather() -> str:
return [tool_time, tool_weather]


@pytest.fixture(scope="module")
@pytest.fixture
def system_prompt():
return "You are an AI assistant."


@pytest.fixture(scope="module")
@pytest.fixture
def agent(model, tools, system_prompt):
return Agent(model=model, tools=tools, system_prompt=system_prompt)


@pytest.fixture(scope="module")
@pytest.fixture
def weather():
class Weather(BaseModel):
"""Extracts the time and weather from the user's message with the exact strings."""
Expand All @@ -57,6 +57,16 @@ class Weather(BaseModel):
return Weather(time="12:00", weather="sunny")


@pytest.fixture
def yellow_color():
class Color(BaseModel):
"""Describes a color."""

name: str

return Color(name="yellow")


def test_agent_invoke(agent):
result = agent("What is the time and weather in New York?")
text = result.message["content"][0]["text"].lower()
Expand Down Expand Up @@ -97,7 +107,7 @@ async def test_agent_structured_output_async(agent, weather):
assert tru_weather == exp_weather


def test_multi_modal_input(agent, yellow_img):
def test_invoke_multi_modal_input(agent, yellow_img):
content = [
{"text": "what is in this image"},
{
Expand All @@ -113,3 +123,20 @@ def test_multi_modal_input(agent, yellow_img):
text = result.message["content"][0]["text"].lowe ED4F r()

assert "yellow" in text


def test_structured_output_multi_modal_input(agent, yellow_img, yellow_color):
content = [
{"text": "Is this image red, blue, or yellow?"},
{
"image": {
"format": "png",
"source": {
"bytes": yellow_img,
},
},
},
]
tru_color = agent.structured_output(type(yellow_color), content)
exp_color = yellow_color
assert tru_color == exp_color
29 changes: 28 additions & 1 deletion tests_integ/models/test_model_bedrock.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,16 @@ def non_streaming_agent(non_streaming_model, system_prompt):
return Agent(model=non_streaming_model, system_prompt=system_prompt, load_tools_from_directory=False)


@pytest.fixture
def yellow_color():
class Color(BaseModel):
"""Describes a color."""

name: str

return Color(name="yellow")


def test_streaming_agent(streaming_agent):
"""Test agent with streaming model."""
result = streaming_agent("Hello!")
Expand Down Expand Up @@ -153,7 +163,7 @@ class Weather(BaseModel):
assert result.weather == "sunny"


def test_multi_modal_input(streaming_agent, yellow_img):
def test_invoke_multi_modal_input(streaming_agent, yellow_img):
content = [
{"text": "what is in this image"},
{
Expand All @@ -169,3 +179,20 @@ def test_multi_modal_input(streaming_agent, yellow_img):
text = result.message["content"][0]["text"].lower()

assert "yellow" in text


def test_structured_output_multi_modal_input(streaming_agent, yellow_img, yellow_color):
content = [
{"text": "Is this image red, blue, or yellow?"},
{
"image": {
"format": "png",
"source": {
"bytes": yellow_img,
},
},
},
]
tru_color = streaming_agent.structured_output(type(yellow_color), content)
exp_color = yellow_color
assert tru_color == exp_color
31 changes: 29 additions & 2 deletions tests_integ/models/test_model_litellm.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,16 @@ def agent(model, tools):
return Agent(model=model, tools=tools)


@pytest.fixture
def yellow_color():
class Color(BaseModel):
"""Describes a color."""

name: str

return Color(name="yellow")


def test_agent(agent):
result = agent("What is the time and weather in New York?")
text = result.message["content"][0]["text"].lower()
Expand All @@ -49,9 +59,9 @@ class Weather(BaseModel):
assert result.weather == "sunny"


def test_multi_modal_input(agent, yellow_img):
def test_invoke_multi_modal_input(agent, yellow_img):
content = [
{"text": "what is in this image"},
{"text": "Is this image red, blue, or yellow?"},
{
"image": {
"format": "png",
Expand All @@ -65,3 +75,20 @@ def test_multi_modal_input(agent, yellow_img):
text = result.message["content"][0]["text"].lower()

assert "yellow" in text


def test_structured_output_multi_modal_input(agent, yellow_img, yellow_color):
content = [
{"text": "what is in this image"},
{
"image": {
"format": "png",
"source": {
"bytes": yellow_img,
},
},
},
]
tru_color = agent.structured_output(type(yellow_color), content)
exp_color = yellow_color
assert tru_color == exp_color
8 changes: 4 additions & 4 deletions tests_integ/models/test_model_ollama.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@
pytestmark = providers.ollama.mark


@pytest.fixture(scope="module")
@pytest.fixture
def model():
return OllamaModel(host="http://localhost:11434", model_id="llama3.3:70b")


@pytest.fixture(scope="module")
@pytest.fixture
def tools():
@strands.tool
def tool_time() -> str:
Expand All @@ -28,12 +28,12 @@ def tool_weather() -> str:
return [tool_time, tool_weather]


@pytest.fixture(scope="module")
@pytest.fixture
def agent(model, tools):
return Agent(model=model, tools=tools)


@pytest.fixture(scope="module")
@pytest.fixture
def weather():
class Weather(BaseModel):
"""Extracts the time and weather from the user's message with the exact strings."""
Expand Down
37 changes: 32 additions & 5 deletions tests_integ/models/test_model_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
pytestmark = providers.openai.mark


@pytest.fixture(scope="module")
@pytest.fixture
def model():
return OpenAIModel(
model_id="gpt-4o",
Expand All @@ -22,7 +22,7 @@ def model():
)


@pytest.fixture(scope="module")
@pytest.fixture
def tools():
@strands.tool
def tool_time() -> str:
Expand All @@ -35,12 +35,12 @@ def tool_weather() -> str:
return [tool_time, tool_weather]


@pytest.fixture(scope="module")
@pytest.fixture
def agent(model, tools):
return Agent(model=model, tools=tools)


@pytest.fixture(scope="module")
@pytest.fixture
def weather():
class Weather(BaseModel):
"""Extracts the time and weather from the user's message with the exact strings."""
Expand All @@ -51,6 +51,16 @@ class Weather(BaseModel):
return Weather(time="12:00", weather="sunny")


@pytest.fixture
def yellow_color():
class Color(BaseModel):
"""Describes a color."""

name: str

return Color(name="yellow")


@pytest.fixture(scope="module")
def test_image_path(request):
return request.config.rootpath / "tests_integ" / "test_image.png"
Expand Down Expand Up @@ -96,7 +106,7 @@ async def test_agent_structured_output_async(agent, weather):
assert tru_weather == exp_weather


def test_multi_modal_input(agent, yellow_img):
def test_invoke_multi_modal_input(agent, yellow_img):
content = [
{"text": "what is in this image"},
{
Expand All @@ -114,6 +124,23 @@ def test_multi_modal_input(agent, yellow_img):
assert "yellow" in text


def test_structured_output_multi_modal_input(agent, yellow_img, yellow_color):
content = [
{"text": "Is this image red, blue, or yellow?"},
{
"image": {
"format": "png",
"source": {
"bytes": yellow_img,
},
},
},
]
tru_color = agent.structured_output(type(yellow_color), content)
exp_color = yellow_color
assert tru_color == exp_color


@pytest.mark.skip("https://github.com/strands-agents/sdk-python/issues/320")
def test_tool_returning_images(model, yellow_img):
@tool
Expand Down
0