8000 feat: add support for complex types in component processing by YassinNouh21 · Pull Request #9305 · deepset-ai/haystack · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

feat: add support for complex types in component processing #9305

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 65 additions & 6 deletions haystack/tools/component_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,8 +371,8 @@ def _create_dataclass_schema(self, python_type: Any, description: str) -> Dict[s
cls = python_type if isinstance(python_type, type) else python_type.__class__
for field in fields(cls):
field_description = f"Field '{field.name}' of '{cls.__name__}'."
if isinstance(schema["properties"], dict):
schema["properties"][field.name] = self._create_property_schema(field.type, field_description)
field_schema = self._create_property_schema(field.type, field_description)
schema["properties"][field.name] = field_schema
return schema

@staticmethod
Expand All @@ -384,8 +384,43 @@ def _create_basic_type_schema(python_type: Any, description: str) -> Dict[str, A
:param description: The description of the type.
:returns: A dictionary representing the basic type schema.
"""
type_mapping = {str: "string", int: "integer", float: "number", bool: "boolean", dict: "object"}
return {"type": type_mapping.get(python_type, "string"), "description": description}
type_mapping = {str: "string", int: "integer", float: "number", bool: "boolean"}
schema = {"type": type_mapping.get(python_type, "string"), "description": description}
return schema

def _create_union_schema(self, types: tuple, description: str) -> Dict[str, Any]:
"""
Creates a schema for a Union type.

:param types: The types in the Union.
:param description: The description of the Union.
:returns: A dictionary representing the Union schema.
"""
schemas = []
for arg_type in types:
if arg_type is not type(None):
# Special case: dict or list of dicts
if arg_type is dict or (get_origin(arg_type) is dict):
arg_schema = {"type": "object", "additionalProperties": True}
elif get_origin(arg_type) is list:
item_type = get_args(arg_type)[0] if get_args(arg_type) else Any
if item_type is dict or (get_origin(item_type) is dict):
items_schema = {"type": "object", "additionalProperties": True}
else:
items_schema = self._create_property_schema(item_type, "")
items_schema.pop("description", None)
arg_schema = {"type": "array", "items": items_schema}
else:
arg_schema = self._create_property_schema(arg_type, "")
arg_schema.pop("description", None)
schemas.append(arg_schema)

if len(schemas) == 1:
schema = schemas[0]
schema["description"] = description
else:
schema = {"oneOf": schemas, "description": description}
return schema

def _create_property_schema(self, python_type: Any, description: str, default: Any = None) -> Dict[str, Any]:
"""
Expand All @@ -403,15 +438,39 @@ def _create_property_schema(self, python_type: Any, description: str, default: A
python_type = non_none_types[0] if non_none_types else str

origin = get_origin(python_type)
if origin is list:
schema = self._create_list_schema(get_args(python_type)[0] if get_args(python_type) else Any, description)
args = get_args(python_type)

# Handle Dict[str, Any] as a special case for meta fields
if origin is dict and args and args[0] is str and args[1] is Any:
if description and "meta" in description.lower():
schema = {"type": "string", "description": description}
else:
schema = {"type": "object", "description": description, "additionalProperties": True}
Comment on lines +444 to +448
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Out of curiosity why do we need this special case?

Copy link
Contributor Author
@YassinNouh21 YassinNouh21 Apr 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good question! The reason we single out pure Dict[str, Any] parameters whose description mentions “meta” is that metadata blobs by definition have an arbitrary, unknown shape, and we don’t want to bake an open-ended object schema (with unpredictable keys) into our function spec.

  • If I treated metadata as a normal object with additionalProperties: true, under tools_strict I would have to enumerate every possible metadata key (which we can’t know ahead of time), and LLMs often struggle to fill such free-form nested schemas.
  • All other dicts (that aren’t metadata) still fall back to an object schema with additionalProperties: true.

# Handle other dict types
elif python_type is dict or (origin is dict):
schema = {"type": "object", "description": description, "additionalProperties": True}
# Handle list
elif origin is list:
item_type = args[0] if args else Any
# Special case: list of dicts
if item_type is dict or (get_origin(item_type) is dict):
items_schema = {"type": "object", "additionalProperties": True}
else:
items_schema = self._create_property_schema(item_type, "")
items_schema.pop("description", None)
schema = {"type": "array", "description": description, "items": items_schema}
# Handle dataclass
elif is_dataclass(python_type):
schema = self._create_dataclass_schema(python_type, description)
# Handle Pydantic v2 models (unsupported)
elif hasattr(python_type, "model_validate"):
raise SchemaGenerationError(
f"Pydantic models (e.g. {python_type.__name__}) are not supported as input types for "
f"component's run method."
)
# Handle Union (including Optional)
elif origin is Union:
schema = self._create_union_schema(args, description)
else:
schema = self._create_basic_type_schema(python_type, description)

Expand Down
55 changes: 54 additions & 1 deletion test/tools/test_component_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import os
from copy import deepcopy
from dataclasses import dataclass
from typing import Dict, List
from typing import Dict, List, Union, Optional, Any

import pytest

Expand Down Expand Up @@ -122,6 +122,26 @@ def run(self, documents: List[Document], top_k: int = 5) -> Dict[str, str]:
return {"concatenated": "\n".join(doc.content for doc in documents[:top_k])}


@component
class ComplexTypeProcessor:
"""A component that processes complex types."""

@component.output_types(result=str)
def run(
self, meta: Union[Dict[str, Any], List[Dict[str, Any]]] = None, extraction_kwargs: Optional[Dict[str, Any]] = None
) -> Dict[str, str]:
"""
Processes complex types like dictionaries and unions.

:param meta: Optional metadata to attach, can be a dictionary or list of dictionaries
:param extraction_kwargs: Optional dictionary containing keyword arguments to customize the extraction process
:return: A dictionary with the result
"""
meta_str = str(meta) if meta else "No metadata"
kwargs_str = str(extraction_kwargs) if extraction_kwargs else "No kwargs"
return {"result": f"Meta: {meta_str}, Kwargs: {kwargs_str}"}


def output_handler(old, new):
"""
Output handler to test serialization.
Expand Down Expand Up @@ -335,6 +355,39 @@ def foo(self, text: str):
with pytest.raises(ValueError):
ComponentTool(component=not_a_component, name="invalid_tool", description="This should fail")

def test_from_component_with_complex_types(self):
component = ComplexTypeProcessor()

tool = ComponentTool(component=component)

# Check the parameter schema
assert "meta" in tool.parameters["properties"]
assert "extraction_kwargs" in tool.parameters["properties"]

# Meta should be oneOf with both object and array options
meta_schema = tool.parameters["properties"]["meta"]
assert meta_schema["description"].startswith("Optional metadata")
assert "oneOf" in meta_schema

# extraction_kwargs should be an object
kwargs_schema = tool.parameters["properties"]["extraction_kwargs"]
assert kwargs_schema["type"] == "object"
assert "additionalProperties" in kwargs_schema
assert kwargs_schema["additionalProperties"] is True
Comment on lines +363 to +376
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I appreciate the additional comments and individual asserts, but could we move update this to a full dict comparison. So

assert tool.parameters["properties"] == {...}

I think that would make it easier to judge at a glance.

Copy link
Contributor Author
@YassinNouh21 YassinNouh21 Apr 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it is better for readability. But no problem, I will do it


# Test tool invocation with dict
result = tool.invoke(meta={"source": "web"}, extraction_kwargs={"timeframe": "last month"})
assert isinstance(result, dict)
assert "result" in result
assert "Meta: {'source': 'web'}" in result["result"]
assert "Kwargs: {'timeframe': 'last month'}" in result["result"]

# Test tool invocation with list of dicts
result = tool.invoke(meta=[{"id": 1}, {"id": 2}])
assert isinstance(result, dict)
assert "result" in result
assert "Meta: [{'id': 1}, {'id': 2}]" in result["result"]


## Integration tests
class TestToolComponentInPipelineWithOpenAI:
Expand Down
0