8000 feat: tool calling spatial reasoning tasks extension by jmatejcz · Pull Request #637 · RobotecAI/rai · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

feat: tool calling spatial reasoning tasks extension #637

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: development
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10000
Original file line number Diff line number Diff line change
Expand Up @@ -32,66 +32,6 @@
)

IMG_PATH = "src/rai_bench/rai_bench/tool_calling_agent/predefined/images/"
true_response_inputs: List[BoolImageTaskInput] = [
BoolImageTaskInput(
question="Is the door on the left from the desk?",
images_paths=[IMG_PATH + "image_1.jpg"],
),
BoolImageTaskInput(
question="Is the light on in the room?",
images_paths=[IMG_PATH + "image_2.jpg"],
),
BoolImageTaskInput(
question="Do you see the plant?",
images_paths=[IMG_PATH + "image_2.jpg"],
),
BoolImageTaskInput(
question="Are there any pictures on the wall?",
images_paths=[IMG_PATH + "image_3.jpg"],
),
BoolImageTaskInput(
question="Are there 3 pictures on the wall?",
images_paths=[IMG_PATH + "image_4.jpg"],
),
BoolImageTaskInput(
question="Is there a plant behind the rack?",
images_paths=[IMG_PATH + "image_5.jpg"],
),
BoolImageTaskInput(
question="Is there a pillow on the armchain?",
images_paths=[IMG_PATH + "image_7.jpg"],
),
]
false_response_inputs: List[BoolImageTaskInput] = [
BoolImageTaskInput(
question="Is the door open?",
images_paths=[IMG_PATH + "image_1.jpg"],
),
BoolImageTaskInput(
question="Is someone in the room?",
images_paths=[IMG_PATH + "image_1.jpg"],
),
BoolImageTaskInput(
question="Do you see the plant?",
images_paths=[IMG_PATH + "image_3.jpg"],
),
BoolImageTaskInput(
question="Are there 4 pictures on the wall?",
images_paths=[IMG_PATH + "image_4.jpg"],
),
BoolImageTaskInput(
question="Is there a rack on the left from the sofa?",
images_paths=[IMG_PATH + "image_4.jpg"],
),
BoolImageTaskInput(
question="Is there a plant on the right from the window?",
images_paths=[IMG_PATH + "image_6.jpg"],
),
BoolImageTaskInput(
question="Is there a red pillow on the armchair?",
images_paths=[IMG_PATH + "image_7.jpg"],
),
]
########## SUBTASKS #################################################################
return_true_subtask = CheckArgsToolCallSubTask(
expected_tool_name="return_bool_response", expected_args={"response": True}
Expand Down Expand Up @@ -127,8 +67,8 @@ def get_spatial_tasks(
easy_true_inputs = [
# Single object presence/detection
BoolImageTaskInput(
question="Is the light on in the room?",
images_paths=[IMG_PATH + "image_2.jpg"],
question="Is the chair in the room?",
images_paths=[IMG_PATH + "image_1.jpg"],
),
BoolImageTaskInput(
question="Do you see the plant?", images_paths=[IMG_PATH + "image_2.jpg"]
Expand All @@ -138,8 +78,8 @@ def get_spatial_tasks(
images_paths=[IMG_PATH + "image_3.jpg"],
),
BoolImageTaskInput(
question="Is there a pillow on the armchain?",
images_paths=[IMG_PATH + "image_7.jpg"],
question="is there a TV in the room?",
images_paths=[IMG_PATH + "image_4.jpg"],
),
]

Expand All @@ -149,6 +89,18 @@ def get_spatial_tasks(
question="Are there 3 pictures on the wall?",
images_paths=[IMG_PATH + "image_4.jpg"],
),
BoolImageTaskInput(
question="Is the light on in the room?",
images_paths=[IMG_PATH + "image_2.jpg"],
),
BoolImageTaskInput(
question="Is the chair blue?",
images_paths=[IMG_PATH + "image_3.jpg"],
),
BoolImageTaskInput(
question="Is there something to sit on?",
images_paths=[IMG_PATH + "image_7.jpg"],
),
]

hard_true_inputs = [
Expand All @@ -161,6 +113,14 @@ def get_spatial_tasks(
question="Is there a plant behind the rack?",
images_paths=[IMG_PATH + "image_5.jpg"],
),
BoolImageTaskInput(
question="Is there a rug under the bed?",
images_paths=[IMG_PATH + "image_2.jpg"],
),
BoolImageTaskInput(
question="Is there a pillow on the armchain?",
images_paths=[IMG_PATH + "image_7.jpg"],
),
]

easy_false_inputs = [
Expand All @@ -175,6 +135,14 @@ def get_spatial_tasks(
question="Is there a red pillow on the armchair?",
images_paths=[IMG_PATH + "image_7.jpg"],
),
BoolImageTaskInput(
question="Is there a red desk with chair in the room?",
images_paths=[IMG_PATH + "image_5.jpg"],
),
BoolImageTaskInput(
question="Do you see the bed?",
images_paths=[IMG_PATH + "image_6.jpg"],
),
]

medium_false_inputs = [
Expand All @@ -186,6 +154,14 @@ def get_spatial_tasks(
question="Are there 4 pictures on the wall?",
images_paths=[IMG_PATH + "image_4.jpg"],
),
BoolImageTaskInput(
question="Is the TV switched on?",
images_paths=[IMG_PATH + "image_6.jpg"],
),
BoolImageTaskInput(
question="Is the window opened?",
images_paths=[IMG_PATH + "image_6.jpg"],
),
]

hard_false_inputs = [
Expand All @@ -198,6 +174,10 @@ def get_spatial_tasks(
question="Is there a plant on the right from the window?",
images_paths=[IMG_PATH + "image_6.jpg"],
),
BoolImageTaskInput(
question="Is the chair next to a bed?",
images_paths=[IMG_PATH + "image_1.jpg"],
),
]

for extra_calls in extra_tool_calls:
Expand Down
14 changes: 7 additions & 7 deletions src/rai_bench/rai_bench/tool_calling_agent/tasks/spatial.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,24 +25,24 @@

loggers_type = logging.Logger

SPATIAL_REASONING_SYSTEM_PROMPT_0_SHOT = """You are a helpful and knowledgeable AI assistant that specializes in interpreting and analyzing visual content. Your task is to answer questions based on the images provided to you. Please response with the use of the provided tools."""
SPATIAL_REASONING_SYSTEM_PROMPT_0_SHOT = """You are a helpful and knowledgeable AI assistant that specializes
in interpreting and analyzing visual content. Your task is to answer questions based
on the images provided to you. Please response with the use of the provided tools."""
# NOTE (jmatejcz) In this case we are using only one tool so there is no difference bettween 2 and 5 shot
# so I made 1 example in '2 shot' and 2 examples in '5 shot' prompt

SPATIAL_REASONING_SYSTEM_PROMPT_2_SHOT = (
SPATIAL_REASONING_SYSTEM_PROMPT_0_SHOT
+ """

Example of tool calls:
- return_bool_response, args: {'response': True}
- return_bool_response, args: {'response': False}"""
- return_bool_response, args: {'response': True}"""
)

# NOTE (jmatejcz) In this case we are using only one tool so there is no difference bettween 2 and 5 shot
SPATIAL_REASONING_SYSTEM_PROMPT_5_SHOT = (
SPATIAL_REASONING_SYSTEM_PROMPT_2_SHOT
+ """
- return_bool_response, args: {'response': True} # When object is clearly visible
- return_bool_response, args: {'response': False} # When object is not present
- return_bool_response, args: {'response': True} # When spatial relationship is correct"""
- return_bool_response, args: {'response': False}"""
)


Expand Down
0