From a21f101eea00fa267906437dc3f301abb8843f11 Mon Sep 17 00:00:00 2001 From: jmatejcz Date: Fri, 20 Jun 2025 11:58:46 +0200 Subject: [PATCH 1/3] feat: defined more spatial tasks --- .../predefined/spatial_reasoning_tasks.py | 44 ++++++++++++++++++- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/src/rai_bench/rai_bench/tool_calling_agent/predefined/spatial_reasoning_tasks.py b/src/rai_bench/rai_bench/tool_calling_agent/predefined/spatial_reasoning_tasks.py index f67d19792..ef9259d38 100644 --- a/src/rai_bench/rai_bench/tool_calling_agent/predefined/spatial_reasoning_tasks.py +++ b/src/rai_bench/rai_bench/tool_calling_agent/predefined/spatial_reasoning_tasks.py @@ -127,8 +127,8 @@ def get_spatial_tasks( easy_true_inputs = [ # Single object presence/detection BoolImageTaskInput( - question="Is the light on in the room?", - images_paths=[IMG_PATH + "image_2.jpg"], + question="Is the chair in the room?", + images_paths=[IMG_PATH + "image_1.jpg"], ), BoolImageTaskInput( question="Do you see the plant?", images_paths=[IMG_PATH + "image_2.jpg"] @@ -137,6 +137,10 @@ def get_spatial_tasks( question="Are there any pictures on the wall?", images_paths=[IMG_PATH + "image_3.jpg"], ), + BoolImageTaskInput( + question="is there a TV in the room?", + images_paths=[IMG_PATH + "image_4.jpg"], + ), BoolImageTaskInput( question="Is there a pillow on the armchain?", images_paths=[IMG_PATH + "image_7.jpg"], @@ -149,6 +153,18 @@ def get_spatial_tasks( question="Are there 3 pictures on the wall?", images_paths=[IMG_PATH + "image_4.jpg"], ), + BoolImageTaskInput( + question="Is the light on in the room?", + images_paths=[IMG_PATH + "image_2.jpg"], + ), + BoolImageTaskInput( + question="Is the chair blue?", + images_paths=[IMG_PATH + "image_3.jpg"], + ), + BoolImageTaskInput( + question="Is there something to sit on?", + images_paths=[IMG_PATH + "image_7.jpg"], + ), ] hard_true_inputs = [ @@ -161,6 +177,10 @@ def get_spatial_tasks( question="Is there a plant behind the rack?", images_paths=[IMG_PATH + "image_5.jpg"], ), + BoolImageTaskInput( + question="Is there a rug under the bed?", + images_paths=[IMG_PATH + "image_2.jpg"], + ), ] easy_false_inputs = [ @@ -175,6 +195,14 @@ def get_spatial_tasks( question="Is there a red pillow on the armchair?", images_paths=[IMG_PATH + "image_7.jpg"], ), + BoolImageTaskInput( + question="Is there a red desk with chair in the room?", + images_paths=[IMG_PATH + "image_5.jpg"], + ), + BoolImageTaskInput( + question="Do you see the bed?", + images_paths=[IMG_PATH + "image_6.jpg"], + ), ] medium_false_inputs = [ @@ -186,6 +214,14 @@ def get_spatial_tasks( question="Are there 4 pictures on the wall?", images_paths=[IMG_PATH + "image_4.jpg"], ), + BoolImageTaskInput( + question="Is the TV switched on?", + images_paths=[IMG_PATH + "image_6.jpg"], + ), + BoolImageTaskInput( + question="Is the window opened?", + images_paths=[IMG_PATH + "image_6.jpg"], + ), ] hard_false_inputs = [ @@ -198,6 +234,10 @@ def get_spatial_tasks( question="Is there a plant on the right from the window?", images_paths=[IMG_PATH + "image_6.jpg"], ), + BoolImageTaskInput( + question="Is the chair next to a bed?", + images_paths=[IMG_PATH + "image_1.jpg"], + ), ] for extra_calls in extra_tool_calls: From 4a03c3059d2a966172376928b5dbc84b3d5b1e2a Mon Sep 17 00:00:00 2001 From: jmatejcz Date: Fri, 20 Jun 2025 11:58:56 +0200 Subject: [PATCH 2/3] refactor: adjusted system propmt examples --- .../rai_bench/tool_calling_agent/tasks/spatial.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/rai_bench/rai_bench/tool_calling_agent/tasks/spatial.py b/src/rai_bench/rai_bench/tool_calling_agent/tasks/spatial.py index 94e81742e..2f9b58e0d 100644 --- a/src/rai_bench/rai_bench/tool_calling_agent/tasks/spatial.py +++ b/src/rai_bench/rai_bench/tool_calling_agent/tasks/spatial.py @@ -25,24 +25,24 @@ loggers_type = logging.Logger -SPATIAL_REASONING_SYSTEM_PROMPT_0_SHOT = """You are a helpful and knowledgeable AI assistant that specializes in interpreting and analyzing visual content. Your task is to answer questions based on the images provided to you. Please response with the use of the provided tools.""" +SPATIAL_REASONING_SYSTEM_PROMPT_0_SHOT = """You are a helpful and knowledgeable AI assistant that specializes +in interpreting and analyzing visual content. Your task is to answer questions based +on the images provided to you. Please response with the use of the provided tools.""" +# NOTE (jmatejcz) In this case we are using only one tool so there is no difference bettween 2 and 5 shot +# so I made 1 example in '2 shot' and 2 examples in '5 shot' prompt SPATIAL_REASONING_SYSTEM_PROMPT_2_SHOT = ( SPATIAL_REASONING_SYSTEM_PROMPT_0_SHOT + """ Example of tool calls: -- return_bool_response, args: {'response': True} -- return_bool_response, args: {'response': False}""" +- return_bool_response, args: {'response': True}""" ) -# NOTE (jmatejcz) In this case we are using only one tool so there is no difference bettween 2 and 5 shot SPATIAL_REASONING_SYSTEM_PROMPT_5_SHOT = ( SPATIAL_REASONING_SYSTEM_PROMPT_2_SHOT + """ -- return_bool_response, args: {'response': True} # When object is clearly visible -- return_bool_response, args: {'response': False} # When object is not present -- return_bool_response, args: {'response': True} # When spatial relationship is correct""" +- return_bool_response, args: {'response': False}""" ) From c65d4706b30fa0acc922c7beefe09f10096216ab Mon Sep 17 00:00:00 2001 From: jmatejcz Date: Fri, 20 Jun 2025 13:17:07 +0200 Subject: [PATCH 3/3] refactor:: removed redundant code --- .../predefined/spatial_reasoning_tasks.py | 68 ++----------------- 1 file changed, 4 insertions(+), 64 deletions(-) diff --git a/src/rai_bench/rai_bench/tool_calling_agent/predefined/spatial_reasoning_tasks.py b/src/rai_bench/rai_bench/tool_calling_agent/predefined/spatial_reasoning_tasks.py index ef9259d38..d420f4fa3 100644 --- a/src/rai_bench/rai_bench/tool_calling_agent/predefined/spatial_reasoning_tasks.py +++ b/src/rai_bench/rai_bench/tool_calling_agent/predefined/spatial_reasoning_tasks.py @@ -32,66 +32,6 @@ ) IMG_PATH = "src/rai_bench/rai_bench/tool_calling_agent/predefined/images/" -true_response_inputs: List[BoolImageTaskInput] = [ - BoolImageTaskInput( - question="Is the door on the left from the desk?", - images_paths=[IMG_PATH + "image_1.jpg"], - ), - BoolImageTaskInput( - question="Is the light on in the room?", - images_paths=[IMG_PATH + "image_2.jpg"], - ), - BoolImageTaskInput( - question="Do you see the plant?", - images_paths=[IMG_PATH + "image_2.jpg"], - ), - BoolImageTaskInput( - question="Are there any pictures on the wall?", - images_paths=[IMG_PATH + "image_3.jpg"], - ), - BoolImageTaskInput( - question="Are there 3 pictures on the wall?", - images_paths=[IMG_PATH + "image_4.jpg"], - ), - BoolImageTaskInput( - question="Is there a plant behind the rack?", - images_paths=[IMG_PATH + "image_5.jpg"], - ), - BoolImageTaskInput( - question="Is there a pillow on the armchain?", - images_paths=[IMG_PATH + "image_7.jpg"], - ), -] -false_response_inputs: List[BoolImageTaskInput] = [ - BoolImageTaskInput( - question="Is the door open?", - images_paths=[IMG_PATH + "image_1.jpg"], - ), - BoolImageTaskInput( - question="Is someone in the room?", - images_paths=[IMG_PATH + "image_1.jpg"], - ), - BoolImageTaskInput( - question="Do you see the plant?", - images_paths=[IMG_PATH + "image_3.jpg"], - ), - BoolImageTaskInput( - question="Are there 4 pictures on the wall?", - images_paths=[IMG_PATH + "image_4.jpg"], - ), - BoolImageTaskInput( - question="Is there a rack on the left from the sofa?", - images_paths=[IMG_PATH + "image_4.jpg"], - ), - BoolImageTaskInput( - question="Is there a plant on the right from the window?", - images_paths=[IMG_PATH + "image_6.jpg"], - ), - BoolImageTaskInput( - question="Is there a red pillow on the armchair?", - images_paths=[IMG_PATH + "image_7.jpg"], - ), -] ########## SUBTASKS ################################################################# return_true_subtask = CheckArgsToolCallSubTask( expected_tool_name="return_bool_response", expected_args={"response": True} @@ -141,10 +81,6 @@ def get_spatial_tasks( question="is there a TV in the room?", images_paths=[IMG_PATH + "image_4.jpg"], ), - BoolImageTaskInput( - question="Is there a pillow on the armchain?", - images_paths=[IMG_PATH + "image_7.jpg"], - ), ] medium_true_inputs = [ @@ -181,6 +117,10 @@ def get_spatial_tasks( question="Is there a rug under the bed?", images_paths=[IMG_PATH + "image_2.jpg"], ), + BoolImageTaskInput( + question="Is there a pillow on the armchain?", + images_paths=[IMG_PATH + "image_7.jpg"], + ), ] easy_false_inputs = [