From ca9523364c54bf793c3704a6bb52133aa1221daa Mon Sep 17 00:00:00 2001
From: Li Yin <li.yin.gravity@gmail.com>
Date: Thu, 19 Dec 2024 14:56:31 -0800
Subject: [PATCH] added test to react agent

---
 adalflow/adalflow/components/agent/react.py   |   61 +-
 adalflow/adalflow/core/types.py               |    3 +
 .../adalflow/optim/text_grad/tgd_optimizer.py |    7 +-
 adalflow/adalflow/optim/trainer/adal.py       |    2 -
 adalflow/adalflow/optim/trainer/trainer.py    |  489 ++++---
 adalflow/tests/test_react_agent.py            |   96 +-
 text_grad_2_0_recompute.py                    |  127 ++
 use_cases/agent/react_agent copy.ipynb        | 1272 +++++++++++++++++
 use_cases/agent/react_agent.ipynb             |  216 +--
 use_cases/config.py                           |    2 +-
 .../bbh/object_count/task.py                  |    2 +-
 .../bbh/object_count/train_new.py             |    2 +-
 use_cases/text_grad_2.0_train.py              |   32 +-
 13 files changed, 1942 insertions(+), 369 deletions(-)
 create mode 100644 text_grad_2_0_recompute.py
 create mode 100644 use_cases/agent/react_agent copy.ipynb
diff --git a/adalflow/adalflow/components/agent/react.py b/adalflow/adalflow/components/agent/react.py
index 92428e53..972b03c2 100644
--- a/adalflow/adalflow/components/agent/react.py
+++ b/adalflow/adalflow/components/agent/react.py
@@ -6,7 +6,8 @@
 
 
 from adalflow.core.generator import Generator
-from adalflow.core.component import Component
+from adalflow.optim.grad_component import GradComponent
+from adalflow.optim.parameter import Parameter, ParameterType
 from adalflow.core.func_tool import FunctionTool, AsyncCallable
 from adalflow.core.tool_manager import ToolManager
 from adalflow.components.output_parsers import JsonOutputParser
@@ -27,40 +28,45 @@
 
 # TODO: test react agent
 
-DEFAULT_REACT_AGENT_SYSTEM_PROMPT = r"""<SYS>
-{# role/task description #}
+react_agent_task_desc = r"""{# role/task description #}
 You are a helpful assistant.
 Answer the user's query using the tools provided below with minimal steps and maximum accuracy.
 {# REACT instructions #}
 Each step you will read the previous Thought, Action, and Observation(execution result of the action) and then provide the next Thought and Action.
+
+<START_OF_TASK_SPEC>
+{# Task specification to teach the agent how to think using 'divide and conquer' strategy #}
+- For simple queries: Directly call the ``finish`` action and provide the answer.
+- For complex queries:
+    - Step 1: Read the user query and potentially divide it into subqueries. And get started with the first subquery.
+    - Call one available tool at a time to solve each subquery/subquestion. \
+    - At step 'finish', join all subqueries answers and finish the task.
+Remember:
+- Action must call one of the above tools with name. It can not be empty.
+- You will always end with 'finish' action to finish the task. The answer can be the final answer or failure message.
+<END_OF_TASK_SPEC>
+"""
+
+DEFAULT_REACT_AGENT_SYSTEM_PROMPT = r"""<START_OF_SYSTEM_PROMPT>
+{{react_agent_task_desc}}
 {# Tools #}
 {% if tools %}
-<TOOLS>
+<START_OF_TOOLS>
 You available tools are:
 {% for tool in tools %}
 {{ loop.index }}.
 {{tool}}
 ------------------------
 {% endfor %}
-</TOOLS>
+<END_OF_TOOLS>
 {% endif %}
 {# output format and examples for output format #}
-<OUTPUT_FORMAT>
+<START_OF_OUTPUT_FORMAT>
 {{output_format_str}}
-</OUTPUT_FORMAT>
-<TASK_SPEC>
-{# Task specification to teach the agent how to think using 'divide and conquer' strategy #}
-- For simple queries: Directly call the ``finish`` action and provide the answer.
-- For complex queries:
-    - Step 1: Read the user query and potentially divide it into subqueries. And get started with the first subquery.
-    - Call one available tool at a time to solve each subquery/subquestion. \
-    - At step 'finish', join all subqueries answers and finish the task.
-Remember:
-- Action must call one of the above tools with name. It can not be empty.
-- You will always end with 'finish' action to finish the task. The answer can be the final answer or failure message.
-</TASK_SPEC>
-</SYS>
+<END_OF_OUTPUT_FORMAT>
+<END_OF_SYSTEM_PROMPT>
 -----------------
+<START_OF_USER_QUERY>
 User query:
 {{ input_str }}
 {# Step History #}
@@ -76,10 +82,11 @@
 {% endfor %}
 </STEPS>
 {% endif %}
-You:"""
+<END_OF_USER_QUERY>
+"""
 
 
-class ReActAgent(Component):
+class ReActAgent(GradComponent):
     __doc__ = r"""ReActAgent uses generator as a planner that runs multiple and sequential functional call steps to generate the final response.
 
     Users need to set up:
@@ -135,11 +142,13 @@ def __init__(
         max_steps: int = 10,
         add_llm_as_fallback: bool = True,
         # TODO: the examples are just for specifying the output format, not end to end input-output examples, need further optimization
-        examples: List[FunctionExpression] = [],
+        # examples: List[FunctionExpression] = [],
+        examples: Union[List[FunctionExpression], List[str]] = [],
         *,
         # the following arguments are mainly for the planner
         model_client: ModelClient,
         model_kwargs: Dict = {},
+        # template for the planner
         template: Optional[str] = None,  # allow users to customize the template
     ):
         super().__init__()
@@ -165,6 +174,13 @@ def __init__(
         prompt_kwargs = {
             "tools": self.tool_manager.yaml_definitions,
             "output_format_str": output_parser.format_instructions(),
+            "react_agent_task_desc": Parameter(
+                name="react_agent_task_desc",
+                data=react_agent_task_desc,
+                role_desc="Task description for the ReAct agent which functions as a planner using a Large Language Model.",
+                param_type=ParameterType.PROMPT,
+                requires_opt=True,
+            ),
         }
         self.planner = Generator(
             template=template,
@@ -249,6 +265,7 @@ def _run_one_step(self, step: int, prompt_kwargs: Dict, model_kwargs: Dict) -> s
         response: GeneratorOutput = self.planner(
             prompt_kwargs=prompt_kwargs, model_kwargs=model_kwargs
         )
+
         if response.error:
             error_msg = f"Error planning step {step}: {response.error}"
             step_output.observation = error_msg
diff --git a/adalflow/adalflow/core/types.py b/adalflow/adalflow/core/types.py
index 18724510..a3b663c0 100644
--- a/adalflow/adalflow/core/types.py
+++ b/adalflow/adalflow/core/types.py
@@ -406,6 +406,9 @@ def add(a, b):
 
     The benefits are less failed function calls.
     """
+    question: Optional[str] = field(
+        default=None, metadata={"desc": "The question to ask the LLM"}
+    )
     thought: Optional[str] = field(
         default=None, metadata={"desc": "Why the function is called"}
     )
diff --git a/adalflow/adalflow/optim/text_grad/tgd_optimizer.py b/adalflow/adalflow/optim/text_grad/tgd_optimizer.py
index 219c299a..3d5911a8 100644
--- a/adalflow/adalflow/optim/text_grad/tgd_optimizer.py
+++ b/adalflow/adalflow/optim/text_grad/tgd_optimizer.py
@@ -81,12 +81,13 @@ class HistoryPrompt(DataClass):
 You must base on the following examples when modifying the {{variable_desc}}:
 <EXAMPLES>{{in_context_examples}}</EXAMPLES>
 {% endif %}
-YOU MUST ENSURE the new variable shares the same intent as the original variable.
-You can either rephrase the initial variable, or add more specific instructions based on the feedback.
-You can not change the variable to only fit on one sample if the batch size is larger than 1.
 <END_OF_USER_MESSAGE>
 """
 
+# YOU MUST ENSURE the new variable shares the same intent as the original variable.
+# You can either rephrase the initial variable, or add more specific instructions based on the feedback.
+# You can not change the variable to only fit on one sample if the batch size is larger than 1.
+
 # optimizer system prompt
 
 # Tips:
diff --git a/adalflow/adalflow/optim/trainer/adal.py b/adalflow/adalflow/optim/trainer/adal.py
index cea31760..eab93159 100644
--- a/adalflow/adalflow/optim/trainer/adal.py
+++ b/adalflow/adalflow/optim/trainer/adal.py
@@ -402,8 +402,6 @@ def train_step(self, batch, batch_idx, num_workers: int = 2) -> List:
                 if isinstance(y_pred, Parameter):
                     raise ValueError(f"y_pred_{i} is a Parameter, {y_pred}")
 
-                print(f"y_pred: {y_pred})")
-
                 assert (
                     y_pred.id == sample.id
                 ), f"ID mismatch: {y_pred.id} != {sample.id}, type: {type(y_pred)}"
diff --git a/adalflow/adalflow/optim/trainer/trainer.py b/adalflow/adalflow/optim/trainer/trainer.py
index 833dd1c3..24d6cd04 100644
--- a/adalflow/adalflow/optim/trainer/trainer.py
+++ b/adalflow/adalflow/optim/trainer/trainer.py
@@ -1119,76 +1119,80 @@ def _fit_text_grad_demo_mix_constrained(
                         all_losses=all_losses,
                         all_y_preds=all_y_preds,
                         include_demo_optimizers=True,
+                        trainer_results=trainer_results,
+                        val_dataset=val_dataset,
+                        test_dataset=test_dataset,
+                        total_steps=total_steps,
                     )
                 )
 
-                if not self._check_optimizer_proposal():
-                    print(
-                        "No proposal can improve the subset and full set, go to next step"
-                    )
-                    # self._add_failed_proposals_text_optimizers()
-
-                    self._add_one_step_in_trainer_results(
-                        trainer_results,
-                        trainer_results.val_scores[-1],
-                        trainer_results.test_scores[-1],
-                        trainer_results.prompts[-1],
-                        total_steps,
-                    )
-
-                    continue
-
-                # set the batch size to the size of the validation set
-                last_val_score = trainer_results.val_scores[-1]
-                val_output = self.adaltask.validation_step(
-                    val_dataset,
-                    total_steps,
-                    self.num_workers,
-                    minimum_score=last_val_score,
-                )
-                val_score = val_output.avg_score
-                self._add_history_text_optimizers(val_score)
-
-                if val_score > last_val_score:
-                    print(f"Optimizer step: {val_score} > {last_val_score}")
-                    # self.optimizer.step()
-                    self._step_text_optimizers()
-                    self._demo_optimizers_step()
-
-                    # test the model
-                    test_score = None
-                    if test_dataset is not None:
-                        test_output = self.adaltask.validation_step(
-                            test_dataset, total_steps, self.num_workers
-                        )
-                        test_score = test_output.avg_score
+                # if not self._check_optimizer_proposal():
+                #     print(
+                #         "No proposal can improve the subset and full set, go to next step"
+                #     )
+                #     # self._add_failed_proposals_text_optimizers()
+
+                #     self._add_one_step_in_trainer_results(
+                #         trainer_results,
+                #         trainer_results.val_scores[-1],
+                #         trainer_results.test_scores[-1],
+                #         trainer_results.prompts[-1],
+                #         total_steps,
+                #     )
 
-                    new_prompts = self.adaltask._get_param_values()
-                    self._add_one_step_in_trainer_results(
-                        trainer_results,
-                        val_score,
-                        test_score,
-                        new_prompts,
-                        total_steps,
-                    )
-                    all_samples, all_losses, all_y_preds = [], [], []
-                else:
-                    print(f"Optimizer revert: {val_score} <= {last_val_score}")
-                    # self.optimizer.revert()
-                    self._revert_text_optimizers()
-                    self._demo_optimizers_revert()
-                    # save the score, no change
-                    self._add_one_step_in_trainer_results(
-                        trainer_results,
-                        last_val_score,
-                        trainer_results.test_scores[-1],
-                        trainer_results.prompts[-1],
-                        total_steps,
-                        attempted_val_score=val_score,
-                    )
+                #     continue
+
+                # # set the batch size to the size of the validation set
+                # last_val_score = trainer_results.val_scores[-1]
+                # val_output = self.adaltask.validation_step(
+                #     val_dataset,
+                #     total_steps,
+                #     self.num_workers,
+                #     minimum_score=last_val_score,
+                # )
+                # val_score = val_output.avg_score
+                # self._add_history_text_optimizers(val_score)
+
+                # if val_score > last_val_score:
+                #     print(f"Optimizer step: {val_score} > {last_val_score}")
+                #     # self.optimizer.step()
+                #     self._step_text_optimizers()
+                #     self._demo_optimizers_step()
+
+                #     # test the model
+                #     test_score = None
+                #     if test_dataset is not None:
+                #         test_output = self.adaltask.validation_step(
+                #             test_dataset, total_steps, self.num_workers
+                #         )
+                #         test_score = test_output.avg_score
+
+                #     new_prompts = self.adaltask._get_param_values()
+                #     self._add_one_step_in_trainer_results(
+                #         trainer_results,
+                #         val_score,
+                #         test_score,
+                #         new_prompts,
+                #         total_steps,
+                #     )
+                #     all_samples, all_losses, all_y_preds = [], [], []
+                # else:
+                #     print(f"Optimizer revert: {val_score} <= {last_val_score}")
+                #     # self.optimizer.revert()
+                #     self._revert_text_optimizers()
+                #     self._demo_optimizers_revert()
+                #     # save the score, no change
+                #     self._add_one_step_in_trainer_results(
+                #         trainer_results,
+                #         last_val_score,
+                #         trainer_results.test_scores[-1],
+                #         trainer_results.prompts[-1],
+                #         total_steps,
+                #         attempted_val_score=val_score,
+                #     )
 
-                print(f"Saving checkpoint to {self.ckpt_file}")
-                save_json(trainer_results.to_dict(), self.ckpt_file)
+                # print(f"Saving checkpoint to {self.ckpt_file}")
+                # save_json(trainer_results.to_dict(), self.ckpt_file)
             save_json(trainer_results.to_dict(), self.ckpt_file)  # checkpoint
 
     def _fit_text_grad_demo_mix_random(
@@ -1641,6 +1645,43 @@ def _add_one_step_in_trainer_results(
         trainer_results.prompts.append(prompts)
         trainer_results.steps.append(step)
 
+    # def _downsample_move_batch(
+    #     self, all_samples, all_losses: List["Parameter"], all_y_preds, acc_score_list
+    # ):
+    #     """Downsample the moving batch to a more balanced error and correct samples"""
+
+    #     from adalflow.optim.parameter import Parameter
+
+    #     if not all([score >= 0 and score <= 1 for score in acc_score_list]):
+    #         raise ValueError(
+    #             "acc_score_list should only contain values between 0 and 1"
+    #         )
+
+    #     for loss in all_losses:
+    #         if not isinstance(loss, Parameter):
+    #             raise ValueError("Loss should be a Parameter object")
+    #     max_moving_batch_size = 20
+
+    #     correct_indices = [i for i, score in enumerate(acc_score_list) if score > 0.5]
+    #     error_indices = [i for i, score in enumerate(acc_score_list) if score <= 0.5]
+
+    #     if (
+    #         len(error_indices) + len(correct_indices)
+    #         <= max_moving_batch_size
+    #         # and len(correct_indices) <= max_moving_batch_size
+    #     ):
+    #         return all_samples, all_losses, all_y_preds, acc_score_list
+
+    #     # downsample from all samples
+    #     new_sample_indices = random.sample(
+    #         range(len(all_samples)), min(max_moving_batch_size, len(all_samples))
+    #     )
+    #     all_samples = [all_samples[i] for i in new_sample_indices]
+    #     all_losses = [all_losses[i] for i in new_sample_indices]
+    #     all_y_preds = [all_y_preds[i] for i in new_sample_indices]
+    #     acc_score_list = [acc_score_list[i] for i in new_sample_indices]
+    #     return all_samples, all_losses, all_y_preds, acc_score_list
+
     def _downsample_move_batch(
         self, all_samples, all_losses: List["Parameter"], all_y_preds, acc_score_list
     ):
@@ -1656,7 +1697,9 @@ def _downsample_move_batch(
         for loss in all_losses:
             if not isinstance(loss, Parameter):
                 raise ValueError("Loss should be a Parameter object")
+
         max_moving_batch_size = 20
+        min_error_samples = 4
 
         correct_indices = [i for i, score in enumerate(acc_score_list) if score > 0.5]
         error_indices = [i for i, score in enumerate(acc_score_list) if score <= 0.5]
@@ -1668,14 +1711,46 @@ def _downsample_move_batch(
         ):
             return all_samples, all_losses, all_y_preds, acc_score_list
 
-        # downsample from all samples
-        new_sample_indices = random.sample(
-            range(len(all_samples)), min(max_moving_batch_size, len(all_samples))
-        )
-        all_samples = [all_samples[i] for i in new_sample_indices]
-        all_losses = [all_losses[i] for i in new_sample_indices]
-        all_y_preds = [all_y_preds[i] for i in new_sample_indices]
-        acc_score_list = [acc_score_list[i] for i in new_sample_indices]
+        # Adjust downsampling logic
+        if len(error_indices) < min_error_samples:
+            remaining_capacity = max_moving_batch_size - len(error_indices)
+            correct_indices = random.sample(correct_indices, max(0, remaining_capacity))
+        else:
+            # Set aside minimum error samples
+            retained_error_indices = error_indices[:min_error_samples]
+            remaining_error_indices = error_indices[min_error_samples:]
+
+            # Combine remaining error and correct indices for unified sampling
+            combined_indices = remaining_error_indices + correct_indices
+            sampled_combined_indices = random.sample(
+                combined_indices, max(0, max_moving_batch_size - min_error_samples)
+            )
+
+            error_indices = retained_error_indices
+            correct_indices = [
+                i for i in sampled_combined_indices if i in correct_indices
+            ]
+            remaining_error_indices = [
+                i for i in sampled_combined_indices if i in remaining_error_indices
+            ]
+            error_indices += remaining_error_indices
+
+        error_samples = [all_samples[i] for i in error_indices]
+        error_losses = [all_losses[i] for i in error_indices]
+        error_y_preds = [all_y_preds[i] for i in error_indices]
+        error_scores = [acc_score_list[i] for i in error_indices]
+
+        correct_samples = [all_samples[i] for i in correct_indices]
+        correct_losses = [all_losses[i] for i in correct_indices]
+        correct_y_preds = [all_y_preds[i] for i in correct_indices]
+        correct_scores = [acc_score_list[i] for i in correct_indices]
+
+        # Combine error and downsampled correct samples
+        all_samples = error_samples + correct_samples
+        all_losses = error_losses + correct_losses
+        all_y_preds = error_y_preds + correct_y_preds
+        acc_score_list = error_scores + correct_scores
+
         return all_samples, all_losses, all_y_preds, acc_score_list
 
     def _moving_batch_sample(
@@ -1751,6 +1826,10 @@ def _text_grad_constraint_propose_step(
         all_losses: List["Parameter"],
         all_y_preds,
         include_demo_optimizers: bool = False,
+        trainer_results: TrainerResult = None,
+        val_dataset: Any = None,
+        test_dataset: Any = None,
+        total_steps: int = 0,
     ):
         """Handles both the mixed training and the separate training.
         When include_demo_optimizers is True, the demo optimizers are included in the training
@@ -1766,11 +1845,22 @@ def _text_grad_constraint_propose_step(
         move_batch_score = move_batch_eval.avg_score
         move_batch_acc_score_list = move_batch_eval.per_item_scores
 
+        last_val_score = trainer_results.val_scores[-1]
+        val_score_increased = False
+
         if move_batch_score >= self.batch_val_score_threshold:
             print(f"Skipping batch {steps} as acc: {move_batch_score}")
 
             # reset the moving batch
             all_samples, all_losses, all_y_preds = [], [], []
+            # track the result
+            self._add_one_step_in_trainer_results(
+                trainer_results,
+                last_val_score,
+                trainer_results.test_scores[-1],
+                trainer_results.prompts[-1],
+                total_steps,
+            )
             return all_samples, all_losses, all_y_preds
         # downsample the moving batch
         all_samples, all_losses, all_y_preds, move_batch_acc_score_list = (
@@ -1801,9 +1891,10 @@ def _text_grad_constraint_propose_step(
 
         # TODO: make this a step
         tdqm_loader = tqdm(range(self.max_proposals_per_step), desc="Proposing")
+
         for i in tdqm_loader:
 
-            # print(f"Proposing step: {i}")
+            print(f"Proposing step: {i}")
             # self.optimizer.propose()
             self._propose_text_optimizers()  # new prompts
             if include_demo_optimizers:
@@ -1812,7 +1903,6 @@ def _text_grad_constraint_propose_step(
             print("New prompts: ", new_prompts)
             # valide the subset
             subset_samples = [all_samples[i] for i in subset_indices]
-            # validate the subset
             val_output = self.adaltask.validation_step(
                 subset_samples, steps, self.num_workers
             )
@@ -1833,25 +1923,88 @@ def _text_grad_constraint_propose_step(
                     self._demo_optimizers_revert()
                 continue
             # validate the full set
-            move_batch_result = self.adaltask.validation_step(
-                all_samples, steps, self.num_workers
+            # move_batch_result = self.adaltask.validation_step(
+            #     all_samples, steps, self.num_workers
+            # )
+            # new_move_batch_score = move_batch_result.avg_score
+            # if new_move_batch_score >= move_batch_score:
+            #     print(f"Pass full check: {new_move_batch_score} >= {move_batch_score}")
+            #     self._track_effectiveness("fullset", True)
+            #     # break
+            # else:
+            #     print(
+            #         f"Fail full check, try next proposal: {new_move_batch_score} < {move_batch_score}"
+            #     )
+            #     self._track_effectiveness("fullset", False)
+            #     # self._add_failed_proposals_text_optimizers()
+            #     self._revert_text_optimizers()
+            #     if include_demo_optimizers:
+            #         self._demo_optimizers_revert()
+            #     continue
+
+            # check on the validation set
+            # set the batch size to the size of the validation set
+            val_output = self.adaltask.validation_step(
+                val_dataset,
+                total_steps,
+                self.num_workers,
+                minimum_score=last_val_score,
             )
-            new_move_batch_score = move_batch_result.avg_score
-            if new_move_batch_score >= move_batch_score:
-                print(f"Pass full check: {new_move_batch_score} >= {move_batch_score}")
-                self._track_effectiveness("fullset", True)
+            val_score = val_output.avg_score
+
+            if val_score > last_val_score:
+                print(f"Optimizer step: {val_score} > {last_val_score}")
+                # self.optimizer.step()
+                self._track_effectiveness("valset", True)
+                self._step_text_optimizers()
+                self._add_history_text_optimizers(val_score)
+
+                self._demo_optimizers_step()
+
+                # test the model
+                test_score = None
+                if test_dataset is not None:
+                    test_output = self.adaltask.validation_step(
+                        test_dataset, total_steps, self.num_workers
+                    )
+                    test_score = test_output.avg_score
+
+                new_prompts = self.adaltask._get_param_values()
+                self._add_one_step_in_trainer_results(
+                    trainer_results,
+                    val_score,
+                    test_score,
+                    new_prompts,
+                    total_steps,
+                )
+                all_samples, all_losses, all_y_preds = [], [], []
+                val_score_increased = True
                 break
             else:
-                print(
-                    f"Fail full check, try next proposal: {new_move_batch_score} < {move_batch_score}"
-                )
-                self._track_effectiveness("fullset", False)
-                # self._add_failed_proposals_text_optimizers()
+                print(f"Optimizer revert: {val_score} <= {last_val_score}")
+                self._track_effectiveness("valset", False)
+                # self.optimizer.revert()
                 self._revert_text_optimizers()
-                if include_demo_optimizers:
-                    self._demo_optimizers_revert()
+                self._demo_optimizers_revert()
+
                 continue
 
+        if not val_score_increased:
+            print("No proposal can improve the subset and full set, and val set")
+            # save the score, no change
+            self._add_one_step_in_trainer_results(
+                trainer_results,
+                last_val_score,
+                trainer_results.test_scores[-1],
+                trainer_results.prompts[-1],
+                total_steps,
+                attempted_val_score=val_score,
+            )
+
+        print(f"Saving checkpoint to {self.ckpt_file}")
+        trainer_results.effective_measure = self._effective_measure
+        save_json(trainer_results.to_dict(), self.ckpt_file)
+
         print("Done with proposals")
         self.adaltask.train()
         return all_samples, all_losses, all_y_preds
@@ -1934,88 +2087,92 @@ def _fit_text_grad_constraint(
                         all_samples=all_samples,
                         all_losses=all_losses,
                         all_y_preds=all_y_preds,
+                        trainer_results=trainer_results,
+                        val_dataset=val_dataset,
+                        test_dataset=test_dataset,
+                        total_steps=total_steps,
                     )
                 )
 
                 # check optimizer stages to see if the proposal was accepted so far
-                if not self._check_optimizer_proposal():
-                    print(
-                        "No proposal can improve the subset and full set, go to next step"
-                    )
-
-                    self._add_one_step_in_trainer_results(
-                        trainer_results,
-                        trainer_results.val_scores[-1],
-                        trainer_results.test_scores[-1],
-                        trainer_results.prompts[-1],
-                        total_steps,
-                    )
-                    continue
-
-                # prune the correct sample size if its too big, same with error samples
-                # run the tests as any other optimizer
-                if self.adaltask.validate_condition(steps, total_steps):
-                    # set the batch size to the size of the validation set
-                    last_val_score = trainer_results.val_scores[-1]
-                    val_output = self.adaltask.validation_step(
-                        val_dataset,
-                        total_steps,
-                        self.num_workers,
-                        minimum_score=last_val_score,
-                    )
-                    val_score = val_output.avg_score
-
-                    if val_score > last_val_score:
-                        print(f"Optimizer step: {val_score} > {last_val_score}")
-                        # self.optimizer.step()
-                        self._add_history_text_optimizers(
-                            val_score
-                        )  # track top performor
-                        self._step_text_optimizers()
-
-                        # save the score
-                        step_result = {
-                            "val_score": val_score,
-                        }
-
-                        self._track_effectiveness("valset", True)
-
-                        # test the model
-                        if test_dataset is not None:
-                            test_output = self.adaltask.validation_step(
-                                test_dataset,
-                                steps,
-                                self.num_workers,
-                            )
-                            step_result["test_score"] = test_output.avg_score
-                        else:
-                            step_result["test_score"] = None
-                        step_result["prompts"] = self.adaltask._get_param_values()
-                        step_result["step"] = total_steps
-                        self._add_one_step_in_trainer_results(
-                            trainer_results,
-                            **step_result,
-                        )
-
-                        # reset the moving batch
-
-                        all_samples, all_losses, all_y_preds = [], [], []
-
-                    else:
-                        print(f"Optimizer revert: {val_score} <= {last_val_score}")
-                        self._revert_text_optimizers()
-                        # self._add_failed_proposals_text_optimizers() # track failed proposals
-                        self._track_effectiveness("valset", False)
-                        self._add_one_step_in_trainer_results(
-                            trainer_results,
-                            trainer_results.val_scores[-1],
-                            trainer_results.test_scores[-1],
-                            trainer_results.prompts[-1],
-                            total_steps,
-                            attempted_val_score=val_score,
-                        )
+                # if not self._check_optimizer_proposal():
+                #     print(
+                #         "No proposal can improve the subset and full set, go to next step"
+                #     )
 
-                trainer_results.effective_measure = self._effective_measure
-                save_json(trainer_results.to_dict(), self.ckpt_file)
+                #     self._add_one_step_in_trainer_results(
+                #         trainer_results,
+                #         trainer_results.val_scores[-1],
+                #         trainer_results.test_scores[-1],
+                #         trainer_results.prompts[-1],
+                #         total_steps,
+                #     )
+                #     continue
+
+                # # prune the correct sample size if its too big, same with error samples
+                # # run the tests as any other optimizer
+                # if self.adaltask.validate_condition(steps, total_steps):
+                #     # set the batch size to the size of the validation set
+                #     last_val_score = trainer_results.val_scores[-1]
+                #     val_output = self.adaltask.validation_step(
+                #         val_dataset,
+                #         total_steps,
+                #         self.num_workers,
+                #         minimum_score=last_val_score,
+                #     )
+                #     val_score = val_output.avg_score
+
+                #     if val_score > last_val_score:
+                #         print(f"Optimizer step: {val_score} > {last_val_score}")
+                #         # self.optimizer.step()
+                #         self._add_history_text_optimizers(
+                #             val_score
+                #         )  # track top performor
+                #         self._step_text_optimizers()
+
+                #         # save the score
+                #         step_result = {
+                #             "val_score": val_score,
+                #         }
+
+                #         self._track_effectiveness("valset", True)
+
+                #         # test the model
+                #         if test_dataset is not None:
+                #             test_output = self.adaltask.validation_step(
+                #                 test_dataset,
+                #                 steps,
+                #                 self.num_workers,
+                #             )
+                #             step_result["test_score"] = test_output.avg_score
+                #         else:
+                #             step_result["test_score"] = None
+                #         step_result["prompts"] = self.adaltask._get_param_values()
+                #         step_result["step"] = total_steps
+                #         self._add_one_step_in_trainer_results(
+                #             trainer_results,
+                #             **step_result,
+                #         )
+
+                #         # reset the moving batch
+
+                #         all_samples, all_losses, all_y_preds = [], [], []
+
+                #     else:
+                #         print(f"Optimizer revert: {val_score} <= {last_val_score}")
+                #         self._revert_text_optimizers()
+                #         # self._add_failed_proposals_text_optimizers() # track failed proposals
+                #         self._track_effectiveness("valset", False)
+                #         self._add_one_step_in_trainer_results(
+                #             trainer_results,
+                #             trainer_results.val_scores[-1],
+                #             trainer_results.test_scores[-1],
+                #             trainer_results.prompts[-1],
+                #             total_steps,
+                #             attempted_val_score=val_score,
+                #         )
+
+                # trainer_results.effective_measure = self._effective_measure
+                # save_json(trainer_results.to_dict(), self.ckpt_file)
         save_json(trainer_results.to_dict(), self.ckpt_file)
         return trainer_results
diff --git a/adalflow/tests/test_react_agent.py b/adalflow/tests/test_react_agent.py
index 244a421f..b9d4e274 100644
--- a/adalflow/tests/test_react_agent.py
+++ b/adalflow/tests/test_react_agent.py
@@ -1,2 +1,96 @@
+import unittest
+from unittest.mock import Mock, patch
+from adalflow.core.func_tool import FunctionTool
+from adalflow.core.types import FunctionExpression, GeneratorOutput
+from adalflow.components.agent.react import ReActAgent
+from adalflow.components.model_client.openai_client import OpenAIClient
+
+
+# Mock tools for testing
+def mock_add(a: int, b: int) -> int:
+    return a + b
+
+
+def mock_multiply(a: int, b: int) -> int:
+    return a * b
+
+
+def mock_simple_tool(input: str) -> str:
+    return f"Processed: {input}"
+
+
+class TestReActAgent(unittest.TestCase):
+    def setUp(self):
+        # Mock OpenAIClient
+        self.mock_model_client = Mock(spec=OpenAIClient)
+
+        # Initialize ReActAgent with mocked tools and model client
+        self.tools = [
+            FunctionTool(mock_add),
+            FunctionTool(mock_multiply),
+            FunctionTool(mock_simple_tool),
+        ]
+        self.react_agent = ReActAgent(
+            tools=self.tools,
+            max_steps=5,
+            add_llm_as_fallback=True,
+            model_client=self.mock_model_client,
+            model_kwargs={"model": "gpt-3.5-turbo"},
+        )
+
+    def test_react_agent_initialization(self):
+        self.assertEqual(self.react_agent.max_steps, 5)
+        self.assertTrue(self.react_agent.add_llm_as_fallback)
+        self.assertEqual(
+            len(self.react_agent.tool_manager.tools), 5
+        )  # 3 tools + finish + fallback
+
+    @patch.object(ReActAgent, "planner", create=True)
+    def test_simple_query_execution(self, mock_planner):
+        # Simulate a valid JSON-serializable response from the planner
+        mock_planner.return_value = GeneratorOutput(
+            data=FunctionExpression.from_function(
+                thought="Finish the task directly.",
+                func=self.react_agent._finish,
+                answer="Simple answer",
+            )
+        )
+
+        result = self.react_agent.call("What is 2 + 2?")
+        self.assertEqual(result, "Simple answer")
+
+    @patch.object(ReActAgent, "planner", create=True)
+    def test_complex_query_execution(self, mock_planner):
+        # Simulate multiple steps for a complex query
+        mock_planner.side_effect = [
+            GeneratorOutput(
+                data=FunctionExpression.from_function(
+                    thought="Divide the task into subqueries.", func=mock_add, a=2, b=2
+                )
+            ),
+            GeneratorOutput(
+                data=FunctionExpression.from_function(
+                    thought="Multiply the results.", func=mock_multiply, a=4, b=3
+                )
+            ),
+        ]
+
+        result = self.react_agent.call("Add 2 and 3, then return the result.")
+        self.assertEqual(result, 12)
+
+    @patch.object(ReActAgent, "planner", create=True)
+    def test_error_handling(self, mock_planner):
+        # Simulate an error scenario
+        mock_planner.return_value = GeneratorOutput(
+            data={
+                "thought": "Encountered an error.",
+                "function": {"name": "finish", "args": {"answer": "Error occurred"}},
+            }
+        )
+
+        result = self.react_agent.call("Simulate an error.")
+        self.assertIn("Error occurred", result)
+
+
 if __name__ == "__main__":
-    pass
+    unittest.main()
diff --git a/text_grad_2_0_recompute.py b/text_grad_2_0_recompute.py
new file mode 100644
index 00000000..bd52905b
--- /dev/null
+++ b/text_grad_2_0_recompute.py
@@ -0,0 +1,127 @@
+import json
+import os
+import math
+
+
+def recompute_metrics_and_update_summary(result_file):
+    try:
+        # Load the results file
+        with open(result_file, "r") as f:
+            ckpt_values = json.load(f)
+
+        # Initialize variables for metrics computation
+        highest_test_score = 0
+        mean_test_score = 0
+        standard_deviation = 0
+        past_highest_scores = []
+        past_highest_test_scores = []
+
+        average_pass_rate_list = []
+        average_pass_prompts_list = []
+        average_total_prompts_list = []
+
+        highest_val_score = 0
+
+        # Process each experiment
+        for experiment, data in ckpt_values.items():
+            if "summary" in experiment:
+                continue  # Skip summary entries
+
+            ckpt_path = data
+
+            if os.path.exists(ckpt_path):
+                with open(ckpt_path, "r") as ckpt_file:
+                    experiment_data = json.load(ckpt_file)
+
+                val_scores = experiment_data.get("val_scores", [])
+                test_scores = experiment_data.get("test_scores", [])
+                _high_test_score = max(val_scores, default=0)
+                _high_val_score = max(test_scores, default=0)
+
+                past_highest_scores.append(_high_test_score)
+                past_highest_test_scores.append(_high_val_score)
+
+                if _high_test_score > highest_test_score:
+                    highest_test_score = _high_test_score
+
+                if _high_val_score > highest_val_score:
+                    highest_val_score = _high_val_score
+
+                effective_measures = experiment_data.get("effective_measure", {})
+
+                if effective_measures:
+                    pass_num = effective_measures["valset"].get("pass", 0)
+                    total_val_prompts = effective_measures["valset"].get(
+                        "pass", 0
+                    ) + effective_measures["valset"].get("fail", 0)
+                else:
+                    total_val_prompts = len(val_scores) - 1
+                    pass_num = len(set(val_scores))
+
+                average_pass_rate = (
+                    pass_num / total_val_prompts if total_val_prompts > 0 else 0
+                )
+                average_pass_rate_list.append(average_pass_rate)
+                average_pass_prompts_list.append(pass_num)
+                average_total_prompts_list.append(total_val_prompts)
+
+        # Compute final metrics
+        if past_highest_scores:
+            mean_test_score = sum(past_highest_scores) / len(past_highest_scores)
+            standard_deviation = math.sqrt(
+                sum((x - mean_test_score) ** 2 for x in past_highest_scores)
+                / len(past_highest_scores)
+            )
+
+        average_pass_rate = (
+            sum(average_pass_rate_list) / len(average_pass_rate_list)
+            if average_pass_rate_list
+            else 0
+        )
+        average_pass_prompts = (
+            sum(average_pass_prompts_list) / len(average_pass_prompts_list)
+            if average_pass_prompts_list
+            else 0
+        )
+        average_total_prompts = (
+            sum(average_total_prompts_list) / len(average_total_prompts_list)
+            if average_total_prompts_list
+            else 0
+        )
+
+        # Update the summary in ckpt_values
+        summary_key = "summary"
+        ckpt_values[summary_key] = {
+            "highest_test_score": highest_test_score,
+            "mean_test_score": mean_test_score,
+            "standard_deviation": standard_deviation,
+            "average_pass_rate": average_pass_rate,
+            "average_pass_prompts": average_pass_prompts,
+            "average_total_prompts": average_total_prompts,
+            "past_highest_scores": past_highest_scores,
+            "past_highest_test_scores": past_highest_test_scores,
+            "highest_val_score": highest_val_score,
+        }
+
+        # Save updated ckpt_values back to the file
+        with open(result_file, "w") as f:
+            json.dump(ckpt_values, f, indent=4)
+
+        return ckpt_values[summary_key]
+
+    except Exception as e:
+        print(f"Error while recomputing metrics: {e}")
+        return None
+
+
+# Usage
+if __name__ == "__main__":
+    result_file = "results.json"  # Replace with your actual result file
+    result_file = "text_grad_2_results_4_runs_1872c441-0db2-4640-9cf6-8ef910744a93.json"
+    result_file = "text_grad_2_results_4_runs_02b9f463-aa21-4485-9899-07ac2542ddac.json"  # only use fullset
+    summary = recompute_metrics_and_update_summary(result_file)
+
+    if summary:
+        print("Updated Summary:")
+        for key, value in summary.items():
+            print(f"{key}: {value}")
diff --git a/use_cases/agent/react_agent copy.ipynb b/use_cases/agent/react_agent copy.ipynb
new file mode 100644
index 00000000..0e1d4d6d
--- /dev/null
+++ b/use_cases/agent/react_agent copy.ipynb	
@@ -0,0 +1,1272 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# ReAct Agent Use Case"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 1. Q&A Chatbot\n",
+    "In this tutorial, we will implement ``adalflow ReAct`` to build a Q&A chatbot on [HotpotQA](https://arxiv.org/pdf/1809.09600) dataset. \n",
+    "\n",
+    "To learn more about ``adalflow ReAct``, please refer to our developer notes."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "# 2. HotpotQA Dataset\n",
+    "We are using [HotpotQA](https://arxiv.org/pdf/1809.09600). It is a Wikipedia-based multi-hop question and answer dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load the dataset\n",
+    "from datasets import load_dataset\n",
+    "\n",
+    "dataset = load_dataset(path=\"hotpot_qa\", name=\"fullwiki\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "len of eval: 7405\n",
+      "example: {'id': '5a8b57f25542995d1e6f1371', 'question': 'Were Scott Derrickson and Ed Wood of the same nationality?', 'answer': 'yes', 'type': 'comparison', 'level': 'hard', 'supporting_facts': {'title': ['Scott Derrickson', 'Ed Wood'], 'sent_id': [0, 0]}, 'context': {'title': ['Adam Collis', 'Ed Wood (film)', 'Tyler Bates', 'Doctor Strange (2016 film)', 'Hellraiser: Inferno', 'Sinister (film)', 'Deliver Us from Evil (2014 film)', 'Woodson, Arkansas', 'Conrad Brooks', 'The Exorcism of Emily Rose'], 'sentences': [['Adam Collis is an American filmmaker and actor.', ' He attended the Duke University from 1986 to 1990 and the University of California, Los Angeles from 2007 to 2010.', ' He also studied cinema at the University of Southern California from 1991 to 1997.', ' Collis first work was the assistant director for the Scott Derrickson\\'s short \"Love in the Ruins\" (1995).', ' In 1998, he played \"Crankshaft\" in Eric Koyanagi\\'s \"Hundred Percent\".'], ['Ed Wood is a 1994 American biographical period comedy-drama film directed and produced by Tim Burton, and starring Johnny Depp as cult filmmaker Ed Wood.', \" The film concerns the period in Wood's life when he made his best-known films as well as his relationship with actor Bela Lugosi, played by Martin Landau.\", ' Sarah Jessica Parker, Patricia Arquette, Jeffrey Jones, Lisa Marie, and Bill Murray are among the supporting cast.'], ['Tyler Bates (born June 5, 1965) is an American musician, music producer, and composer for films, television, and video games.', ' Much of his work is in the action and horror film genres, with films like \"Dawn of the Dead, 300, Sucker Punch,\" and \"John Wick.\"', ' He has collaborated with directors like Zack Snyder, Rob Zombie, Neil Marshall, William Friedkin, Scott Derrickson, and James Gunn.', ' With Gunn, he has scored every one of the director\\'s films; including \"Guardians of the Galaxy\", which became one of the highest grossing domestic movies of 2014, and its 2017 sequel.', ' In addition, he is also the lead guitarist of the American rock band Marilyn Manson, and produced its albums \"The Pale Emperor\" and \"Heaven Upside Down\".'], ['Doctor Strange is a 2016 American superhero film based on the Marvel Comics character of the same name, produced by Marvel Studios and distributed by Walt Disney Studios Motion Pictures.', ' It is the fourteenth film of the Marvel Cinematic Universe (MCU).', ' The film was directed by Scott Derrickson, who wrote it with Jon Spaihts and C. Robert Cargill, and stars Benedict Cumberbatch as Stephen Strange, along with Chiwetel Ejiofor, Rachel McAdams, Benedict Wong, Michael Stuhlbarg, Benjamin Bratt, Scott Adkins, Mads Mikkelsen, and Tilda Swinton.', ' In \"Doctor Strange\", surgeon Strange learns the mystic arts after a career-ending car accident.'], ['Hellraiser: Inferno (also known as Hellraiser V: Inferno) is a 2000 American horror film.', ' It is the fifth installment in the \"Hellraiser\" series and the first \"Hellraiser\" film to go straight-to-DVD.', ' It was directed by Scott Derrickson and released on October 3, 2000.', \" The film concerns a corrupt detective who discovers Lemarchand's box at a crime scene.\", \" The film's reviews were mixed.\"], ['Sinister is a 2012 supernatural horror film directed by Scott Derrickson and written by Derrickson and C. Robert Cargill.', ' It stars Ethan Hawke as fictional true-crime writer Ellison Oswalt who discovers a box of home movies in his attic that puts his family in danger.'], ['Deliver Us from Evil is a 2014 American supernatural horror film directed by Scott Derrickson and produced by Jerry Bruckheimer.', ' The film is officially based on a 2001 non-fiction book entitled \"Beware the Night\" by Ralph Sarchie and Lisa Collier Cool, and its marketing campaign highlighted that it was \"inspired by actual accounts\".', ' The film stars Eric Bana, Édgar Ramírez, Sean Harris, Olivia Munn, and Joel McHale in the main roles and was released on July 2, 2014.'], ['Woodson is a census-designated place (CDP) in Pulaski County, Arkansas, in the United States.', ' Its population was 403 at the 2010 census.', ' It is part of the Little Rock–North Little Rock–Conway Metropolitan Statistical Area.', ' Woodson and its accompanying Woodson Lake and Wood Hollow are the namesake for Ed Wood Sr., a prominent plantation owner, trader, and businessman at the turn of the 20th century.', ' Woodson is adjacent to the Wood Plantation, the largest of the plantations own by Ed Wood Sr.'], ['Conrad Brooks (born Conrad Biedrzycki on January 3, 1931 in Baltimore, Maryland) is an American actor.', ' He moved to Hollywood, California in 1948 to pursue a career in acting.', ' He got his start in movies appearing in Ed Wood films such as \"Plan 9 from Outer Space\", \"Glen or Glenda\", and \"Jail Bait.\"', ' He took a break from acting during the 1960s and 1970s but due to the ongoing interest in the films of Ed Wood, he reemerged in the 1980s and has become a prolific actor.', ' He also has since gone on to write, produce and direct several films.'], ['The Exorcism of Emily Rose is a 2005 American legal drama horror film directed by Scott Derrickson and starring Laura Linney and Tom Wilkinson.', ' The film is loosely based on the story of Anneliese Michel and follows a self-proclaimed agnostic who acts as defense counsel (Linney) representing a parish priest (Wilkinson), accused by the state of negligent homicide after he performed an exorcism.']]}}\n",
+      "attributes in each sample: ['id', 'question', 'answer', 'type', 'level', 'supporting_facts', 'context']\n"
+     ]
+    }
+   ],
+   "source": [
+    "# check the data sample\n",
+    "test_sample = dataset[\"validation\"][0]\n",
+    "print(f\"len of eval: {len(dataset['validation'])}\")\n",
+    "print(f\"example: {test_sample}\")\n",
+    "print(f\"attributes in each sample: {list(test_sample.keys())}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "question: Were Scott Derrickson and Ed Wood of the same nationality?\n",
+      "answer: yes\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Each sample contains a question and a corresponding answer.\n",
+    "print(f\"question: {test_sample.get('question')}\")\n",
+    "print(f\"answer: {test_sample.get('answer')}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 3. Set up\n",
+    "Please make sure you have set the model client APIs before running the agent. Now import the necessary packages."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import dotenv\n",
+    "from adalflow.components.model_client import OpenAIClient\n",
+    "from adalflow.components.agent.react import ReActAgent\n",
+    "from adalflow.core.func_tool import FunctionTool\n",
+    "\n",
+    "import time\n",
+    "\n",
+    "# load evironment, please set the relative path to your .env file that includes the api key\n",
+    "dotenv.load_dotenv(dotenv_path=\"../../.env\", override=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 4. Create Agent\n",
+    "To create an gent, we need to define the basic components.\n",
+    "\n",
+    "## Tools\n",
+    "Firstly, we need to specify what functions the agent will need to answer the question. In this case, we are answering the Wikipedia-based questions, we will allow the agent to **search** Wikipedia api. The [ReAct Paper](https://arxiv.org/pdf/2210.03629) includes a **lookup** function that serves as Ctrl+F functionality on the browser.\n",
+    "\n",
+    "As ``adalflow ReAct`` has a built in ``finish`` function, we don't need to define by ourselves."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "from bs4 import BeautifulSoup\n",
+    "import re\n",
+    "import string\n",
+    "\n",
+    "\n",
+    "# copy code from the paper\n",
+    "def clean_str(p):\n",
+    "    return p.encode().decode(\"unicode-escape\").encode(\"latin1\").decode(\"utf-8\")\n",
+    "\n",
+    "\n",
+    "# normalization copied from the paper's code\n",
+    "def normalize_answer(s):\n",
+    "    def remove_articles(text):\n",
+    "        return re.sub(r\"\\b(a|an|the)\\b\", \" \", text)\n",
+    "\n",
+    "    def white_space_fix(text):\n",
+    "        return \" \".join(text.split())\n",
+    "\n",
+    "    def remove_punc(text):\n",
+    "        exclude = set(string.punctuation)\n",
+    "        return \"\".join(ch for ch in text if ch not in exclude)\n",
+    "\n",
+    "    def lower(text):\n",
+    "        return text.lower()\n",
+    "\n",
+    "    return white_space_fix(remove_articles(remove_punc(lower(s))))\n",
+    "\n",
+    "\n",
+    "def search(entity: str) -> str:\n",
+    "    \"\"\"\n",
+    "    searches the exact entity on Wikipedia and returns the first paragraph if it exists. If not, it will return some similar entities to search.\n",
+    "    \"\"\"\n",
+    "    # Format the entity for URL encoding\n",
+    "    entity_formatted = entity.replace(\" \", \"+\")\n",
+    "    url = f\"https://en.wikipedia.org/w/index.php?search={entity_formatted}\"\n",
+    "\n",
+    "    # Fetch the page\n",
+    "    response = requests.get(url)\n",
+    "    soup = BeautifulSoup(response.text, \"html.parser\")\n",
+    "\n",
+    "    # Check if the exact page was found or suggest similar items\n",
+    "    # when <div class=mw-search-result-heading> is detected, it means the entity page is not found on wikipedia\n",
+    "    result_divs = soup.find_all(\"div\", {\"class\": \"mw-search-result-heading\"})\n",
+    "\n",
+    "    if (\n",
+    "        result_divs\n",
+    "    ):  # this means the searched entity page is not in wikipedia, wikipedia will show a list of similar entities\n",
+    "        # get Similar results\n",
+    "        similar_titles = [div.a.get_text() for div in result_divs]\n",
+    "        return f\"Could not find exact page for '{entity}'. Similar topics: {similar_titles[:5]}\"  # return the top 5 similar titles\n",
+    "    else:\n",
+    "        # the paper uses page to represent content in <p>\n",
+    "        # Extract xontent\n",
+    "        page_list = [\n",
+    "            p.get_text().strip() for p in soup.find_all(\"p\") + soup.find_all(\"ul\")\n",
+    "        ]\n",
+    "        # TODO: Recursive search, if find any concept that needs more search then call search again\n",
+    "        # if any(\"may refer to:\" in p for p in page_list):\n",
+    "        #     search(entity)\n",
+    "\n",
+    "        # restructure & clean the page content following the paper's logic\n",
+    "        page = \"\"\n",
+    "        for p in page_list:\n",
+    "            if len(p.split(\" \")) > 2:\n",
+    "                page += clean_str(p)\n",
+    "                if not p.endswith(\"\\n\"):\n",
+    "                    page += \"\\n\"\n",
+    "        paragraphs = page.split(\"\\n\")\n",
+    "        paragraphs = [p.strip() for p in paragraphs if p.strip()]\n",
+    "\n",
+    "        sentences = []\n",
+    "        for p in paragraphs:\n",
+    "            sentences += p.split(\". \")\n",
+    "        sentences = [s.strip() + \".\" for s in sentences if s.strip()]\n",
+    "\n",
+    "        # return the first 5 sentences\n",
+    "        if sentences:\n",
+    "            return (\n",
+    "                \" \".join(sentences[:5]) if len(sentences) >= 5 else \" \".join(sentences)\n",
+    "            )\n",
+    "        else:\n",
+    "            return \"No content found on this page.\"\n",
+    "\n",
+    "        # TODO: clean the paragraphs and return the searched content\n",
+    "\n",
+    "\n",
+    "def lookup(text: str, keyword: str) -> str:\n",
+    "    \"\"\"\n",
+    "    returns the sentences containing keyword in the current passage.\n",
+    "    \"\"\"\n",
+    "    sentences = text.split(\".\")\n",
+    "    matching_sentences = [\n",
+    "        sentence.strip() + \".\"\n",
+    "        for sentence in sentences\n",
+    "        if keyword.lower() in sentence.lower()\n",
+    "    ]\n",
+    "    if not matching_sentences:\n",
+    "        return \"No sentences found with the keyword.\"\n",
+    "    else:\n",
+    "        return \" \".join(\n",
+    "            matching_sentences\n",
+    "        )  # Join all matching sentences into a single string"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# set up tools for the agent\n",
+    "tools = [FunctionTool(fn=search)]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Examples\n",
+    "The next thing to add is examples. Few shot prompt engineering is a common practice to improve the model performance.\n",
+    "\n",
+    "Let's use the paper's examples. The paper has 6 examples altogether."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "examples = [\n",
+    "    \"\"\"Question: What is the elevation range for the area that the eastern sector of the Colorado orogeny extends into?\n",
+    "Thought 1: I need to search Colorado orogeny, find the area that the eastern sector of the Colorado orogeny extends into, then find the elevation range of the area.\n",
+    "Action 1: search(\"Colorado orogeny\")\n",
+    "Observation 1: The Colorado orogeny was an episode of mountain building (an orogeny) in Colorado and surrounding areas.\n",
+    "Thought 2: It does not mention the eastern sector. So I need to look up eastern sector.\n",
+    "Action 2: lookup(\"eastern sector\")\n",
+    "Observation 2: (Result 1 / 1) The eastern sector extends into the High Plains and is called the Central Plains orogeny.\n",
+    "Thought 3: The eastern sector of Colorado orogeny extends into the High Plains. So I need to search High Plains and find its elevation range.\n",
+    "Action 3: search(\"High Plains\")\n",
+    "Observation 3: High Plains refers to one of two distinct land regions:\n",
+    "Thought 4: I need to instead search High Plains (United States).\n",
+    "Action 4: search(\"High Plains (United States)\")\n",
+    "Observation 4: The High Plains are a subregion of the Great Plains. From east to west, the High Plains rise in elevation from around 1,800 to 7,000 ft (550 to 2,130 m).[3]\n",
+    "Thought 5: High Plains rise in elevation from around 1,800 to 7,000 ft, so the answer is 1,800 to 7,000 ft.\n",
+    "Action 5: finish(\"1,800 to 7,000 ft\")\"\"\",\n",
+    "    \"\"\"Question: Musician and satirist Allie Goertz wrote a song about the \"The Simpsons\" character Milhouse, who Matt Groening named after who?\n",
+    "Thought 1: The question simplifies to \"The Simpsons\" character Milhouse is named after who. I only need to search Milhouse and find who it is named after.\n",
+    "Action 1: search(\"Milhouse\")\n",
+    "Observation 1: Milhouse Mussolini Van Houten is a recurring character in the Fox animated television series The Simpsons voiced by Pamela Hayden and created by Matt Groening.\n",
+    "Thought 2: The paragraph does not tell who Milhouse is named after, maybe I can look up \"named after\".\n",
+    "Action 2: lookup(\"named after\")\n",
+    "Observation 2: (Result 1 / 1) Milhouse was named after U.S. president Richard Nixon, whose middle name was Milhous. \n",
+    "Thought 3: Milhouse was named after U.S. president Richard Nixon, so the answer is Richard Nixon.\n",
+    "Action 3: finish(\"Richard Nixon\")\"\"\",\n",
+    "    \"\"\"Question: Which documentary is about Finnish rock groups, Adam Clayton Powell or The Saimaa Gesture?\n",
+    "Thought 1: I need to search Adam Clayton Powell and The Saimaa Gesture, and find which documentary is about Finnish rock groups.\n",
+    "Action 1: search(\"Adam Clayton Powell\")\n",
+    "Observation 1: Could not find [\"Adam Clayton Powell\"]. Similar: ['Adam Clayton Powell III', 'Seventh Avenue (Manhattan)', 'Adam Clayton Powell Jr. State Office Building', 'Isabel Washington Powell', 'Adam Powell', 'Adam Clayton Powell (film)', 'Giancarlo Esposito'].\n",
+    "Thought 2: To find the documentary, I can search Adam Clayton Powell (film).\n",
+    "Action 2: search(\"Adam Clayton Powell (film)\")\n",
+    "Observation 2: Adam Clayton Powell is a 1989 American documentary film directed by Richard Kilberg.\n",
+    "The film is about the rise and fall of influential African-American politician Adam Clayton Powell Jr.[3][4] It was later aired as part of the PBS series The American Experience.\n",
+    "Thought 3: Adam Clayton Powell (film) is a documentary about an African-American politician, not Finnish rock groups. So the documentary about Finnish rock groups must instead be The Saimaa Gesture.\n",
+    "Action 3: finish(\"The Saimaa Gesture\")\"\"\",\n",
+    "    \"\"\"Question: What profession does Nicholas Ray and Elia Kazan have in common?\n",
+    "Thought 1: I need to search Nicholas Ray and Elia Kazan, find their professions, then find the profession they have in common.\n",
+    "Action 1: search(\"Nicholas Ray\")\n",
+    "Observation 1: Nicholas Ray (born Raymond Nicholas Kienzle Jr., August 7, 1911 – June 16, 1979) was an American film director, screenwriter, and actor best known for the 1955 film Rebel Without a Cause.\n",
+    "Thought 2: Professions of Nicholas Ray are director, screenwriter, and actor. I need to search Elia Kazan next and find his professions.\n",
+    "Action 2: search(\"Elia Kazan\")\n",
+    "Observation 2: Elia Kazan was an American film and theatre director, producer, screenwriter and actor.\n",
+    "Thought 3: Professions of Elia Kazan are director, producer, screenwriter, and actor. So profession Nicholas Ray and Elia Kazan have in common is director, screenwriter, and actor.\n",
+    "Action 3: finish(\"director, screenwriter, actor\")\"\"\",\n",
+    "    \"\"\"Question: Which magazine was started first Arthur's Magazine or First for Women?\n",
+    "Thought 1: I need to search Arthur's Magazine and First for Women, and find which was started first.\n",
+    "Action 1: search(\"Arthur's Magazine\")\n",
+    "Observation 1: Arthur's Magazine (1844-1846) was an American literary periodical published in Philadelphia in the 19th century. \n",
+    "Thought 2: Arthur's Magazine was started in 1844. I need to search First for Women next.\n",
+    "Action 2: search(\"First for Women\")\n",
+    "Observation 2: First for Women is a woman's magazine published by Bauer Media Group in the USA.[1] The magazine was started in 1989. \n",
+    "Thought 3: First for Women was started in 1989. 1844 (Arthur's Magazine) < 1989 (First for Women), so Arthur's Magazine was started first.\n",
+    "Action 3: finish(\"Arthur's Magazine\")\"\"\",\n",
+    "    \"\"\"Question: Were Pavel Urysohn and Leonid Levin known for the same type of work?\n",
+    "Thought 1: I need to search Pavel Urysohn and Leonid Levin, find their types of work, then find if they are the same.\n",
+    "Action 1: search(\"Pavel Urysohn\")\n",
+    "Observation 1: Pavel Samuilovich Urysohn (February 3, 1898 â August 17, 1924) was a Soviet mathematician who is best known for his contributions in dimension theory.\n",
+    "Thought 2: Pavel Urysohn is a mathematician. I need to search Leonid Levin next and find its type of work.\n",
+    "Action 2: search(\"Leonid Levin\")\n",
+    "Observation 2: Leonid Anatolievich Levin is a Soviet-American mathematician and computer scientist. \n",
+    "Thought 3: Leonid Levin is a mathematician and computer scientist. So Pavel Urysohn and Leonid Levin have the same type of work. \n",
+    "Action 3: finish(\"yes\")\"\"\",\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# preset up the examples as prompt_kwargs, the examples will be included in the system prompt\n",
+    "\n",
+    "# convert this example in FunctionExpression\n",
+    "# import adalflow as adal\n",
+    "# from adalflow.core.types import FunctionExpression\n",
+    "\n",
+    "# eg_1 = {\n",
+    "#     \"Question\": \"What is the elevation range for the area that the eastern sector of the Colorado orogeny extends into?\",\n",
+    "#     \"Thought 1\": \"I need to search Colorado orogeny, find the area that the eastern sector of the Colorado orogeny extends into, then find the elevation range of the area.\",\n",
+    "#     \"Action 1\": \"search\",\n",
+    "#     \"kwargs\": {\"entity\": \"Colorado orogeny\"},\n",
+    "#     \"Observation 1\": \"The Colorado orogeny was an episode of mountain building (an orogeny) in Colorado and surrounding areas.\",\n",
+    "#     \"Thought 2\": \"It does not mention the eastern sector. So I need to look up eastern sector.\",\n",
+    "#     \"Action 2\": \"lookup('eastern sector')\",\n",
+    "#     \"Action 2\": \"lookup\",\n",
+    "#     \"kwargs\": {\"text\": \"eastern sector\", \"keyword\": \"eastern sector\"},\n",
+    "#     \"Observation 2\": \"(Result 1 / 1) The eastern sector extends into the High Plains and is called the Central Plains orogeny.\",\n",
+    "#     \"Thought 3\": \"The eastern sector of Colorado orogeny extends into the High Plains. So I need to search High Plains and find its elevation range.\",\n",
+    "#     \"Action 3\": \"search('High Plains')\",\n",
+    "#     \"Observation 3\": \"High Plains refers to one of two distinct land regions:\",\n",
+    "#     \"Thought 4\": \"I need to instead search High Plains (United States).\",\n",
+    "#     \"Action 4\": \"search('High Plains (United States)')\",\n",
+    "#     \"Observation 4\": \"The High Plains are a subregion of the Great Plains. From east to west, the High Plains rise in elevation from around 1,800 to 7,000 ft (550 to 2,130 m).[3]\",\n",
+    "#     \"Thought 5\": \"High Plains rise in elevation from around 1,800 to 7,000 ft, so the answer is 1,800 to 7,000 ft.\",\n",
+    "#     \"Action 5\": \"finish('1,800 to 7,000 ft')\"\n",
+    "# }\n",
+    "\n",
+    "# examples_expression = []\n",
+    "# for example in examples:\n",
+    "#     ex_exp = FunctionExpression(thought=example)\n",
+    "\n",
+    "# preset_prompt_kwargs = {\"examples\": examples}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Model\n",
+    "\n",
+    "Next, we can choose the model to call. In this example we will use OpenAIClient ``gpt-3.5-turbo`` model. We will set the ``temperature`` at 0.0 to make the response as consistent as possible."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gpt_model_kwargs = {\n",
+    "    \"model\": \"gpt-3.5-turbo\",\n",
+    "    \"temperature\": 0.0,\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Agent\n",
+    "Combining the previous components, we can define the agent."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "ReActAgent(\n",
+       "  max_steps=3, add_llm_as_fallback=True, \n",
+       "  (tool_manager): ToolManager(Tools: [FunctionTool(fn: <function search at 0x12266a520>, async: False, definition: FunctionDefinition(func_name='search', func_desc='search(entity: str) -> str\\n\\n    searches the exact entity on Wikipedia and returns the first paragraph if it exists. If not, it will return some similar entities to search.\\n    ', func_parameters={'type': 'object', 'properties': {'entity': {'type': 'str'}}, 'required': ['entity']})), FunctionTool(fn: <function ReActAgent._init_tools.<locals>.llm_tool at 0x1379487c0>, async: False, definition: FunctionDefinition(func_name='llm_tool', func_desc=\"llm_tool(input: str) -> str\\nI answer any input query with llm's world knowledge. Use me as a fallback tool or when the query is simple.\", func_parameters={'type': 'object', 'properties': {'input': {'type': 'str'}}, 'required': ['input']})), FunctionTool(fn: <function ReActAgent._init_tools.<locals>.finish at 0x137948400>, async: False, definition: FunctionDefinition(func_name='finish', func_desc='finish(answer: str) -> str\\nFinish the task with answer.', func_parameters={'type': 'object', 'properties': {'answer': {'type': 'str'}}, 'required': ['answer']}))], Additional Context: {})\n",
+       "  (planner): Generator(\n",
+       "    model_kwargs={'model': 'gpt-3.5-turbo', 'temperature': 0.0}, trainable_prompt_kwargs=[]\n",
+       "    (prompt): Prompt(\n",
+       "      template: <SYS>\n",
+       "      {# role/task description #}\n",
+       "      You are a helpful assistant.\n",
+       "      Answer the user's query using the tools provided below with minimal steps and maximum accuracy.\n",
+       "      {# REACT instructions #}\n",
+       "      Each step you will read the previous Thought, Action, and Observation(execution result of the action) and then provide the next Thought and Action.\n",
+       "      {# Tools #}\n",
+       "      {% if tools %}\n",
+       "      <TOOLS>\n",
+       "      You available tools are:\n",
+       "      {% for tool in tools %}\n",
+       "      {{ loop.index }}.\n",
+       "      {{tool}}\n",
+       "      ------------------------\n",
+       "      {% endfor %}\n",
+       "      </TOOLS>\n",
+       "      {% endif %}\n",
+       "      {# output format and examples for output format #}\n",
+       "      <OUTPUT_FORMAT>\n",
+       "      {{output_format_str}}\n",
+       "      </OUTPUT_FORMAT>\n",
+       "      <TASK_SPEC>\n",
+       "      {# Task specification to teach the agent how to think using 'divide and conquer' strategy #}\n",
+       "      - For simple queries: Directly call the ``finish`` action and provide the answer.\n",
+       "      - For complex queries:\n",
+       "          - Step 1: Read the user query and potentially divide it into subqueries. And get started with the first subquery.\n",
+       "          - Call one available tool at a time to solve each subquery/subquestion. \\\n",
+       "          - At step 'finish', join all subqueries answers and finish the task.\n",
+       "      Remember:\n",
+       "      - Action must call one of the above tools with name. It can not be empty.\n",
+       "      - You will always end with 'finish' action to finish the task. The answer can be the final answer or failure message.\n",
+       "      </TASK_SPEC>\n",
+       "      </SYS>\n",
+       "      -----------------\n",
+       "      User query:\n",
+       "      {{ input_str }}\n",
+       "      {# Step History #}\n",
+       "      {% if step_history %}\n",
+       "      <STEPS>\n",
+       "      Your previous steps:\n",
+       "      {% for history in step_history %}\n",
+       "      Step {{ loop.index }}.\n",
+       "      \"Thought\": \"{{history.action.thought}}\",\n",
+       "      \"Action\": \"{{history.action.action}}\",\n",
+       "      \"Observation\": \"{{history.observation}}\"\n",
+       "      ------------------------\n",
+       "      {% endfor %}\n",
+       "      </STEPS>\n",
+       "      {% endif %}\n",
+       "      You:, prompt_kwargs: {'tools': ['func_name: search\\nfunc_desc: \"search(entity: str) -> str\\\\n\\\\n    searches the exact entity on Wikipedia\\\\\\n  \\\\ and returns the first paragraph if it exists. If not, it will return some similar\\\\\\n  \\\\ entities to search.\\\\n    \"\\nfunc_parameters:\\n  type: object\\n  properties:\\n    entity:\\n      type: str\\n  required:\\n  - entity', \"func_name: llm_tool\\nfunc_desc: 'llm_tool(input: str) -> str\\n\\n  I answer any input query with llm''s world knowledge. Use me as a fallback tool\\n  or when the query is simple.'\\nfunc_parameters:\\n  type: object\\n  properties:\\n    input:\\n      type: str\\n  required:\\n  - input\", \"func_name: finish\\nfunc_desc: 'finish(answer: str) -> str\\n\\n  Finish the task with answer.'\\nfunc_parameters:\\n  type: object\\n  properties:\\n    answer:\\n      type: str\\n  required:\\n  - answer\"], 'output_format_str': 'Your output should be formatted as a standard JSON instance with the following schema:\\n```\\n{\\n    \"question\": \"The question to ask the LLM (Optional[str]) (optional)\",\\n    \"thought\": \"Why the function is called (Optional[str]) (optional)\",\\n    \"action\": \"FuncName(<kwargs>) Valid function call expression. Example: \\\\\"FuncName(a=1, b=2)\\\\\" Follow the data type specified in the function parameters.e.g. for Type object with x,y properties, use \\\\\"ObjectType(x=1, y=2) (str) (required)\"\\n}\\n```\\nExamples:\\n```\\n{\\n    \"question\": null,\\n    \"thought\": \"I have finished the task.\",\\n    \"action\": \"finish(answer=\\\\\"final answer: \\'answer\\'\\\\\")\"\\n}\\n________\\n```\\n-Make sure to always enclose the JSON output in triple backticks (```). Please do not add anything other than valid JSON output!\\n-Use double quotes for the keys and string values.\\n-DO NOT mistaken the \"properties\" and \"type\" in the schema as the actual fields in the JSON output.\\n-Follow the JSON formatting conventions.'}, prompt_variables: ['input_str', 'tools', 'step_history', 'output_format_str']\n",
+       "    )\n",
+       "    (model_client): OpenAIClient()\n",
+       "    (output_processors): JsonOutputParser(\n",
+       "      data_class=FunctionExpression, examples=[FunctionExpression(question=None, thought='I have finished the task.', action='finish(answer=\"final answer: \\'answer\\'\")')], exclude_fields=None,             include_fields=None, return_data_class=True\n",
+       "      (output_format_prompt): Prompt(\n",
+       "        template: Your output should be formatted as a standard JSON instance with the following schema:\n",
+       "        ```\n",
+       "        {{schema}}\n",
+       "        ```\n",
+       "        {% if example %}\n",
+       "        Examples:\n",
+       "        ```\n",
+       "        {{example}}\n",
+       "        ```\n",
+       "        {% endif %}\n",
+       "        -Make sure to always enclose the JSON output in triple backticks (```). Please do not add anything other than valid JSON output!\n",
+       "        -Use double quotes for the keys and string values.\n",
+       "        -DO NOT mistaken the \"properties\" and \"type\" in the schema as the actual fields in the JSON output.\n",
+       "        -Follow the JSON formatting conventions., prompt_variables: ['example', 'schema']\n",
+       "      )\n",
+       "      (output_processors): JsonParser()\n",
+       "    )\n",
+       "  )\n",
+       ")"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# max_steps refers to how many thought-action round we allow the model to perform\n",
+    "# to save resources, let's use 3 here\n",
+    "agent = ReActAgent(\n",
+    "    tools=tools,\n",
+    "    max_steps=3,\n",
+    "    model_client=OpenAIClient(),\n",
+    "    model_kwargs=gpt_model_kwargs,\n",
+    "    # preset_prompt_kwargs=preset_prompt_kwargs,\n",
+    ")\n",
+    "agent"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<module 'lightrag' from '/Users/alleria/Documents/sylphAI/lightrag_package/LightRAG/lightrag/__init__.py'>"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import importlib\n",
+    "import adalflow\n",
+    "\n",
+    "importlib.reload(adalflow)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 5. Q & A\n",
+    "Next we can use the agent to answer our questions. Let's run 5 examples. We will use the validation data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['id', 'question', 'answer', 'type', 'level', 'supporting_facts', 'context'],\n",
+       "    num_rows: 7405\n",
+       "})"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "val_dataset = dataset[\"validation\"]\n",
+    "val_dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "``LightRAG`` provides a ``printc`` function. You can utilize it to show colored console output for angent."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[31m2024-12-19 13:58:48 - [react.py:285:call] - input_query: Were Scott Derrickson and Ed Wood of the same nationality?\u001b[0m\n",
+      "\u001b[34m2024-12-19 13:58:49 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought='I will search for the nationalities of Scott Derrickson and Ed Wood.', action=\"search(entity='Scott Derrickson')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Scott Derrickson'}), observation='Scott Derrickson (born July 16, 1966) is an American filmmaker. He is best known for his work in the horror genre, directing films such as The Exorcism of Emily Rose (2005), Sinister (2012), and The Black Phone (2021). He is also known for the superhero film Doctor Strange (2016), based on the Marvel Comics character.. Scott Derrickson grew up in Denver, Colorado. He graduated from Biola University with a B.A.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 13:58:50 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought='I will search for the nationality of Ed Wood.', action=\"search(entity='Ed Wood')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Ed Wood'}), observation=\"Edward Davis Wood Jr. (October 10, 1924\\xa0– December 10, 1978) was an American filmmaker, actor, and pulp novelist.. In the 1950s, Wood directed several low-budget science fiction, crime and horror films that later became cult classics, notably Glen or Glenda (1953), Jail Bait (1954), Bride of the Monster (1955), Plan 9 from Outer Space (1957) and Night of the Ghouls (1959).[1] In the 1960s and 1970s, he moved towards sexploitation and pornographic films such as The Sinister Urge (1960), Orgy of the Dead (1965) and Necromania (1971), and wrote over 80 lurid pulp crime and sex novels.. Notable for their campy aesthetics, technical errors, unsophisticated special effects, use of poorly-matched stock footage, eccentric casts, idiosyncratic stories and non sequitur dialogue, Wood's films remained largely obscure until he was posthumously awarded a Golden Turkey Award for Worst Director of All Time in 1980, renewing public interest in his life and work.[2]. Following the publication of Rudolph Grey's 1992 oral biography Nightmare of Ecstasy: The Life and Art of Edward D.\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 13:58:51 - [react.py:265:_run_one_step] - Step 3: \n",
+      "StepOutput(step=3, action=FunctionExpression(question=None, thought='I have finished the task.', action='finish(answer=\"Scott Derrickson and Ed Wood were both of American nationality.\"'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'Scott Derrickson and Ed Wood were both of American nationality.'}), observation='Scott Derrickson and Ed Wood were both of American nationality.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 13:58:51 - [react.py:299:call] - answer:\n",
+      " Scott Derrickson and Ed Wood were both of American nationality.\u001b[0m\n",
+      "\u001b[33m2024-12-19 13:58:51 - [530968165.py:14:<module>] - question: Were Scott Derrickson and Ed Wood of the same nationality?, ground truth: yes, pred answer: scott derrickson and ed wood were both of american nationality\u001b[0m\n",
+      "\u001b[31m2024-12-19 13:58:51 - [react.py:285:call] - input_query: What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?\u001b[0m\n",
+      "\u001b[34m2024-12-19 13:58:52 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought='I will search for the government position held by the woman who portrayed Corliss Archer in the film Kiss and Tell.', action=\"search(entity='Shirley Temple')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Shirley Temple'}), observation=\"This is an accepted version of this page. Shirley Temple Black (born Shirley Jane Temple; April 23, 1928 – February 10, 2014) was an American actress, singer, dancer, and diplomat, who was Hollywood's number-one box-office draw as a child actress from 1934 to 1938. Later, she was named United States Ambassador to Ghana and Czechoslovakia, and also served as Chief of Protocol of the United States.. Temple began her film career in 1931 when she was three years old and was well-known for her performance in Bright Eyes, which was released in 1934. She won a special Juvenile Academy Award in February 1935 for her outstanding contribution as a juvenile performer in motion pictures during 1934 and continued to appear in popular films through the remainder of the 1930s, although her subsequent films became less popular as she grew older.[1] She appeared in her last film, A Kiss for Corliss, in 1949.[2][3].\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 13:58:53 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought='I have finished the task.', action=\"finish(answer='The government positions held by Shirley Temple, who portrayed Corliss Archer in the film Kiss and Tell, include being the United States Ambassador to Ghana and Czechoslovakia, as well as serving as Chief of Protocol of the United States.')\"), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'The government positions held by Shirley Temple, who portrayed Corliss Archer in the film Kiss and Tell, include being the United States Ambassador to Ghana and Czechoslovakia, as well as serving as Chief of Protocol of the United States.'}), observation='The government positions held by Shirley Temple, who portrayed Corliss Archer in the film Kiss and Tell, include being the United States Ambassador to Ghana and Czechoslovakia, as well as serving as Chief of Protocol of the United States.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 13:58:53 - [react.py:299:call] - answer:\n",
+      " The government positions held by Shirley Temple, who portrayed Corliss Archer in the film Kiss and Tell, include being the United States Ambassador to Ghana and Czechoslovakia, as well as serving as Chief of Protocol of the United States.\u001b[0m\n",
+      "\u001b[33m2024-12-19 13:58:53 - [530968165.py:14:<module>] - question: What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?, ground truth: chief of protocol, pred answer: government positions held by shirley temple who portrayed corliss archer in film kiss and tell include being united states ambassador to ghana and czechoslovakia as well as serving as chief of protocol of united states\u001b[0m\n",
+      "\u001b[31m2024-12-19 13:58:53 - [react.py:285:call] - input_query: What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?\u001b[0m\n",
+      "\u001b[34m2024-12-19 13:58:55 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question='What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?', thought='I will search for the science fantasy young adult series with companion books about enslaved worlds and alien species.', action=\"search(entity='science fantasy young adult series with companion books about enslaved worlds and alien species')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'science fantasy young adult series with companion books about enslaved worlds and alien species'}), observation=\"Could not find exact page for 'science fantasy young adult series with companion books about enslaved worlds and alien species'. Similar topics: ['Animorphs', 'Feminist science fiction', 'LGBT themes in speculative fiction', 'Apocalyptic and post-apocalyptic fiction', 'Last Legionary']\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 13:58:56 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought='I will use LLM as a fallback tool to provide information on the science fantasy young adult series with companion books about enslaved worlds and alien species.', action=\"llm_tool(input='What is a science fantasy young adult series with companion books about enslaved worlds and alien species?')\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'What is a science fantasy young adult series with companion books about enslaved worlds and alien species?'}), observation='One popular science fantasy young adult series that fits your description is the \"Lorien Legacies\" series by Pittacus Lore. The series includes companion books that delve into the enslaved worlds and alien species featured in the main novels. The series follows a group of alien teenagers known as the Garde who are on Earth hiding from their enemies, the Mogadorians, while developing their powers to fight back.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 13:58:57 - [react.py:265:_run_one_step] - Step 3: \n",
+      "StepOutput(step=3, action=FunctionExpression(question=None, thought='I have finished the task.', action='finish(answer=\"One popular science fantasy young adult series that fits your description is the \\'Lorien Legacies\\' series by Pittacus Lore. The series includes companion books that delve into the enslaved worlds and alien species featured in the main novels. The series follows a group of alien teenagers known as the Garde who are on Earth hiding from their enemies, the Mogadorians, while developing their powers to fight back.\")'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': \"One popular science fantasy young adult series that fits your description is the 'Lorien Legacies' series by Pittacus Lore. The series includes companion books that delve into the enslaved worlds and alien species featured in the main novels. The series follows a group of alien teenagers known as the Garde who are on Earth hiding from their enemies, the Mogadorians, while developing their powers to fight back.\"}), observation=\"One popular science fantasy young adult series that fits your description is the 'Lorien Legacies' series by Pittacus Lore. The series includes companion books that delve into the enslaved worlds and alien species featured in the main novels. The series follows a group of alien teenagers known as the Garde who are on Earth hiding from their enemies, the Mogadorians, while developing their powers to fight back.\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 13:58:57 - [react.py:299:call] - answer:\n",
+      " One popular science fantasy young adult series that fits your description is the 'Lorien Legacies' series by Pittacus Lore. The series includes companion books that delve into the enslaved worlds and alien species featured in the main novels. The series follows a group of alien teenagers known as the Garde who are on Earth hiding from their enemies, the Mogadorians, while developing their powers to fight back.\u001b[0m\n",
+      "\u001b[33m2024-12-19 13:58:57 - [530968165.py:14:<module>] - question: What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?, ground truth: animorphs, pred answer: one popular science fantasy young adult series that fits your description is lorien legacies series by pittacus lore series includes companion books that delve into enslaved worlds and alien species featured in main novels series follows group of alien teenagers known as garde who are on earth hiding from their enemies mogadorians while developing their powers to fight back\u001b[0m\n",
+      "\u001b[31m2024-12-19 13:58:57 - [react.py:285:call] - input_query: Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?\u001b[0m\n",
+      "\u001b[34m2024-12-19 13:58:59 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought='I will search for information about the locations of Laleli Mosque and Esma Sultan Mansion.', action=\"search(entity='Laleli Mosque')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Laleli Mosque'}), observation=\"The Laleli Mosque (Turkish: Laleli Camii, lit.\\u2009'Tulip Mosque') is an 18th-century Ottoman imperial mosque located in Laleli, Fatih, Istanbul, Turkey.[1]. The mosque was commissioned by Sultan Mustafa III to serve as his imperial or sultanic mosque.[2][3] Although it was tradition among earlier sultans to build their imperial mosque in commemoration of a major military success, Mustafa III ignored this tradition by ordering the construction before any such victories.[3] Construction began on 5 April 1760 and was completed on 9 March 1764.[4][3] According to a contemporary writer, the mosque was officially named Nur Mustafa ('Light of Mustafa'), but it became popularly known as the Laleli Mosque ('Mosque of the Tulips') after the name of the neighbourhood where it was built.[3]. The architect of the mosque is not confirmed by historical documentation, but art historians have attributed the mosque to Mehmed Tahir Agha, the chief imperial architect at the time of the mosque's completion.[a][2][4][5] On average, about 770 workers were employed in the project and about two thirds of them were non-Muslims, the rest being Muslim.[5]. The mosque was the centerpiece of a larger complex (külliye) that included the Mustafa III's tomb, a nearby caravanserai which provided some revenues to the complex, a sebil, and a madrasa.[6] Mustafa III was buried in the mausoleum attached to the complex after his death in 1774.[7] The mosque and its complex were damaged by the 1766 earthquake[4] and, according to Ünver Rüstem, by a fire in 1783.[7] In 1783 it was fully restored.[7][2] The restoration, which Doğan Kuban attributes to the architect Seyit Mustafa Agha,[4][2] preserved the original mosque's appearance.[7] The mausoleum's façade was updated with new marble window frames in the early 19th century.[6] The madrasa of the complex was destroyed by fire in 1911.[6]. The mosque was built in the Ottoman Baroque style of its time.[8][9][10] The layout is based on that of the earlier Selimiye Mosque of Edirne from the classical period, in accordance with Mustafa III's wishes.[11][5] The decoration of the mosque and its complex is firmly baroque.[12][2] The mosque incorporates thus the visual style of the earlier Nuruosmaniye Mosque – the first imperial mosque in the Ottoman Baroque style, completed by Mustafa III's predecessor – but in a more restrained way that integrates it with more traditional Ottoman architecture.[13].\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 13:59:00 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought='I have found information about the Laleli Mosque. Now, I will search for information about the Esma Sultan Mansion.', action=\"search(entity='Esma Sultan Mansion')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Esma Sultan Mansion'}), observation='The Esma Sultan Mansion (Turkish: Esma Sultan Yalısı), a historical yalı located on the Bosphorus in the Ortaköy neighborhood of Istanbul, Turkey and named after its original owner Princess Esma Sultan, is used today as a cultural center after being redeveloped.. The three-storey brick manor was designed by the renowned architect Sarkis Balyan and finished in 1875 next to Ortaköy Mosque. It was presented to the Princess Esma Sultan, the daughter of Ottoman Sultan Abdulaziz, as a wedding gift in 1889.. The mansion remained in the possession of the Ottoman dynasty until 1915. Subsequently, the building was used first as a tobacco warehouse and then as a coal depot from 1920 until 1975 when it was destroyed by a fire.[1].')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 13:59:01 - [react.py:265:_run_one_step] - Step 3: \n",
+      "StepOutput(step=3, action=FunctionExpression(question=None, thought='I have found information about both the Laleli Mosque and the Esma Sultan Mansion.', action='finish(answer=\"Yes, the Laleli Mosque and Esma Sultan Mansion are not located in the same neighborhood. Laleli Mosque is in Laleli, Fatih, Istanbul, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul.\")'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'Yes, the Laleli Mosque and Esma Sultan Mansion are not located in the same neighborhood. Laleli Mosque is in Laleli, Fatih, Istanbul, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul.'}), observation='Yes, the Laleli Mosque and Esma Sultan Mansion are not located in the same neighborhood. Laleli Mosque is in Laleli, Fatih, Istanbul, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 13:59:01 - [react.py:299:call] - answer:\n",
+      " Yes, the Laleli Mosque and Esma Sultan Mansion are not located in the same neighborhood. Laleli Mosque is in Laleli, Fatih, Istanbul, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul.\u001b[0m\n",
+      "\u001b[33m2024-12-19 13:59:01 - [530968165.py:14:<module>] - question: Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?, ground truth: no, pred answer: yes laleli mosque and esma sultan mansion are not located in same neighborhood laleli mosque is in laleli fatih istanbul while esma sultan mansion is in ortaköy neighborhood of istanbul\u001b[0m\n",
+      "\u001b[31m2024-12-19 13:59:01 - [react.py:285:call] - input_query: The director of the romantic comedy \"Big Stone Gap\" is based in what New York city?\u001b[0m\n",
+      "\u001b[34m2024-12-19 13:59:03 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought=\"I will search for the director of the romantic comedy 'Big Stone Gap' on Wikipedia.\", action=\"search(entity='Big Stone Gap (film)')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Big Stone Gap (film)'}), observation=\"Big Stone Gap is a 2014 American drama romantic comedy film written and directed by Adriana Trigiani and produced by Donna Gigliotti for Altar Identity Studios, a subsidiary of Media Society. Based on Trigiani's 2000 best-selling novel of the same name, the story is set in the actual Virginia town of Big Stone Gap circa 1970s. The film had its world premiere at the Virginia Film Festival on November 6, 2014.[2]. The film was released on October 9, 2015, by Picturehouse.[3] The film was released in Blu-Ray by Universal Pictures Home Entertainment on February 2, 2016.. In 1978, 40-year-old independent woman Ave Maria Mulligan owns her dead father's pharmacy in her hometown of Big Stone Gap, Virginia.\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 13:59:04 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought=\"I will provide the director of the romantic comedy 'Big Stone Gap'.\", action='finish(answer=\"The director of the romantic comedy \\'Big Stone Gap\\' is Adriana Trigiani.\")'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': \"The director of the romantic comedy 'Big Stone Gap' is Adriana Trigiani.\"}), observation=\"The director of the romantic comedy 'Big Stone Gap' is Adriana Trigiani.\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 13:59:04 - [react.py:299:call] - answer:\n",
+      " The director of the romantic comedy 'Big Stone Gap' is Adriana Trigiani.\u001b[0m\n",
+      "\u001b[33m2024-12-19 13:59:04 - [530968165.py:14:<module>] - question: The director of the romantic comedy \"Big Stone Gap\" is based in what New York city?, ground truth: greenwich village new york city, pred answer: director of romantic comedy big stone gap is adriana trigiani\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "from adalflow.utils.logger import printc\n",
+    "\n",
+    "num_questions = 5\n",
+    "for i in range(num_questions):\n",
+    "    question = val_dataset[i][\"question\"]\n",
+    "    gt_answer = normalize_answer(\n",
+    "        val_dataset[i][\"answer\"]\n",
+    "    )  # normalize the ground truth answer\n",
+    "\n",
+    "    # get the agent's response\n",
+    "    pred_answer = agent(question)\n",
+    "    pred_answer = normalize_answer(pred_answer)\n",
+    "\n",
+    "    printc(\n",
+    "        f\"question: {question}, ground truth: {gt_answer}, pred answer: {pred_answer}\",\n",
+    "        color=\"yellow\",\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 6. Evaluation\n",
+    "\n",
+    "Now you will see that we have the ``exact correct answer`` for some questions:\n",
+    "\n",
+    "question: Were Scott Derrickson and Ed Wood of the same nationality?, ground truth: ``yes`` pred answer: ``yes``\n",
+    "\n",
+    "question: What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?, ground truth: ``animorphs``, pred answer: ``animorphs``\n",
+    "\n",
+    "Sometimes the agent performs correctly but not in the same format with the ground truth. E.g. ground truth: ``no``, pred answer: ``no, they are not the same``. This is what we can tolerate.\n",
+    "\n",
+    "But how to evaluate if the agent is doing well, or if our tools, examples, and prompt implementations work well? We need to evaluate it.\n",
+    "\n",
+    "1. Exact Match(EM)\n",
+    "Exact Match is what the paper is using. Only when the normalized agent response is the same with the ground truth answer, we count it as correct. The paper's EM for react agent is around 30%(gpt-3).\n",
+    "\n",
+    "2. Fuzzy Match(FM)\n",
+    "EM doesn't make much sense in question and answering. So we propose fuzzy match based on the LLMs' lengthy output nature. If the ground truth answer is included in the agent response, then we count it as correct. FM is not necessarily correct. \n",
+    "E.g. question: Harry Potter and Dumbledore, who is older? ground truth: ``dumbledore``, pred answer: ``harry potter is older than dumbledore.``\n",
+    "the model mentioned the groud truth but still provide wrong answer. So FM serves as reference.\n",
+    "\n",
+    "Let's use ``LightRAG eval`` module and evaluate on 10 questions and keep the model's practice to set ``max_step`` at `7`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[31m2024-12-19 13:59:56 - [react.py:285:call] - input_query: Were Scott Derrickson and Ed Wood of the same nationality?\u001b[0m\n",
+      "\u001b[34m2024-12-19 13:59:57 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought='I will search for the nationalities of Scott Derrickson and Ed Wood.', action=\"search(entity='Scott Derrickson')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Scott Derrickson'}), observation='Scott Derrickson (born July 16, 1966) is an American filmmaker. He is best known for his work in the horror genre, directing films such as The Exorcism of Emily Rose (2005), Sinister (2012), and The Black Phone (2021). He is also known for the superhero film Doctor Strange (2016), based on the Marvel Comics character.. Scott Derrickson grew up in Denver, Colorado. He graduated from Biola University with a B.A.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 13:59:58 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought='I will search for the nationality of Ed Wood.', action=\"search(entity='Ed Wood')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Ed Wood'}), observation=\"Edward Davis Wood Jr. (October 10, 1924\\xa0– December 10, 1978) was an American filmmaker, actor, and pulp novelist.. In the 1950s, Wood directed several low-budget science fiction, crime and horror films that later became cult classics, notably Glen or Glenda (1953), Jail Bait (1954), Bride of the Monster (1955), Plan 9 from Outer Space (1957) and Night of the Ghouls (1959).[1] In the 1960s and 1970s, he moved towards sexploitation and pornographic films such as The Sinister Urge (1960), Orgy of the Dead (1965) and Necromania (1971), and wrote over 80 lurid pulp crime and sex novels.. Notable for their campy aesthetics, technical errors, unsophisticated special effects, use of poorly-matched stock footage, eccentric casts, idiosyncratic stories and non sequitur dialogue, Wood's films remained largely obscure until he was posthumously awarded a Golden Turkey Award for Worst Director of All Time in 1980, renewing public interest in his life and work.[2]. Following the publication of Rudolph Grey's 1992 oral biography Nightmare of Ecstasy: The Life and Art of Edward D.\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 13:59:59 - [react.py:265:_run_one_step] - Step 3: \n",
+      "StepOutput(step=3, action=FunctionExpression(question=None, thought='I have finished the search for the nationalities of Scott Derrickson and Ed Wood.', action='finish(answer=\"Scott Derrickson is American and Ed Wood was also American.\")'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'Scott Derrickson is American and Ed Wood was also American.'}), observation='Scott Derrickson is American and Ed Wood was also American.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 13:59:59 - [react.py:299:call] - answer:\n",
+      " Scott Derrickson is American and Ed Wood was also American.\u001b[0m\n",
+      "\u001b[33m2024-12-19 13:59:59 - [2823432734.py:31:<module>] - No. 1, question: Were Scott Derrickson and Ed Wood of the same nationality?, ground truth: yes, pred answer: scott derrickson is american and ed wood was also american\u001b[0m\n",
+      "\u001b[31m2024-12-19 13:59:59 - [react.py:285:call] - input_query: What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:00 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought='I will search for the government position held by the woman who portrayed Corliss Archer in the film Kiss and Tell.', action=\"search(entity='Corliss Archer')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Corliss Archer'}), observation='Fictional teenage girl Corliss Archer is the lead character in a series of American short stories written by F. Hugh Herbert starting in 1943. She also appears in these derivative works:. Main pageContentsCurrent eventsRandom articleAbout WikipediaContact us. HelpLearn to editCommunity portalRecent changesUpload file.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:02 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought='I will search for the woman who portrayed Corliss Archer in the film Kiss and Tell.', action=\"search(entity='Kiss and Tell film cast')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Kiss and Tell film cast'}), observation=\"Could not find exact page for 'Kiss and Tell film cast'. Similar topics: ['Kiss & Tell (1997 film)', 'Kiss Me (2014 film)', 'Kiss Kiss Bang Bang', 'Kiss and Tell (1945 film)', 'Kiss the Girls (1997 film)']\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:04 - [react.py:265:_run_one_step] - Step 3: \n",
+      "StepOutput(step=3, action=FunctionExpression(question=None, thought='I will search for the woman who portrayed Corliss Archer in the film Kiss and Tell.', action=\"search(entity='Kiss and Tell (1945 film) cast')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Kiss and Tell (1945 film) cast'}), observation=\"Could not find exact page for 'Kiss and Tell (1945 film) cast'. Similar topics: ['Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Kiss Them for Me (film)', 'A Kiss for Corliss', 'State Fair (1945 film)']\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:05 - [react.py:265:_run_one_step] - Step 4: \n",
+      "StepOutput(step=4, action=FunctionExpression(question=None, thought='I have finished the search for the woman who portrayed Corliss Archer in the film Kiss and Tell.', action='finish(answer=\"Unfortunately, I couldn\\'t find the specific information about the woman who portrayed Corliss Archer in the film Kiss and Tell.\")'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': \"Unfortunately, I couldn't find the specific information about the woman who portrayed Corliss Archer in the film Kiss and Tell.\"}), observation=\"Unfortunately, I couldn't find the specific information about the woman who portrayed Corliss Archer in the film Kiss and Tell.\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 14:00:05 - [react.py:299:call] - answer:\n",
+      " Unfortunately, I couldn't find the specific information about the woman who portrayed Corliss Archer in the film Kiss and Tell.\u001b[0m\n",
+      "\u001b[33m2024-12-19 14:00:05 - [2823432734.py:31:<module>] - No. 2, question: What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?, ground truth: chief of protocol, pred answer: unfortunately i couldnt find specific information about woman who portrayed corliss archer in film kiss and tell\u001b[0m\n",
+      "\u001b[31m2024-12-19 14:00:05 - [react.py:285:call] - input_query: What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:07 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought='I will search for the science fantasy young adult series described by the user.', action=\"search(entity='science fantasy young adult series')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'science fantasy young adult series'}), observation=\"Could not find exact page for 'science fantasy young adult series'. Similar topics: ['Ballantine Adult Fantasy series', 'Red Queen (novel)', 'Brandon Sanderson', 'The Mortal Instruments', 'V. E. Schwab']\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:09 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought='I will search for companion books narrating the stories of enslaved worlds and alien species related to the previous search.', action=\"search(entity='companion books enslaved worlds alien species')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'companion books enslaved worlds alien species'}), observation=\"Could not find exact page for 'companion books enslaved worlds alien species'. Similar topics: ['List of Doctor Who universe creatures and aliens', 'List of Star Wars species (P–T)', 'List of companions in Doctor Who spin-offs', 'Extraterrestrial life', 'List of Known Space characters']\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:11 - [react.py:265:_run_one_step] - Step 3: \n",
+      "StepOutput(step=3, action=FunctionExpression(question=None, thought='I will search for companion books narrating the stories of enslaved worlds and alien species related to the previous search.', action=\"search(entity='companion books narrating stories enslaved worlds alien species')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'companion books narrating stories enslaved worlds alien species'}), observation=\"Could not find exact page for 'companion books narrating stories enslaved worlds alien species'. Similar topics: ['Animorphs', 'Eighth Doctor', 'Apocalyptic and post-apocalyptic fiction', 'Barsoom', 'Wicked Witch of the West']\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:13 - [react.py:265:_run_one_step] - Step 4: \n",
+      "StepOutput(step=4, action=FunctionExpression(question=None, thought='I have exhausted the search for the specific series and companion books. I will use the LLM tool to provide a relevant answer.', action=\"llm_tool(input='What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?')\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?'}), observation='The science fantasy young adult series you are referring to is \"The Lorien Legacies\" series by Pittacus Lore. The companion books you mentioned are called \"The Lost Files,\" which narrate the stories of enslaved worlds and alien species in the same universe as the main series.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:14 - [react.py:265:_run_one_step] - Step 5: \n",
+      "StepOutput(step=5, action=FunctionExpression(question=None, thought='I have finished the task.', action='finish(answer=\"The science fantasy young adult series you are referring to is \\'The Lorien Legacies\\' series by Pittacus Lore. The companion books you mentioned are called \\'The Lost Files,\\' which narrate the stories of enslaved worlds and alien species in the same universe as the main series.\")'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': \"The science fantasy young adult series you are referring to is 'The Lorien Legacies' series by Pittacus Lore. The companion books you mentioned are called 'The Lost Files,' which narrate the stories of enslaved worlds and alien species in the same universe as the main series.\"}), observation=\"The science fantasy young adult series you are referring to is 'The Lorien Legacies' series by Pittacus Lore. The companion books you mentioned are called 'The Lost Files,' which narrate the stories of enslaved worlds and alien species in the same universe as the main series.\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 14:00:14 - [react.py:299:call] - answer:\n",
+      " The science fantasy young adult series you are referring to is 'The Lorien Legacies' series by Pittacus Lore. The companion books you mentioned are called 'The Lost Files,' which narrate the stories of enslaved worlds and alien species in the same universe as the main series.\u001b[0m\n",
+      "\u001b[33m2024-12-19 14:00:14 - [2823432734.py:31:<module>] - No. 3, question: What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?, ground truth: animorphs, pred answer: science fantasy young adult series you are referring to is lorien legacies series by pittacus lore companion books you mentioned are called lost files which narrate stories of enslaved worlds and alien species in same universe as main series\u001b[0m\n",
+      "\u001b[31m2024-12-19 14:00:14 - [react.py:285:call] - input_query: Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:16 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought='I will search for information about the locations of Laleli Mosque and Esma Sultan Mansion.', action=\"search(entity='Laleli Mosque')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Laleli Mosque'}), observation=\"The Laleli Mosque (Turkish: Laleli Camii, lit.\\u2009'Tulip Mosque') is an 18th-century Ottoman imperial mosque located in Laleli, Fatih, Istanbul, Turkey.[1]. The mosque was commissioned by Sultan Mustafa III to serve as his imperial or sultanic mosque.[2][3] Although it was tradition among earlier sultans to build their imperial mosque in commemoration of a major military success, Mustafa III ignored this tradition by ordering the construction before any such victories.[3] Construction began on 5 April 1760 and was completed on 9 March 1764.[4][3] According to a contemporary writer, the mosque was officially named Nur Mustafa ('Light of Mustafa'), but it became popularly known as the Laleli Mosque ('Mosque of the Tulips') after the name of the neighbourhood where it was built.[3]. The architect of the mosque is not confirmed by historical documentation, but art historians have attributed the mosque to Mehmed Tahir Agha, the chief imperial architect at the time of the mosque's completion.[a][2][4][5] On average, about 770 workers were employed in the project and about two thirds of them were non-Muslims, the rest being Muslim.[5]. The mosque was the centerpiece of a larger complex (külliye) that included the Mustafa III's tomb, a nearby caravanserai which provided some revenues to the complex, a sebil, and a madrasa.[6] Mustafa III was buried in the mausoleum attached to the complex after his death in 1774.[7] The mosque and its complex were damaged by the 1766 earthquake[4] and, according to Ünver Rüstem, by a fire in 1783.[7] In 1783 it was fully restored.[7][2] The restoration, which Doğan Kuban attributes to the architect Seyit Mustafa Agha,[4][2] preserved the original mosque's appearance.[7] The mausoleum's façade was updated with new marble window frames in the early 19th century.[6] The madrasa of the complex was destroyed by fire in 1911.[6]. The mosque was built in the Ottoman Baroque style of its time.[8][9][10] The layout is based on that of the earlier Selimiye Mosque of Edirne from the classical period, in accordance with Mustafa III's wishes.[11][5] The decoration of the mosque and its complex is firmly baroque.[12][2] The mosque incorporates thus the visual style of the earlier Nuruosmaniye Mosque – the first imperial mosque in the Ottoman Baroque style, completed by Mustafa III's predecessor – but in a more restrained way that integrates it with more traditional Ottoman architecture.[13].\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:17 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought='I will search for information about the location of Esma Sultan Mansion.', action=\"search(entity='Esma Sultan Mansion')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Esma Sultan Mansion'}), observation='The Esma Sultan Mansion (Turkish: Esma Sultan Yalısı), a historical yalı located on the Bosphorus in the Ortaköy neighborhood of Istanbul, Turkey and named after its original owner Princess Esma Sultan, is used today as a cultural center after being redeveloped.. The three-storey brick manor was designed by the renowned architect Sarkis Balyan and finished in 1875 next to Ortaköy Mosque. It was presented to the Princess Esma Sultan, the daughter of Ottoman Sultan Abdulaziz, as a wedding gift in 1889.. The mansion remained in the possession of the Ottoman dynasty until 1915. Subsequently, the building was used first as a tobacco warehouse and then as a coal depot from 1920 until 1975 when it was destroyed by a fire.[1].')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:18 - [react.py:265:_run_one_step] - Step 3: \n",
+      "StepOutput(step=3, action=FunctionExpression(question=None, thought='I have finished the task.', action='finish(answer=\"Yes, the Laleli Mosque and Esma Sultan Mansion are not located in the same neighborhood. Laleli Mosque is in Laleli, Fatih, Istanbul, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul.\")'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'Yes, the Laleli Mosque and Esma Sultan Mansion are not located in the same neighborhood. Laleli Mosque is in Laleli, Fatih, Istanbul, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul.'}), observation='Yes, the Laleli Mosque and Esma Sultan Mansion are not located in the same neighborhood. Laleli Mosque is in Laleli, Fatih, Istanbul, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 14:00:18 - [react.py:299:call] - answer:\n",
+      " Yes, the Laleli Mosque and Esma Sultan Mansion are not located in the same neighborhood. Laleli Mosque is in Laleli, Fatih, Istanbul, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul.\u001b[0m\n",
+      "\u001b[33m2024-12-19 14:00:18 - [2823432734.py:31:<module>] - No. 4, question: Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?, ground truth: no, pred answer: yes laleli mosque and esma sultan mansion are not located in same neighborhood laleli mosque is in laleli fatih istanbul while esma sultan mansion is in ortaköy neighborhood of istanbul\u001b[0m\n",
+      "\u001b[31m2024-12-19 14:00:18 - [react.py:285:call] - input_query: The director of the romantic comedy \"Big Stone Gap\" is based in what New York city?\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:19 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought=\"I will search for the director of the romantic comedy 'Big Stone Gap' on Wikipedia.\", action=\"search(entity='Big Stone Gap (film)')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Big Stone Gap (film)'}), observation=\"Big Stone Gap is a 2014 American drama romantic comedy film written and directed by Adriana Trigiani and produced by Donna Gigliotti for Altar Identity Studios, a subsidiary of Media Society. Based on Trigiani's 2000 best-selling novel of the same name, the story is set in the actual Virginia town of Big Stone Gap circa 1970s. The film had its world premiere at the Virginia Film Festival on November 6, 2014.[2]. The film was released on October 9, 2015, by Picturehouse.[3] The film was released in Blu-Ray by Universal Pictures Home Entertainment on February 2, 2016.. In 1978, 40-year-old independent woman Ave Maria Mulligan owns her dead father's pharmacy in her hometown of Big Stone Gap, Virginia.\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:20 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought=\"I will search for the director of the romantic comedy 'Big Stone Gap' on Wikipedia.\", action=\"search(entity='Adriana Trigiani')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Adriana Trigiani'}), observation=\"Adriana Trigiani is an American best-selling author of eighteen books, playwright, television writer/producer, film director/screenwriter/producer, and entrepreneur based in New York City. Trigiani has published a novel a year since 2000.[1]. Trigiani graduated from Saint Mary's College in Indiana in 1981.[2]. Inspired by her Italian American heritage and Appalachian childhood in Big Stone Gap, Virginia, Trigiani arrived in New York in 1985. Trigiani made her off-Broadway debut in New York City as a playwright in 1985 at the Manhattan Theater Club with Secrets of the Lava Lamp, directed by Stuart Ross.[3]  From 1988 to 1998, she created scripts for television sitcoms, including The Cosby Show (1984) and its spin-off A Different World (1987).\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:21 - [react.py:265:_run_one_step] - Step 3: \n",
+      "StepOutput(step=3, action=FunctionExpression(question=None, thought='I have finished the task.', action=\"finish(answer='Adriana Trigiani is the director of the romantic comedy film Big Stone Gap and is based in New York City.')\"), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'Adriana Trigiani is the director of the romantic comedy film Big Stone Gap and is based in New York City.'}), observation='Adriana Trigiani is the director of the romantic comedy film Big Stone Gap and is based in New York City.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 14:00:21 - [react.py:299:call] - answer:\n",
+      " Adriana Trigiani is the director of the romantic comedy film Big Stone Gap and is based in New York City.\u001b[0m\n",
+      "\u001b[33m2024-12-19 14:00:21 - [2823432734.py:31:<module>] - No. 5, question: The director of the romantic comedy \"Big Stone Gap\" is based in what New York city?, ground truth: greenwich village new york city, pred answer: adriana trigiani is director of romantic comedy film big stone gap and is based in new york city\u001b[0m\n",
+      "\u001b[31m2024-12-19 14:00:21 - [react.py:285:call] - input_query: 2014 S/S is the debut album of a South Korean boy group that was formed by who?\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:22 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought='I will search for information about the debut album of the South Korean boy group.', action=\"search(entity='2014 S/S (album)')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': '2014 S/S (album)'}), observation='2014 S/S is the debut album of South Korean group Winner. It was released on August 12, 2014, by the group\\'s record label, YG Entertainment. The members were credited for writing the lyrics and composing the majority of the album\\'s songs.[2]. The members produced the majority of the music for this album themselves, with the help of other producers such as Choice 37, B.I.,  Airplay, and others.[3] The album was highlighted for incorporating elements generally absent from K-pop releases, including hints of acoustic and alternative rock.[4][5]. The title track \"Empty\" became a hit in South Korea, topping the Gaon Chart and Billboard\\'s K-Pop Hot 100 chart,[6][7] and topping 7 other South Korean charts.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:24 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question='The question to ask the LLM (Optional[str]) (optional)', thought='I will find out who formed the South Korean boy group Winner.', action=\"search(entity='Winner (group)')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Winner (group)'}), observation=\"Could not find exact page for 'Winner (group)'. Similar topics: ['2026 FIFA World Cup', '2025 FIFA Club World Cup', '2025 Africa Cup of Nations', 'Reality Winner', 'Dana Winner']\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:26 - [react.py:265:_run_one_step] - Step 3: \n",
+      "StepOutput(step=3, action=FunctionExpression(question=None, thought='I will search for the formation of the South Korean boy group Winner through another related search.', action=\"search(entity='Winner (South Korean group) formation')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Winner (South Korean group) formation'}), observation=\"Could not find exact page for 'Winner (South Korean group) formation'. Similar topics: ['Blackpink', 'Winner (band)', 'Babymonster', 'BigBang (South Korean band)', 'Seventeen (South Korean band)']\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:27 - [react.py:265:_run_one_step] - Step 4: \n",
+      "StepOutput(step=4, action=FunctionExpression(question=None, thought='I have exhausted the search options related to the formation of Winner. I will provide a general response.', action=\"llm_tool(input='Who formed the South Korean boy group Winner?')\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'Who formed the South Korean boy group Winner?'}), observation='The South Korean boy group Winner was formed by YG Entertainment through the reality survival program called \"WIN: Who is Next\" in 2013.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:28 - [react.py:265:_run_one_step] - Step 5: \n",
+      "StepOutput(step=5, action=FunctionExpression(question=None, thought='I have exhausted the search options related to the formation of Winner. I will provide a general response.', action='finish(answer=\"YG Entertainment formed the South Korean boy group Winner through the reality survival program called \\'WIN: Who is Next\\' in 2013.\")'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': \"YG Entertainment formed the South Korean boy group Winner through the reality survival program called 'WIN: Who is Next' in 2013.\"}), observation=\"YG Entertainment formed the South Korean boy group Winner through the reality survival program called 'WIN: Who is Next' in 2013.\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 14:00:28 - [react.py:299:call] - answer:\n",
+      " YG Entertainment formed the South Korean boy group Winner through the reality survival program called 'WIN: Who is Next' in 2013.\u001b[0m\n",
+      "\u001b[33m2024-12-19 14:00:28 - [2823432734.py:31:<module>] - No. 6, question: 2014 S/S is the debut album of a South Korean boy group that was formed by who?, ground truth: yg entertainment, pred answer: yg entertainment formed south korean boy group winner through reality survival program called win who is next in 2013\u001b[0m\n",
+      "\u001b[31m2024-12-19 14:00:28 - [react.py:285:call] - input_query: Who was known by his stage name Aladin and helped organizations improve their performance as a consultant?\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:29 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought='The user is asking about a person known by the stage name Aladin who helped organizations improve their performance as a consultant.', action=\"search(entity='Aladin')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Aladin'}), observation='Aladdin is a folk tale of Middle Eastern origin.. Aladdin, Aladin, and variants such as Aladdin and His Magic Lamp, may also refer to:. Main pageContentsCurrent eventsRandom articleAbout WikipediaContact us. HelpLearn to editCommunity portalRecent changesUpload file. Donate.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:31 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought=\"The search for Aladin did not provide relevant information. Let's try to find the person known by the stage name Aladin who helped organizations improve their performance as a consultant.\", action=\"search(entity='Aladin consultant')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Aladin consultant'}), observation=\"Could not find exact page for 'Aladin consultant'. Similar topics: ['2009 in film', 'Magicians (2007 film)', 'Liberal Party of Canada candidates in the 2019 Canadian federal election', 'Ashchorjyo Prodeep', 'Waleed El-Ansary']\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:33 - [react.py:265:_run_one_step] - Step 3: \n",
+      "StepOutput(step=3, action=FunctionExpression(question=None, thought=\"The search for Aladin consultant did not provide relevant information. Let's try to find the person known by the stage name Aladin who helped organizations improve their performance as a consultant.\", action=\"search(entity='Aladin performance improvement consultant')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Aladin performance improvement consultant'}), observation=\"Could not find exact page for 'Aladin performance improvement consultant'. Similar topics: ['Five Pillars of Islam', 'Signal-to-noise ratio', 'Islamic Golden Age', 'Travelling salesman problem', 'Clock']\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:35 - [react.py:265:_run_one_step] - Step 4: \n",
+      "StepOutput(step=4, action=FunctionExpression(question=None, thought=\"The search for Aladin with different variations did not yield relevant information. Let's use LLM as a fallback tool to answer the query.\", action=\"llm_tool(input='Who was known by his stage name Aladin and helped organizations improve their performance as a consultant?')\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'Who was known by his stage name Aladin and helped organizations improve their performance as a consultant?'}), observation='The person known by his stage name Aladin who helped organizations improve their performance as a consultant is Aladin Akyurek. He is a renowned consultant, speaker, and author who specializes in organizational development and performance improvement.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:36 - [react.py:265:_run_one_step] - Step 5: \n",
+      "StepOutput(step=5, action=FunctionExpression(question=None, thought='I have finished the task.', action='finish(answer=\"The person known by his stage name Aladin who helped organizations improve their performance as a consultant is Aladin Akyurek. He is a renowned consultant, speaker, and author who specializes in organizational development and performance improvement.\")'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'The person known by his stage name Aladin who helped organizations improve their performance as a consultant is Aladin Akyurek. He is a renowned consultant, speaker, and author who specializes in organizational development and performance improvement.'}), observation='The person known by his stage name Aladin who helped organizations improve their performance as a consultant is Aladin Akyurek. He is a renowned consultant, speaker, and author who specializes in organizational development and performance improvement.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 14:00:36 - [react.py:299:call] - answer:\n",
+      " The person known by his stage name Aladin who helped organizations improve their performance as a consultant is Aladin Akyurek. He is a renowned consultant, speaker, and author who specializes in organizational development and performance improvement.\u001b[0m\n",
+      "\u001b[33m2024-12-19 14:00:36 - [2823432734.py:31:<module>] - No. 7, question: Who was known by his stage name Aladin and helped organizations improve their performance as a consultant?, ground truth: eenasul fateh, pred answer: person known by his stage name aladin who helped organizations improve their performance as consultant is aladin akyurek he is renowned consultant speaker and author who specializes in organizational development and performance improvement\u001b[0m\n",
+      "\u001b[31m2024-12-19 14:00:36 - [react.py:285:call] - input_query: The arena where the Lewiston Maineiacs played their home games can seat how many people?\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:37 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought='I will search for the arena where the Lewiston Maineiacs played their home games.', action=\"search(entity='Androscoggin Bank Colisée')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Androscoggin Bank Colisée'}), observation='The Colisée, formerly Androscoggin Bank Colisée, Central Maine Youth Center, Central Maine Civic Center, and Lewiston Colisee, is a 4,000 capacity (3,677 seated) multi-purpose arena, in Lewiston, Maine, United States that was built from 1956 to 1958 and opened in the latter year to replace the Dominics Regional High School Arena and was initially constructed and operated by the Catholic parish of SS. Peter and Paul. It was used as a replacement venue for the boxing match between Muhammad Ali and Sonny Liston in 1965.. Currently, it is the home of the Maine Nordiques of the North American Hockey League. The Colisée is also used for concerts, conventions and trade shows.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:38 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought='I have finished the task.', action=\"finish(answer='The arena where the Lewiston Maineiacs played their home games, Androscoggin Bank Colisée, can seat 4,000 people (3,677 seated).' )\"), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'The arena where the Lewiston Maineiacs played their home games, Androscoggin Bank Colisée, can seat 4,000 people (3,677 seated).'}), observation='The arena where the Lewiston Maineiacs played their home games, Androscoggin Bank Colisée, can seat 4,000 people (3,677 seated).')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 14:00:38 - [react.py:299:call] - answer:\n",
+      " The arena where the Lewiston Maineiacs played their home games, Androscoggin Bank Colisée, can seat 4,000 people (3,677 seated).\u001b[0m\n",
+      "\u001b[33m2024-12-19 14:00:38 - [2823432734.py:31:<module>] - No. 8, question: The arena where the Lewiston Maineiacs played their home games can seat how many people?, ground truth: 3677 seated, pred answer: arena where lewiston maineiacs played their home games androscoggin bank colisée can seat 4000 people 3677 seated\u001b[0m\n",
+      "\u001b[31m2024-12-19 14:00:38 - [react.py:285:call] - input_query: Who is older, Annie Morton or Terry Richardson?\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:39 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought='I will search for information about Annie Morton and Terry Richardson to determine who is older.', action=\"search(entity='Annie Morton')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Annie Morton'}), observation='Annie Morton (born October 8, 1970) is an American model[1] born in Pennsylvania.[2] She has appeared on the covers of British Vogue, ID, Marie Claire, and other magazines. She has been photographed by Helmut Newton; Peter Lindbergh; Annie Leibovitz; Richard Avedon; Juergen Teller;[3] Paul Jasmin, Mary Ellen Mark, Stephen Shames,  and Terry Richardson, and modeled for Donna Karan,[4] Givenchy, Guerlain, Chanel, Harper\\'s Bazaar, Sports Illustrated and Victoria\\'s Secret.[5] A long time vegetarian, an advocate for organic lifestyle choices and natural healthcare. She co-founded Tsi-La Organics, a \"Green Luxury\" company that creates and sells vegan, organic perfume and skin care products.[6]. She has appeared on many magazine covers and has been featured in several professional photobooks, Peter Lindbergh Selected Work, the cover of Juergen Teller By Juergen Teller, Helmut Newton Pages From The Glossies, and Donna Karan\\'s book A Journey Of A Woman 20 Years.[7] In 1997 she posed for the Pirelli Calendar with Richard Avedon.. Morton has also appeared in several music videos.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:41 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought='I will search for information about Terry Richardson to determine his age.', action=\"search(entity='Terry Richardson')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Terry Richardson'}), observation=\"Terrence Richardson (born August 14, 1965) is an American fashion and portrait photographer. He has shot advertising campaigns for Marc Jacobs, Aldo, Supreme, Sisley, Tom Ford, and Yves Saint Laurent among others, and also done work for magazines such as Rolling Stone, GQ, Vogue, Vanity Fair, Harper's Bazaar, i-D, and Vice.. Since 2001, Richardson has been accused by multiple models of sexual misconduct.[2][3][4][5] In 2017, brands and magazines that had worked with Richardson in the past began distancing themselves from him, and said they would no longer employ him.[6] He has not actively worked as a photographer since 2018.[7]. Richardson was born in New York City, the son of Norma Kessler, an actress,[8][9] and Bob Richardson, a fashion photographer who struggled with schizophrenia and drug abuse.[10] Richardson's father was Irish Catholic and his mother is Jewish.[11] Following the divorce of his parents, Richardson moved to Woodstock, New York, with his mother and stepfather, English guitarist Jackie Lomax.[8] Richardson later moved to the Hollywood neighborhood of Los Angeles, where he attended Hollywood High School.[12]. He moved with his mother to Ojai, California, where he attended Nordhoff High School, when he was 16.[13] Richardson originally wanted to be a punk rock musician rather than a photographer.[13] He played bass guitar in the punk rock band The Invisible Government for four years.[14] He played bass for a variety of other punk bands in Southern California including Signal Street Alcoholics, Doggy Style, Baby Fist and Middle Finger.[8][15].\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:42 - [react.py:265:_run_one_step] - Step 3: \n",
+      "StepOutput(step=3, action=FunctionExpression(question=None, thought='I have finished the task.', action='finish(answer=\"Annie Morton was born on October 8, 1970, making her older than Terry Richardson who was born on August 14, 1965.\")'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'Annie Morton was born on October 8, 1970, making her older than Terry Richardson who was born on August 14, 1965.'}), observation='Annie Morton was born on October 8, 1970, making her older than Terry Richardson who was born on August 14, 1965.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 14:00:42 - [react.py:299:call] - answer:\n",
+      " Annie Morton was born on October 8, 1970, making her older than Terry Richardson who was born on August 14, 1965.\u001b[0m\n",
+      "\u001b[33m2024-12-19 14:00:42 - [2823432734.py:31:<module>] - No. 9, question: Who is older, Annie Morton or Terry Richardson?, ground truth: terry richardson, pred answer: annie morton was born on october 8 1970 making her older than terry richardson who was born on august 14 1965\u001b[0m\n",
+      "\u001b[31m2024-12-19 14:00:42 - [react.py:285:call] - input_query: Are Local H and For Against both from the United States?\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:43 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought='Checking the origin of the bands Local H and For Against.', action='search(entity=\"Local H\")'), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Local H'}), observation=\"Local H is an American rock band originally formed by guitarist and vocalist Scott Lucas, bassist Matt Garcia, drummer Joe Daniels, and lead guitarist John Sparkman in Zion, Illinois in 1990. The members all met in high school in 1987 and founded Local H three years later. After Sparkman's departure in 1991 and Garcia's departure in 1993, Local H continued as an unorthodox two-piece setup.. Local H signed a record contract with Island Records in 1994, where they would go on to release three albums. The band's debut album, Ham Fisted (1995), was not a success and the band was nearly dropped, but the band remained on the label long enough to release their second album As Good as Dead (1996).\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:00:44 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought='I have finished the task.', action='finish(answer=\"Yes, Local H is from the United States. Now, let\\'s check For Against\\'s origin.\"'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': \"Yes, Local H is from the United States. Now, let's check For Against's origin.\"}), observation=\"Yes, Local H is from the United States. Now, let's check For Against's origin.\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 14:00:44 - [react.py:299:call] - answer:\n",
+      " Yes, Local H is from the United States. Now, let's check For Against's origin.\u001b[0m\n",
+      "\u001b[33m2024-12-19 14:00:44 - [2823432734.py:31:<module>] - No. 10, question: Are Local H and For Against both from the United States?, ground truth: yes, pred answer: yes local h is from united states now lets check for againsts origin\u001b[0m\n",
+      "EM = EvaluationResult(avg_score=0.0, per_item_scores=[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], additional_info=None), FM = EvaluationResult(avg_score=0.5, per_item_scores=[0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0], additional_info=None), average time = 4.769389891624451\n"
+     ]
+    }
+   ],
+   "source": [
+    "from adalflow.eval.answer_match_acc import AnswerMatchAcc\n",
+    "\n",
+    "# set up evaluation type\n",
+    "EM_evaluator = AnswerMatchAcc(type=\"exact_match\")\n",
+    "FM_evaluator = AnswerMatchAcc(type=\"fuzzy_match\")\n",
+    "\n",
+    "agent = ReActAgent(\n",
+    "    tools=tools,\n",
+    "    max_steps=7,\n",
+    "    model_client=OpenAIClient(),\n",
+    "    model_kwargs=gpt_model_kwargs,\n",
+    "    # preset_prompt_kwargs=preset_prompt_kwargs,\n",
+    ")\n",
+    "\n",
+    "num_questions = 10\n",
+    "gt_answers = []\n",
+    "pred_answers = []\n",
+    "start_time = time.time()\n",
+    "for i in range(num_questions):\n",
+    "    question = val_dataset[i][\"question\"]\n",
+    "    gt_answer = normalize_answer(\n",
+    "        val_dataset[i][\"answer\"]\n",
+    "    )  # normalize the ground truth answer\n",
+    "    gt_answers.append(gt_answer)\n",
+    "\n",
+    "    # get the agent's response\n",
+    "    pred_answer = agent(question)\n",
+    "    pred_answer = normalize_answer(pred_answer)\n",
+    "    pred_answers.append(pred_answer)\n",
+    "\n",
+    "    printc(\n",
+    "        f\"No. {i+1}, question: {question}, ground truth: {gt_answer}, pred answer: {pred_answer}\",\n",
+    "        color=\"yellow\",\n",
+    "    )\n",
+    "\n",
+    "end_time = time.time()\n",
+    "\n",
+    "em = EM_evaluator.compute(pred_answers=pred_answers, gt_answers=gt_answers)\n",
+    "fm = FM_evaluator.compute(pred_answers=pred_answers, gt_answers=gt_answers)\n",
+    "avg_time = (end_time - start_time) / num_questions\n",
+    "\n",
+    "print(f\"EM = {em}, FM = {fm}, average time = {avg_time}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The EM is 0.3 and the FM is 0.6. Each query takes 9s in average. (The performance also depends on the success of wikipedia query connection.)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "What if we use simple LLM models to answer these questions? To test on this, we just need to remove the tools. We have a built-in ``llm_tool`` and ``finish`` that automatically handles the query. ``llm_tool`` uses the same model with the agent. We can't add the examples because the examples will mislead the agent to use non-existing tools."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[31m2024-12-19 14:01:27 - [react.py:285:call] - input_query: Were Scott Derrickson and Ed Wood of the same nationality?\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:29 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought='Checking the nationality of Scott Derrickson and Ed Wood.', action=\"llm_tool(input='What is the nationality of Scott Derrickson?')\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'What is the nationality of Scott Derrickson?'}), observation='Scott Derrickson is an American filmmaker.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:30 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought='Checking the nationality of Ed Wood.', action=\"llm_tool(input='What is the nationality of Ed Wood?')\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'What is the nationality of Ed Wood?'}), observation='Ed Wood was an American filmmaker.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:31 - [react.py:265:_run_one_step] - Step 3: \n",
+      "StepOutput(step=3, action=FunctionExpression(question=None, thought='Confirming if Scott Derrickson and Ed Wood were of the same nationality.', action=\"finish(answer='Yes, both Scott Derrickson and Ed Wood were American filmmakers, so they were of the same nationality.')\"), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'Yes, both Scott Derrickson and Ed Wood were American filmmakers, so they were of the same nationality.'}), observation='Yes, both Scott Derrickson and Ed Wood were American filmmakers, so they were of the same nationality.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 14:01:31 - [react.py:299:call] - answer:\n",
+      " Yes, both Scott Derrickson and Ed Wood were American filmmakers, so they were of the same nationality.\u001b[0m\n",
+      "\u001b[33m2024-12-19 14:01:31 - [3974127232.py:27:<module>] - No. 1, question: Were Scott Derrickson and Ed Wood of the same nationality?, ground truth: yes, pred answer: yes both scott derrickson and ed wood were american filmmakers so they were of same nationality\u001b[0m\n",
+      "\u001b[31m2024-12-19 14:01:31 - [react.py:285:call] - input_query: What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:32 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought='The user is asking about the government position held by the woman who portrayed Corliss Archer in the film Kiss and Tell.', action=\"llm_tool(input='government position held by the woman who portrayed Corliss Archer in the film Kiss and Tell')\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'government position held by the woman who portrayed Corliss Archer in the film Kiss and Tell'}), observation='The government position held by the woman who portrayed Corliss Archer in the film \"Kiss and Tell\" is Shirley Temple. She served as the United States Ambassador to Ghana and later to Czechoslovakia.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:33 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought='The user is asking about the government position held by the woman who portrayed Corliss Archer in the film Kiss and Tell.', action=\"finish(answer='Shirley Temple served as the United States Ambassador to Ghana and later to Czechoslovakia.')\"), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'Shirley Temple served as the United States Ambassador to Ghana and later to Czechoslovakia.'}), observation='Shirley Temple served as the United States Ambassador to Ghana and later to Czechoslovakia.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 14:01:33 - [react.py:299:call] - answer:\n",
+      " Shirley Temple served as the United States Ambassador to Ghana and later to Czechoslovakia.\u001b[0m\n",
+      "\u001b[33m2024-12-19 14:01:33 - [3974127232.py:27:<module>] - No. 2, question: What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?, ground truth: chief of protocol, pred answer: shirley temple served as united states ambassador to ghana and later to czechoslovakia\u001b[0m\n",
+      "\u001b[31m2024-12-19 14:01:33 - [react.py:285:call] - input_query: What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:36 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought='Identifying the science fantasy young adult series with companion books narrating stories of enslaved worlds and alien species.', action=\"llm_tool(input='Science fantasy young adult series with companion books narrating stories of enslaved worlds and alien species')\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'Science fantasy young adult series with companion books narrating stories of enslaved worlds and alien species'}), observation='I recommend checking out the \"Lorien Legacies\" series by Pittacus Lore. This science fiction young adult series follows the story of nine alien teenagers who escape their home planet, Lorien, which has been invaded by the evil Mogadorians. Each book in the series focuses on one of the teenagers and their journey to save Earth and their own species. Additionally, there are companion books that delve into the histories of the enslaved worlds and alien species featured in the main series.')\n",
+      "_______\n",
+      "\u001b[0m\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Error at parsing JSON string: Got invalid JSON object with yaml.safe_load. Error: while parsing a flow mapping\n",
+      "  in \"<unicode string>\", line 1, column 1:\n",
+      "    {\n",
+      "    ^\n",
+      "expected ',' or '}', but got '<scalar>'\n",
+      "  in \"<unicode string>\", line 4, column 61:\n",
+      "     ... ='I recommend checking out the \"Lorien Legacies\" series by Pitta ... \n",
+      "                                         ^. Got JSON string: {\n",
+      "    \"question\": null,\n",
+      "    \"thought\": \"Providing the information about the science fantasy young adult series with companion books narrating stories of enslaved worlds and alien species.\",\n",
+      "    \"action\": \"finish(answer='I recommend checking out the \"Lorien Legacies\" series by Pittacus Lore. This science fiction young adult series follows the story of nine alien teenagers who escape their home planet, Lorien, which has been invaded by the evil Mogadorians. Each book in the series focuses on one of the teenagers and their journey to save Earth and their own species. Additionally, there are companion books that delve into the histories of the enslaved worlds and alien species featured in the main series.')\"\n",
+      "}\n",
+      "Error in parsing JSON to JSON: Error: Got invalid JSON object with yaml.safe_load. Error: while parsing a flow mapping\n",
+      "  in \"<unicode string>\", line 1, column 1:\n",
+      "    {\n",
+      "    ^\n",
+      "expected ',' or '}', but got '<scalar>'\n",
+      "  in \"<unicode string>\", line 4, column 61:\n",
+      "     ... ='I recommend checking out the \"Lorien Legacies\" series by Pitta ... \n",
+      "                                         ^. Got JSON string: {\n",
+      "    \"question\": null,\n",
+      "    \"thought\": \"Providing the information about the science fantasy young adult series with companion books narrating stories of enslaved worlds and alien species.\",\n",
+      "    \"action\": \"finish(answer='I recommend checking out the \"Lorien Legacies\" series by Pittacus Lore. This science fiction young adult series follows the story of nine alien teenagers who escape their home planet, Lorien, which has been invaded by the evil Mogadorians. Each book in the series focuses on one of the teenagers and their journey to save Earth and their own species. Additionally, there are companion books that delve into the histories of the enslaved worlds and alien species featured in the main series.')\"\n",
+      "}\n",
+      "Error processing the output processors: Error: Got invalid JSON object with yaml.safe_load. Error: while parsing a flow mapping\n",
+      "  in \"<unicode string>\", line 1, column 1:\n",
+      "    {\n",
+      "    ^\n",
+      "expected ',' or '}', but got '<scalar>'\n",
+      "  in \"<unicode string>\", line 4, column 61:\n",
+      "     ... ='I recommend checking out the \"Lorien Legacies\" series by Pitta ... \n",
+      "                                         ^. Got JSON string: {\n",
+      "    \"question\": null,\n",
+      "    \"thought\": \"Providing the information about the science fantasy young adult series with companion books narrating stories of enslaved worlds and alien species.\",\n",
+      "    \"action\": \"finish(answer='I recommend checking out the \"Lorien Legacies\" series by Pittacus Lore. This science fiction young adult series follows the story of nine alien teenagers who escape their home planet, Lorien, which has been invaded by the evil Mogadorians. Each book in the series focuses on one of the teenagers and their journey to save Earth and their own species. Additionally, there are companion books that delve into the histories of the enslaved worlds and alien species featured in the main series.')\"\n",
+      "}\n",
+      "Error planning step 2: Error: Got invalid JSON object with yaml.safe_load. Error: while parsing a flow mapping\n",
+      "  in \"<unicode string>\", line 1, column 1:\n",
+      "    {\n",
+      "    ^\n",
+      "expected ',' or '}', but got '<scalar>'\n",
+      "  in \"<unicode string>\", line 4, column 61:\n",
+      "     ... ='I recommend checking out the \"Lorien Legacies\" series by Pitta ... \n",
+      "                                         ^. Got JSON string: {\n",
+      "    \"question\": null,\n",
+      "    \"thought\": \"Providing the information about the science fantasy young adult series with companion books narrating stories of enslaved worlds and alien species.\",\n",
+      "    \"action\": \"finish(answer='I recommend checking out the \"Lorien Legacies\" series by Pittacus Lore. This science fiction young adult series follows the story of nine alien teenagers who escape their home planet, Lorien, which has been invaded by the evil Mogadorians. Each book in the series focuses on one of the teenagers and their journey to save Earth and their own species. Additionally, there are companion books that delve into the histories of the enslaved worlds and alien species featured in the main series.')\"\n",
+      "}\n",
+      "Error running step 3: Error rendering Jinja2 template: 'None' has no attribute 'thought'\n",
+      "Error running step 4: Error rendering Jinja2 template: 'None' has no attribute 'thought'\n",
+      "Error running step 5: Error rendering Jinja2 template: 'None' has no attribute 'thought'\n",
+      "Error running step 6: Error rendering Jinja2 template: 'None' has no attribute 'thought'\n",
+      "Error running step 7: Error rendering Jinja2 template: 'None' has no attribute 'thought'\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-12-19 14:01:37 - [react.py:299:call] - answer:\n",
+      " Error planning step 2: Error: Got invalid JSON object with yaml.safe_load. Error: while parsing a flow mapping\n",
+      "  in \"<unicode string>\", line 1, column 1:\n",
+      "    {\n",
+      "    ^\n",
+      "expected ',' or '}', but got '<scalar>'\n",
+      "  in \"<unicode string>\", line 4, column 61:\n",
+      "     ... ='I recommend checking out the \"Lorien Legacies\" series by Pitta ... \n",
+      "                                         ^. Got JSON string: {\n",
+      "    \"question\": null,\n",
+      "    \"thought\": \"Providing the information about the science fantasy young adult series with companion books narrating stories of enslaved worlds and alien species.\",\n",
+      "    \"action\": \"finish(answer='I recommend checking out the \"Lorien Legacies\" series by Pittacus Lore. This science fiction young adult series follows the story of nine alien teenagers who escape their home planet, Lorien, which has been invaded by the evil Mogadorians. Each book in the series focuses on one of the teenagers and their journey to save Earth and their own species. Additionally, there are companion books that delve into the histories of the enslaved worlds and alien species featured in the main series.')\"\n",
+      "}\u001b[0m\n",
+      "\u001b[33m2024-12-19 14:01:37 - [3974127232.py:27:<module>] - No. 3, question: What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?, ground truth: animorphs, pred answer: error planning step 2 error got invalid json object with yamlsafeload error while parsing flow mapping in unicode string line 1 column 1 expected or but got scalar in unicode string line 4 column 61 i recommend checking out lorien legacies series by pitta got json string question null thought providing information about science fantasy young adult series with companion books narrating stories of enslaved worlds and alien species action finishansweri recommend checking out lorien legacies series by pittacus lore this science fiction young adult series follows story of nine alien teenagers who escape their home planet lorien which has been invaded by evil mogadorians each book in series focuses on one of teenagers and their journey to save earth and their own species additionally there are companion books that delve into histories of enslaved worlds and alien species featured in main series\u001b[0m\n",
+      "\u001b[31m2024-12-19 14:01:37 - [react.py:285:call] - input_query: Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:39 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought='Checking the locations of Laleli Mosque and Esma Sultan Mansion.', action=\"llm_tool(input='Are Laleli Mosque and Esma Sultan Mansion in the same neighborhood?')\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'Are Laleli Mosque and Esma Sultan Mansion in the same neighborhood?'}), observation='No, Laleli Mosque and Esma Sultan Mansion are not in the same neighborhood. Laleli Mosque is located in the Laleli district of Istanbul, while Esma Sultan Mansion is located in the Ortaköy district. They are in different neighborhoods within Istanbul.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:40 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought='Confirming the locations of Laleli Mosque and Esma Sultan Mansion.', action=\"finish(answer='No, Laleli Mosque is in the Laleli district of Istanbul, and Esma Sultan Mansion is in the Ortaköy district. They are in different neighborhoods within Istanbul.')\"), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'No, Laleli Mosque is in the Laleli district of Istanbul, and Esma Sultan Mansion is in the Ortaköy district. They are in different neighborhoods within Istanbul.'}), observation='No, Laleli Mosque is in the Laleli district of Istanbul, and Esma Sultan Mansion is in the Ortaköy district. They are in different neighborhoods within Istanbul.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 14:01:40 - [react.py:299:call] - answer:\n",
+      " No, Laleli Mosque is in the Laleli district of Istanbul, and Esma Sultan Mansion is in the Ortaköy district. They are in different neighborhoods within Istanbul.\u001b[0m\n",
+      "\u001b[33m2024-12-19 14:01:40 - [3974127232.py:27:<module>] - No. 4, question: Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?, ground truth: no, pred answer: no laleli mosque is in laleli district of istanbul and esma sultan mansion is in ortaköy district they are in different neighborhoods within istanbul\u001b[0m\n",
+      "\u001b[31m2024-12-19 14:01:40 - [react.py:285:call] - input_query: The director of the romantic comedy \"Big Stone Gap\" is based in what New York city?\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:42 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought=\"The user is asking about the director of the romantic comedy movie 'Big Stone Gap' and the New York city it is based in.\", action=\"llm_tool(input='director of the romantic comedy Big Stone Gap and New York city it is based in')\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'director of the romantic comedy Big Stone Gap and New York city it is based in'}), observation='\"Big Stone Gap\" is a romantic comedy film directed by Adriana Trigiani. The story is set in a small town in Virginia, not New York City. The film follows the life of a woman named Ave Maria Mulligan, played by Ashley Judd, as she navigates love, family, and community in the charming town of Big Stone Gap.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:43 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought=\"The user is asking about the director of the romantic comedy movie 'Big Stone Gap' and the New York city it is based in.\", action=\"finish(answer='The director of the romantic comedy movie Big Stone Gap is Adriana Trigiani. The movie is set in a small town in Virginia, not New York City.')\"), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'The director of the romantic comedy movie Big Stone Gap is Adriana Trigiani. The movie is set in a small town in Virginia, not New York City.'}), observation='The director of the romantic comedy movie Big Stone Gap is Adriana Trigiani. The movie is set in a small town in Virginia, not New York City.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 14:01:43 - [react.py:299:call] - answer:\n",
+      " The director of the romantic comedy movie Big Stone Gap is Adriana Trigiani. The movie is set in a small town in Virginia, not New York City.\u001b[0m\n",
+      "\u001b[33m2024-12-19 14:01:43 - [3974127232.py:27:<module>] - No. 5, question: The director of the romantic comedy \"Big Stone Gap\" is based in what New York city?, ground truth: greenwich village new york city, pred answer: director of romantic comedy movie big stone gap is adriana trigiani movie is set in small town in virginia not new york city\u001b[0m\n",
+      "\u001b[31m2024-12-19 14:01:43 - [react.py:285:call] - input_query: 2014 S/S is the debut album of a South Korean boy group that was formed by who?\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:44 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought='The user is asking about the debut album of a South Korean boy group and who formed it.', action=\"llm_tool(input='2014 S/S is the debut album of a South Korean boy group that was formed by who?')\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': '2014 S/S is the debut album of a South Korean boy group that was formed by who?'}), observation='2014 S/S is the debut album of the South Korean boy group WINNER, which was formed by YG Entertainment through the reality survival program WIN: Who is Next.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:45 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought=\"Providing the information about the debut album and the group's formation.\", action=\"finish(answer='2014 S/S is the debut album of the South Korean boy group WINNER, which was formed by YG Entertainment through the reality survival program WIN: Who is Next.')\"), function=Function(thought=None, name='finish', args=[], kwargs={'answer': '2014 S/S is the debut album of the South Korean boy group WINNER, which was formed by YG Entertainment through the reality survival program WIN: Who is Next.'}), observation='2014 S/S is the debut album of the South Korean boy group WINNER, which was formed by YG Entertainment through the reality survival program WIN: Who is Next.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 14:01:45 - [react.py:299:call] - answer:\n",
+      " 2014 S/S is the debut album of the South Korean boy group WINNER, which was formed by YG Entertainment through the reality survival program WIN: Who is Next.\u001b[0m\n",
+      "\u001b[33m2024-12-19 14:01:45 - [3974127232.py:27:<module>] - No. 6, question: 2014 S/S is the debut album of a South Korean boy group that was formed by who?, ground truth: yg entertainment, pred answer: 2014 ss is debut album of south korean boy group winner which was formed by yg entertainment through reality survival program win who is next\u001b[0m\n",
+      "\u001b[31m2024-12-19 14:01:45 - [react.py:285:call] - input_query: Who was known by his stage name Aladin and helped organizations improve their performance as a consultant?\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:48 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought='The user is asking about a person known by the stage name Aladin who worked as a consultant to help organizations improve their performance.', action=\"llm_tool(input='Aladin stage name consultant organizations performance improve')\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'Aladin stage name consultant organizations performance improve'}), observation='It sounds like you are looking for advice on how to improve the performance of organizations that provide stage name consulting services, like Aladin. Here are some suggestions to help enhance their performance:\\n\\n1. Develop a strong brand identity: Help organizations establish a unique and memorable brand identity that sets them apart from competitors.\\n\\n2. Enhance customer experience: Focus on providing exceptional customer service and personalized consultations to ensure client satisfaction.\\n\\n3. Expand service offerings: Consider offering additional services such as social media management, image consulting, or talent development to attract a wider range of clients.\\n\\n4. Invest in marketing and promotion: Utilize digital marketing strategies, social media platforms, and partnerships with influencers to increase visibility and attract new clients.\\n\\n5. Professional development: Provide ongoing training and development opportunities for consultants to stay updated on industry trends and best practices.\\n\\nBy implementing these strategies, organizations offering stage name consulting services can improve their performance and attract more clients.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:49 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought='The user is looking for information about a person known by the stage name Aladin who worked as a consultant to help organizations improve their performance.', action='finish(answer=\"Aladin is known by his stage name and helped organizations improve their performance as a consultant.\")'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'Aladin is known by his stage name and helped organizations improve their performance as a consultant.'}), observation='Aladin is known by his stage name and helped organizations improve their performance as a consultant.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 14:01:49 - [react.py:299:call] - answer:\n",
+      " Aladin is known by his stage name and helped organizations improve their performance as a consultant.\u001b[0m\n",
+      "\u001b[33m2024-12-19 14:01:49 - [3974127232.py:27:<module>] - No. 7, question: Who was known by his stage name Aladin and helped organizations improve their performance as a consultant?, ground truth: eenasul fateh, pred answer: aladin is known by his stage name and helped organizations improve their performance as consultant\u001b[0m\n",
+      "\u001b[31m2024-12-19 14:01:49 - [react.py:285:call] - input_query: The arena where the Lewiston Maineiacs played their home games can seat how many people?\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:50 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought='The user is asking about the seating capacity of the arena where the Lewiston Maineiacs played their home games.', action=\"llm_tool(input='The arena where the Lewiston Maineiacs played their home games can seat how many people?')\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'The arena where the Lewiston Maineiacs played their home games can seat how many people?'}), observation='The arena where the Lewiston Maineiacs played their home games was the Androscoggin Bank Colisée, which had a seating capacity of approximately 3,677 people for hockey games.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:51 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought='I have finished the task.', action=\"finish(answer='The Androscoggin Bank Colisée, where the Lewiston Maineiacs played their home games, had a seating capacity of approximately 3,677 people for hockey games.')\"), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'The Androscoggin Bank Colisée, where the Lewiston Maineiacs played their home games, had a seating capacity of approximately 3,677 people for hockey games.'}), observation='The Androscoggin Bank Colisée, where the Lewiston Maineiacs played their home games, had a seating capacity of approximately 3,677 people for hockey games.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 14:01:51 - [react.py:299:call] - answer:\n",
+      " The Androscoggin Bank Colisée, where the Lewiston Maineiacs played their home games, had a seating capacity of approximately 3,677 people for hockey games.\u001b[0m\n",
+      "\u001b[33m2024-12-19 14:01:51 - [3974127232.py:27:<module>] - No. 8, question: The arena where the Lewiston Maineiacs played their home games can seat how many people?, ground truth: 3677 seated, pred answer: androscoggin bank colisée where lewiston maineiacs played their home games had seating capacity of approximately 3677 people for hockey games\u001b[0m\n",
+      "\u001b[31m2024-12-19 14:01:51 - [react.py:285:call] - input_query: Who is older, Annie Morton or Terry Richardson?\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:52 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought='Identifying the ages of Annie Morton and Terry Richardson.', action=\"llm_tool(input='Annie Morton age') (str)\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'Annie Morton age'}), observation=\"I'm sorry, but I cannot provide personal information such as age without more context or details. How can I assist you further?\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:54 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought='Comparing the ages of Annie Morton and Terry Richardson to determine who is older.', action=\"llm_tool(input='Terry Richardson age') (str)\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'Terry Richardson age'}), observation=\"I'm sorry, but I cannot provide real-time information on individuals. Terry Richardson is a well-known American fashion photographer, but I do not have access to his current age.\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:54 - [react.py:265:_run_one_step] - Step 3: \n",
+      "StepOutput(step=3, action=FunctionExpression(question=None, thought=\"Unfortunately, I couldn't retrieve the ages of Annie Morton and Terry Richardson.\", action='finish(answer=\"I don\\'t have access to the specific ages of Annie Morton and Terry Richardson.\")'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': \"I don't have access to the specific ages of Annie Morton and Terry Richardson.\"}), observation=\"I don't have access to the specific ages of Annie Morton and Terry Richardson.\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 14:01:54 - [react.py:299:call] - answer:\n",
+      " I don't have access to the specific ages of Annie Morton and Terry Richardson.\u001b[0m\n",
+      "\u001b[33m2024-12-19 14:01:54 - [3974127232.py:27:<module>] - No. 9, question: Who is older, Annie Morton or Terry Richardson?, ground truth: terry richardson, pred answer: i dont have access to specific ages of annie morton and terry richardson\u001b[0m\n",
+      "\u001b[31m2024-12-19 14:01:54 - [react.py:285:call] - input_query: Are Local H and For Against both from the United States?\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:56 - [react.py:265:_run_one_step] - Step 1: \n",
+      "StepOutput(step=1, action=FunctionExpression(question=None, thought='Checking the origin of the bands Local H and For Against.', action='llm_tool(input=\"Local H origin\")'), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'Local H origin'}), observation='Local H is an American rock band hailing from Zion, Illinois. The band was formed in 1987 by Scott Lucas (vocals, guitar, bass) and Joe Daniels (drums). They are known for their energetic live performances and their blend of alternative rock, grunge, and punk influences.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:58 - [react.py:265:_run_one_step] - Step 2: \n",
+      "StepOutput(step=2, action=FunctionExpression(question=None, thought='Checking the origin of the band For Against.', action=\"llm_tool(input='For Against origin')\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'For Against origin'}), observation='Sure, here are some arguments for and against the concept of origin:\\n\\nFor:\\n1. Understanding the origin of something can provide valuable insights into its nature and purpose.\\n2. Knowing the origin of a product or idea can help in determining its authenticity and quality.\\n3. Studying origins can help in tracing the development and evolution of various phenomena.\\n\\nAgainst:\\n1. Focusing too much on origin can lead to oversimplification and overlook the complexity of a subject.\\n2. The origin of something may not always be clear or easily traceable, leading to speculation and uncertainty.\\n3. Placing too much emphasis on origin can limit our understanding and appreciation of the present and future aspects of a subject.')\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[34m2024-12-19 14:01:59 - [react.py:265:_run_one_step] - Step 3: \n",
+      "StepOutput(step=3, action=FunctionExpression(question=None, thought=\"Combining the origins of Local H and For Against to answer the user's query.\", action='finish(answer=\"Yes, Local H is from the United States (Zion, Illinois) and For Against\\'s origin is not specified in the response.\")'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': \"Yes, Local H is from the United States (Zion, Illinois) and For Against's origin is not specified in the response.\"}), observation=\"Yes, Local H is from the United States (Zion, Illinois) and For Against's origin is not specified in the response.\")\n",
+      "_______\n",
+      "\u001b[0m\n",
+      "\u001b[32m2024-12-19 14:01:59 - [react.py:299:call] - answer:\n",
+      " Yes, Local H is from the United States (Zion, Illinois) and For Against's origin is not specified in the response.\u001b[0m\n",
+      "\u001b[33m2024-12-19 14:01:59 - [3974127232.py:27:<module>] - No. 10, question: Are Local H and For Against both from the United States?, ground truth: yes, pred answer: yes local h is from united states zion illinois and for againsts origin is not specified in response\u001b[0m\n",
+      "EM = EvaluationResult(avg_score=0.0, per_item_scores=[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], additional_info=None), FM = EvaluationResult(avg_score=0.5, per_item_scores=[1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0], additional_info=None), average time = 3.1863945960998534\n"
+     ]
+    }
+   ],
+   "source": [
+    "from adalflow.eval.answer_match_acc import AnswerMatchAcc\n",
+    "\n",
+    "# set up evaluation type\n",
+    "EM_evaluator = AnswerMatchAcc(type=\"exact_match\")\n",
+    "FM_evaluator = AnswerMatchAcc(type=\"fuzzy_match\")\n",
+    "\n",
+    "agent = ReActAgent(\n",
+    "    max_steps=7, model_client=OpenAIClient(), model_kwargs=gpt_model_kwargs\n",
+    ")\n",
+    "\n",
+    "num_questions = 10\n",
+    "gt_answers = []\n",
+    "pred_answers = []\n",
+    "start_time = time.time()\n",
+    "for i in range(num_questions):\n",
+    "    question = val_dataset[i][\"question\"]\n",
+    "    gt_answer = normalize_answer(\n",
+    "        val_dataset[i][\"answer\"]\n",
+    "    )  # normalize the ground truth answer\n",
+    "    gt_answers.append(gt_answer)\n",
+    "\n",
+    "    # get the agent's response\n",
+    "    pred_answer = agent(question)\n",
+    "    pred_answer = normalize_answer(pred_answer)\n",
+    "    pred_answers.append(pred_answer)\n",
+    "\n",
+    "    printc(\n",
+    "        f\"No. {i+1}, question: {question}, ground truth: {gt_answer}, pred answer: {pred_answer}\",\n",
+    "        color=\"yellow\",\n",
+    "    )\n",
+    "\n",
+    "end_time = time.time()\n",
+    "\n",
+    "em = EM_evaluator.compute(pred_answers=pred_answers, gt_answers=gt_answers)\n",
+    "fm = FM_evaluator.compute(pred_answers=pred_answers, gt_answers=gt_answers)\n",
+    "avg_time = (end_time - start_time) / num_questions\n",
+    "\n",
+    "print(f\"EM = {em}, FM = {fm}, average time = {avg_time}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Without the tools and examples, EM=0 and FM=0.4. We saw hallucinations and nonsense:\n",
+    "\n",
+    "2024-06-15 23:17:04 - [3230041225.py:26:<module>] - No. 1, question: Were Scott Derrickson and Ed Wood of the same nationality?, ground truth: ``yes``, pred answer: ``no scott derrickson and ed wood were not of same nationality scott derrickson is american while ed wood was also american``\n",
+    "\n",
+    "2024-06-15 23:18:16 - [3230041225.py:26:<module>] - No. 9, question: Who is older, Annie Morton or Terry Richardson?, ground truth:`` terry richardson``, pred answer: ``who is older annie morton or terry richardson``\n",
+    "\n",
+    "Therefore, using ReAct agent outperforms the base LLM.\n",
+    "Meanwhile, ``LightRAG ReAct agent`` shows that the performance on 10 questions(EM=0.3)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 7. Future Improvement"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 84,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# TODO:\n",
+    "# 1. advanced, add history to react\n",
+    "# 2. add training, few shot\n",
+    "# 3. llm as judge\n",
+    "# 4. add picture\n",
+    "# 5. better json handling, we need to store the answer output"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "my-project-kernel",
+   "language": "python",
+   "name": "my-project-kernel"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/use_cases/agent/react_agent.ipynb b/use_cases/agent/react_agent.ipynb
index a93cb89e..0750dba1 100644
--- a/use_cases/agent/react_agent.ipynb
+++ b/use_cases/agent/react_agent.ipynb
@@ -30,16 +30,7 @@
    "cell_type": "code",
    "execution_count": 1,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/alleria/Documents/sylphAI/lightrag_package/LightRAG/.venv/lib/python3.11/site-packages/datasets/table.py:1421: FutureWarning: promote has been superseded by promote_options='default'.\n",
-      "  table = cls._concat_blocks(blocks, axis=0)\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# load the dataset\n",
     "from datasets import load_dataset\n",
@@ -100,7 +91,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -109,7 +100,7 @@
        "True"
       ]
      },
-     "execution_count": 23,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -117,8 +108,8 @@
    "source": [
     "import dotenv\n",
     "from adalflow.components.model_client import OpenAIClient\n",
-    "from adalflow.components.agent.react_agent import ReActAgent\n",
-    "from adalflow.core.tool_helper import FunctionTool\n",
+    "from adalflow.components.agent.react import ReActAgent\n",
+    "from adalflow.core.func_tool import FunctionTool\n",
     "\n",
     "import time\n",
     "\n",
@@ -141,7 +132,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -252,12 +243,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
     "# set up tools for the agent\n",
-    "tools = [FunctionTool.from_defaults(fn=search), FunctionTool.from_defaults(fn=lookup)]"
+    "tools = [FunctionTool(fn=search), FunctionTool(fn=lookup)]"
    ]
   },
   {
@@ -272,7 +263,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -343,12 +334,41 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "# preset up the examples as prompt_kwargs, the examples will be included in the system prompt\n",
     "\n",
+    "# convert this example in FunctionExpression\n",
+    "import adalflow as adal\n",
+    "from adalflow.core.types import FunctionExpression\n",
+    "\n",
+    "eg_1 = {\n",
+    "    \"Question\": \"What is the elevation range for the area that the eastern sector of the Colorado orogeny extends into?\",\n",
+    "    \"Thought 1\": \"I need to search Colorado orogeny, find the area that the eastern sector of the Colorado orogeny extends into, then find the elevation range of the area.\",\n",
+    "    \"Action 1\": \"search\",\n",
+    "    \"kwargs\": {\"entity\": \"Colorado orogeny\"},\n",
+    "    \"Observation 1\": \"The Colorado orogeny was an episode of mountain building (an orogeny) in Colorado and surrounding areas.\",\n",
+    "    \"Thought 2\": \"It does not mention the eastern sector. So I need to look up eastern sector.\",\n",
+    "    \"Action 2\": \"lookup('eastern sector')\",\n",
+    "    \"Action 2\": \"lookup\",\n",
+    "    \"kwargs\": {\"text\": \"eastern sector\", \"keyword\": \"eastern sector\"},\n",
+    "    \"Observation 2\": \"(Result 1 / 1) The eastern sector extends into the High Plains and is called the Central Plains orogeny.\",\n",
+    "    \"Thought 3\": \"The eastern sector of Colorado orogeny extends into the High Plains. So I need to search High Plains and find its elevation range.\",\n",
+    "    \"Action 3\": \"search('High Plains')\",\n",
+    "    \"Observation 3\": \"High Plains refers to one of two distinct land regions:\",\n",
+    "    \"Thought 4\": \"I need to instead search High Plains (United States).\",\n",
+    "    \"Action 4\": \"search('High Plains (United States)')\",\n",
+    "    \"Observation 4\": \"The High Plains are a subregion of the Great Plains. From east to west, the High Plains rise in elevation from around 1,800 to 7,000 ft (550 to 2,130 m).[3]\",\n",
+    "    \"Thought 5\": \"High Plains rise in elevation from around 1,800 to 7,000 ft, so the answer is 1,800 to 7,000 ft.\",\n",
+    "    \"Action 5\": \"finish('1,800 to 7,000 ft')\",\n",
+    "}\n",
+    "\n",
+    "examples_expression = []\n",
+    "for example in examples:\n",
+    "    ex_exp = FunctionExpression(thought=example)\n",
+    "\n",
     "preset_prompt_kwargs = {\"examples\": examples}"
    ]
   },
@@ -363,7 +383,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -383,149 +403,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "ReActAgent(\n",
-       "  tools=[FunctionTool(search), FunctionTool(lookup), FunctionTool(llm_tool), FunctionTool(finish)], max_steps=3, model_kwargs={'model': 'gpt-3.5-turbo', 'temperature': 0.0}, \n",
-       "  (prompt): Prompt(\n",
-       "    template: \n",
-       "    {# role/task description #}\n",
-       "    You task is to answer user's query with minimum steps and maximum accuracy using the tools provided.\n",
-       "    {# REACT instructions #}\n",
-       "    Each step you will read the previous Thought, Action, and Observation(execution result of the action)steps and then provide the next Thought and Action.\n",
-       "    \n",
-       "    You only have access to the following tools:\n",
-       "    {# tools #}\n",
-       "    {% for tool in tools %}\n",
-       "    {{ loop.index }}. ToolName: {{ tool.metadata.name }}\n",
-       "        Tool Description: {{ tool.metadata.description }}\n",
-       "        Tool Parameters: {{ tool.metadata.fn_schema_str }} {#tool args can be misleading, especially if we already have type hints and docstring in the function#}\n",
-       "    {% endfor %}\n",
-       "    {# output is always more robust to use json than string #}\n",
-       "    ---\n",
-       "    Your output must be in valid JSON format(raw Python string format) with two keys:\n",
-       "    {\n",
-       "        \"thought\": \"<Why you are taking this action>\",\n",
-       "        \"action\": \"ToolName(<args>, <kwargs>)\"\n",
-       "    }\n",
-       "    - Must double quote the JSON str.\n",
-       "    - Inside of the JSON str, Must use escape double quote and escape backslash for string.\n",
-       "    For example:\n",
-       "    \"action\": \"finish(\\\"John's.\\\")\"\n",
-       "    ---\n",
-       "    {# Specifications TODO: preference between the usage of llm tool vs the other tool #}\n",
-       "    Process:\n",
-       "    - Step 1: Read the user query and potentially divide it into subqueries. And get started with the first subquery.\n",
-       "    - Call one available tool at a time to solve each subquery/subquestion. \\\n",
-       "    - At step 'finish', join all subqueries answers and finish the task.\n",
-       "    Remember:\n",
-       "    - Action must call one of the above tools with Took Name. It can not be empty.\n",
-       "    - Read the Tool Description and ensure your args and kwarg follow what each tool expects in types. e.g. (a=1, b=2) if it is keyword argument or (1, 2) if it is positional.\n",
-       "    - You will always end with 'finish' action to finish the task. The answer can be the final answer or failure message.\n",
-       "    - When the initial query is simple, use minimum steps to answer the query.\n",
-       "    {#Examples can be here#}\n",
-       "    {# Check if there are any examples #}\n",
-       "    {% if examples %}\n",
-       "    <EXAMPLES>\n",
-       "    {% for example in examples %}\n",
-       "    {{ example }}\n",
-       "    {% endfor %}\n",
-       "    </EXAMPLES>\n",
-       "    {% endif %}\n",
-       "    <</SYS>>\n",
-       "    -----------------\n",
-       "    {# History #}\n",
-       "    {% for history in step_history %}\n",
-       "    Step {{history.step}}:\n",
-       "    {\n",
-       "     \"thought\": \"{{history.thought}}\",\n",
-       "     \"action\": \"{{history.action}}\",\n",
-       "    }\n",
-       "    \"observation\": \"{{history.observation}}\"\n",
-       "    {% endfor %}\n",
-       "    {% if input_str %}\n",
-       "    User query:\n",
-       "    {{ input_str }}\n",
-       "    {% endif %}\n",
-       "    , preset_prompt_kwargs: {'examples': ['Question: What is the elevation range for the area that the eastern sector of the Colorado orogeny extends into?\\nThought 1: I need to search Colorado orogeny, find the area that the eastern sector of the Colorado orogeny extends into, then find the elevation range of the area.\\nAction 1: search(\"Colorado orogeny\")\\nObservation 1: The Colorado orogeny was an episode of mountain building (an orogeny) in Colorado and surrounding areas.\\nThought 2: It does not mention the eastern sector. So I need to look up eastern sector.\\nAction 2: lookup(\"eastern sector\")\\nObservation 2: (Result 1 / 1) The eastern sector extends into the High Plains and is called the Central Plains orogeny.\\nThought 3: The eastern sector of Colorado orogeny extends into the High Plains. So I need to search High Plains and find its elevation range.\\nAction 3: search(\"High Plains\")\\nObservation 3: High Plains refers to one of two distinct land regions:\\nThought 4: I need to instead search High Plains (United States).\\nAction 4: search(\"High Plains (United States)\")\\nObservation 4: The High Plains are a subregion of the Great Plains. From east to west, the High Plains rise in elevation from around 1,800 to 7,000 ft (550 to 2,130 m).[3]\\nThought 5: High Plains rise in elevation from around 1,800 to 7,000 ft, so the answer is 1,800 to 7,000 ft.\\nAction 5: finish(\"1,800 to 7,000 ft\")', 'Question: Musician and satirist Allie Goertz wrote a song about the \"The Simpsons\" character Milhouse, who Matt Groening named after who?\\nThought 1: The question simplifies to \"The Simpsons\" character Milhouse is named after who. I only need to search Milhouse and find who it is named after.\\nAction 1: search(\"Milhouse\")\\nObservation 1: Milhouse Mussolini Van Houten is a recurring character in the Fox animated television series The Simpsons voiced by Pamela Hayden and created by Matt Groening.\\nThought 2: The paragraph does not tell who Milhouse is named after, maybe I can look up \"named after\".\\nAction 2: lookup(\"named after\")\\nObservation 2: (Result 1 / 1) Milhouse was named after U.S. president Richard Nixon, whose middle name was Milhous. \\nThought 3: Milhouse was named after U.S. president Richard Nixon, so the answer is Richard Nixon.\\nAction 3: finish(\"Richard Nixon\")', 'Question: Which documentary is about Finnish rock groups, Adam Clayton Powell or The Saimaa Gesture?\\nThought 1: I need to search Adam Clayton Powell and The Saimaa Gesture, and find which documentary is about Finnish rock groups.\\nAction 1: search(\"Adam Clayton Powell\")\\nObservation 1: Could not find [\"Adam Clayton Powell\"]. Similar: [\\'Adam Clayton Powell III\\', \\'Seventh Avenue (Manhattan)\\', \\'Adam Clayton Powell Jr. State Office Building\\', \\'Isabel Washington Powell\\', \\'Adam Powell\\', \\'Adam Clayton Powell (film)\\', \\'Giancarlo Esposito\\'].\\nThought 2: To find the documentary, I can search Adam Clayton Powell (film).\\nAction 2: search(\"Adam Clayton Powell (film)\")\\nObservation 2: Adam Clayton Powell is a 1989 American documentary film directed by Richard Kilberg.\\nThe film is about the rise and fall of influential African-American politician Adam Clayton Powell Jr.[3][4] It was later aired as part of the PBS series The American Experience.\\nThought 3: Adam Clayton Powell (film) is a documentary about an African-American politician, not Finnish rock groups. So the documentary about Finnish rock groups must instead be The Saimaa Gesture.\\nAction 3: finish(\"The Saimaa Gesture\")', 'Question: What profession does Nicholas Ray and Elia Kazan have in common?\\nThought 1: I need to search Nicholas Ray and Elia Kazan, find their professions, then find the profession they have in common.\\nAction 1: search(\"Nicholas Ray\")\\nObservation 1: Nicholas Ray (born Raymond Nicholas Kienzle Jr., August 7, 1911 – June 16, 1979) was an American film director, screenwriter, and actor best known for the 1955 film Rebel Without a Cause.\\nThought 2: Professions of Nicholas Ray are director, screenwriter, and actor. I need to search Elia Kazan next and find his professions.\\nAction 2: search(\"Elia Kazan\")\\nObservation 2: Elia Kazan was an American film and theatre director, producer, screenwriter and actor.\\nThought 3: Professions of Elia Kazan are director, producer, screenwriter, and actor. So profession Nicholas Ray and Elia Kazan have in common is director, screenwriter, and actor.\\nAction 3: finish(\"director, screenwriter, actor\")', 'Question: Which magazine was started first Arthur\\'s Magazine or First for Women?\\nThought 1: I need to search Arthur\\'s Magazine and First for Women, and find which was started first.\\nAction 1: search(\"Arthur\\'s Magazine\")\\nObservation 1: Arthur\\'s Magazine (1844-\\x80\\x931846) was an American literary periodical published in Philadelphia in the 19th century. \\nThought 2: Arthur\\'s Magazine was started in 1844. I need to search First for Women next.\\nAction 2: search(\"First for Women\")\\nObservation 2: First for Women is a woman\\'s magazine published by Bauer Media Group in the USA.[1] The magazine was started in 1989. \\nThought 3: First for Women was started in 1989. 1844 (Arthur\\'s Magazine) < 1989 (First for Women), so Arthur\\'s Magazine was started first.\\nAction 3: finish(\"Arthur\\'s Magazine\")', 'Question: Were Pavel Urysohn and Leonid Levin known for the same type of work?\\nThought 1: I need to search Pavel Urysohn and Leonid Levin, find their types of work, then find if they are the same.\\nAction 1: search(\"Pavel Urysohn\")\\nObservation 1: Pavel Samuilovich Urysohn (February 3, 1898 â\\x80\\x93 August 17, 1924) was a Soviet mathematician who is best known for his contributions in dimension theory.\\nThought 2: Pavel Urysohn is a mathematician. I need to search Leonid Levin next and find its type of work.\\nAction 2: search(\"Leonid Levin\")\\nObservation 2: Leonid Anatolievich Levin is a Soviet-American mathematician and computer scientist. \\nThought 3: Leonid Levin is a mathematician and computer scientist. So Pavel Urysohn and Leonid Levin have the same type of work. \\nAction 3: finish(\"yes\")'], 'tools': [FunctionTool(search), FunctionTool(lookup), FunctionTool(llm_tool), FunctionTool(finish)]}, prompt_variables: ['examples', 'step_history', 'input_str', 'tools']\n",
-       "  )\n",
-       "  (model_client): OpenAIClient()\n",
-       "  (output_processors): JsonParser()\n",
-       "  (additional_llm_tool): Generator(\n",
-       "    model_kwargs={'model': 'gpt-3.5-turbo', 'temperature': 0.0}, \n",
-       "    (prompt): Prompt(\n",
-       "      template: \n",
-       "      {% if task_desc_str or output_format_str or tools_str or examples_str or chat_history_str or context_str or steps_str %}\n",
-       "      <SYS>\n",
-       "      {% endif %}\n",
-       "      {# task desc #}\n",
-       "      {% if task_desc_str %}\n",
-       "      {{task_desc_str}}\n",
-       "      {% endif %}\n",
-       "      {# output format #}\n",
-       "      {% if output_format_str %}\n",
-       "      <OUTPUT_FORMAT>\n",
-       "      {{output_format_str}}\n",
-       "      </OUTPUT_FORMAT>\n",
-       "      {% endif %}\n",
-       "      {# tools #}\n",
-       "      {% if tools_str %}\n",
-       "      <TOOLS>\n",
-       "      {{tools_str}}\n",
-       "      </TOOLS>\n",
-       "      {% endif %}\n",
-       "      {# example #}\n",
-       "      {% if examples_str %}\n",
-       "      <EXAMPLES>\n",
-       "      {{examples_str}}\n",
-       "      </EXAMPLES>\n",
-       "      {% endif %}\n",
-       "      {# chat history #}\n",
-       "      {% if chat_history_str %}\n",
-       "      <CHAT_HISTORY>\n",
-       "      {{chat_history_str}}\n",
-       "      </CHAT_HISTORY>\n",
-       "      {% endif %}\n",
-       "      {#contex#}\n",
-       "      {% if context_str %}\n",
-       "      <CONTEXT>\n",
-       "      {{context_str}}\n",
-       "      </CONTEXT>\n",
-       "      {% endif %}\n",
-       "      {# steps #}\n",
-       "      {% if steps_str %}\n",
-       "      <STEPS>\n",
-       "      {{steps_str}}\n",
-       "      </STEPS>\n",
-       "      {% endif %}\n",
-       "      {% if task_desc_str or output_format_str or tools_str or examples_str or chat_history_str or context_str or steps_str %}\n",
-       "      </SYS>\n",
-       "      {% endif %}\n",
-       "      {% if input_str %}\n",
-       "      <Inputs>\n",
-       "      {{input_str}}\n",
-       "      </Inputs>\n",
-       "      {% endif %}\n",
-       "      {% if output_str %}\n",
-       "      <Outputs>\n",
-       "      {{output_str}}\n",
-       "      </Outputs>\n",
-       "      {% endif %}\n",
-       "      You:\n",
-       "      , prompt_variables: ['context_str', 'input_str', 'steps_str', 'output_format_str', 'output_str', 'chat_history_str', 'tools_str', 'examples_str', 'task_desc_str']\n",
-       "    )\n",
-       "    (model_client): OpenAIClient()\n",
-       "  )\n",
-       ")"
-      ]
-     },
-     "execution_count": 29,
-     "metadata": {},
-     "output_type": "execute_result"
+     "ename": "TypeError",
+     "evalue": "ReActAgent.__init__() got an unexpected keyword argument 'preset_prompt_kwargs'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[12], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m# max_steps refers to how many thought-action round we allow the model to perform\u001b[39;00m\n\u001b[1;32m      2\u001b[0m \u001b[38;5;66;03m# to save resources, let's use 3 here\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m agent \u001b[38;5;241m=\u001b[39m \u001b[43mReActAgent\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m      4\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtools\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtools\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m      5\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmax_steps\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m3\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m      6\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmodel_client\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mOpenAIClient\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m      7\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmodel_kwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgpt_model_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m      8\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpreset_prompt_kwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpreset_prompt_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m      9\u001b[0m \u001b[43m)\u001b[49m\n\u001b[1;32m     10\u001b[0m agent\n",
+      "\u001b[0;31mTypeError\u001b[0m: ReActAgent.__init__() got an unexpected keyword argument 'preset_prompt_kwargs'"
+     ]
     }
    ],
    "source": [
@@ -1365,9 +1255,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "lightrag-project",
+   "display_name": "my-project-kernel",
    "language": "python",
-   "name": "light-rag-project"
+   "name": "my-project-kernel"
   },
   "language_info": {
    "codemirror_mode": {
@@ -1379,7 +1269,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.7"
+   "version": "3.12.4"
   }
  },
  "nbformat": 4,
diff --git a/use_cases/config.py b/use_cases/config.py
index 895ed097..6eb5b3ed 100644
--- a/use_cases/config.py
+++ b/use_cases/config.py
@@ -41,7 +41,7 @@
 gpt_4o_model = {
     "model_client": OpenAIClient(),
     "model_kwargs": {
-        "model": "gpt-4o-mini",
+        "model": "gpt-4o",
         "temperature": 1,
         "top_p": 0.99,
         "max_tokens": 1000,
diff --git a/use_cases/question_answering/bbh/object_count/task.py b/use_cases/question_answering/bbh/object_count/task.py
index 6f5571f8..ce5b0dfa 100644
--- a/use_cases/question_answering/bbh/object_count/task.py
+++ b/use_cases/question_answering/bbh/object_count/task.py
@@ -60,7 +60,7 @@ def call(
         self, question: str, id: str = None
     ) -> Union[adal.GeneratorOutput, adal.Parameter]:
         output = self.llm_counter(prompt_kwargs={"input_str": question}, id=id)
-        print(f"output: {output}, training: {self.training}")
+        # print(f"output: {output}, training: {self.training}")
         if self.training:
             if output.full_response.error and "429" in output.full_response.error:
                 raise ValueError("Rate limit exceeded")
diff --git a/use_cases/question_answering/bbh/object_count/train_new.py b/use_cases/question_answering/bbh/object_count/train_new.py
index 5cf29ae6..72dc3ddc 100644
--- a/use_cases/question_answering/bbh/object_count/train_new.py
+++ b/use_cases/question_answering/bbh/object_count/train_new.py
@@ -159,7 +159,7 @@ def train(
 
     ckpt = train(
         debug=False,
-        max_steps=2,
+        max_steps=12,
         strategy=set_strategy,
         exclude_input_fields_from_bootstrap_demos=True,
     )
diff --git a/use_cases/text_grad_2.0_train.py b/use_cases/text_grad_2.0_train.py
index 1a029d53..6071e36c 100644
--- a/use_cases/text_grad_2.0_train.py
+++ b/use_cases/text_grad_2.0_train.py
@@ -2,7 +2,7 @@
 import tempfile
 import json
 
-num_runs = 2
+num_runs = 4
 # List of experiments to run
 object_count = "use_cases/question_answering/bbh/object_count/train_new.py"
 hotpot_qa_multi_hop_rag = "benchmarks/hotpot_qa/adal_exp/train_multi_hop_rag.py"
@@ -15,7 +15,7 @@
 
 # Optional: Arguments for each experiment (if needed)
 experiment_args = {
-    object_count: "--strategy random",
+    object_count: "--strategy constrained",
     # hotpot_qa_multi_hop_rag: "",
 }
 ckpt_values = {}
@@ -48,6 +48,13 @@ def run_experiment(script, args):
 
 
 if __name__ == "__main__":
+
+    result_file = "text_grad_2_results"
+    # add important run information in the naming of the file
+    import uuid
+
+    result_file = f"{result_file}_{num_runs}_runs_{uuid.uuid4()}.json"
+
     for experiment in experiments:
         args = experiment_args.get(experiment, "")
         for i in range(num_runs):
@@ -70,18 +77,18 @@ def run_experiment(script, args):
                 data = json.load(f)
                 print(f"Experiment: {experiment_index}")
                 print(f"Data: {data}")
-                _high_test_score = max(data["test_scores"])
-                print(f" test score: {data["test_scores"]}")
+                _high_test_score = max(data["val_scores"])
+                print(f" val score: {data["val_scores"]}")
                 past_highest_scores.append(_high_test_score)
                 if _high_test_score > highest_test_score:
                     highest_test_score = _high_test_score
                     highest_test_score_json_file = ckpt
                 # read the effective measures
-                effective_measures = data.get("effective_measures", {})
+                effective_measures = data.get("effective_measure", {})
                 if not effective_measures:
-                    total_prompts = len(data["test_scores"]) - 1
+                    total_prompts = len(data["val_scores"]) - 1
                     # count the total number of different test scores
-                    pass_num = len(set(data["test_scores"])) - 1
+                    pass_num = len(set(data["val_scores"])) - 1
                     average_pass_rate = pass_num / total_prompts
                     average_pass_rate_list.append(average_pass_rate)
                     average_pass_prompts_list.append(pass_num)
@@ -115,11 +122,15 @@ def run_experiment(script, args):
             average_pass_prompts_list
         )
         # calculate the average total prompts
-        average_total_prompts = sum(average_total_prompts) / len(average_total_prompts)
+        average_total_prompts = sum(average_total_prompts) / num_runs
 
         # add these numbers in the ckpt_values
         index = f"{experiment}_summary"
         ckpt_values[index] = {
+            "config": {
+                "num_runs": num_runs,
+                "args": args,
+            },
             "highest_test_score": highest_test_score,
             "mean_test_score": mean_test_score,
             "standard_deviation": standard_deviation,
@@ -127,6 +138,7 @@ def run_experiment(script, args):
             "average_pass_rate": average_pass_rate,
             "average_pass_prompts": average_pass_prompts,
             "average_total_prompts": average_total_prompts,
+            "past_highest_scores": past_highest_scores,
         }
 
     print("\nAll Checkpoints:")
@@ -134,5 +146,7 @@ def run_experiment(script, args):
         print(f"{experiment}: {ckpt}")
 
     # Save the results to a file
-    with open("results.json", "w") as f:
+    with open(result_file, "w") as f:
         json.dump(ckpt_values, f)
+
+    print(f"\nResults saved to {result_file}")