add scripts to summarize multiple runs

SylphAI-Inc · Dec 18, 2024 · 66390d6 · 66390d6
1 parent 2aac4ff
commit 66390d6
Show file tree

Hide file tree

Showing 5 changed files with 148 additions and 73 deletions.
diff --git a/adalflow/adalflow/optim/parameter.py b/adalflow/adalflow/optim/parameter.py
@@ -52,6 +52,9 @@ class ComponentNode:
 
     id: str = field(metadata={"desc": "The unique id of the component"})
     name: str = field(metadata={"desc": "The name of the component"})
+    type: Literal["INPUT", "COMPONENT"] = field(
+        metadata={"desc": "The type of the node"}, default="COMPONENT"
+    )
 
 
 @dataclass
@@ -1052,8 +1055,10 @@ def draw_component_subgraph(
         for node in component_nodes:
             node_label = f"""
             <table border="0" cellborder="1" cellspacing="0">
-                <tr><td><b>ID:</b></td><td>{node.id}</td></tr>
-                <tr><td><b>Name:</b></td><td>{node.name}</td></tr>"""
+                <tr><td><b>Name:</b></td><td>{node.name}</td></tr>
+                <tr><td><b>TYPE:</b></td><td>{node.type}</td></tr>
+                """
+            #                 <tr><td><b>ID:</b></td><td>{node.id}</td></tr>
 
             # add the list of orders
             if node.id in component_nodes_orders:
@@ -1068,7 +1073,7 @@ def draw_component_subgraph(
 
         # Add edges with order labels
         for source_id, target_id, edge_order in edges:
-            dot.edge(source_id, target_id, label=str(edge_order), color="black")
+            dot.edge(source_id, target_id)  # , label=str(edge_order), color="black")
 
         # Step 3: Save and render
         dot.render(filepath, cleanup=True)
@@ -1113,58 +1118,6 @@ def traverse(node: "Parameter"):
         traverse(self)
         return output_nodes, edges
 
-    # def _collect_output_subgraph(
-    #     self,
-    # ) -> Tuple[Set[Tuple[str, str]], List[Tuple[str, str]]]:
-    #     """
-    #     Collect OUTPUT nodes and their relationships using component_trace information.
-
-    #     Returns:
-    #         nodes (Set[Tuple[str, str]]): Set of component nodes (component_id, label).
-    #         edges (List[Tuple[str, str]]): Edges between component IDs.
-    #     """
-    #     component_nodes = set()  # To store component nodes as (component_id, label)
-    #     edges = []  # To store edges between components
-
-    #     visited = set()  # Track visited parameters to avoid cycles
-
-    #     def traverse(node: "Parameter"):
-    #         if node in visited:
-    #             return
-    #         visited.add(node)
-
-    #         # Only consider OUTPUT-type parameters
-    #         if (
-    #             node.param_type == ParameterType.OUTPUT
-    #             or "OUTPUT" in node.param_type.name
-    #         ):
-    #             component_id = node.component_trace.id
-    #             component_name = node.component_trace.name or "Unknown Component"
-    #             label = f"{component_name}\nID: {component_id}"
-
-    #             # Add the component as a node
-    #             component_nodes.add((component_id, label))
-
-    #             # Traverse predecessors and add edges
-    #             for pred in node.predecessors:
-    #                 if pred.param_type == ParameterType.OUTPUT:
-    #                     pred_id = pred.component_trace.id
-    #                     pred_name = pred.component_trace.name or "Unknown Component"
-
-    #                     # Add predecessor as a node
-    #                     pred_label = f"{pred_name}\nID: {pred_id}"
-    #                     component_nodes.add((pred_id, pred_label))
-
-    #                     # Add edge between components
-    #                     edges.append((pred_id, component_id))
-
-    #                 # Recursive traversal
-    #                 traverse(pred)
-
-    #     # Start traversal from the current parameter
-    #     traverse(self)
-    #     return component_nodes, edges
-
     def _collect_component_subgraph(
         self,
     ) -> Tuple[Set[ComponentNode], List[Tuple[str, str]]]:
@@ -1184,7 +1137,7 @@ def _collect_component_subgraph(
         visited = set()  # Track visited parameters to avoid cycles
         edge_counter = [0]  # Mutable counter for edge order tracking
 
-        def traverse(node: "Parameter", depth: int):
+        def traverse(node: "Parameter"):
             if node in visited:
                 return
             visited.add(node)
@@ -1211,14 +1164,25 @@ def traverse(node: "Parameter", depth: int):
                         pred.param_type == ParameterType.OUTPUT
                         or "OUTPUT" in pred.param_type.name
                     ):
-                        edges.append((pred_id, component_id, depth))
+                        edges.append((pred_id, component_id, edge_counter[0]))
                         component_nodes.add(ComponentNode(id=pred_id, name=pred_name))
                         edge_counter[0] += 1
 
-                    traverse(pred, depth + 1)
+                    if pred.param_type == ParameterType.INPUT:
+                        pred_id = pred.id
+                        pred_name = pred.name
+                        pred_node = ComponentNode(
+                            id=pred_id, name=pred_name, type="INPUT"
+                        )
+                        component_nodes.add(pred_node)
+                        # add an edge from input to the first output
+                        edges.append((pred_id, component_id, edge_counter[0]))
+                        edge_counter[0] += 1
+
+                    traverse(pred)
 
         # Start traversal from the current parameter
-        traverse(self, depth=0)
+        traverse(self)
         # Reverse the edge order
         # total_edges = len(edges)
         # edges = [

diff --git a/adalflow/adalflow/optim/trainer/trainer.py b/adalflow/adalflow/optim/trainer/trainer.py
@@ -91,6 +91,9 @@ class Trainer(Component):
     batch_val_score_threshold: Optional[float] = (
         1.0  # when acc_score >= this threshold, skip this batch
     )
+    correct_val_score_threshold: Optional[float] = (
+        0.5  # when acc_score >= this threshold, it is considered as correct sample
+    )
     max_error_samples: Optional[int] = 2
     max_correct_samples: Optional[int] = 2
     debug: bool = False
@@ -106,6 +109,7 @@ def __init__(
         num_workers: int = 4,
         ckpt_path: str = None,
         batch_val_score_threshold: Optional[float] = 1.0,
+        correct_val_score_threshold: Optional[float] = 0.5,
         max_error_samples: Optional[int] = 2,
         max_correct_samples: Optional[int] = 2,
         max_proposals_per_step: int = 5,
@@ -140,6 +144,7 @@ def __init__(
         self.val_dataset = val_dataset
         self.test_dataset = test_dataset
         self.batch_val_score_threshold = batch_val_score_threshold
+        self.correct_val_score_threshold = correct_val_score_threshold
         self.max_error_samples = max_error_samples
         self.max_correct_samples = max_correct_samples
         self.max_proposals_per_step = max_proposals_per_step
@@ -1680,10 +1685,18 @@ def _moving_batch_sample(
         # ensure only 0 and 1 in the acc_score_list
         import numpy as np
 
-        if not all([score in [0, 1] for score in acc_score_list]):
+        if not all(0 <= score <= 1 for score in acc_score_list):
             raise ValueError("acc_score_list should only contain 0 and 1")
-        correct_indices = [i for i, score in enumerate(acc_score_list) if score == 1]
-        error_indices = [i for i, score in enumerate(acc_score_list) if score == 0]
+        correct_indices = [
+            i
+            for i, score in enumerate(acc_score_list)
+            if score > self.correct_val_score_threshold
+        ]
+        error_indices = [
+            i
+            for i, score in enumerate(acc_score_list)
+            if score <= self.correct_val_score_threshold
+        ]
         print(f"Moving batch correct size: {len(correct_indices)}")
         print(f"Moving batch error size: {len(error_indices)}")
         if len(error_indices) == 0:
@@ -1984,6 +1997,8 @@ def _fit_text_grad_constraint(
                             **step_result,
                         )
 
+                        # reset the moving batch
+
                         all_samples, all_losses, all_y_preds = [], [], []
 
                     else:

diff --git a/use_cases/classification/train_string_output.py b/use_cases/classification/train_string_output.py
@@ -7,7 +7,7 @@
 from use_cases.classification.data import load_datasets, TRECExtendedData
 
 from adalflow.eval.answer_match_acc import AnswerMatchAcc
-from LightRAG.use_cases.config import (
+from use_cases.config import (
     gpt_3_model,
     gpt_4o_model,
 )

diff --git a/use_cases/question_answering/bbh/object_count/train_new.py b/use_cases/question_answering/bbh/object_count/train_new.py
@@ -140,20 +140,36 @@ def train(
 
 
 if __name__ == "__main__":
-    import sys
     import json
 
+    # make the strategy configurable in the script
+    import argparse
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--strategy", type=str, default="random")
+    parser.add_argument(
+        "output_path", nargs="?", help="File path to save the checkpoint"
+    )
+
+    args = parser.parse_args()
+
+    set_strategy = args.strategy
+    set_output_path = args.output_path
+
     ckpt = train(
         debug=False,
-        max_steps=12,
-        strategy="constrained",
+        max_steps=2,
+        strategy=set_strategy,
         exclude_input_fields_from_bootstrap_demos=True,
     )
     print(f"ckpt: {ckpt}")
-    # Save ckpt to a file passed as an argument
-    if len(sys.argv) > 1:  # Check if a file path is provided
-        with open(sys.argv[1], "w") as f:
+    if set_output_path:
+        with open(set_output_path, "w") as f:
             json.dump({"ckpt": ckpt}, f)
+        print(f"Checkpoint saved to {set_output_path}")
+    else:
+        print("No file path provided for saving the checkpoint.")
 
     # train_diagnose(**gpt_3_model)
     # train_diagnose_teacher(**gpt_4o_model) # 4omini works well as an optimizer too

diff --git a/use_cases/text_grad_2.0_train.py b/use_cases/text_grad_2.0_train.py
@@ -2,6 +2,7 @@
 import tempfile
 import json
 
+num_runs = 2
 # List of experiments to run
 object_count = "use_cases/question_answering/bbh/object_count/train_new.py"
 hotpot_qa_multi_hop_rag = "benchmarks/hotpot_qa/adal_exp/train_multi_hop_rag.py"
@@ -14,7 +15,7 @@
 
 # Optional: Arguments for each experiment (if needed)
 experiment_args = {
-    object_count: "",
+    object_count: "--strategy random",
     # hotpot_qa_multi_hop_rag: "",
 }
 ckpt_values = {}
@@ -49,10 +50,89 @@ def run_experiment(script, args):
 if __name__ == "__main__":
     for experiment in experiments:
         args = experiment_args.get(experiment, "")
-        ckpt = run_experiment(experiment, args)
-        if ckpt:
-            ckpt_values[experiment] = ckpt
+        for i in range(num_runs):
+            print(f"\nRun {i + 1}/{num_runs}")
+            ckpt = run_experiment(experiment, args)
+            ckpt_index = f"{experiment}_{i + 1}"
+            if ckpt:
+                ckpt_values[ckpt_index] = ckpt
+        # load all json files using the ckpt paths
+        highest_test_score, mean_test_score, standard_deviation = 0, 0, 0
+        past_highest_scores = []
+        # average pass rate, average pass prompts
+        average_pass_rate_list = []
+        average_pass_prompts_list = []
+        average_total_prompts = []
+        total_prompts = 0
+        highest_test_score_json_file = None
+        for experiment_index, ckpt in ckpt_values.items():
+            with open(ckpt, "r") as f:
+                data = json.load(f)
+                print(f"Experiment: {experiment_index}")
+                print(f"Data: {data}")
+                _high_test_score = max(data["test_scores"])
+                print(f" test score: {data["test_scores"]}")
+                past_highest_scores.append(_high_test_score)
+                if _high_test_score > highest_test_score:
+                    highest_test_score = _high_test_score
+                    highest_test_score_json_file = ckpt
+                # read the effective measures
+                effective_measures = data.get("effective_measures", {})
+                if not effective_measures:
+                    total_prompts = len(data["test_scores"]) - 1
+                    # count the total number of different test scores
+                    pass_num = len(set(data["test_scores"])) - 1
+                    average_pass_rate = pass_num / total_prompts
+                    average_pass_rate_list.append(average_pass_rate)
+                    average_pass_prompts_list.append(pass_num)
+                    average_total_prompts.append(total_prompts)
+                else:
+                    total_prompts = (
+                        effective_measures["subset"]["pass"]
+                        + effective_measures["subset"]["fail"]
+                    )
+
+                    pass_num = effective_measures["valset"]["pass"]
+                    total_val_prompts = (
+                        effective_measures["valset"]["pass"]
+                        + effective_measures["valset"]["fail"]
+                    )
+                    average_pass_rate = pass_num / total_val_prompts
+                    average_pass_rate_list.append(average_pass_rate)
+                    average_pass_prompts_list.append(pass_num)
+                    average_total_prompts.append(total_prompts)
+        # calculate the mean test score
+        mean_test_score = sum(past_highest_scores) / len(past_highest_scores)
+        # calculate the standard deviation
+        standard_deviation = sum(
+            [(x - mean_test_score) ** 2 for x in past_highest_scores]
+        ) / len(past_highest_scores)
+        standard_deviation = standard_deviation**0.5
+        # calculate the average pass rate
+        average_pass_rate = sum(average_pass_rate_list) / len(average_pass_rate_list)
+        # calculate the average pass prompts
+        average_pass_prompts = sum(average_pass_prompts_list) / len(
+            average_pass_prompts_list
+        )
+        # calculate the average total prompts
+        average_total_prompts = sum(average_total_prompts) / len(average_total_prompts)
+
+        # add these numbers in the ckpt_values
+        index = f"{experiment}_summary"
+        ckpt_values[index] = {
+            "highest_test_score": highest_test_score,
+            "mean_test_score": mean_test_score,
+            "standard_deviation": standard_deviation,
+            "highest_test_score_json_file": highest_test_score_json_file,
+            "average_pass_rate": average_pass_rate,
+            "average_pass_prompts": average_pass_prompts,
+            "average_total_prompts": average_total_prompts,
+        }
 
     print("\nAll Checkpoints:")
     for experiment, ckpt in ckpt_values.items():
         print(f"{experiment}: {ckpt}")
+
+    # Save the results to a file
+    with open("results.json", "w") as f:
+        json.dump(ckpt_values, f)