fixed some issues with evolve instruct

camel-ai · Jul 17, 2024 · 5b0306d · 5b0306d
1 parent 245377f
commit 5b0306d
Show file tree

Hide file tree

Showing 6 changed files with 184 additions and 60 deletions.
diff --git a/implementations/example_evolve.py b/implementations/example_evolve.py
@@ -26,7 +26,11 @@
 
 def main():
     parser = argparse.ArgumentParser(description='Options')
-    parser.add_argument("--seed_file", type=str, default="")
+    parser.add_argument(
+        "--seed_file",
+        type=str,
+        default="implementations/data/seed_files/alpaca_data.json",
+    )
     parser.add_argument("--column_names", nargs='+', default="instruction")
     parser.add_argument("--num_rows", type=int, default=5)
     parser.add_argument("--min_len_chars", type=int, default=32)
@@ -40,53 +44,35 @@ def main():
         SeedInstruction(
             id="seed_task_0",
             short_name="breakfast_suggestion",
-            instruction="Is there anything I can eat for a breakfast that "
-            "doesn't include eggs, yet includes protein, and has"
-            " roughly 700-1000 calories?",
+            instruction="Give three tips for staying healthy.",
             instances=[
                 Instance(
                     "",
-                    "Yes, you can have 1 oatmeal banana protein shake and"
-                    " 4 strips of bacon. The oatmeal banana protein shake may"
-                    " contain 1/2 cup oatmeal, 60 grams whey protein powder,"
-                    " 1/2 medium banana, 1tbsp flaxseed oil and 1/2 cup"
-                    "water, totalling about 550 calories. "
-                    "The 4 strips of bacon "
-                    "contains about 200 calories.",
+                    "" "",
                 )
             ],
             is_classification=False,
         ),
         SeedInstruction(
             id="seed_task_1",
             short_name="antonym_relation",
-            instruction="What is the relation between the given pairs?",
+            instruction="Describe the structure of an atom.",
             instances=[
                 Instance(
-                    "Night : Day :: Right : Left",
-                    "The relation between the given pairs is "
-                    "that they are opposites.",
+                    "",
+                    "" "",
                 )
             ],
             is_classification=False,
         ),
         SeedInstruction(
             id="seed_task_2",
             short_name="one_sentence_description",
-            instruction="Generate a one-sentence description for each"
-            " of the following people.",
+            instruction="Identify the odd one out.",
             instances=[
                 Instance(
-                    "- Brack Obama\n- Elon Musk\n- Taylor Swift",
-                    "- Barack Hussein Obama II is an American politician"
-                    " who served as the 44th president of the United States"
-                    " from 2009 to 2017.\n- Elon Musk is the founder, CEO,"
-                    " and chief engineer of SpaceX; angel investor, CEO and"
-                    " product architect of Tesla, Inc.; founder of The Boring"
-                    " Company; co-founder of Neuralink and OpenAI; president"
-                    " of the Musk Foundation; and owner and CEO of Twitter,"
-                    " Inc.\n- Taylor Alison Swift is an American"
-                    " singer-songwriter.",
+                    "Twitter, Instagram, Telegram",
+                    "Telegram",
                 )
             ],
             is_classification=False,
@@ -96,15 +82,15 @@ def main():
     spec.agent_system = SingleAgent()
     llm_pipeline = ChatGPTPipeline(args.openai_model)
 
-    generator = EvolveInstructGenerator(
-        llm_pipeline=llm_pipeline,
-        seed_data=args.seed_file,
-        column_names=args.column_names,
-        num_rows=args.num_rows,
-        min_len_chars=args.min_len_chars,
-        max_len_chars=args.max_len_chars,
-        verbose=True,
-    )
+    spec.llm_pipeline = llm_pipeline
+    spec.seed_data = args.seed_file
+    spec.column_names = args.column_names
+    spec.num_rows = args.num_rows
+    spec.min_len_chars = args.min_len_chars
+    spec.max_len_chars = args.max_len_chars
+    spec.verbose = True
+
+    generator = EvolveInstructGenerator(spec)
     generator.run()
 
 

diff --git a/implementations/synthetic_datagen/base_generator.py b/implementations/synthetic_datagen/base_generator.py
@@ -23,10 +23,10 @@ class BaseDataGenerator(ABC):
     def generate(self):
         pass
 
-    @abstractmethod
-    def curate(self):
-        pass
+    # @abstractmethod
+    # def curate(self):
+    #     pass
 
-    @abstractmethod
-    def evaluate(self):
-        pass
+    # @abstractmethod
+    # def evaluate(self):
+    #     pass
diff --git a/implementations/synthetic_datagen/evolve_instruct/evolve_instruct_generator.py b/implementations/synthetic_datagen/evolve_instruct/evolve_instruct_generator.py
@@ -11,16 +11,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
+import json
 import logging
+import os
 import time
+import uuid
 from enum import Enum
 from typing import Optional
-import json
-import uuid
-import os
+
 import numpy as np
 import pandas as pd
-
 from datasets import Dataset, DatasetDict, load_dataset
 from synthetic_datagen.base_generator import BaseDataGenerator
 from synthetic_datagen.evolve_instruct.evolve_instruct_spec import (
@@ -75,9 +75,14 @@ def __init__(self, spec: Optional[EvolveInstructSpec] = None):
             for the generator. If not provided,
             a default SelfInstructSpec is used.
         """
+
         self.spec = spec or EvolveInstructSpec()
         self.instruction_generator = InstructionGenerator(self.spec)
         self.instance_generator = InstanceGenerator(self.spec)
+        self.seed_text_list = []
+        self.prompts = []
+        self.final_prompts = []
+        self.final_answers = []
         self.prompt_templates = dict()
         self.prompt_templates['base'] = ""
         write_in_korean = "Write in Korean."
@@ -172,10 +177,11 @@ def generate(self):
         followed by
         the generation of synthetic instances based on those instructions.
         """
+        self.create_seed_prompts()
         logging.info("Generating synthetic instructions...")
-        self.instruction_generator.generate()
+        self.create_prompts()
         logging.info("Generating synthetic instances...")
-        self.instance_generator.generate()
+        self.create_answers()
 
     def evaluate(self):
         raise RuntimeError(
@@ -197,12 +203,10 @@ def run(self):
                         'output': self.final_answers[i],
                     }
                 )
-
 
         with open(
-            f"{
-                self.seed_data.replace('.jsonl', '').replace('json', '')
-                }.%s.json"
+            f"{self.seed_data.replace('.jsonl', '')\
+               .replace('json', '')}.%s.json"
             % str(uuid.uuid4())[:4],
             "wt",
         ) as f:
@@ -218,27 +222,29 @@ def create_seed_prompts(self):
 
         :return: None
         """
-
 
-        if isinstance(self.seed_data, str) and os.path.exists(self.seed_data):
-            data = load_dataset("json", data_files=self.seed_data)
+        if isinstance(self.spec.seed_data, str) and os.path.exists(
+            self.spec.seed_data
+        ):
+            data = load_dataset("json", data_files=self.spec.seed_data)
             self.seed_text_list = []
             for d in data['train']:
                 s = ""
-                if isinstance(self.column_names, str):
-                    s = d[self.column_names]
+                if isinstance(self.spec.column_names, str):
+                    s = d[self.spec.column_names]
                 else:
-                    for col in self.column_names:
+                    for col in self.spec.column_names:
                         s += d[col] + "\n"
                 self.seed_text_list.append(s.strip())
             assert self.seed_text_list, "data import failed, got empty list"
 
     def create_prompts(self):
-        print("Creating %d prompts." % self.num_rows)
+        print("Creating %d prompts." % self.spec.num_rows)
         assert self.seed_text_list, "must have seed text list"
         t0 = time.time()
         self.prompts.clear()
-        for _ in range(self.num_rows):
+
+        for _ in range(self.spec.num_rows):
             new_prompt = np.random.choice(self.seed_text_list)
             self.prompts.append(new_prompt)
         i = 0
@@ -268,3 +274,111 @@ def convert_list_to_dataset(self, text_list):
         ds = DatasetDict()
         ds['train'] = Dataset.from_pandas(df)
         return ds
+
+    def mutate(self, iteration):
+        assert len(self.prompts) == self.spec.num_rows
+        list_prompts = []
+        mutations = []
+        for i in range(self.spec.num_rows):
+            mutation = np.random.choice(Mutation)
+            mutations.append(mutation)
+            # if mutation == Mutation.FRESH_START:
+            #     mutation = Mutation.COMPLICATE
+            before = self.prompts[i]
+            prompt = self.prompt_templates[mutation].replace(
+                "<PROMPT>", before
+            )
+            list_prompts.append(prompt)
+
+        ds = self.convert_list_to_dataset(list_prompts)
+        assert (
+            ds['train'].num_rows
+            == len(list_prompts)
+            == self.spec.num_rows
+            == len(self.prompts)
+        )
+        t0 = time.time()
+        after = self.spec.llm_pipeline(ds['train'])
+        assert len(after) == self.spec.num_rows
+        t1 = time.time()
+        print("HFPipeline took %.4f seconds" % (t1 - t0))
+
+        for i in range(len(after)):
+            after[i] = after[i].split("Prompt#:")[-1].strip()
+            for pp in ['New Prompt:\n', 'New Prompt: ']:
+                if after[i][: len(pp)] == pp:
+                    after[i] = after[i][len(pp) :]
+            after[i] = after[i].strip()
+            use_new_prompt, why = self.change_approved(
+                self.prompts[i], after[i]
+            )
+            if self.spec.verbose:
+                print("===========================")
+                print("Old Prompt: %s" % self.prompts[i])
+                print("Mutation: %s" % mutations[i].name)
+                print("New Prompt: %s" % after[i])
+                print("===========================")
+
+            if use_new_prompt:
+                if (
+                    self.spec.max_len_chars
+                    >= len(after[i])
+                    >= self.spec.min_len_chars
+                ):
+                    self.final_prompts.append(after[i])
+                    print(
+                        "Prompt was accepted, now have %d good prompts."
+                        % len(self.final_prompts)
+                    )
+                    self.prompts[i] = np.random.choice(self.seed_text_list)
+                    print("Creating new prompt.")
+                else:
+                    self.prompts[i] = after[i]
+                    print("Prompt was successfully modified.")
+            else:
+                print("Mutation rejected, will try again. Reason: %s" % why)
+            print("", flush=True)
+        return len(self.final_prompts) < self.spec.num_rows
+
+    def change_approved(self, before, after):
+        if before == after:
+            return False, "same"
+        if after.count('\n') > after.count(" ") * 2:
+            return False, "too many lines"
+        if after.count('\n') == after.count("- ") > 10:
+            return False, "too many items"
+        if (
+            self.prompt_templates['base']
+            and self.prompt_templates['base'] in after
+        ):
+            return False, "prompt leaked 1"
+        if "#New Prompt#" in after:
+            return False, "prompt leaked 2"
+        if "new prompt" in after.lower():
+            return False, "prompt leaked 3"
+        if "openai" in after.lower():
+            return False, "AI"
+        if "gpt" in after.lower() and "gpt" not in before.lower():
+            return False, "AI"
+        if (
+            "죄송하지만" in after.lower()
+            and "죄송" not in before.lower()
+            and len(after) < len(before)
+        ):
+            return False, "sorry"
+        if False:
+            # too slow in general, not needed
+            prompt = """Are the two following prompts equal to each other?
+To be equal, they must meet two requirements:
+1. Both prompts have the same constraints and requirements.
+2. Both prompts have the same depth and breath of the inquiry.
+First prompt: %s
+Second prompt: %s
+Answer with 'Equal' or 'Not Equal'. No need to explain the reason.""" % (
+                before,
+                after,
+            )
+            answer = self.llm_pipeline(prompt)
+            if 'not equal' not in answer.lower():
+                return False, "equal"
+        return True, "ok"
diff --git a/implementations/synthetic_datagen/evolve_instruct/evolve_instruct_spec.py b/implementations/synthetic_datagen/evolve_instruct/evolve_instruct_spec.py
@@ -18,6 +18,7 @@
 from rouge_score import rouge_scorer
 from synthetic_datagen.agent_systems.base_agent_system import BaseAgentSystem
 from synthetic_datagen.agent_systems.single_agent import SingleAgent
+from transformers import pipeline
 
 from implementations.synthetic_datagen.utils.seed_instruction import (
     SeedInstruction,
@@ -72,7 +73,14 @@ class EvolveInstructSpec:
         )
     )
     agent_system: BaseAgentSystem = field(default_factory=SingleAgent)
+    llm_pipeline: pipeline = None
+    seed_data: List[str] = None
     seed_instructions: List[SeedInstruction] = field(default=list)
+    column_names: List[str] = field(default_factory=list)  # ["instruction"]
+    num_rows: int = 10
+    min_len_chars: int = 512
+    max_len_chars: int = 1024
+    verbose: bool = False
     include_seed_tasks: bool = False
     synthetic_data_dir: str = Path("data/gpt4_generations/")
     num_prompt_instructions: int = 3

diff --git a/implementations/synthetic_datagen/evolve_instruct/prompts.py b/implementations/synthetic_datagen/evolve_instruct/prompts.py
@@ -0,0 +1,13 @@
+# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
diff --git a/implementations/synthetic_datagen/pipeline.py b/implementations/synthetic_datagen/pipeline.py
@@ -75,6 +75,9 @@ def __call__(self, dataset):
             response = None
             count = 0
             while not response and count < 3:
+                import pdb
+
+                pdb.set_trace()
                 try:
                     response = openai.ChatCompletion.create(
                         # model="gpt-3.5-turbo-0613",