Skip to content

Commit

Permalink
fixed some issues with evolve instruct
Browse files Browse the repository at this point in the history
  • Loading branch information
Su committed Jul 17, 2024
1 parent 245377f commit 5b0306d
Show file tree
Hide file tree
Showing 6 changed files with 184 additions and 60 deletions.
58 changes: 22 additions & 36 deletions implementations/example_evolve.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,11 @@

def main():
parser = argparse.ArgumentParser(description='Options')
parser.add_argument("--seed_file", type=str, default="")
parser.add_argument(
"--seed_file",
type=str,
default="implementations/data/seed_files/alpaca_data.json",
)
parser.add_argument("--column_names", nargs='+', default="instruction")
parser.add_argument("--num_rows", type=int, default=5)
parser.add_argument("--min_len_chars", type=int, default=32)
Expand All @@ -40,53 +44,35 @@ def main():
SeedInstruction(
id="seed_task_0",
short_name="breakfast_suggestion",
instruction="Is there anything I can eat for a breakfast that "
"doesn't include eggs, yet includes protein, and has"
" roughly 700-1000 calories?",
instruction="Give three tips for staying healthy.",
instances=[
Instance(
"",
"Yes, you can have 1 oatmeal banana protein shake and"
" 4 strips of bacon. The oatmeal banana protein shake may"
" contain 1/2 cup oatmeal, 60 grams whey protein powder,"
" 1/2 medium banana, 1tbsp flaxseed oil and 1/2 cup"
"water, totalling about 550 calories. "
"The 4 strips of bacon "
"contains about 200 calories.",
"" "",
)
],
is_classification=False,
),
SeedInstruction(
id="seed_task_1",
short_name="antonym_relation",
instruction="What is the relation between the given pairs?",
instruction="Describe the structure of an atom.",
instances=[
Instance(
"Night : Day :: Right : Left",
"The relation between the given pairs is "
"that they are opposites.",
"",
"" "",
)
],
is_classification=False,
),
SeedInstruction(
id="seed_task_2",
short_name="one_sentence_description",
instruction="Generate a one-sentence description for each"
" of the following people.",
instruction="Identify the odd one out.",
instances=[
Instance(
"- Brack Obama\n- Elon Musk\n- Taylor Swift",
"- Barack Hussein Obama II is an American politician"
" who served as the 44th president of the United States"
" from 2009 to 2017.\n- Elon Musk is the founder, CEO,"
" and chief engineer of SpaceX; angel investor, CEO and"
" product architect of Tesla, Inc.; founder of The Boring"
" Company; co-founder of Neuralink and OpenAI; president"
" of the Musk Foundation; and owner and CEO of Twitter,"
" Inc.\n- Taylor Alison Swift is an American"
" singer-songwriter.",
"Twitter, Instagram, Telegram",
"Telegram",
)
],
is_classification=False,
Expand All @@ -96,15 +82,15 @@ def main():
spec.agent_system = SingleAgent()
llm_pipeline = ChatGPTPipeline(args.openai_model)

generator = EvolveInstructGenerator(
llm_pipeline=llm_pipeline,
seed_data=args.seed_file,
column_names=args.column_names,
num_rows=args.num_rows,
min_len_chars=args.min_len_chars,
max_len_chars=args.max_len_chars,
verbose=True,
)
spec.llm_pipeline = llm_pipeline
spec.seed_data = args.seed_file
spec.column_names = args.column_names
spec.num_rows = args.num_rows
spec.min_len_chars = args.min_len_chars
spec.max_len_chars = args.max_len_chars
spec.verbose = True

generator = EvolveInstructGenerator(spec)
generator.run()


Expand Down
12 changes: 6 additions & 6 deletions implementations/synthetic_datagen/base_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,10 @@ class BaseDataGenerator(ABC):
def generate(self):
pass

@abstractmethod
def curate(self):
pass
# @abstractmethod
# def curate(self):
# pass

@abstractmethod
def evaluate(self):
pass
# @abstractmethod
# def evaluate(self):
# pass
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,16 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
import json
import logging
import os
import time
import uuid
from enum import Enum
from typing import Optional
import json
import uuid
import os

import numpy as np
import pandas as pd

from datasets import Dataset, DatasetDict, load_dataset
from synthetic_datagen.base_generator import BaseDataGenerator
from synthetic_datagen.evolve_instruct.evolve_instruct_spec import (
Expand Down Expand Up @@ -75,9 +75,14 @@ def __init__(self, spec: Optional[EvolveInstructSpec] = None):
for the generator. If not provided,
a default SelfInstructSpec is used.
"""

self.spec = spec or EvolveInstructSpec()
self.instruction_generator = InstructionGenerator(self.spec)
self.instance_generator = InstanceGenerator(self.spec)
self.seed_text_list = []
self.prompts = []
self.final_prompts = []
self.final_answers = []
self.prompt_templates = dict()
self.prompt_templates['base'] = ""
write_in_korean = "Write in Korean."
Expand Down Expand Up @@ -172,10 +177,11 @@ def generate(self):
followed by
the generation of synthetic instances based on those instructions.
"""
self.create_seed_prompts()
logging.info("Generating synthetic instructions...")
self.instruction_generator.generate()
self.create_prompts()
logging.info("Generating synthetic instances...")
self.instance_generator.generate()
self.create_answers()

def evaluate(self):
raise RuntimeError(
Expand All @@ -197,12 +203,10 @@ def run(self):
'output': self.final_answers[i],
}
)


with open(
f"{
self.seed_data.replace('.jsonl', '').replace('json', '')
}.%s.json"
f"{self.seed_data.replace('.jsonl', '')\
.replace('json', '')}.%s.json"
% str(uuid.uuid4())[:4],
"wt",
) as f:
Expand All @@ -218,27 +222,29 @@ def create_seed_prompts(self):
:return: None
"""


if isinstance(self.seed_data, str) and os.path.exists(self.seed_data):
data = load_dataset("json", data_files=self.seed_data)
if isinstance(self.spec.seed_data, str) and os.path.exists(
self.spec.seed_data
):
data = load_dataset("json", data_files=self.spec.seed_data)
self.seed_text_list = []
for d in data['train']:
s = ""
if isinstance(self.column_names, str):
s = d[self.column_names]
if isinstance(self.spec.column_names, str):
s = d[self.spec.column_names]
else:
for col in self.column_names:
for col in self.spec.column_names:
s += d[col] + "\n"
self.seed_text_list.append(s.strip())
assert self.seed_text_list, "data import failed, got empty list"

def create_prompts(self):
print("Creating %d prompts." % self.num_rows)
print("Creating %d prompts." % self.spec.num_rows)
assert self.seed_text_list, "must have seed text list"
t0 = time.time()
self.prompts.clear()
for _ in range(self.num_rows):

for _ in range(self.spec.num_rows):
new_prompt = np.random.choice(self.seed_text_list)
self.prompts.append(new_prompt)
i = 0
Expand Down Expand Up @@ -268,3 +274,111 @@ def convert_list_to_dataset(self, text_list):
ds = DatasetDict()
ds['train'] = Dataset.from_pandas(df)
return ds

def mutate(self, iteration):
assert len(self.prompts) == self.spec.num_rows
list_prompts = []
mutations = []
for i in range(self.spec.num_rows):
mutation = np.random.choice(Mutation)
mutations.append(mutation)
# if mutation == Mutation.FRESH_START:
# mutation = Mutation.COMPLICATE
before = self.prompts[i]
prompt = self.prompt_templates[mutation].replace(
"<PROMPT>", before
)
list_prompts.append(prompt)

ds = self.convert_list_to_dataset(list_prompts)
assert (
ds['train'].num_rows
== len(list_prompts)
== self.spec.num_rows
== len(self.prompts)
)
t0 = time.time()
after = self.spec.llm_pipeline(ds['train'])
assert len(after) == self.spec.num_rows
t1 = time.time()
print("HFPipeline took %.4f seconds" % (t1 - t0))

for i in range(len(after)):
after[i] = after[i].split("Prompt#:")[-1].strip()
for pp in ['New Prompt:\n', 'New Prompt: ']:
if after[i][: len(pp)] == pp:
after[i] = after[i][len(pp) :]
after[i] = after[i].strip()
use_new_prompt, why = self.change_approved(
self.prompts[i], after[i]
)
if self.spec.verbose:
print("===========================")
print("Old Prompt: %s" % self.prompts[i])
print("Mutation: %s" % mutations[i].name)
print("New Prompt: %s" % after[i])
print("===========================")

if use_new_prompt:
if (
self.spec.max_len_chars
>= len(after[i])
>= self.spec.min_len_chars
):
self.final_prompts.append(after[i])
print(
"Prompt was accepted, now have %d good prompts."
% len(self.final_prompts)
)
self.prompts[i] = np.random.choice(self.seed_text_list)
print("Creating new prompt.")
else:
self.prompts[i] = after[i]
print("Prompt was successfully modified.")
else:
print("Mutation rejected, will try again. Reason: %s" % why)
print("", flush=True)
return len(self.final_prompts) < self.spec.num_rows

def change_approved(self, before, after):
if before == after:
return False, "same"
if after.count('\n') > after.count(" ") * 2:
return False, "too many lines"
if after.count('\n') == after.count("- ") > 10:
return False, "too many items"
if (
self.prompt_templates['base']
and self.prompt_templates['base'] in after
):
return False, "prompt leaked 1"
if "#New Prompt#" in after:
return False, "prompt leaked 2"
if "new prompt" in after.lower():
return False, "prompt leaked 3"
if "openai" in after.lower():
return False, "AI"
if "gpt" in after.lower() and "gpt" not in before.lower():
return False, "AI"
if (
"죄송하지만" in after.lower()
and "죄송" not in before.lower()
and len(after) < len(before)
):
return False, "sorry"
if False:
# too slow in general, not needed
prompt = """Are the two following prompts equal to each other?
To be equal, they must meet two requirements:
1. Both prompts have the same constraints and requirements.
2. Both prompts have the same depth and breath of the inquiry.
First prompt: %s
Second prompt: %s
Answer with 'Equal' or 'Not Equal'. No need to explain the reason.""" % (
before,
after,
)
answer = self.llm_pipeline(prompt)
if 'not equal' not in answer.lower():
return False, "equal"
return True, "ok"
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from rouge_score import rouge_scorer
from synthetic_datagen.agent_systems.base_agent_system import BaseAgentSystem
from synthetic_datagen.agent_systems.single_agent import SingleAgent
from transformers import pipeline

from implementations.synthetic_datagen.utils.seed_instruction import (
SeedInstruction,
Expand Down Expand Up @@ -72,7 +73,14 @@ class EvolveInstructSpec:
)
)
agent_system: BaseAgentSystem = field(default_factory=SingleAgent)
llm_pipeline: pipeline = None
seed_data: List[str] = None
seed_instructions: List[SeedInstruction] = field(default=list)
column_names: List[str] = field(default_factory=list) # ["instruction"]
num_rows: int = 10
min_len_chars: int = 512
max_len_chars: int = 1024
verbose: bool = False
include_seed_tasks: bool = False
synthetic_data_dir: str = Path("data/gpt4_generations/")
num_prompt_instructions: int = 3
Expand Down
13 changes: 13 additions & 0 deletions implementations/synthetic_datagen/evolve_instruct/prompts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
# Licensed under the Apache License, Version 2.0 (the “License”);
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an “AS IS” BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
3 changes: 3 additions & 0 deletions implementations/synthetic_datagen/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ def __call__(self, dataset):
response = None
count = 0
while not response and count < 3:
import pdb

pdb.set_trace()
try:
response = openai.ChatCompletion.create(
# model="gpt-3.5-turbo-0613",
Expand Down

0 comments on commit 5b0306d

Please sign in to comment.