-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain_gen_questions.py
70 lines (57 loc) · 3.1 KB
/
main_gen_questions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import os
import json
import argparse
from src.eval_generator.dataset_loader import DatasetLoader
from src.eval_generator.notebook_processor import NotebookProcessor
from src.eval_generator.client import GPT4oClient
from data.tasks_and_skills import SKILLS, TASKS
from src.eval_generator.utils import check_task_and_skill
from src.eval_generator.rag_answer_validator import RAGAnswerValidator
def main(csvs_directory, jsons_directory, notebooks_directory, api_key):
gpt4o_client = GPT4oClient(api_key)
rag_answer_validator = RAGAnswerValidator(gpt4o_client)
for subdir in os.listdir(csvs_directory):
csv_subdir_path = os.path.join(csvs_directory, subdir)
json_subdir_path = os.path.join(jsons_directory, subdir)
notebook_subdir_path = os.path.join(notebooks_directory, subdir)
if os.path.isdir(csv_subdir_path):
print(f"Processing directory: {csv_subdir_path}")
# Load the dataset and generate summary
dataset_loader = DatasetLoader(csv_subdir_path, json_subdir_path)
dataset_summary = dataset_loader.get_summary()
# Process the notebook and create cell batches
notebook_processor = NotebookProcessor(notebook_subdir_path)
cell_batches = notebook_processor.process_notebook()
questions = []
for batch in cell_batches:
cell_numbers = [cell_number for cell_number, _ in batch]
batch_questions = gpt4o_client.generate_questions(
batch, dataset_summary, SKILLS, TASKS
)
for batch_question in batch_questions:
if not check_task_and_skill(
batch_question["task"], batch_question["skill"]
):
continue
if not rag_answer_validator.check_answer(
batch_question["question"], batch_question["answer"], batch
):
continue
questions.append(batch_question)
# Select the 10 best questions
best_questions = gpt4o_client.select_best_questions(questions)
# Save the best questions to a JSON file
output_file = os.path.join(json_subdir_path, "questions.json")
with open(output_file, "w") as f:
json.dump(best_questions, f, indent=4)
print(f"Saved evaluation set to: {output_file}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Generate evaluation sets for data analytics agents."
)
parser.add_argument('csvs_directory', type=str, help='The directory containing subfolders data csvs.')
parser.add_argument('jsons_directory', type=str, help='The directory containing subfolders meta information.')
parser.add_argument('notebooks_directory', type=str, help='The directory containing subfolders notebooks.')
parser.add_argument("api_key", type=str, help="The API key for OpenAI.")
args = parser.parse_args()
main(args.csvs_directory, args.jsons_directory, args.notebooks_directory, args.api_key)