Skip to content

Commit f7f07f5

Browse files
committed
Keep changes in evals directory only
1 parent 2420c62 commit f7f07f5

12 files changed

+7568
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
generated
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,279 @@
1+
import json
2+
import os
3+
import random
4+
import re
5+
6+
import pandas as pd
7+
8+
SEED = 42
9+
10+
11+
def load_quran_data(file_path):
12+
return pd.read_csv(file_path, header=None, names=["chapter", "verse", "text"], sep="|")
13+
14+
15+
def load_chapter_names(file_path):
16+
return pd.read_json(file_path)
17+
18+
19+
def extract_random_ayas(df, number_of_ayas):
20+
random.seed(SEED)
21+
return df.sample(n=number_of_ayas, random_state=SEED)
22+
23+
24+
def load_distractors(file_path):
25+
with open(file_path, "r", encoding="utf-8") as file:
26+
distractors = json.load(file)
27+
return distractors
28+
29+
30+
def generate_mcq_questions(aya, distractors_list, n=3):
31+
random.seed(SEED)
32+
correct_answer = aya["text"]
33+
valid_distractors = [d for d in distractors_list if d != correct_answer]
34+
selected_distractors = random.sample(valid_distractors, n)
35+
36+
options = selected_distractors + [correct_answer]
37+
random.shuffle(options)
38+
39+
option_labels = ["A", "B", "C", "D"][: len(options)]
40+
labeled_options = {label: option for label, option in zip(option_labels, options)}
41+
42+
options_text = "; ".join([f"{label}. {option}" for label, option in labeled_options.items()])
43+
question_content_en = f"Which of the following is a text from the Quran? {options_text}, please answer with the letter of the correct option (A, B, C, D) only"
44+
question_content_ar = f"أي من التالي هو نص من القرآن؟ {options_text}, يرجى الإجابة بحرف الخيار الصحيح (A, B, C, D) فقط"
45+
46+
correct_label = [
47+
label for label, option in labeled_options.items() if option == correct_answer
48+
][0]
49+
50+
return question_content_en, question_content_ar, correct_label
51+
52+
53+
def redact_aya(text, all_texts):
54+
random.seed(SEED)
55+
words = text.split()
56+
if len(words) <= 1:
57+
return None
58+
59+
for _ in range(100):
60+
start = random.randint(0, len(words) - 1)
61+
end = random.randint(start + 1, len(words))
62+
first_section = " ".join(words[:start])
63+
missing_section = " ".join(words[start:end])
64+
third_section = " ".join(words[end:])
65+
redacted_aya = f"{first_section} ________ {third_section}".strip()
66+
67+
pattern = re.escape(first_section) + ".*" + re.escape(third_section)
68+
if len([t for t in all_texts if re.match(pattern, t)]) == 1:
69+
return redacted_aya, first_section, missing_section, third_section
70+
71+
return None
72+
73+
74+
def generate_bilingual_questions(ayas_df, question_type):
75+
random.seed(SEED)
76+
bilingual_questions = []
77+
half_length = len(ayas_df) // 2
78+
include_extra_info = True
79+
80+
for index, row in ayas_df.iterrows():
81+
extra_info_en = (
82+
f" This text is from Surah {row['name']} (Chapter {row['chapter']}, Verse {row['verse']})."
83+
if include_extra_info
84+
else ""
85+
)
86+
extra_info_ar = (
87+
f" هذا النص القرآني من سورة {row['name']} (السورة {row['chapter']}، الآية {row['verse']})."
88+
if include_extra_info
89+
else ""
90+
)
91+
92+
if question_type == "missing_text":
93+
question_content_en = f"Fill in the blank of the following Quranic text: (({row['redacted']})) to complete the full verse.{extra_info_en}. The answer may be one or more words."
94+
question_content_ar = f"املأ الفراغ في النص القرآني التالي: (({row['redacted']})) لإتمام الآية كاملة.{extra_info_ar}. قد تكون الإجابة عبارة عن كلمة واحدة أو أكثر."
95+
ideal_answer = [row["missing_section"]]
96+
ideal_answer_ar = [row["missing_section"]]
97+
98+
elif question_type == "surah_name":
99+
question_content_en = f"Identify the Surah (in Arabic) of the following Quranic text: {row['text']} (Please provide the answer without diacritics but keep hamza and madda)."
100+
question_content_ar = f"حدد اسم السورة للنص القرآني التالي: {row['text']} (يرجى تقديم الإجابة بدون تشكيل ولكن احتفظ بالهمزة والمد)."
101+
ideal_answer = [row["name"], row["transliteration"], row["translation"]]
102+
ideal_answer_ar = [row["name"], row["transliteration"], row["translation"]]
103+
104+
elif question_type == "surah_type":
105+
question_content_en = f"Determine if the Surah of the following Quranic aya text is meccan or medinan: {row['text']} answer only with either 'meccan' or 'medinan' (exactly in small case)."
106+
question_content_ar = f"حدد إذا كانت السورة للنص القرآني التالي مكية أو مدنية: {row['text']} أجب فقط بـ 'مكية' أو 'مدنية' (بدون تشكيل)."
107+
answer_arabic_translations = (
108+
["مكية", "مكي", "مكة"] if row["type"] == "meccan" else ["مدنية", "مدني", "المدينة"]
109+
)
110+
(
111+
["meccan", "meccan", "mecca", "maccan"]
112+
if row["type"] == "meccan"
113+
else ["madinan", "medinan", "madina"]
114+
)
115+
all_answers = [row["type"]] + answer_arabic_translations
116+
ideal_answer = all_answers
117+
ideal_answer_ar = all_answers
118+
119+
elif question_type == "mcq":
120+
question_content_en, question_content_ar, correct_label = generate_mcq_questions(
121+
row, distractors_list
122+
)
123+
ideal_answer = [correct_label]
124+
ideal_answer_ar = [correct_label]
125+
126+
# Creating questions in both English and Arabic
127+
if index < half_length: # English questions
128+
bilingual_questions.append(
129+
{
130+
"input": [
131+
{"role": "system", "content": question_content_en},
132+
{
133+
"role": "user",
134+
"content": "Please provide the answer, and ONLY the answer without any extra commentary"
135+
if question_type != "mcq"
136+
else "Please provide the answer by selecting the correct letter (A, B, C, or D) without any extra commentary",
137+
},
138+
],
139+
"ideal": ideal_answer,
140+
}
141+
)
142+
else: # Arabic questions
143+
bilingual_questions.append(
144+
{
145+
"input": [
146+
{"role": "system", "content": question_content_ar},
147+
{
148+
"role": "user",
149+
"content": "يرجى تقديم الإجابة. وفقط الإجابة دون أي تعليق إضافي"
150+
if question_type != "mcq"
151+
else "يرجى تقديم الإجابة عن طريق تحديد الحرف الصحيح (A, B, C, أو D) دون أي تعليق إضافي",
152+
},
153+
],
154+
"ideal": ideal_answer_ar,
155+
}
156+
)
157+
158+
# Toggle extra info for next question
159+
include_extra_info = not include_extra_info
160+
161+
return bilingual_questions
162+
163+
164+
if __name__ == "__main__":
165+
# Main process
166+
quran_file_path = "evals/registry/data/quran_eval/gen_script/resources/Arabic-Original.csv"
167+
chapters_file_path = "evals/registry/data/quran_eval/gen_script/resources/chapters-en.json"
168+
distractors_file_path = (
169+
"evals/registry/data/quran_eval/gen_script/resources/distractors_not_quranic.json"
170+
)
171+
172+
random.seed(SEED)
173+
174+
# Load and prepare data
175+
quran_df = load_quran_data(quran_file_path)
176+
chapters_df = load_chapter_names(chapters_file_path)
177+
random_ayas_df = extract_random_ayas(quran_df, 350)
178+
distractors_list = load_distractors(distractors_file_path)
179+
180+
random_ayas_df = random_ayas_df.merge(chapters_df, left_on="chapter", right_on="id")
181+
random_ayas_df.drop(columns=["id", "total_verses"], inplace=True)
182+
183+
# Apply the redaction process and validation
184+
all_texts = quran_df["text"].tolist()
185+
validated_ayas = []
186+
187+
for index, row in random_ayas_df.iterrows():
188+
result = redact_aya(row["text"], all_texts)
189+
if result:
190+
(
191+
row["redacted"],
192+
row["first_section"],
193+
row["missing_section"],
194+
row["third_section"],
195+
) = result
196+
pattern = row["text"]
197+
if len([t for t in all_texts if re.match(pattern, t)]) == 1:
198+
validated_ayas.append(row)
199+
200+
validated_ayas_df = pd.DataFrame(validated_ayas)
201+
202+
# Generate bilingual questions
203+
bilingual_missing_text_questions = generate_bilingual_questions(
204+
validated_ayas_df, "missing_text"
205+
)
206+
bilingual_surah_name_questions = generate_bilingual_questions(validated_ayas_df, "surah_name")
207+
bilingual_surah_type_questions = generate_bilingual_questions(validated_ayas_df, "surah_type")
208+
# Generate MCQ questions
209+
question_type = "mcq"
210+
mcq_questions = generate_bilingual_questions(random_ayas_df, question_type)
211+
212+
# Save the questions to separate JSON files
213+
readable_bilingual_missing_text_file_path = (
214+
"evals/registry/data/quran_eval/gen_script/generated/masked_quranic_text.json"
215+
)
216+
readable_bilingual_surah_name_file_path = (
217+
"evals/registry/data/quran_eval/gen_script/generated/guess_quran_surah_name.json"
218+
)
219+
readable_bilingual_surah_type_file_path = (
220+
"evals/registry/data/quran_eval/gen_script/generated/guess_quran_surah_type.json"
221+
)
222+
readable_biligual_questions_mcq_file_path = (
223+
"evals/registry/data/quran_eval/gen_script/generated/guess_which_text_is_from_quran.json"
224+
)
225+
226+
output_folder = "evals/registry/data/quran_eval/gen_script/generated"
227+
228+
# Create the output folder if it doesn't exist
229+
if not os.path.exists(output_folder):
230+
os.makedirs(output_folder)
231+
232+
with open(readable_bilingual_missing_text_file_path, "w", encoding="utf-8") as file:
233+
json.dump(bilingual_missing_text_questions, file, ensure_ascii=False, indent=4)
234+
235+
with open(readable_bilingual_surah_name_file_path, "w", encoding="utf-8") as file:
236+
json.dump(bilingual_surah_name_questions, file, ensure_ascii=False, indent=4)
237+
238+
with open(readable_bilingual_surah_type_file_path, "w", encoding="utf-8") as file:
239+
json.dump(bilingual_surah_type_questions, file, ensure_ascii=False, indent=4)
240+
241+
with open(readable_biligual_questions_mcq_file_path, "w", encoding="utf-8") as file:
242+
json.dump(mcq_questions, file, ensure_ascii=False, indent=4)
243+
244+
# Final output paths for each question type
245+
missing_text_output_jsonl = "evals/registry/data/quran_eval/masked_quranic_text.jsonl"
246+
surah_name_output_jsonl = "evals/registry/data/quran_eval/guess_quran_surah_name.jsonl"
247+
surah_type_output_jsonl = "evals/registry/data/quran_eval/guess_quran_surah_type.jsonl"
248+
mcq_output_jsonl = "evals/registry/data/quran_eval/guess_which_text_is_from_quran.jsonl"
249+
250+
output_folder = "evals/registry/data/quran_eval"
251+
252+
# Create the output folder if it doesn't exist
253+
if not os.path.exists(output_folder):
254+
os.makedirs(output_folder)
255+
256+
# Save the questions in JSON Lines format for each question type
257+
with open(missing_text_output_jsonl, "w", encoding="utf-8") as file:
258+
for question in bilingual_missing_text_questions:
259+
# Write each question as a separate line in the file
260+
json_line = json.dumps(question, ensure_ascii=False)
261+
file.write(json_line + "\n")
262+
263+
with open(surah_name_output_jsonl, "w", encoding="utf-8") as file:
264+
for question in bilingual_surah_name_questions:
265+
# Write each question as a separate line in the file
266+
json_line = json.dumps(question, ensure_ascii=False)
267+
file.write(json_line + "\n")
268+
269+
with open(surah_type_output_jsonl, "w", encoding="utf-8") as file:
270+
for question in bilingual_surah_type_questions:
271+
# Write each question as a separate line in the file
272+
json_line = json.dumps(question, ensure_ascii=False)
273+
file.write(json_line + "\n")
274+
275+
with open(mcq_output_jsonl, "w", encoding="utf-8") as file:
276+
for question in mcq_questions:
277+
# Write each question as a separate line in the file
278+
json_line = json.dumps(question, ensure_ascii=False)
279+
file.write(json_line + "\n")

0 commit comments

Comments
 (0)