Merge commit '08d4151cea53725b3e016bf546b58bece8d51c38'

EvolvingLMMs-Lab · Mar 29, 2024 · 70636cf · 70636cf
1 parent a4d86da
commit 70636cf
Show file tree

Hide file tree

Showing 9 changed files with 570 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -9,9 +9,9 @@
 🏠 [Homepage](https://lmms-lab.github.io/) |  🎉 [Blog](https://lmms-lab.github.io/lmms-eval-blog/lmms-eval-0.1/) | 📚 [Documentation](docs/README.md) | 🤗 [Huggingface Datasets](https://huggingface.co/lmms-lab) | <a href="https://emoji.gg/emoji/1684-discord-thread"><img src="https://cdn3.emoji.gg/emojis/1684-discord-thread.png" width="14px" height="14px" alt="Discord_Thread"></a> [discord/lmms-eval](https://discord.gg/ebAMGSsS)
 
 
-In today's world, we're on a thrilling quest for Artificial General Intelligence (AGI), driven by a passion that reminds us of the excitement surrounding the 1960s moon landing. At the heart of this adventure are the incredible large language models (LLMs) and large multimodal models (LMMs). These models are like brilliant minds that can understand, learn, and interact with a vast array of human tasks, marking a significant leap toward our goal.
+In today's world, we're on an exciting journey toward creating Artificial General Intelligence (AGI), much like the enthusiasm of the 1960s moon landing. This journey is powered by advanced large language models (LLMs) and large multimodal models (LMMs), which are complex systems capable of understanding, learning, and performing a wide variety of human tasks. These advancements bring us closer to achieving AGI.
 
-To truly understand how capable these models are, we've started to create and use a wide variety of evaluation benchmarks. These benchmarks help us map out a detailed chart of abilities, showing us how close we are to achieving true AGI. However, this journey is not without its challenges. The sheer number of benchmarks and datasets we need to look at is overwhelming. They're all over the place - tucked away in someone's Google Drive, scattered across Dropbox, and hidden in the corners of various school and research lab websites. It's like embarking on a treasure hunt where the maps are spread far and wide.
+To gauge how advanced these models are, we use a variety of evaluation benchmarks. These benchmarks are tools that help us understand the capabilities of these models, showing us how close we are to achieving AGI. However, finding and using these benchmarks is a big challenge. The necessary benchmarks and datasets are spread out and hidden in various places like Google Drive, Dropbox, and different school and research lab websites. It feels like we're on a treasure hunt, but the maps are scattered everywhere.
 
 In the field of language models, there has been a valuable precedent set by the work of [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness). They offer integrated data and model interfaces, enabling rapid evaluation of language models and serving as the backend support framework for the [open-llm-leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard), and has gradually become the underlying ecosystem of the era of foundation models.
 

diff --git a/llava_repr_requirements.txt b/llava_repr_requirements.txt
@@ -27,6 +27,23 @@ shortuuid==1.0.12
 sqlitedict==2.1.0
 tenacity==8.2.3
 torch==2.0.1
+openai>=1.0.0
+pycocoevalcap
 tokenizers==0.15.2
 tqdm==4.66.2
-transformers==4.37.2
+tqdm-multiprocess
+transformers==4.37.2
+zstandard
+pillow
+pyyaml
+sympy
+mpmath
+Jinja2
+openpyxl
+Levenshtein
+hf_transfer
+tenacity
+wandb>=0.16.0
+transformers-stream-generator
+tiktoken
+pre-commit
diff --git a/lmms_eval/tasks/olympiadbench/cn_utils.py b/lmms_eval/tasks/olympiadbench/cn_utils.py
@@ -0,0 +1,69 @@
+import os
+import json
+import datetime
+from lmms_eval.tasks.olympiadbench.olympiadbench_evals import OlympiadBenchEvaluator
+from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
+
+import logging
+eval_logger = logging.getLogger("lmms-eval")
+dir_name = os.path.dirname(os.path.abspath(__file__))
+
+olympiadbench_evaluator = OlympiadBenchEvaluator()
+
+def olympiadbench_doc_to_visual(doc):
+    return [image.convert("RGB") for image in doc["images"]]
+
+def olympiadbench_doc_to_text(doc):
+    question = doc["question"]
+    subject = doc["subfield"]
+    mul_ans = doc["is_multiple_answer"]
+    if mul_ans is None:
+        mul_ans = False
+    ans_type = doc["answer_type"]
+    if ans_type == "Need_human_evaluate":
+        ans_type = "proof based"
+
+    pre_prompt = f"以下是中国{subject}竞赛中的解答题。\n"
+
+    post_prompt = ""
+    if not mul_ans:
+        post_prompt += f"答案类型为{ans_type}。\n"
+    else:
+        post_prompt += f"题目有多个答案，答案类型均为{ans_type}。\n"
+    post_prompt += "请根据题目的要求和所提供的信息计算得出答案。解答过程和结果中使用的变量和公式请使用LaTeX格式表示。请在最后以"
+    if not mul_ans:
+        post_prompt += '"所以最终答案是\\boxed{答案}。"\n'
+    else:
+        post_prompt += '"所以最终答案是\\boxed{用英⽂逗号连接的多个答案}。"\n'
+
+    final_question = pre_prompt + question + '\n' + post_prompt
+    return final_question
+
+def olympiadbench_process_results(doc, results):
+    precision = doc["error"]
+    is_proving = "TP" in doc["source"] 
+    if precision is None:
+        precision = 0
+    prediction = results[0].strip()
+
+    if is_proving:
+        return {
+            "submission": prediction
+        }
+    else:
+        prediction = prediction.split("所以最终答案是")[-1]
+        prediction = prediction.replace('"', "").replace("\n", "").replace(" ", "").strip(".").strip("。")
+        accuracy = olympiadbench_evaluator.judge(prediction, doc["final_answer"][0], precision)
+        accuracy = int(accuracy)
+        return {
+            "exact_match": accuracy
+        }
+
+def olympiadbench_aggregate_results(results, args):
+    now_date_time = datetime.datetime.now().strftime("%Y-%m%d-%H%M-%S")
+    submission_file_name = f"olympiadbench-test-cn-submission-{now_date_time}.json"
+    path = generate_submission_file(submission_file_name, args)
+    with open(path, "w") as f:
+        json.dump(results, f, ensure_ascii=False)
+    print(f"Submission file saved to {path}")
+
diff --git a/lmms_eval/tasks/olympiadbench/en_utils.py b/lmms_eval/tasks/olympiadbench/en_utils.py
@@ -0,0 +1,69 @@
+import os
+import json
+import datetime
+from lmms_eval.tasks.olympiadbench.olympiadbench_evals import OlympiadBenchEvaluator
+from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
+
+import logging
+eval_logger = logging.getLogger("lmms-eval")
+dir_name = os.path.dirname(os.path.abspath(__file__))
+
+olympiadbench_evaluator = OlympiadBenchEvaluator()
+
+def olympiadbench_doc_to_visual(doc):
+    return [image.convert("RGB") for image in doc["images"]]
+
+def olympiadbench_doc_to_text(doc):
+    question = doc["question"]
+    subject = doc["subfield"]
+    mul_ans = doc["is_multiple_answer"]
+    if mul_ans is None:
+        mul_ans = False
+    ans_type = doc["answer_type"]
+    if ans_type == "Need_human_evaluate":
+        ans_type = "proof based"
+
+    pre_prompt = f"The following is a question from an International {subject} competition.\n"
+
+    post_prompt = ""
+    if not mul_ans:
+        post_prompt += f"The answer of the question should be {ans_type}.\n"
+    else:
+        post_prompt += f"The question has multiple answers, each of them should be {ans_type}.\n"
+    post_prompt += "Please calculate the answer according to the given requirements and the information provided. Please use LaTeX format to represent the variables and formulas used in the solution process and results. Please end your solution with "
+    if not mul_ans:
+        post_prompt += '"So the final answer is \\boxed{answer}."\n'
+    else:
+        post_prompt += 'So the final answer is \\boxed{multiple answers connected with commas}.\n'
+
+    final_question = pre_prompt + question + '\n' + post_prompt
+    return final_question
+
+def olympiadbench_process_results(doc, results):
+    precision = doc["error"]
+    is_proving = "TP" in doc["source"] 
+    if precision is None:
+        precision = 0
+    prediction = results[0].strip()
+
+    if is_proving:
+        return {
+            "submission": prediction
+        }
+    else:
+        prediction = prediction.split("final answer is")[-1]
+        prediction = prediction.replace('"', "").replace("\n", "").replace(" ", "").strip(".").strip("。")
+        accuracy = olympiadbench_evaluator.judge(prediction, doc["final_answer"][0], precision)
+        accuracy = int(accuracy)
+        return {
+            "exact_match": accuracy
+        }
+
+def olympiadbench_aggregate_results(results, args):
+    now_date_time = datetime.datetime.now().strftime("%Y-%m%d-%H%M-%S")
+    submission_file_name = f"olympiadbench-test-en-submission-{now_date_time}.json"
+    path = generate_submission_file(submission_file_name, args)
+    with open(path, "w") as f:
+        json.dump(results, f, ensure_ascii=False)
+    print(f"Submission file saved to {path}")
+
diff --git a/lmms_eval/tasks/olympiadbench/olympiadbench.yaml b/lmms_eval/tasks/olympiadbench/olympiadbench.yaml
@@ -0,0 +1,6 @@
+group: olympiadbench
+task:
+- olympiadbench_test_en
+- olympiadbench_test_cn
+metadata:
+  - version: 0.0