add videofinal utils functinos

EvolvingLMMs-Lab · Dec 14, 2024 · a5bcb73 · a5bcb73
1 parent 1345d43
commit a5bcb73
Show file tree

Hide file tree

Showing 18 changed files with 946 additions and 0 deletions.
diff --git a/lmms_eval/tasks/videofinal/__init__.py b/lmms_eval/tasks/videofinal/__init__.py
diff --git a/lmms_eval/tasks/videofinal/_default_template_yaml b/lmms_eval/tasks/videofinal/_default_template_yaml
@@ -0,0 +1,14 @@
+dataset_path: lmms-lab/videofinal
+dataset_kwargs:
+  token: True
+  video: True
+  # force_download: True
+  # force_unzip: True
+  cache_dir: videommmu_1213
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: "You should watch and learn the video content. Then apply what you learned to "
+    mcq_prompt: "answer the following multi-choice question. The image for this question is at the end of the video.\n"
+    open_ended_prompt: "answer the following open-ended question. The image for this question is at the end of the video.\n"
+generation_kwargs:
+  max_new_tokens: 1024
diff --git a/lmms_eval/tasks/videofinal/adaptation.yaml b/lmms_eval/tasks/videofinal/adaptation.yaml
@@ -0,0 +1,16 @@
+dataset_name: "Adaptation"
+task: "video_mmmu_adaptation"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.videoperception_doc_to_visual
+doc_to_text: !function utils.videoperception_doc_to_text_adaptation
+doc_to_target: !function utils.videoperception_doc_to_answer
+process_results: !function utils.videoperception_process_results
+metric_list:
+  - metric: submission
+    aggregation: !function utils.videoperception_aggregate_results_for_submission
+    higher_is_better: true
+  - metric: mmmu_acc
+    aggregation: !function utils.videoperception_aggregate_results
+    higher_is_better: true
+include: _default_template_yaml
diff --git a/lmms_eval/tasks/videofinal/adaptation_question_only.yaml b/lmms_eval/tasks/videofinal/adaptation_question_only.yaml
@@ -0,0 +1,16 @@
+dataset_name: "Adaptation"
+task: "video_mmmu_adaptation_no_video"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.videoperception_doc_to_visual_no_video
+doc_to_text: !function utils.videoperception_doc_to_text_no_preprompt
+doc_to_target: !function utils.videoperception_doc_to_answer
+process_results: !function utils.videoperception_process_results
+metric_list:
+  - metric: submission
+    aggregation: !function utils.videoperception_aggregate_results_for_submission
+    higher_is_better: true
+  - metric: mmmu_acc
+    aggregation: !function utils.videoperception_aggregate_results
+    higher_is_better: true
+include: _default_template_yaml
diff --git a/lmms_eval/tasks/videofinal/audio_gemini.yaml b/lmms_eval/tasks/videofinal/audio_gemini.yaml
@@ -0,0 +1,5 @@
+group: video_mmmu_audio_gemini
+task:
+- videommmu_audio_gemini_application
+- videommmu_audio_gemini_comprehension
+- videommmu_audio_gemini_videoperception
diff --git a/lmms_eval/tasks/videofinal/audio_gemini_application.yaml b/lmms_eval/tasks/videofinal/audio_gemini_application.yaml
@@ -0,0 +1,13 @@
+dataset_name: "application_augmented"
+task: "videommmu_audio_gemini_application"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.videoperception_doc_to_visual_perception
+doc_to_text: !function audio_gemini_utils.doc_to_text
+doc_to_target: !function utils.videoperception_doc_to_answer
+process_results: !function audio_gemini_utils.process_results
+metric_list:
+  - metric: audio
+    aggregation: !function audio_gemini_utils.aggregate_results_application
+    higher_is_better: true
+include: _default_template_yaml
diff --git a/lmms_eval/tasks/videofinal/audio_gemini_comprehension.yaml b/lmms_eval/tasks/videofinal/audio_gemini_comprehension.yaml
@@ -0,0 +1,13 @@
+dataset_name: "application_augmented"
+task: "videommmu_audio_gemini_comprehension"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.videoperception_doc_to_visual_perception
+doc_to_text: !function audio_gemini_utils.doc_to_text
+doc_to_target: !function utils.videoperception_doc_to_answer
+process_results: !function audio_gemini_utils.process_results
+metric_list:
+  - metric: audio
+    aggregation: !function audio_gemini_utils.aggregate_results_comprehension
+    higher_is_better: true
+include: _default_template_yaml
diff --git a/lmms_eval/tasks/videofinal/audio_gemini_utils.py b/lmms_eval/tasks/videofinal/audio_gemini_utils.py
@@ -0,0 +1,88 @@
+import json
+
+import pandas as pd
+from loguru import logger
+
+from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
+from lmms_eval.tasks.videoperception.utils import (
+    videoperception_doc_to_answer as original_doc_to_answer,
+)
+from lmms_eval.tasks.videoperception.utils import (
+    videoperception_doc_to_text as original_doc_to_text,
+)
+
+template = """\
+[System]
+
+You are an assistant that helps with question evaluation. I will provide you with a video, along with a pair of questions and answers. Your task is to assess whether the question requires audio information from the video to be answered, or if it can be answered purely through visual information. You need to provide a complete and detailed reason explaining why the question does or does not require audio from the video.
+
+[Question]
+
+{question}
+
+[Answer]
+
+{answer}
+
+[Standard]
+
+The standard for determining whether audio is necessary is: if a question does not require audio, then I should be able to turn off the video's sound and still be able to infer the correct answer entirely from the visual information.
+
+[Output Format]
+
+Your answer must strictly follow the JSON format below:
+
+{{
+    "reason": "This question requires audio information from the video to be answered because...",
+    "use_audio": true
+}}
+
+"use_audio" should be set to "true" if the question requires audio information from the video to be answered, and "false" otherwise.
+
+Please note that you should output only the JSON code, with no additional information.\
+"""
+
+
+def doc_to_text(doc, lmms_eval_specific_kwargs=None):
+    question = original_doc_to_text(doc, lmms_eval_specific_kwargs)
+    answer = original_doc_to_answer(doc)
+    return template.format(question=question, answer=answer)
+
+
+def process_results(doc, results: str):
+    results = results[0].strip()
+    tmp_results = results
+    try:
+        if results.startswith("```json") and results.endswith("```"):
+            results = results[7:-3]
+        results = json.loads(results)
+        results["use_audio"] = 1 if results["use_audio"] else 0
+    except Exception as e:
+        results = {
+            "reason": f"Failed to parse the results. Error: {e}\nResults: {tmp_results}",
+            "use_audio": -1,
+        }
+    results["id"] = doc["id"]
+    return {"audio": results}
+
+
+def aggregate_results_subtask(results, subtask, args):
+    path = generate_submission_file(f"video_mmmu_audio_{subtask}.xlsx", args)
+    logger.info(f"Saving results to {path}")
+    df = pd.DataFrame(results)
+    df.to_excel(path, index=False)
+    use_audios = df["use_audio"].value_counts()
+    logger.info(f"Use audio counts: {use_audios.to_dict()}")
+    return use_audios[1] if 1 in use_audios else 0
+
+
+def aggregate_results_application(results, args):
+    return aggregate_results_subtask(results, "application", args)
+
+
+def aggregate_results_comprehension(results, args):
+    return aggregate_results_subtask(results, "comprehension", args)
+
+
+def aggregate_results_videoperception(results, args):
+    return aggregate_results_subtask(results, "videoperception", args)
diff --git a/lmms_eval/tasks/videofinal/audio_gemini_videoperception.yaml b/lmms_eval/tasks/videofinal/audio_gemini_videoperception.yaml
@@ -0,0 +1,13 @@
+dataset_name: "application_augmented"
+task: "videommmu_audio_gemini_videoperception"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.videoperception_doc_to_visual_perception
+doc_to_text: !function audio_gemini_utils.doc_to_text
+doc_to_target: !function utils.videoperception_doc_to_answer
+process_results: !function audio_gemini_utils.process_results
+metric_list:
+  - metric: audio
+    aggregation: !function audio_gemini_utils.aggregate_results_videoperception
+    higher_is_better: true
+include: _default_template_yaml
diff --git a/lmms_eval/tasks/videofinal/audio_group.yaml b/lmms_eval/tasks/videofinal/audio_group.yaml
@@ -0,0 +1,5 @@
+group: videommmu_audio
+task:
+- videommmu_application_audio
+- videommmu_comprehension_audio
+- videommmu_perception_audio
diff --git a/lmms_eval/tasks/videofinal/comprehension.yaml b/lmms_eval/tasks/videofinal/comprehension.yaml
@@ -0,0 +1,16 @@
+dataset_name: "comprehension"
+task: "videommmu_comprehension"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.videoperception_doc_to_visual_perception
+doc_to_text: !function utils.videoperception_doc_to_text_perception
+doc_to_target: !function utils.videoperception_doc_to_answer
+process_results: !function utils.videoperception_process_results
+metric_list:
+  - metric: submission
+    aggregation: !function utils.videoperception_aggregate_results_for_submission
+    higher_is_better: true
+  - metric: mmmu_acc
+    aggregation: !function utils.videoperception_aggregate_results
+    higher_is_better: true
+include: _default_template_yaml