enable transcripts evaluation all three tracks

EvolvingLMMs-Lab · Dec 17, 2024 · c148532 · c148532
1 parent a5bcb73
commit c148532
Show file tree

Hide file tree

Showing 18 changed files with 64 additions and 204 deletions.
diff --git a/lmms_eval/tasks/videofinal/_default_template_yaml b/lmms_eval/tasks/videofinal/_default_template_yaml
@@ -4,10 +4,11 @@ dataset_kwargs:
   video: True
   # force_download: True
   # force_unzip: True
-  cache_dir: videommmu_1213
+  cache_dir: video_mmmu
 lmms_eval_specific_kwargs:
   default:
     pre_prompt: "You should watch and learn the video content. Then apply what you learned to "
+    perception_and_comprehension_prompt: "\nPlease ignore the Quiz question in last frame of the video."
     mcq_prompt: "answer the following multi-choice question. The image for this question is at the end of the video.\n"
     open_ended_prompt: "answer the following open-ended question. The image for this question is at the end of the video.\n"
 generation_kwargs:

diff --git a/lmms_eval/tasks/videofinal/adaptation_question_only.yaml b/lmms_eval/tasks/videofinal/adaptation_question_only.yaml
@@ -2,7 +2,7 @@ dataset_name: "Adaptation"
 task: "video_mmmu_adaptation_no_video"
 test_split: test
 output_type: generate_until
-doc_to_visual: !function utils.videoperception_doc_to_visual_no_video
+doc_to_visual: !function utils.videoperception_doc_to_visual_question_only
 doc_to_text: !function utils.videoperception_doc_to_text_no_preprompt
 doc_to_target: !function utils.videoperception_doc_to_answer
 process_results: !function utils.videoperception_process_results

diff --git a/...sks/videofinal/videoperception_audio.yaml → ...deofinal/adaptation_with_transcripts.yaml b/...sks/videofinal/videoperception_audio.yaml → ...deofinal/adaptation_with_transcripts.yaml
@@ -1,9 +1,9 @@
-dataset_name: "perception"
-task: "videommmu_perception_audio"
+dataset_name: "Adaptation"
+task: "video_mmmu_adaptation"
 test_split: test
 output_type: generate_until
-doc_to_visual: !function utils.videoperception_doc_to_visual_perception
-doc_to_text: !function utils.videoperception_doc_to_text_with_transcript
+doc_to_visual: !function utils.videoperception_doc_to_visual
+doc_to_text: !function utils.videoperception_doc_to_text_with_transcript_adaptation
 doc_to_target: !function utils.videoperception_doc_to_answer
 process_results: !function utils.videoperception_process_results
 metric_list:

diff --git a/lmms_eval/tasks/videofinal/audio_gemini.yaml b/lmms_eval/tasks/videofinal/audio_gemini.yaml
diff --git a/lmms_eval/tasks/videofinal/audio_gemini_application.yaml b/lmms_eval/tasks/videofinal/audio_gemini_application.yaml
diff --git a/lmms_eval/tasks/videofinal/audio_gemini_comprehension.yaml b/lmms_eval/tasks/videofinal/audio_gemini_comprehension.yaml
diff --git a/lmms_eval/tasks/videofinal/audio_gemini_utils.py b/lmms_eval/tasks/videofinal/audio_gemini_utils.py
diff --git a/lmms_eval/tasks/videofinal/audio_gemini_videoperception.yaml b/lmms_eval/tasks/videofinal/audio_gemini_videoperception.yaml
diff --git a/lmms_eval/tasks/videofinal/audio_group.yaml b/lmms_eval/tasks/videofinal/audio_group.yaml
diff --git a/lmms_eval/tasks/videofinal/comprehension.yaml b/lmms_eval/tasks/videofinal/comprehension.yaml
@@ -1,9 +1,9 @@
-dataset_name: "comprehension"
-task: "videommmu_comprehension"
+dataset_name: "Comprehension"
+task: "video_mmmu_comprehension"
 test_split: test
 output_type: generate_until
-doc_to_visual: !function utils.videoperception_doc_to_visual_perception
-doc_to_text: !function utils.videoperception_doc_to_text_perception
+doc_to_visual: !function utils.videoperception_doc_to_visual
+doc_to_text: !function utils.videoperception_doc_to_text_perception_comprehension
 doc_to_target: !function utils.videoperception_doc_to_answer
 process_results: !function utils.videoperception_process_results
 metric_list:

diff --git a/...ks/videofinal/videoapplication_audio.yaml → ...final/comprehension_with_transcripts.yaml b/...ks/videofinal/videoapplication_audio.yaml → ...final/comprehension_with_transcripts.yaml
@@ -1,9 +1,9 @@
-dataset_name: "application_augmented"
-task: "videommmu_application_audio"
+dataset_name: "Comprehension"
+task: "video_mmmu_comprehension"
 test_split: test
 output_type: generate_until
-doc_to_visual: !function utils.videoperception_doc_to_visual_perception
-doc_to_text: !function utils.videoperception_doc_to_text_with_transcript_application
+doc_to_visual: !function utils.videoperception_doc_to_visual
+doc_to_text: !function utils.videoperception_doc_to_text_with_transcript_perception_comprehension
 doc_to_target: !function utils.videoperception_doc_to_answer
 process_results: !function utils.videoperception_process_results
 metric_list:

diff --git a/...val/tasks/videofinal/videoperception.yaml → lmms_eval/tasks/videofinal/perception.yaml b/...val/tasks/videofinal/videoperception.yaml → lmms_eval/tasks/videofinal/perception.yaml
@@ -1,9 +1,9 @@
-dataset_name: "perception"
-task: "videommmu_perception"
+dataset_name: "Perception"
+task: "video_mmmu_perception"
 test_split: test
 output_type: generate_until
-doc_to_visual: !function utils.videoperception_doc_to_visual_perception
-doc_to_text: !function utils.videoperception_doc_to_text_perception
+doc_to_visual: !function utils.videoperception_doc_to_visual
+doc_to_text: !function utils.videoperception_doc_to_text_perception_comprehension
 doc_to_target: !function utils.videoperception_doc_to_answer
 process_results: !function utils.videoperception_process_results
 metric_list:

diff --git a/.../videofinal/videocomprehension_audio.yaml → ...deofinal/perception_with_transcripts.yaml b/.../videofinal/videocomprehension_audio.yaml → ...deofinal/perception_with_transcripts.yaml
@@ -1,9 +1,9 @@
-dataset_name: "comprehension"
-task: "videommmu_comprehension_audio"
+dataset_name: "Perception"
+task: "video_mmmu_perception"
 test_split: test
 output_type: generate_until
-doc_to_visual: !function utils.videoperception_doc_to_visual_perception
-doc_to_text: !function utils.videoperception_doc_to_text_with_transcript
+doc_to_visual: !function utils.videoperception_doc_to_visual
+doc_to_text: !function utils.videoperception_doc_to_text_with_transcript_perception_comprehension
 doc_to_target: !function utils.videoperception_doc_to_answer
 process_results: !function utils.videoperception_process_results
 metric_list:

diff --git a/lmms_eval/tasks/videofinal/utils.py b/lmms_eval/tasks/videofinal/utils.py
@@ -32,7 +32,6 @@
 # And load it here
 HF_HOME = os.environ["HF_HOME"]
 cache_dir = config["dataset_kwargs"]["cache_dir"]
-cache_dir = os.path.join(HF_HOME, cache_dir)
 
 
 from loguru import logger as eval_logger
@@ -61,10 +60,10 @@ def videoperception_doc_to_visual(doc):
     subject = "_".join(doc["id"].split("_")[1:-1])
 
     # Get the appropriate cache directory based on the subject
-    perception_cache_dir = os.path.join(HF_HOME, cache_dir, get_cache_dir(subject))
+    videommmu_cache_dir = os.path.join(HF_HOME, cache_dir, get_cache_dir(subject))
 
     video_path = doc["id"] + ".mp4"
-    video_path = os.path.join(perception_cache_dir, video_path)
+    video_path = os.path.join(videommmu_cache_dir, video_path)
 
     if os.path.exists(video_path):
         video_path = video_path
@@ -73,7 +72,7 @@ def videoperception_doc_to_visual(doc):
 
     return [video_path]
 
-def videoperception_doc_to_visual_no_video(doc):
+def videoperception_doc_to_visual_question_only(doc):
     video_path = doc["id"] + "_image" + ".mp4"
     question_only_cache_dir =  os.path.join(cache_dir, "question_only") 
     video_path = os.path.join(question_only_cache_dir, video_path)
@@ -117,7 +116,19 @@ def videoperception_doc_to_text_no_preprompt(doc, lmms_eval_specific_kwargs=None
     return f"{question}"
 
 
-def videoperception_doc_to_text_with_transcript(doc, lmms_eval_specific_kwargs=None, transcripts_dir="aud"):
+def videoperception_doc_to_text_perception_comprehension(doc, lmms_eval_specific_kwargs=None):
+    if lmms_eval_specific_kwargs is None:
+        lmms_eval_specific_kwargs = {}
+    post_prompt = ""
+    post_prompt += lmms_eval_specific_kwargs["perception_and_comprehension_prompt"]
+    question = doc["question"]
+    parsed_options = parse_options(doc["options"])
+    question += "\n" + parsed_options
+
+    return f"{question}{post_prompt}"
+
+
+def videoperception_doc_to_text_with_transcript_perception_comprehension(doc, lmms_eval_specific_kwargs=None, transcripts_dir):
     if lmms_eval_specific_kwargs is None:
         lmms_eval_specific_kwargs = {}
 
@@ -126,9 +137,7 @@ def videoperception_doc_to_text_with_transcript(doc, lmms_eval_specific_kwargs=N
     question += "\n" + parsed_options
 
     # Get the transcript from the corresponding file using the doc_id
-    cache_dir = config["dataset_kwargs"]["cache_dir"]
-    parent_cache_dir = os.path.join(HF_HOME, cache_dir)
-    transcripts_dir = os.path.join(parent_cache_dir, "aud")
+    transcripts_dir = os.path.join(cache_dir, "transcripts")
     file_name = doc["id"]
     transcript_file = os.path.join(transcripts_dir, f"{file_name}.txt")
     transcript = ""
@@ -138,13 +147,16 @@ def videoperception_doc_to_text_with_transcript(doc, lmms_eval_specific_kwargs=N
             transcript = f.read().strip()
     else:
         transcript = "[Transcript not available]"
-
-    # Combine the pre_prompt, transcript, and question
-    formatted_output = f"\nTranscript for the Video:\n{transcript}\n\nQuestion for the video:\n{question}"
+
+    post_prompt = ""
+    post_prompt += lmms_eval_specific_kwargs["perception_and_comprehension_prompt"]
+
+    formatted_output = f"\nTranscript for the Video:\n{transcript}\n\nQuestion for the video:\n{question}{post_prompt}"
+
     return formatted_output
 
 
-def videoperception_doc_to_text_with_transcript_application(doc, lmms_eval_specific_kwargs=None, transcripts_dir="aud"):
+def videoperception_doc_to_text_with_transcript_adaptation(doc, lmms_eval_specific_kwargs=None, transcripts_dir=):
     if lmms_eval_specific_kwargs is None:
         lmms_eval_specific_kwargs = {}
 
@@ -153,9 +165,7 @@ def videoperception_doc_to_text_with_transcript_application(doc, lmms_eval_speci
     question += "\n" + parsed_options
 
     # Get the transcript from the corresponding file using the doc_id
-    cache_dir = config["dataset_kwargs"]["cache_dir"]
-    parent_cache_dir = os.path.join(HF_HOME, cache_dir)
-    transcripts_dir = os.path.join(parent_cache_dir, "aud")
+    transcripts_dir = os.path.join(cache_dir, "transcripts")
     file_name = doc["id"]
     transcript_file = os.path.join(transcripts_dir, f"{file_name}.txt")
     transcript = ""
@@ -173,7 +183,6 @@ def videoperception_doc_to_text_with_transcript_application(doc, lmms_eval_speci
     else:
         pre_prompt += lmms_eval_specific_kwargs["open_ended_prompt"]
 
-    # Combine the pre_prompt, transcript, and question
     formatted_output = f"{pre_prompt}\nTranscript for the Video:\n{transcript}\n\nQuestion for the video:\n{question}"
     return formatted_output
 
@@ -585,16 +594,14 @@ def get_key_subresponses(response):
         response = response.strip().strip(".").lower()
         sub_responses = re.split(r"\.\s(?=[A-Z])|\n", response)
         indicators_of_keys = [
-            "could be ",
-            "so ",
-            "is ",
-            "thus ",
-            "therefore ",
-            "final ",
-            "answer ",
-            "result ",
-            "are",
+            # Common explanation or conclusion phrases
+            "could be ", "so ", "is ", "thus ", "therefore ", "final ", "answer ",
+            "result ", "are ", "in total ", "total ", "identify ", "recognize ", 
+            "calculated as ", "counted as ", "measured as ", "observed as ", 
+            "concluded as ", "found to be ", "equals ", "determined to be ",
+            "number of ", "value is ", "adds up to ", "have ", "has "
         ]
+
         key_responses = []
         for index, resp in enumerate(sub_responses):
             # if last one, accept it's an equation (the entire response can be just one sentence with equation)

diff --git a/lmms_eval/tasks/videofinal/video_mmmu.yaml b/lmms_eval/tasks/videofinal/video_mmmu.yaml
@@ -1,5 +1,5 @@
 group: video_mmmu
 task:
-- videommmu_application
-- videommmu_comprehension
-- videommmu_perception
+- video_mmmu_application
+- video_mmmu_comprehension
+- video_mmmu_perception
diff --git a/lmms_eval/tasks/videofinal/video_mmmu_with_transcripts.yaml b/lmms_eval/tasks/videofinal/video_mmmu_with_transcripts.yaml
@@ -0,0 +1,5 @@
+group: video_mmmu_with_transcripts
+task:
+- video_mmmu_adaptation_with_transcripts
+- video_mmmu_comprehension_with_transcripts
+- video_mmmu_perception_with_transcripts
diff --git a/lmms_eval/tasks/videofinal/videoperception_image.yaml b/lmms_eval/tasks/videofinal/videoperception_image.yaml