Merge pull request #128 from EvolvingLMMs-Lab/videomme

Update videomme task [w,w/o subtitle] and modified prompt for ablations
EvolvingLMMs-Lab · Jun 16, 2024 · 791e087 · 791e087
2 parents f4eeaa9 + c58271a
commit 791e087
Show file tree

Hide file tree

Showing 2 changed files with 72 additions and 2 deletions.
diff --git a/lmms_eval/tasks/videomme/utils.py b/lmms_eval/tasks/videomme/utils.py
@@ -106,10 +106,37 @@ def videomme_doc_to_visual(doc):
 
 
 def videomme_doc_to_text(doc, model_specific_prompt_kwargs=None):
+
+    option_prompt="Select the best answer to the following multiple-choice question based on the video and the subtitles. Respond with only the letter (A, B, C, or D) of the correct option."
+    question = doc["question"]
+    option = str(doc["options"])
+    question = question + "\n" + option
+    full_prompt=option_prompt+"\n"+question+"\n"+"The best answer is:"
+    return full_prompt
+# Frames + Subs
+# This video's subtitles are listed below: 
+# 【subtitles】
+
+# Select the best answer to the following multiple-choice question based on the video and the subtitles. Respond with only the letter (A, B, C, or D) of the correct option.
+# 【question】
+# The best answer is:
+# Frames / Frames + Audio
+# Select the best answer to the following multiple-choice question based on the video. Respond with only the letter (A, B, C, or D) of the correct option.
+# 【question】
+# The best answer is:
+
+def videomme_doc_to_text_subtitle(doc, model_specific_prompt_kwargs=None):
+    subtitles_prompt="This video's subtitles are listed below: \n"
+    if doc["Subtitle"]=="":
+        subtitle="No subtitles available"
+    else:
+        subtitle=doc["Subtitle"]
+    option_prompt="Select the best answer to the following multiple-choice question based on the video and the subtitles. Respond with only the letter (A, B, C, or D) of the correct option."
     question = doc["question"]
     option = str(doc["options"])
-    question = question + "\n" + option + model_specific_prompt_kwargs["post_prompt"]
-    return question
+    question = question + "\n" + option
+    full_prompt=subtitles_prompt+subtitle+"\n"+option_prompt+"\n"+question+"\n"+"The best answer is:"
+    return full_prompt
 
 
 def extract_characters_regex(s):

diff --git a/lmms_eval/tasks/videomme/videomme_subtitle.yaml b/lmms_eval/tasks/videomme/videomme_subtitle.yaml
@@ -0,0 +1,43 @@
+dataset_path: lmms-lab/Video-MME
+dataset_kwargs:
+  token: True
+  cache_dir: videomme
+  video: True
+  # From_YouTube: True
+task: videomme_subtitle
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.videomme_doc_to_visual
+doc_to_text: !function utils.videomme_doc_to_text_subtitle
+doc_to_target: "answer"
+generation_kwargs:
+  max_new_tokens: 16
+  temperature: 0
+  top_p: 1.0
+  num_beams: 1
+  do_sample: false
+# The return value of process_results will be used by metrics
+process_results: !function utils.videomme_process_results
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+metric_list:
+  - metric: videomme_percetion_score
+    aggregation: !function utils.videomme_aggregate_results
+    higher_is_better: true
+model_specific_prompt_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: ""
+  # gpt4v:
+  #   pre_prompt: ""
+  #   post_prompt: 
+  # # qwen_vl:  
+  # #   pre_prompt: ""
+  # #   post_prompt: " Answer:"
+  # # otterhd:
+  # #   pre_prompt: ""
+  # #   post_prompt: " Answer:"
+  # xcomposer2_4khd:
+  #   pre_prompt: "[UNUSED_TOKEN_146]user\n"
+  #   post_prompt: " Answer this question with A, B, C, or D.[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n"
+metadata:
+  - version: 0.0