Skip to content

Commit

Permalink
enable transcripts evaluation all three tracks
Browse files Browse the repository at this point in the history
  • Loading branch information
KairuiHu committed Dec 17, 2024
1 parent a5bcb73 commit c148532
Show file tree
Hide file tree
Showing 18 changed files with 64 additions and 204 deletions.
3 changes: 2 additions & 1 deletion lmms_eval/tasks/videofinal/_default_template_yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@ dataset_kwargs:
video: True
# force_download: True
# force_unzip: True
cache_dir: videommmu_1213
cache_dir: video_mmmu
lmms_eval_specific_kwargs:
default:
pre_prompt: "You should watch and learn the video content. Then apply what you learned to "
perception_and_comprehension_prompt: "\nPlease ignore the Quiz question in last frame of the video."
mcq_prompt: "answer the following multi-choice question. The image for this question is at the end of the video.\n"
open_ended_prompt: "answer the following open-ended question. The image for this question is at the end of the video.\n"
generation_kwargs:
Expand Down
2 changes: 1 addition & 1 deletion lmms_eval/tasks/videofinal/adaptation_question_only.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ dataset_name: "Adaptation"
task: "video_mmmu_adaptation_no_video"
test_split: test
output_type: generate_until
doc_to_visual: !function utils.videoperception_doc_to_visual_no_video
doc_to_visual: !function utils.videoperception_doc_to_visual_question_only
doc_to_text: !function utils.videoperception_doc_to_text_no_preprompt
doc_to_target: !function utils.videoperception_doc_to_answer
process_results: !function utils.videoperception_process_results
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
dataset_name: "perception"
task: "videommmu_perception_audio"
dataset_name: "Adaptation"
task: "video_mmmu_adaptation"
test_split: test
output_type: generate_until
doc_to_visual: !function utils.videoperception_doc_to_visual_perception
doc_to_text: !function utils.videoperception_doc_to_text_with_transcript
doc_to_visual: !function utils.videoperception_doc_to_visual
doc_to_text: !function utils.videoperception_doc_to_text_with_transcript_adaptation
doc_to_target: !function utils.videoperception_doc_to_answer
process_results: !function utils.videoperception_process_results
metric_list:
Expand Down
5 changes: 0 additions & 5 deletions lmms_eval/tasks/videofinal/audio_gemini.yaml

This file was deleted.

13 changes: 0 additions & 13 deletions lmms_eval/tasks/videofinal/audio_gemini_application.yaml

This file was deleted.

13 changes: 0 additions & 13 deletions lmms_eval/tasks/videofinal/audio_gemini_comprehension.yaml

This file was deleted.

88 changes: 0 additions & 88 deletions lmms_eval/tasks/videofinal/audio_gemini_utils.py

This file was deleted.

13 changes: 0 additions & 13 deletions lmms_eval/tasks/videofinal/audio_gemini_videoperception.yaml

This file was deleted.

5 changes: 0 additions & 5 deletions lmms_eval/tasks/videofinal/audio_group.yaml

This file was deleted.

8 changes: 4 additions & 4 deletions lmms_eval/tasks/videofinal/comprehension.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
dataset_name: "comprehension"
task: "videommmu_comprehension"
dataset_name: "Comprehension"
task: "video_mmmu_comprehension"
test_split: test
output_type: generate_until
doc_to_visual: !function utils.videoperception_doc_to_visual_perception
doc_to_text: !function utils.videoperception_doc_to_text_perception
doc_to_visual: !function utils.videoperception_doc_to_visual
doc_to_text: !function utils.videoperception_doc_to_text_perception_comprehension
doc_to_target: !function utils.videoperception_doc_to_answer
process_results: !function utils.videoperception_process_results
metric_list:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
dataset_name: "application_augmented"
task: "videommmu_application_audio"
dataset_name: "Comprehension"
task: "video_mmmu_comprehension"
test_split: test
output_type: generate_until
doc_to_visual: !function utils.videoperception_doc_to_visual_perception
doc_to_text: !function utils.videoperception_doc_to_text_with_transcript_application
doc_to_visual: !function utils.videoperception_doc_to_visual
doc_to_text: !function utils.videoperception_doc_to_text_with_transcript_perception_comprehension
doc_to_target: !function utils.videoperception_doc_to_answer
process_results: !function utils.videoperception_process_results
metric_list:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
dataset_name: "perception"
task: "videommmu_perception"
dataset_name: "Perception"
task: "video_mmmu_perception"
test_split: test
output_type: generate_until
doc_to_visual: !function utils.videoperception_doc_to_visual_perception
doc_to_text: !function utils.videoperception_doc_to_text_perception
doc_to_visual: !function utils.videoperception_doc_to_visual
doc_to_text: !function utils.videoperception_doc_to_text_perception_comprehension
doc_to_target: !function utils.videoperception_doc_to_answer
process_results: !function utils.videoperception_process_results
metric_list:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
dataset_name: "comprehension"
task: "videommmu_comprehension_audio"
dataset_name: "Perception"
task: "video_mmmu_perception"
test_split: test
output_type: generate_until
doc_to_visual: !function utils.videoperception_doc_to_visual_perception
doc_to_text: !function utils.videoperception_doc_to_text_with_transcript
doc_to_visual: !function utils.videoperception_doc_to_visual
doc_to_text: !function utils.videoperception_doc_to_text_with_transcript_perception_comprehension
doc_to_target: !function utils.videoperception_doc_to_answer
process_results: !function utils.videoperception_process_results
metric_list:
Expand Down
57 changes: 32 additions & 25 deletions lmms_eval/tasks/videofinal/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
# And load it here
HF_HOME = os.environ["HF_HOME"]
cache_dir = config["dataset_kwargs"]["cache_dir"]
cache_dir = os.path.join(HF_HOME, cache_dir)


from loguru import logger as eval_logger
Expand Down Expand Up @@ -61,10 +60,10 @@ def videoperception_doc_to_visual(doc):
subject = "_".join(doc["id"].split("_")[1:-1])

# Get the appropriate cache directory based on the subject
perception_cache_dir = os.path.join(HF_HOME, cache_dir, get_cache_dir(subject))
videommmu_cache_dir = os.path.join(HF_HOME, cache_dir, get_cache_dir(subject))

video_path = doc["id"] + ".mp4"
video_path = os.path.join(perception_cache_dir, video_path)
video_path = os.path.join(videommmu_cache_dir, video_path)

if os.path.exists(video_path):
video_path = video_path
Expand All @@ -73,7 +72,7 @@ def videoperception_doc_to_visual(doc):

return [video_path]

def videoperception_doc_to_visual_no_video(doc):
def videoperception_doc_to_visual_question_only(doc):
video_path = doc["id"] + "_image" + ".mp4"
question_only_cache_dir = os.path.join(cache_dir, "question_only")
video_path = os.path.join(question_only_cache_dir, video_path)
Expand Down Expand Up @@ -117,7 +116,19 @@ def videoperception_doc_to_text_no_preprompt(doc, lmms_eval_specific_kwargs=None
return f"{question}"


def videoperception_doc_to_text_with_transcript(doc, lmms_eval_specific_kwargs=None, transcripts_dir="aud"):
def videoperception_doc_to_text_perception_comprehension(doc, lmms_eval_specific_kwargs=None):
if lmms_eval_specific_kwargs is None:
lmms_eval_specific_kwargs = {}
post_prompt = ""
post_prompt += lmms_eval_specific_kwargs["perception_and_comprehension_prompt"]
question = doc["question"]
parsed_options = parse_options(doc["options"])
question += "\n" + parsed_options

return f"{question}{post_prompt}"


def videoperception_doc_to_text_with_transcript_perception_comprehension(doc, lmms_eval_specific_kwargs=None, transcripts_dir):
if lmms_eval_specific_kwargs is None:
lmms_eval_specific_kwargs = {}

Expand All @@ -126,9 +137,7 @@ def videoperception_doc_to_text_with_transcript(doc, lmms_eval_specific_kwargs=N
question += "\n" + parsed_options

# Get the transcript from the corresponding file using the doc_id
cache_dir = config["dataset_kwargs"]["cache_dir"]
parent_cache_dir = os.path.join(HF_HOME, cache_dir)
transcripts_dir = os.path.join(parent_cache_dir, "aud")
transcripts_dir = os.path.join(cache_dir, "transcripts")
file_name = doc["id"]
transcript_file = os.path.join(transcripts_dir, f"{file_name}.txt")
transcript = ""
Expand All @@ -138,13 +147,16 @@ def videoperception_doc_to_text_with_transcript(doc, lmms_eval_specific_kwargs=N
transcript = f.read().strip()
else:
transcript = "[Transcript not available]"

# Combine the pre_prompt, transcript, and question
formatted_output = f"\nTranscript for the Video:\n{transcript}\n\nQuestion for the video:\n{question}"

post_prompt = ""
post_prompt += lmms_eval_specific_kwargs["perception_and_comprehension_prompt"]

formatted_output = f"\nTranscript for the Video:\n{transcript}\n\nQuestion for the video:\n{question}{post_prompt}"

return formatted_output


def videoperception_doc_to_text_with_transcript_application(doc, lmms_eval_specific_kwargs=None, transcripts_dir="aud"):
def videoperception_doc_to_text_with_transcript_adaptation(doc, lmms_eval_specific_kwargs=None, transcripts_dir=):
if lmms_eval_specific_kwargs is None:
lmms_eval_specific_kwargs = {}

Expand All @@ -153,9 +165,7 @@ def videoperception_doc_to_text_with_transcript_application(doc, lmms_eval_speci
question += "\n" + parsed_options

# Get the transcript from the corresponding file using the doc_id
cache_dir = config["dataset_kwargs"]["cache_dir"]
parent_cache_dir = os.path.join(HF_HOME, cache_dir)
transcripts_dir = os.path.join(parent_cache_dir, "aud")
transcripts_dir = os.path.join(cache_dir, "transcripts")
file_name = doc["id"]
transcript_file = os.path.join(transcripts_dir, f"{file_name}.txt")
transcript = ""
Expand All @@ -173,7 +183,6 @@ def videoperception_doc_to_text_with_transcript_application(doc, lmms_eval_speci
else:
pre_prompt += lmms_eval_specific_kwargs["open_ended_prompt"]

# Combine the pre_prompt, transcript, and question
formatted_output = f"{pre_prompt}\nTranscript for the Video:\n{transcript}\n\nQuestion for the video:\n{question}"
return formatted_output

Expand Down Expand Up @@ -585,16 +594,14 @@ def get_key_subresponses(response):
response = response.strip().strip(".").lower()
sub_responses = re.split(r"\.\s(?=[A-Z])|\n", response)
indicators_of_keys = [
"could be ",
"so ",
"is ",
"thus ",
"therefore ",
"final ",
"answer ",
"result ",
"are",
# Common explanation or conclusion phrases
"could be ", "so ", "is ", "thus ", "therefore ", "final ", "answer ",
"result ", "are ", "in total ", "total ", "identify ", "recognize ",
"calculated as ", "counted as ", "measured as ", "observed as ",
"concluded as ", "found to be ", "equals ", "determined to be ",
"number of ", "value is ", "adds up to ", "have ", "has "
]

key_responses = []
for index, resp in enumerate(sub_responses):
# if last one, accept it's an equation (the entire response can be just one sentence with equation)
Expand Down
6 changes: 3 additions & 3 deletions lmms_eval/tasks/videofinal/video_mmmu.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
group: video_mmmu
task:
- videommmu_application
- videommmu_comprehension
- videommmu_perception
- video_mmmu_application
- video_mmmu_comprehension
- video_mmmu_perception
5 changes: 5 additions & 0 deletions lmms_eval/tasks/videofinal/video_mmmu_with_transcripts.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
group: video_mmmu_with_transcripts
task:
- video_mmmu_adaptation_with_transcripts
- video_mmmu_comprehension_with_transcripts
- video_mmmu_perception_with_transcripts
16 changes: 0 additions & 16 deletions lmms_eval/tasks/videofinal/videoperception_image.yaml

This file was deleted.

Loading

0 comments on commit c148532

Please sign in to comment.