Skip to content

Commit

Permalink
[task] add mix_evals for video evaluation
Browse files Browse the repository at this point in the history
  • Loading branch information
Luodian committed Sep 22, 2024
1 parent 920c211 commit 7452238
Show file tree
Hide file tree
Showing 6 changed files with 382 additions and 0 deletions.
16 changes: 16 additions & 0 deletions lmms_eval/tasks/mix_evals/_default_template_yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
dataset_path: lmms-lab/MixEvals_Video2Text
dataset_kwargs:
token: True
video: True
cache_dir: mix_evals_video2text
model_specific_prompt_kwargs:
default:
pre_prompt: ""
post_prompt: ""
gpt4v:
pre_prompt: "These are frames from a video. Please answer the following questions about the video."
post_prompt: ""
metadata:
modality: video
version: 0.0
gpt_eval_model_name: "gpt-4o-mini"
5 changes: 5 additions & 0 deletions lmms_eval/tasks/mix_evals/mix_evals_video2text.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
group: mix_evals_video2text
task:
- mix_evals_video2text_openconv
- mix_evals_video2text_mc
- mix_evals_video2text_freeform
22 changes: 22 additions & 0 deletions lmms_eval/tasks/mix_evals/mix_evals_video2text_freeform.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
dataset_name: "video2text_closeended_free-form"
task: "mix_evals_video2text_freeform"
test_split: test
output_type: generate_until
doc_to_visual: !function utils.mix_evals_video2text_doc_to_visual
doc_to_text: !function utils.mix_evals_video2text_doc_to_text
doc_to_target: "{{target}}"
process_results: !function utils.mix_evals_video2text_process_results_freeform
metric_list:
- metric: gpt_eval
aggregation: !function utils.mix_evals_video2text_gpt_eval
higher_is_better: true

include: _default_template_yaml

model_specific_prompt_kwargs:
default:
pre_prompt: "These are frames from a video. Please answer the following questions about the video."
post_prompt: "Answer the question using a single word or phrase."
gpt4v:
pre_prompt: "These are frames from a video. Please answer the following questions about the video with a short phrase."
post_prompt: ""
31 changes: 31 additions & 0 deletions lmms_eval/tasks/mix_evals/mix_evals_video2text_mc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
include: _default_template_yaml
dataset_name: "video2text_closeended_multiple-choice"
task: "mix_evals_video2text_mc"
test_split: test
output_type: generate_until
doc_to_visual: !function utils.mix_evals_video2text_doc_to_visual
doc_to_text: !function utils.mix_evals_video2text_doc_to_text
doc_to_target: "{{target}}"

metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true

filter_list:
- name: "flexible-extract"
filter:
- function: !function utils.MultiChoiceRegexFilter
group_select: 0
ignore_case: true
ignore_punctuation: true

model_specific_prompt_kwargs:
default:
pre_prompt: "These are frames from a video. Please answer the following questions about the video."
post_prompt: "Answer with the option's letter from the given choices directly."
gpt4v:
pre_prompt: "These are frames from a video. Please answer the following questions about the video."
post_prompt: "Answer with the option's letter from the given choices directly."
22 changes: 22 additions & 0 deletions lmms_eval/tasks/mix_evals/mix_evals_video2text_openended.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
include: _default_template_yaml
dataset_name: "video2text_openended"
task: "mix_evals_video2text_openconv"
test_split: test
output_type: generate_until
doc_to_visual: !function utils.mix_evals_video2text_doc_to_visual
doc_to_text: !function utils.mix_evals_video2text_doc_to_text_open_convs
doc_to_target: ""
process_results: !function utils.mix_evals_video2text_process_results_open_convs

metric_list:
- metric: submission
aggregation: !function utils.mix_evals_video2text_aggregate_gen
higher_is_better: true

model_specific_prompt_kwargs:
default:
pre_prompt: "These are frames from a video. Please answer the following questions about the video."
post_prompt: ""
gpt4v:
pre_prompt: "These are frames from a video. Please answer the following questions about the video."
post_prompt: ""
286 changes: 286 additions & 0 deletions lmms_eval/tasks/mix_evals/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,286 @@
import datetime
import json
import os
import re
import sys
import time
from pathlib import Path

import requests
import yaml
from loguru import logger as eval_logger

import lmms_eval.tasks._task_utils.file_utils as file_utils
from lmms_eval.filters.extraction import ExtendedRegexFilter

with open(Path(__file__).parent / "_default_template_yaml", "r") as f:
raw_data = f.readlines()
safe_data = []
for i, line in enumerate(raw_data):
# remove function definition since yaml load cannot handle it
if "!function" not in line:
safe_data.append(line)

config = yaml.safe_load("".join(safe_data))

NUM_SECONDS_TO_SLEEP = 5
GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"]
API_TYPE = os.getenv("API_TYPE", "openai")

if API_TYPE == "openai":
API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
headers = {
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json",
}
elif API_TYPE == "azure":
API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
headers = {
"api-key": API_KEY,
"Content-Type": "application/json",
}

eval_prompt = """You are an AI assistant who will help me to evaluate the quality of a model response to a few candidate ground truth answers.
Some criterion
- Response that perfectly reflect the meaning of the ground truth: 1 point
- Response that reflect none of the key points in the ground truth: 0 point
- Some part in the response are correct but some parts in the ground truth are not mentioned in the response: 0.5 point
- Some part in the response are correct but other parts in the response are not mentioned in the ground truth: 0.5 point
Here're some examples about the scoring criterion and format:
model response: Steam Cleaning Services
ground truth: ["steam clean", "steam clean", "cleaning", "car", "steam clean"],
Point: 1
model response: A cowboy action shooter.
ground truth: ["man"]
Point: 1
model response: I'm sorry, but I can't assist with that request.
ground truth: ["quality"]
Point: 0
Let's begin this task:
model response: {model_response}
ground truth: {ground_truth}
Point:"""


def get_eval(model_response: str, ground_truth: str, max_tokens: int, retries: int = 5):
global headers
content = eval_prompt.format(model_response=model_response, ground_truth=ground_truth)

messages = [
{"role": "user", "content": content},
]

payload = {
"model": GPT_EVAL_MODEL_NAME,
"messages": messages,
"temperature": 0.2,
"max_tokens": max_tokens,
}

for attempt in range(retries):
try:
response = requests.post(API_URL, headers=headers, json=payload, timeout=60)
response.raise_for_status()
response_data = response.json()

content = response_data["choices"][0]["message"]["content"].strip()
if content != "":
return content, response_data["model"]
break # If successful, break out of the loop

except Exception as e:
eval_logger.info(f"Attempt {attempt + 1} failed with error: {e}")
if attempt < retries: # If we have retries left, sleep and then continue to next attempt
time.sleep(NUM_SECONDS_TO_SLEEP)
else: # If this was the last attempt, log and return empty
eval_logger.error(f"All {retries} attempts failed. Last error message: {e}")
return "", ""
return "", ""


# A bit ugly here
# But the idea is that we will unzip all the zip files
# To HF HOME cache dir
# And load it here
HF_HOME = os.environ["HF_HOME"]
cache_dir = config["dataset_kwargs"]["cache_dir"]
cache_dir = os.path.join(HF_HOME, cache_dir)
cache_dir = os.path.join(cache_dir)


# Pass in video path here
# Can only work correctly with video llm
def mix_evals_video2text_doc_to_visual(doc):
video_path = doc["video_path"]
video_path = os.path.join(cache_dir, video_path)
if os.path.exists(video_path):
video_path = video_path
elif os.path.exists(video_path.replace("mp4", "MP4")):
video_path = video_path.replace("mp4", "MP4")
else:
sys.exit(f"video path:{video_path} does not exist, please check")
return [video_path]


# This is the place where you format your question
def mix_evals_video2text_doc_to_text(doc, model_specific_prompt_kwargs=None):
if model_specific_prompt_kwargs is None:
model_specific_prompt_kwargs = {}
pre_prompt = ""
post_prompt = ""
if "pre_prompt" in model_specific_prompt_kwargs:
pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
if "post_prompt" in model_specific_prompt_kwargs:
post_prompt = model_specific_prompt_kwargs["post_prompt"]

user_prompt = doc["prompt"]

if "options" in doc:
option_prompt = "Here are the options:\n"
for idx, option in enumerate(doc["options"]):
char_idx = chr(ord("A") + idx)
option = option.strip()
option_prompt += f"{char_idx}. {option}\n"

option_prompt = option_prompt.rstrip("\n")
user_prompt = f"{user_prompt}\n{option_prompt}"

if pre_prompt:
user_prompt = f"{pre_prompt}\n{user_prompt}"

if post_prompt:
user_prompt = f"{user_prompt}\n{post_prompt}"
return user_prompt


OPEN_CONVS_PROMPT = """{PRE}
{FIRST}
{POST}
"""


def mix_evals_video2text_doc_to_text_open_convs(doc, model_specific_prompt_kwargs=None):
if model_specific_prompt_kwargs is None:
model_specific_prompt_kwargs = {}
pre_prompt = ""
post_prompt = ""
if "pre_prompt" in model_specific_prompt_kwargs:
pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
if "post_prompt" in model_specific_prompt_kwargs:
post_prompt = model_specific_prompt_kwargs["post_prompt"]

filtered_first_turn = re.sub(r"<video_[0-9]+>", "", doc["first_turn_user_prompt"])
return OPEN_CONVS_PROMPT.format(
PRE=pre_prompt,
POST=post_prompt,
FIRST=filtered_first_turn,
)


MODEL_CONVS_PROMPT = """{FIRST}
{MODEL_RESPONSE}
{PRE}
{SECOND}
{POST}
"""


def mix_evals_video2text_doc_to_text_open_2nd_convs(doc, model_specific_prompt_kwargs=None):
if model_specific_prompt_kwargs is None:
model_specific_prompt_kwargs = {}
pre_prompt = ""
post_prompt = ""
if "pre_prompt" in model_specific_prompt_kwargs:
pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
if "post_prompt" in model_specific_prompt_kwargs:
post_prompt = model_specific_prompt_kwargs["post_prompt"]

return MODEL_CONVS_PROMPT.format(
PRE=pre_prompt,
POST=post_prompt,
FIRST=doc["first_turn_user_prompt"],
SECOND=doc["second_turn_user_prompt"],
MODEL_RESPONSE=doc["model_response"],
)


def mix_evals_video2text_process_results_open_convs(doc, result):
pred = result[0]
return {"submission": {"pred": pred, "question_idx": doc["question_index"], "first_turn_video_caption": doc["first_turn_video_caption"], "target": ""}}


def mix_evals_video2text_process_results_freeform(doc, result):
pred = result[0]
ground_truth_str = ", ".join([f'"{gt}"' for gt in doc["target"]])
ground_truth_str = f"[{ground_truth_str}]"
content = eval_prompt.format(model_response=pred, ground_truth=ground_truth_str)
eval_answer, model_name = get_eval(model_response=pred, ground_truth=ground_truth_str, max_tokens=1024)
return {
"submission": {"pred": pred, "question_idx": doc["question_index"], "target": doc["target"], "eval_answer": eval_answer, "gpt_prompt": content},
"gpt_eval": {"pred": pred, "question_idx": doc["question_index"], "target": doc["target"], "eval_answer": eval_answer, "gpt_prompt": content},
}


def mix_evals_video2text_aggregate_submissions(results, args, task):
now_date_time = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
submission_file_name = f"mix_evals_video2text_{task}-{now_date_time}.json"
path = file_utils.generate_submission_file(submission_file_name, args)
with open(path, "w") as f:
json.dump(results, f)
eval_logger.info(f"Submission file saved to {path}")


def mix_evals_video2text_gpt_eval(results, args):
score = 0
for result in results:
eval_answer = result["eval_answer"]
eval_score = re.search(r"([0-9.]+)", eval_answer).group(1)
try:
eval_score = float(eval_score)
except Exception as e:
eval_logger.error(f"Error parsing eval_score: {e}")
eval_score = 0.0
score += eval_score

return score / len(results)


# Factory into different aggregate
def mix_evals_video2text_aggregate_gen(results, args):
mix_evals_video2text_aggregate_submissions(results, args, "OpenConvs")


class MultiChoiceRegexFilter(ExtendedRegexFilter):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

def apply(self, resps, docs):
filtered_resps = []

for r, doc in zip(resps, docs):
# Regex to directly extract the option letter from the model response
option_letter_regex = re.compile(r"\b([A-Z])\.\s+([^\n]*)")

# Process each response
filtered = []
for resp in r:
# Try to match the option letter at the start of the response
match = option_letter_regex.match(resp)
if match:
# If a match is found, append the matched letter
filtered.append(match.group(1))
else:
# If no match, return the original response
filtered.append(resp)

# Assuming we need the first response that matches or the original response
filtered_resps.append(filtered[0])

return filtered_resps

0 comments on commit 7452238

Please sign in to comment.