[task] add mix_evals for video evaluation

EvolvingLMMs-Lab · Sep 22, 2024 · 7452238 · 7452238
1 parent 920c211
commit 7452238
Show file tree

Hide file tree

Showing 6 changed files with 382 additions and 0 deletions.
diff --git a/lmms_eval/tasks/mix_evals/_default_template_yaml b/lmms_eval/tasks/mix_evals/_default_template_yaml
@@ -0,0 +1,16 @@
+dataset_path: lmms-lab/MixEvals_Video2Text
+dataset_kwargs:
+  token: True
+  video: True
+  cache_dir: mix_evals_video2text
+model_specific_prompt_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: ""
+  gpt4v:
+    pre_prompt: "These are frames from a video. Please answer the following questions about the video."
+    post_prompt: ""
+metadata:
+  modality: video
+  version: 0.0
+  gpt_eval_model_name: "gpt-4o-mini"
diff --git a/lmms_eval/tasks/mix_evals/mix_evals_video2text.yaml b/lmms_eval/tasks/mix_evals/mix_evals_video2text.yaml
@@ -0,0 +1,5 @@
+group: mix_evals_video2text
+task:
+- mix_evals_video2text_openconv
+- mix_evals_video2text_mc
+- mix_evals_video2text_freeform
diff --git a/lmms_eval/tasks/mix_evals/mix_evals_video2text_freeform.yaml b/lmms_eval/tasks/mix_evals/mix_evals_video2text_freeform.yaml
@@ -0,0 +1,22 @@
+dataset_name: "video2text_closeended_free-form"
+task: "mix_evals_video2text_freeform"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.mix_evals_video2text_doc_to_visual
+doc_to_text: !function utils.mix_evals_video2text_doc_to_text
+doc_to_target: "{{target}}"
+process_results: !function utils.mix_evals_video2text_process_results_freeform
+metric_list:
+  - metric: gpt_eval
+    aggregation: !function utils.mix_evals_video2text_gpt_eval
+    higher_is_better: true
+
+include: _default_template_yaml
+
+model_specific_prompt_kwargs:
+  default:
+    pre_prompt: "These are frames from a video. Please answer the following questions about the video."
+    post_prompt: "Answer the question using a single word or phrase."
+  gpt4v:
+    pre_prompt: "These are frames from a video. Please answer the following questions about the video with a short phrase."
+    post_prompt: ""
diff --git a/lmms_eval/tasks/mix_evals/mix_evals_video2text_mc.yaml b/lmms_eval/tasks/mix_evals/mix_evals_video2text_mc.yaml
@@ -0,0 +1,31 @@
+include: _default_template_yaml
+dataset_name: "video2text_closeended_multiple-choice"
+task: "mix_evals_video2text_mc"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.mix_evals_video2text_doc_to_visual
+doc_to_text: !function utils.mix_evals_video2text_doc_to_text
+doc_to_target: "{{target}}"
+
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+
+filter_list:
+  - name: "flexible-extract"
+    filter:
+      - function: !function utils.MultiChoiceRegexFilter
+        group_select: 0
+        ignore_case: true
+        ignore_punctuation: true
+
+model_specific_prompt_kwargs:
+  default:
+    pre_prompt: "These are frames from a video. Please answer the following questions about the video."
+    post_prompt: "Answer with the option's letter from the given choices directly."
+  gpt4v:
+    pre_prompt: "These are frames from a video. Please answer the following questions about the video."
+    post_prompt: "Answer with the option's letter from the given choices directly."
diff --git a/lmms_eval/tasks/mix_evals/mix_evals_video2text_openended.yaml b/lmms_eval/tasks/mix_evals/mix_evals_video2text_openended.yaml
@@ -0,0 +1,22 @@
+include: _default_template_yaml
+dataset_name: "video2text_openended"
+task: "mix_evals_video2text_openconv"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.mix_evals_video2text_doc_to_visual
+doc_to_text: !function utils.mix_evals_video2text_doc_to_text_open_convs
+doc_to_target: ""
+process_results: !function utils.mix_evals_video2text_process_results_open_convs
+
+metric_list:
+  - metric: submission
+    aggregation: !function utils.mix_evals_video2text_aggregate_gen
+    higher_is_better: true
+
+model_specific_prompt_kwargs:
+  default:
+    pre_prompt: "These are frames from a video. Please answer the following questions about the video."
+    post_prompt: ""
+  gpt4v:
+    pre_prompt: "These are frames from a video. Please answer the following questions about the video."
+    post_prompt: ""
diff --git a/lmms_eval/tasks/mix_evals/utils.py b/lmms_eval/tasks/mix_evals/utils.py
@@ -0,0 +1,286 @@
+import datetime
+import json
+import os
+import re
+import sys
+import time
+from pathlib import Path
+
+import requests
+import yaml
+from loguru import logger as eval_logger
+
+import lmms_eval.tasks._task_utils.file_utils as file_utils
+from lmms_eval.filters.extraction import ExtendedRegexFilter
+
+with open(Path(__file__).parent / "_default_template_yaml", "r") as f:
+    raw_data = f.readlines()
+    safe_data = []
+    for i, line in enumerate(raw_data):
+        # remove function definition since yaml load cannot handle it
+        if "!function" not in line:
+            safe_data.append(line)
+
+    config = yaml.safe_load("".join(safe_data))
+
+NUM_SECONDS_TO_SLEEP = 5
+GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"]
+API_TYPE = os.getenv("API_TYPE", "openai")
+
+if API_TYPE == "openai":
+    API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
+    API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
+    headers = {
+        "Authorization": f"Bearer {API_KEY}",
+        "Content-Type": "application/json",
+    }
+elif API_TYPE == "azure":
+    API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
+    API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
+    headers = {
+        "api-key": API_KEY,
+        "Content-Type": "application/json",
+    }
+
+eval_prompt = """You are an AI assistant who will help me to evaluate the quality of a model response to a few candidate ground truth answers.
+
+Some criterion
+- Response that perfectly reflect the meaning of the ground truth: 1 point
+- Response that reflect none of the key points in the ground truth: 0 point
+- Some part in the response are correct but some parts in the ground truth are not mentioned in the response: 0.5 point
+- Some part in the response are correct but other parts in the response are not mentioned in the ground truth: 0.5 point
+
+Here're some examples about the scoring criterion and format:
+model response: Steam Cleaning Services
+ground truth: ["steam clean", "steam clean", "cleaning", "car", "steam clean"],
+Point: 1
+
+model response: A cowboy action shooter.
+ground truth: ["man"]
+Point: 1
+
+model response: I'm sorry, but I can't assist with that request.
+ground truth: ["quality"]
+Point: 0
+
+Let's begin this task:
+model response: {model_response}
+ground truth: {ground_truth}
+Point:"""
+
+
+def get_eval(model_response: str, ground_truth: str, max_tokens: int, retries: int = 5):
+    global headers
+    content = eval_prompt.format(model_response=model_response, ground_truth=ground_truth)
+
+    messages = [
+        {"role": "user", "content": content},
+    ]
+
+    payload = {
+        "model": GPT_EVAL_MODEL_NAME,
+        "messages": messages,
+        "temperature": 0.2,
+        "max_tokens": max_tokens,
+    }
+
+    for attempt in range(retries):
+        try:
+            response = requests.post(API_URL, headers=headers, json=payload, timeout=60)
+            response.raise_for_status()
+            response_data = response.json()
+
+            content = response_data["choices"][0]["message"]["content"].strip()
+            if content != "":
+                return content, response_data["model"]
+            break  # If successful, break out of the loop
+
+        except Exception as e:
+            eval_logger.info(f"Attempt {attempt + 1} failed with error: {e}")
+            if attempt < retries:  # If we have retries left, sleep and then continue to next attempt
+                time.sleep(NUM_SECONDS_TO_SLEEP)
+            else:  # If this was the last attempt, log and return empty
+                eval_logger.error(f"All {retries} attempts failed. Last error message: {e}")
+                return "", ""
+    return "", ""
+
+
+# A bit ugly here
+# But the idea is that we will unzip all the zip files
+# To HF HOME cache dir
+# And load it here
+HF_HOME = os.environ["HF_HOME"]
+cache_dir = config["dataset_kwargs"]["cache_dir"]
+cache_dir = os.path.join(HF_HOME, cache_dir)
+cache_dir = os.path.join(cache_dir)
+
+
+# Pass in video path here
+# Can only work correctly with video llm
+def mix_evals_video2text_doc_to_visual(doc):
+    video_path = doc["video_path"]
+    video_path = os.path.join(cache_dir, video_path)
+    if os.path.exists(video_path):
+        video_path = video_path
+    elif os.path.exists(video_path.replace("mp4", "MP4")):
+        video_path = video_path.replace("mp4", "MP4")
+    else:
+        sys.exit(f"video path:{video_path} does not exist, please check")
+    return [video_path]
+
+
+# This is the place where you format your question
+def mix_evals_video2text_doc_to_text(doc, model_specific_prompt_kwargs=None):
+    if model_specific_prompt_kwargs is None:
+        model_specific_prompt_kwargs = {}
+    pre_prompt = ""
+    post_prompt = ""
+    if "pre_prompt" in model_specific_prompt_kwargs:
+        pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
+    if "post_prompt" in model_specific_prompt_kwargs:
+        post_prompt = model_specific_prompt_kwargs["post_prompt"]
+
+    user_prompt = doc["prompt"]
+
+    if "options" in doc:
+        option_prompt = "Here are the options:\n"
+        for idx, option in enumerate(doc["options"]):
+            char_idx = chr(ord("A") + idx)
+            option = option.strip()
+            option_prompt += f"{char_idx}. {option}\n"
+
+        option_prompt = option_prompt.rstrip("\n")
+        user_prompt = f"{user_prompt}\n{option_prompt}"
+
+    if pre_prompt:
+        user_prompt = f"{pre_prompt}\n{user_prompt}"
+
+    if post_prompt:
+        user_prompt = f"{user_prompt}\n{post_prompt}"
+    return user_prompt
+
+
+OPEN_CONVS_PROMPT = """{PRE}
+{FIRST}
+{POST}
+"""
+
+
+def mix_evals_video2text_doc_to_text_open_convs(doc, model_specific_prompt_kwargs=None):
+    if model_specific_prompt_kwargs is None:
+        model_specific_prompt_kwargs = {}
+    pre_prompt = ""
+    post_prompt = ""
+    if "pre_prompt" in model_specific_prompt_kwargs:
+        pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
+    if "post_prompt" in model_specific_prompt_kwargs:
+        post_prompt = model_specific_prompt_kwargs["post_prompt"]
+
+    filtered_first_turn = re.sub(r"<video_[0-9]+>", "", doc["first_turn_user_prompt"])
+    return OPEN_CONVS_PROMPT.format(
+        PRE=pre_prompt,
+        POST=post_prompt,
+        FIRST=filtered_first_turn,
+    )
+
+
+MODEL_CONVS_PROMPT = """{FIRST}
+{MODEL_RESPONSE}
+{PRE}
+{SECOND}
+{POST}
+"""
+
+
+def mix_evals_video2text_doc_to_text_open_2nd_convs(doc, model_specific_prompt_kwargs=None):
+    if model_specific_prompt_kwargs is None:
+        model_specific_prompt_kwargs = {}
+    pre_prompt = ""
+    post_prompt = ""
+    if "pre_prompt" in model_specific_prompt_kwargs:
+        pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
+    if "post_prompt" in model_specific_prompt_kwargs:
+        post_prompt = model_specific_prompt_kwargs["post_prompt"]
+
+    return MODEL_CONVS_PROMPT.format(
+        PRE=pre_prompt,
+        POST=post_prompt,
+        FIRST=doc["first_turn_user_prompt"],
+        SECOND=doc["second_turn_user_prompt"],
+        MODEL_RESPONSE=doc["model_response"],
+    )
+
+
+def mix_evals_video2text_process_results_open_convs(doc, result):
+    pred = result[0]
+    return {"submission": {"pred": pred, "question_idx": doc["question_index"], "first_turn_video_caption": doc["first_turn_video_caption"], "target": ""}}
+
+
+def mix_evals_video2text_process_results_freeform(doc, result):
+    pred = result[0]
+    ground_truth_str = ", ".join([f'"{gt}"' for gt in doc["target"]])
+    ground_truth_str = f"[{ground_truth_str}]"
+    content = eval_prompt.format(model_response=pred, ground_truth=ground_truth_str)
+    eval_answer, model_name = get_eval(model_response=pred, ground_truth=ground_truth_str, max_tokens=1024)
+    return {
+        "submission": {"pred": pred, "question_idx": doc["question_index"], "target": doc["target"], "eval_answer": eval_answer, "gpt_prompt": content},
+        "gpt_eval": {"pred": pred, "question_idx": doc["question_index"], "target": doc["target"], "eval_answer": eval_answer, "gpt_prompt": content},
+    }
+
+
+def mix_evals_video2text_aggregate_submissions(results, args, task):
+    now_date_time = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
+    submission_file_name = f"mix_evals_video2text_{task}-{now_date_time}.json"
+    path = file_utils.generate_submission_file(submission_file_name, args)
+    with open(path, "w") as f:
+        json.dump(results, f)
+    eval_logger.info(f"Submission file saved to {path}")
+
+
+def mix_evals_video2text_gpt_eval(results, args):
+    score = 0
+    for result in results:
+        eval_answer = result["eval_answer"]
+        eval_score = re.search(r"([0-9.]+)", eval_answer).group(1)
+        try:
+            eval_score = float(eval_score)
+        except Exception as e:
+            eval_logger.error(f"Error parsing eval_score: {e}")
+            eval_score = 0.0
+        score += eval_score
+
+    return score / len(results)
+
+
+# Factory into different aggregate
+def mix_evals_video2text_aggregate_gen(results, args):
+    mix_evals_video2text_aggregate_submissions(results, args, "OpenConvs")
+
+
+class MultiChoiceRegexFilter(ExtendedRegexFilter):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def apply(self, resps, docs):
+        filtered_resps = []
+
+        for r, doc in zip(resps, docs):
+            # Regex to directly extract the option letter from the model response
+            option_letter_regex = re.compile(r"\b([A-Z])\.\s+([^\n]*)")
+
+            # Process each response
+            filtered = []
+            for resp in r:
+                # Try to match the option letter at the start of the response
+                match = option_letter_regex.match(resp)
+                if match:
+                    # If a match is found, append the matched letter
+                    filtered.append(match.group(1))
+                else:
+                    # If no match, return the original response
+                    filtered.append(resp)
+
+            # Assuming we need the first response that matches or the original response
+            filtered_resps.append(filtered[0])
+
+        return filtered_resps