Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MixEval-X Image / Video #434

Merged
merged 10 commits into from
Dec 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions lmms_eval/filters/extraction.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import os
import re
import sys
import unicodedata

import openai

from lmms_eval.api.filter import Filter


Expand Down
4 changes: 2 additions & 2 deletions lmms_eval/models/internvl2.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,8 @@ def __init__(
super().__init__()

self.path = pretrained
self._model = AutoModel.from_pretrained(self.path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, trust_remote_code=True).eval().cuda()
self._tokenizer = AutoTokenizer.from_pretrained(self.path, trust_remote_code=True)
self._model = AutoModel.from_pretrained(self.path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, trust_remote_code=True, device_map=device_map).eval()
self._tokenizer = AutoTokenizer.from_pretrained(self.path, trust_remote_code=True, device_map=device_map)

batch_size = int(batch_size)
assert batch_size == 1, f"Batch size should be 1 for InternVL2, but got {batch_size}."
Expand Down
31 changes: 11 additions & 20 deletions lmms_eval/models/llama_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from lmms_eval.api.instance import Instance
from lmms_eval.api.model import lmms
from lmms_eval.api.registry import register_model
from lmms_eval.models.model_utils.load_video import read_video_pyav_pil
Copy link

@vancoykendall vancoykendall Dec 7, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This currently breaks as read_video_pyav_pil doesn't exist @pufanyi

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hello, sorry about this. It seems that this change was accidentally brought over from another branch. Thank you for pointing it out! I've reverted it in #447.


warnings.filterwarnings("ignore")

Expand All @@ -25,33 +26,19 @@

@register_model("llama_vision")
class LlamaVision(lmms):
"""
Llava Model for Hugging Face Transformers: https://huggingface.co/docs/transformers/v4.39.3/en/model_doc/llava

Adapted from the InstructBLIP model in lmms_eval/models/instructblip.py

Example usage:

accelerate launch --num_processes=8 --main_process_port 12345 -m lmms_eval \
--model llava_hf \
--model_args pretrained=llava-hf/llava-1.5-7b-hf \
--tasks seedbench \
--batch_size 1 \
--output_path ./logs/ \
--log_samples
"""

def __init__(
self,
pretrained: str = "meta-llama/Llama-3.2-11B-Vision",
revision: str = "main",
device: str = "cuda",
dtype: Optional[Union[str, torch.dtype]] = "auto",
batch_size: int = 1,
trust_remote_code: Optional[bool] = False,
trust_remote_code: Optional[bool] = True,
attn_implementation: Optional[str] = None,
device_map: str = "",
max_frames_num: Optional[int] = 32,
fps: Optional[int] = None,
max_image_size: Optional[int] = None,
**kwargs,
) -> None:
super().__init__()
Expand All @@ -68,7 +55,9 @@ def __init__(
if isinstance(dtype, str) and dtype != "auto":
dtype = getattr(torch, dtype)

self.fps = fps
self.max_frames_num = max_frames_num
self.max_image_size = max_image_size
self._model = MllamaForConditionalGeneration.from_pretrained(pretrained, revision=revision, torch_dtype=dtype, device_map=self.device_map, trust_remote_code=trust_remote_code, attn_implementation=attn_implementation)
self.model.eval()
self.processor = AutoProcessor.from_pretrained(pretrained)
Expand Down Expand Up @@ -193,9 +182,11 @@ def generate_until(self, requests: List[Instance]) -> List[str]:

for visual in visuals:
if isinstance(visual, str):
frames = self.load_video(visual, self.max_frames_num)
frames = torch.from_numpy(frames).permute(0, 3, 1, 2)
images.extend([to_pil_image(frame) for frame in frames])
frames = read_video_pyav_pil(visual, num_frm=self.max_frames_num, fps=self.fps, max_image_size=self.max_image_size)
images.extend(frames)
# frames = self.load_video(visual, self.max_frames_num)
# frames = torch.from_numpy(frames).permute(0, 3, 1, 2)
# images.extend([to_pil_image(frame) for frame in frames])
elif isinstance(visual, PIL.Image.Image):
images.append(visual)

Expand Down
2 changes: 1 addition & 1 deletion lmms_eval/models/llava_vid.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def __init__(
conv_template="vicuna_v1",
use_cache=True,
truncate_context=False, # whether to truncate the context in generation, set it False for LLaVA-1.6
max_frames_num: int = 3,
max_frames_num: int = 20,
video_fps: int = 1,
mm_resampler_type: str = "spatial_pool",
mm_spatial_pool_stride: int = 2,
Expand Down
3 changes: 3 additions & 0 deletions lmms_eval/tasks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -417,6 +417,8 @@ def _get_task_and_group(self, task_dir: str):
"yaml_path": yaml_path,
}
elif self._config_is_group(config):
if f.endswith("mix_evals_image2text.yaml"):
print(config)
# This is a group config
tasks_and_groups[config["group"]] = {
"type": "group",
Expand Down Expand Up @@ -477,6 +479,7 @@ def _get_task_and_group(self, task_dir: str):
else:
self.logger.debug(f"File {f} in {root} could not be loaded as a task or group")

print(tasks_and_groups["mix_evals_image2text"])
return tasks_and_groups


Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
group: mix_evals_audio2text
task:
- mix_evals_audio2_text_freeform
13 changes: 13 additions & 0 deletions lmms_eval/tasks/mix_evals/image2text/_default_template_yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
dataset_path: MixEval/MixEval-X
dataset_kwargs:
video: true # a bit confusing, but this is because the official uses path to store image data, so we need to load it as a video dataset
cache_dir: mix_evals_image2text
lmms_eval_specific_kwargs:
default:
post_prompt: ""
pre_prompt: ""
gpt4v:
post_prompt: ""
pre_prompt: ""
metadata:
version: 0
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
group: mix_evals_image2text
task:
- mix_evals_image2text_mc
- mix_evals_image2text_freeform
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
task: "mix_evals_image2text_freeform"
dataset_name: "image2text"
test_split: free_form
output_type: generate_until
doc_to_visual: !function utils.mix_evals_image2text_doc_to_visual
doc_to_text: !function utils.mix_evals_image2text_doc_to_text
doc_to_target: "{{reference_answer}}"
process_results: !function utils.mix_evals_image2text_process_results_freeform
metric_list:
- metric: gpt_eval
aggregation: !function utils.mix_evals_image2text_gpt_eval
higher_is_better: true

generation_kwargs:
max_new_tokens: 1024

include: _default_template_yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
task: "mix_evals_image2text_freeform_hard"
dataset_name: "image2text"
test_split: free_form_hard
output_type: generate_until
doc_to_visual: !function utils.mix_evals_image2text_doc_to_visual
doc_to_text: !function utils.mix_evals_image2text_doc_to_text
doc_to_target: "{{reference_answer}}"
process_results: !function utils.mix_evals_image2text_process_results_freeform
metric_list:
- metric: gpt_eval
aggregation: !function utils.mix_evals_image2text_gpt_eval
higher_is_better: true

generation_kwargs:
max_new_tokens: 1024

include: _default_template_yaml

lmms_eval_specific_kwargs:
default:
pre_prompt: "Please answer the following questions about the image."
post_prompt: ""
gpt4v:
pre_prompt: "Please answer the following questions about the image."
post_prompt: ""
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
group: mix_evals_image2text_hard
task:
- mix_evals_image2text_mc_hard
- mix_evals_image2text_freeform_hard
# - mix_evals_image2text_openended
23 changes: 23 additions & 0 deletions lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_mc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
include: _default_template_yaml
task: "mix_evals_image2text_mc"
dataset_name: "image2text"
test_split: multiple_choice
output_type: generate_until
doc_to_visual: !function utils.mix_evals_image2text_doc_to_visual
doc_to_text: !function utils.mix_evals_image2text_doc_to_text
doc_to_target: "{{reference_answer}}"

generation_kwargs:
max_new_tokens: 1024

metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true

filter_list:
- name: "flexible-extract"
filter:
- function: !function utils.GPTMultiChoiceFilter
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
include: _default_template_yaml
task: "mix_evals_image2text_mc_hard"
dataset_name: "image2text"
test_split: multiple_choice_hard
output_type: generate_until
doc_to_visual: !function utils.mix_evals_image2text_doc_to_visual
doc_to_text: !function utils.mix_evals_image2text_doc_to_text
doc_to_target: "{{reference_answer}}"

generation_kwargs:
max_new_tokens: 1024

metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true

filter_list:
- name: "flexible-extract"
filter:
- function: !function utils.GPTMultiChoiceFilter
Loading