-
Notifications
You must be signed in to change notification settings - Fork 182
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add videosearch adn mmmu_for_testing tasl
- Loading branch information
Showing
15 changed files
with
1,458 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
from datasets import load_dataset, Dataset | ||
|
||
# Load the deduplicated VideoSearch dataset | ||
videosearch_dataset = load_dataset('lmms-lab/VideoSearch', 'deduplicated_combined_milestone', split='test') | ||
|
||
# ID to be removed | ||
id_to_remove = 'validation_Biology_18' | ||
|
||
# Filter out the row with the missing ID | ||
filtered_rows = [row for row in videosearch_dataset if row['id'] != id_to_remove] | ||
|
||
# Create a new dataset from the filtered rows | ||
filtered_dataset = Dataset.from_list(filtered_rows) | ||
|
||
# Save the filtered dataset locally or push it to Hugging Face hub | ||
filtered_dataset.push_to_hub("lmms-lab/VideoSearch", "final_combined_milestone", split="test") | ||
|
||
# Check and print the number of rows before and after filtering | ||
print(f"Original dataset size: {len(videosearch_dataset)}") | ||
print(f"Filtered dataset size: {len(filtered_dataset)}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
import os | ||
from datasets import load_dataset | ||
|
||
# Load the VideoSearch dataset | ||
videosearch_dataset = load_dataset('lmms-lab/VideoSearch', 'final_combined_milestone', split='test') | ||
|
||
# Path to the videos directory (replace with your actual path) | ||
videos_directory = '/mnt/sfs-common/krhu/.cache/huggingface/Combined_milestone/videos/' | ||
|
||
# Get all IDs from the dataset | ||
videosearch_ids = set(videosearch_dataset['id']) | ||
|
||
# List to store IDs of files that are not in the dataset | ||
extra_files = [] | ||
|
||
# Loop through all .mp4 files in the videos directory | ||
for file in os.listdir(videos_directory): | ||
if file.endswith('.mp4'): | ||
# Extract the ID from the file name (remove the .mp4 extension) | ||
file_id = file.replace('.mp4', '') | ||
|
||
# Check if the file ID exists in the VideoSearch dataset | ||
if file_id not in videosearch_ids: | ||
extra_files.append(file_id) | ||
|
||
# Print the IDs of .mp4 files that are not in the dataset | ||
if extra_files: | ||
print(f"MP4 files not included in the VideoSearch dataset: {extra_files}") | ||
else: | ||
print("All MP4 files have corresponding entries in the VideoSearch dataset.") | ||
|
||
# Optionally, print the total number of extra files | ||
print(f"Total extra MP4 files: {len(extra_files)}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,238 @@ | ||
import base64 | ||
import json | ||
import os | ||
import time | ||
from copy import deepcopy | ||
from io import BytesIO | ||
from typing import List, Tuple | ||
|
||
import numpy as np | ||
import requests as url_requests | ||
from accelerate import Accelerator, DistributedType | ||
from tqdm import tqdm | ||
|
||
from lmms_eval.api.instance import Instance | ||
from lmms_eval.api.model import lmms | ||
from lmms_eval.api.registry import register_model | ||
|
||
try: | ||
from decord import VideoReader, cpu | ||
except ImportError: | ||
pass | ||
|
||
from PIL import Image | ||
|
||
# API_TYPE = os.getenv("API_TYPE", "openai") | ||
API_TYPE = "azure" | ||
NUM_SECONDS_TO_SLEEP = 30 | ||
from loguru import logger as eval_logger | ||
|
||
if API_TYPE == "openai": | ||
API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions") | ||
API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY") | ||
headers = { | ||
"Authorization": f"Bearer {API_KEY}", | ||
"Content-Type": "application/json", | ||
} | ||
elif API_TYPE == "azure": | ||
# API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken") | ||
API_URL = "https://egoasr.openai.azure.com/openai/deployments/egodataset/chat/completions?api-version=2024-02-15-preview" | ||
# API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY") | ||
API_KEY = "5b496f4100d14ce58186e1bfe91db644" | ||
headers = { | ||
"api-key": API_KEY, | ||
"Content-Type": "application/json", | ||
} | ||
|
||
|
||
@register_model("gpt4o") | ||
class GPT4O(lmms): | ||
def __init__( | ||
self, | ||
# model_version: str = "gpt-4-vision-preview", | ||
modality: str = "video", | ||
max_frames_num: int = 10, | ||
timeout: int = 120, | ||
continual_mode: bool = False, | ||
response_persistent_folder: str = None, | ||
**kwargs, | ||
) -> None: | ||
super().__init__() | ||
# Manually set a image token for GPT4V so that we can search for it | ||
# and split the text and image | ||
# Here we just use the same token as llava for convenient | ||
# self.model_version = model_version | ||
self.modality = modality | ||
self.max_frames_num = max_frames_num | ||
self.image_token = "<image>" | ||
self.timeout = timeout | ||
self.continual_mode = continual_mode | ||
if self.continual_mode: | ||
if response_persistent_folder is None: | ||
raise ValueError("Continual mode requires a persistent path for the response. Please provide a valid path.") | ||
|
||
os.makedirs(response_persistent_folder, exist_ok=True) | ||
self.response_persistent_folder = response_persistent_folder | ||
self.response_persistent_file = os.path.join(self.response_persistent_folder, f"{self.model_version}_response.json") | ||
|
||
if os.path.exists(self.response_persistent_file): | ||
with open(self.response_persistent_file, "r") as f: | ||
self.response_cache = json.load(f) | ||
self.cache_mode = "resume" | ||
else: | ||
self.response_cache = {} | ||
self.cache_mode = "start" | ||
|
||
accelerator = Accelerator() | ||
# assert self.batch_size_per_gpu == 1, "Llava currently does not support batched generation. See https://github.com/haotian-liu/LLaVA/issues/754. HF Llava also has this issue." | ||
if accelerator.num_processes > 1: | ||
assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported." | ||
self.accelerator = accelerator | ||
if self.accelerator.is_local_main_process: | ||
eval_logger.info(f"Using {accelerator.num_processes} devices with data parallelism") | ||
self._rank = self.accelerator.local_process_index | ||
self._world_size = self.accelerator.num_processes | ||
else: | ||
self.accelerator = accelerator | ||
self._rank = self.accelerator.local_process_index | ||
self._world_size = self.accelerator.num_processes | ||
|
||
self.device = self.accelerator.device | ||
|
||
# Function to encode the image | ||
def encode_image(self, image: Image): | ||
output_buffer = BytesIO() | ||
image.save(output_buffer, format="PNG") | ||
byte_data = output_buffer.getvalue() | ||
base64_str = base64.b64encode(byte_data).decode("utf-8") | ||
return base64_str | ||
|
||
# Function to encode the video | ||
def encode_video(self, video_path, for_get_frames_num): | ||
vr = VideoReader(video_path, ctx=cpu(0)) | ||
total_frame_num = len(vr) | ||
uniform_sampled_frames = np.linspace(0, total_frame_num - 1, for_get_frames_num, dtype=int) | ||
|
||
# Ensure the last frame is included | ||
if total_frame_num - 1 not in uniform_sampled_frames: | ||
uniform_sampled_frames = np.append(uniform_sampled_frames, total_frame_num - 1) | ||
|
||
frame_idx = uniform_sampled_frames.tolist() | ||
frames = vr.get_batch(frame_idx).asnumpy() | ||
|
||
base64_frames = [] | ||
for frame in frames: | ||
img = Image.fromarray(frame) | ||
output_buffer = BytesIO() | ||
img.save(output_buffer, format="PNG") | ||
byte_data = output_buffer.getvalue() | ||
base64_str = base64.b64encode(byte_data).decode("utf-8") | ||
base64_frames.append(base64_str) | ||
|
||
return base64_frames | ||
|
||
def flatten(self, input): | ||
new_list = [] | ||
for i in input: | ||
for j in i: | ||
new_list.append(j) | ||
return new_list | ||
|
||
def generate_until(self, requests) -> List[str]: | ||
res = [] | ||
pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding") | ||
|
||
for contexts, gen_kwargs, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]: | ||
if self.continual_mode is True and self.cache_mode == "resume": | ||
doc_uuid = f"{task}___{split}___{doc_id}" | ||
if doc_uuid in self.response_cache: | ||
response_text = self.response_cache[doc_uuid] | ||
if response_text: | ||
res.append(response_text) | ||
pbar.update(1) | ||
continue | ||
|
||
visuals = [doc_to_visual(self.task_dict[task][split][doc_id])] | ||
visuals = self.flatten(visuals) | ||
imgs = [] # multiple images or frames for video | ||
for visual in visuals: | ||
if self.modality == "image": | ||
img = self.encode_image(visual) | ||
imgs.append(img) | ||
elif self.modality == "video": | ||
frames = self.encode_video(visual, self.max_frames_num) | ||
imgs.extend(frames) | ||
|
||
payload = {"messages": []} | ||
if API_TYPE == "openai": | ||
payload["model"] = self.model_version | ||
|
||
response_json = {"role": "user", "content": []} | ||
# When there is no image token in the context, append the image to the text | ||
if self.image_token not in contexts: | ||
payload["messages"].append(deepcopy(response_json)) | ||
payload["messages"][0]["content"].append({"type": "text", "text": contexts}) | ||
for img in imgs: | ||
payload["messages"][0]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}}) | ||
else: | ||
contexts = contexts.split(self.image_token) | ||
for idx, img in enumerate(imgs): | ||
payload["messages"].append(deepcopy(response_json)) | ||
payload["messages"][idx]["content"].append({"type": "text", "text": contexts[idx]}) | ||
payload["messages"][idx]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}}) | ||
|
||
# If n image tokens are in the contexts | ||
# contexts will be splitted into n+1 chunks | ||
# Manually add it into the payload | ||
payload["messages"].append(deepcopy(response_json)) | ||
payload["messages"][-1]["content"].append({"type": "text", "text": contexts[-1]}) | ||
|
||
if "max_new_tokens" not in gen_kwargs: | ||
gen_kwargs["max_new_tokens"] = 1024 | ||
if gen_kwargs["max_new_tokens"] > 4096: | ||
gen_kwargs["max_new_tokens"] = 4096 | ||
if "temperature" not in gen_kwargs: | ||
gen_kwargs["temperature"] = 0 | ||
if "top_p" not in gen_kwargs: | ||
gen_kwargs["top_p"] = None | ||
if "num_beams" not in gen_kwargs: | ||
gen_kwargs["num_beams"] = 1 | ||
|
||
payload["max_tokens"] = gen_kwargs["max_new_tokens"] | ||
payload["temperature"] = gen_kwargs["temperature"] | ||
|
||
for attempt in range(5): | ||
try: | ||
response = url_requests.post(API_URL, headers=headers, json=payload, timeout=self.timeout) | ||
response_data = response.json() | ||
|
||
response_text = response_data["choices"][0]["message"]["content"].strip() | ||
break # If successful, break out of the loop | ||
|
||
except Exception as e: | ||
try: | ||
error_msg = response.json() | ||
except: | ||
error_msg = "" | ||
|
||
eval_logger.info(f"Attempt {attempt + 1} failed with error: {str(e)}.\nReponse: {error_msg}") | ||
if attempt <= 5: | ||
time.sleep(NUM_SECONDS_TO_SLEEP) | ||
else: # If this was the last attempt, log and return empty string | ||
eval_logger.error(f"All 5 attempts failed. Last error message: {str(e)}.\nResponse: {response.json()}") | ||
response_text = "" | ||
res.append(response_text) | ||
pbar.update(1) | ||
|
||
if self.continual_mode is True: # Cache the response | ||
doc_uuid = f"{task}___{split}___{doc_id}" | ||
self.response_cache[doc_uuid] = response_text | ||
with open(self.response_persistent_file, "w") as f: | ||
json.dump(self.response_cache, f) | ||
|
||
pbar.close() | ||
return res | ||
|
||
def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: | ||
# TODO | ||
assert False, "GPT4V not support" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
generation_kwargs: | ||
max_new_tokens: 16 | ||
|
||
metadata: | ||
version: 0.0 | ||
interleaved_format: false |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
group: mmmu | ||
task: | ||
- mmmu_testing_val |
Oops, something went wrong.