Skip to content

Commit

Permalink
fix EvolvingLMMs-Lab#117, allow auto download with tar format videos
Browse files Browse the repository at this point in the history
  • Loading branch information
teowu committed Jun 16, 2024
1 parent 62ea8ce commit a056f11
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 2 deletions.
49 changes: 49 additions & 0 deletions lmms_eval/api/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -776,8 +776,10 @@ def _download_from_youtube(path):
if accelerator.is_main_process:
force_download = dataset_kwargs.get("force_download", False)
force_unzip = dataset_kwargs.get("force_unzip", False)
print(force_download)
cache_path = snapshot_download(repo_id=self.DATASET_PATH, repo_type="dataset", force_download=force_download, etag_timeout=60)
zip_files = glob(os.path.join(cache_path, "**/*.zip"), recursive=True)
tar_files = glob(os.path.join(cache_path, "**/*.tar*"), recursive=True)

def unzip_video_data(zip_file):
import zipfile
Expand All @@ -786,10 +788,57 @@ def unzip_video_data(zip_file):
zip_ref.extractall(cache_dir)
eval_logger.info(f"Extracted all files from {zip_file} to {cache_dir}")

def untar_video_data(tar_file):
import tarfile
with tarfile.open(tar_file, "r") as tar_ref:
tar_ref.extractall(cache_dir)
eval_logger.info(f"Extracted all files from {tar_file} to {cache_dir}")



def concat_tar_parts(tar_parts, output_tar):
print("This is the output file:", output_tar, "from:", tar_parts)
try:
with open(output_tar, 'wb') as out_tar:
from tqdm import tqdm
for part in tqdm(sorted(tar_parts)):
with open(part, 'rb') as part_file:
out_tar.write(part_file.read())
except Exception as ex:
print("Error!!!", ex)
eval_logger.info(f"Concatenated parts {tar_parts} into {output_tar}")

# Unzip zip files if needed
if force_unzip or (not os.path.exists(cache_dir) and len(zip_files) > 0):
for zip_file in zip_files:
unzip_video_data(zip_file)

# Concatenate and extract tar files if needed
if force_unzip or (not os.path.exists(cache_dir) and len(tar_files) > 0):
tar_parts_dict = {}

# Group tar parts together
for tar_file in tar_files:
base_name = tar_file.split('.tar')[0]
if base_name not in tar_parts_dict:
tar_parts_dict[base_name] = []
tar_parts_dict[base_name].append(tar_file)

print(tar_parts_dict)

# Concatenate and untar split parts
for base_name, parts in tar_parts_dict.items():
eval_logger.info(f"Extracting following tar files: {parts}")
output_tar = base_name + ".tar"
if not os.path.exists(output_tar):
eval_logger.info(f"Start concatenating tar files")

concat_tar_parts(parts, output_tar)
eval_logger.info(f"Finish concatenating tar files")

if not os.path.exists(os.path.join(cache_dir, os.path.basename(base_name))):
untar_video_data(output_tar)

accelerator.wait_for_everyone()
dataset_kwargs.pop("cache_dir")
dataset_kwargs.pop("video")
Expand Down
4 changes: 2 additions & 2 deletions lmms_eval/models/llava_vid.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,7 +405,7 @@ def generate_until(self, requests) -> List[str]:
attention_mask=attention_masks,
modalities="video",
use_cache=self.use_cache,
#stopping_criteria=[stopping_criteria],
stopping_criteria=[stopping_criteria],
do_sample=True if gen_kwargs["temperature"] > 0 else False,
temperature=gen_kwargs["temperature"],
top_p=gen_kwargs["top_p"],
Expand All @@ -417,4 +417,4 @@ def generate_until(self, requests) -> List[str]:
outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
res.append(outputs)
pbar.update(1)
return res
return res

0 comments on commit a056f11

Please sign in to comment.