diff --git a/docs/task_guide.md b/docs/task_guide.md index 31fb443d..1376bc22 100755 --- a/docs/task_guide.md +++ b/docs/task_guide.md @@ -27,7 +27,7 @@ doc_to_target: "answer" generation_kwargs: max_new_tokens: 16 temperature: 0 - top_p: 0 + top_p: 1.0 num_beams: 1 do_sample: false # The return value of process_results will be used by metrics diff --git a/lmms_eval/models/gemini_api.py b/lmms_eval/models/gemini_api.py index abbc4dd3..455db085 100644 --- a/lmms_eval/models/gemini_api.py +++ b/lmms_eval/models/gemini_api.py @@ -2,6 +2,7 @@ import os import time import logging +import json from PIL import Image from typing import List, Tuple @@ -11,13 +12,12 @@ from lmms_eval.api.instance import Instance from accelerate import Accelerator, DistributedType - eval_logger = logging.getLogger("lmms-eval") try: import google.generativeai as genai - NUM_SECONDS_TO_SLEEP = 5 + NUM_SECONDS_TO_SLEEP = 30 GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") genai.configure(api_key=GOOGLE_API_KEY) @@ -33,15 +33,31 @@ def __init__( model_version: str = "gemini-1.5-flash-latest", modality: str = "image", timeout: int = 120, + continual_mode: bool = False, + response_persistent_folder: str = None, # We will cache the Gemini API response in this path and use it for future requests **kwargs, ) -> None: super().__init__() self.model_version = model_version self.timeout = timeout self.model = genai.GenerativeModel(model_version) + self.continual_mode = continual_mode + if self.continual_mode and response_persistent_folder is None: + raise ValueError("Continual mode requires a persistent path for the response. We will cache the Gemini API response in this path and use it for future requests. Please provide a valid path.") + self.response_persistent_folder = response_persistent_folder + self.response_persistent_file = os.path.join(self.response_persistent_folder, f"{self.model_version}_response.json") + + if os.path.exists(self.response_persistent_file): + with open(self.response_persistent_file, "r") as f: + self.response_cache = json.load(f) + self.cache_mode = "resume" + else: + self.response_cache = {} + self.cache_mode = "start" accelerator = Accelerator() if accelerator.num_processes > 1: + assert self.continual_mode is False, "Continual mode is not supported with distributed inference." assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported." self.accelerator = accelerator if self.accelerator.is_local_main_process: @@ -77,7 +93,9 @@ def get_image_size(self, image): return img_size def encode_video(self, video_path): - return genai.upload_file(path=video_path) + uploaded_obj = genai.upload_file(path=video_path) + time.sleep(5) + return uploaded_obj def convert_video(self, images): for idx, img in enumerate(images): @@ -109,6 +127,14 @@ def generate_until(self, requests) -> List[str]: message = [contexts] + visuals + if self.continual_mode is True and self.cache_mode == "resume": + if doc_id in self.response_cache: + doc_uuid = str(doc_id) + content = self.response_cache[doc_uuid] + res.append(content) + pbar.update(1) + continue + for attempt in range(5): try: content = self.model.generate_content(message, generation_config=config) @@ -123,6 +149,13 @@ def generate_until(self, requests) -> List[str]: content = "" res.append(content) pbar.update(1) + + if self.continual_mode is True: # Cache the response + doc_uuid = str(doc_id) + self.response_cache[doc_uuid] = content + with open(self.response_persistent_file, "w") as f: + json.dump(self.response_cache, f) + pbar.close() return res diff --git a/lmms_eval/models/llava_sglang.py b/lmms_eval/models/llava_sglang.py index 10d5c98c..af79d155 100644 --- a/lmms_eval/models/llava_sglang.py +++ b/lmms_eval/models/llava_sglang.py @@ -108,9 +108,6 @@ def _collate(x): gen_kwargs["top_p"] = 1.0 if "num_beams" not in gen_kwargs: gen_kwargs["num_beams"] = 1 - if gen_kwargs["top_p"] == 0.0: - gen_kwargs["top_p"] = 1.0 - gen_kwargs["temperature"] = 0.0 assert gen_kwargs["num_beams"] == 1 def save_image_to_temp_file(image): diff --git a/lmms_eval/tasks/activitynetqa/activitynetqa_generation.yaml b/lmms_eval/tasks/activitynetqa/activitynetqa_generation.yaml index a9bb69d2..e6c167e5 100755 --- a/lmms_eval/tasks/activitynetqa/activitynetqa_generation.yaml +++ b/lmms_eval/tasks/activitynetqa/activitynetqa_generation.yaml @@ -20,6 +20,6 @@ generation_kwargs: image_aspect_ratio: original max_new_tokens: 64 temperature: 0 - top_p: 0 + top_p: 1.0 num_beams: 1 do_sample: false diff --git a/lmms_eval/tasks/coco_cap/coco2014_cap_test.yaml b/lmms_eval/tasks/coco_cap/coco2014_cap_test.yaml index e080450f..01fc93c0 100755 --- a/lmms_eval/tasks/coco_cap/coco2014_cap_test.yaml +++ b/lmms_eval/tasks/coco_cap/coco2014_cap_test.yaml @@ -11,7 +11,7 @@ doc_to_target: "answer" generation_kwargs: max_new_tokens: 128 temperature: 0 - top_p: 0 + top_p: 1.0 num_beams: 1 do_sample: false process_results: !function utils.coco_test_process_result diff --git a/lmms_eval/tasks/coco_cap/coco2014_cap_val.yaml b/lmms_eval/tasks/coco_cap/coco2014_cap_val.yaml index 57b01a7d..6c14f5f9 100755 --- a/lmms_eval/tasks/coco_cap/coco2014_cap_val.yaml +++ b/lmms_eval/tasks/coco_cap/coco2014_cap_val.yaml @@ -11,7 +11,7 @@ doc_to_target: "answer" generation_kwargs: max_new_tokens: 64 temperature: 0 - top_p: 0 + top_p: 1.0 num_beams: 1 do_sample: false process_results: !function utils.coco_process_result diff --git a/lmms_eval/tasks/coco_cap/coco2017_cap_test.yaml b/lmms_eval/tasks/coco_cap/coco2017_cap_test.yaml index ecd73759..39124a6a 100755 --- a/lmms_eval/tasks/coco_cap/coco2017_cap_test.yaml +++ b/lmms_eval/tasks/coco_cap/coco2017_cap_test.yaml @@ -11,7 +11,7 @@ doc_to_target: "answer" generation_kwargs: max_new_tokens: 128 temperature: 0 - top_p: 0 + top_p: 1.0 num_beams: 1 do_sample: false process_results: !function utils.coco_test_process_result diff --git a/lmms_eval/tasks/coco_cap/coco2017_cap_val.yaml b/lmms_eval/tasks/coco_cap/coco2017_cap_val.yaml index b0d9d4a4..4ef084b7 100755 --- a/lmms_eval/tasks/coco_cap/coco2017_cap_val.yaml +++ b/lmms_eval/tasks/coco_cap/coco2017_cap_val.yaml @@ -11,7 +11,7 @@ doc_to_target: "answer" generation_kwargs: max_new_tokens: 64 temperature: 0 - top_p: 0 + top_p: 1.0 num_beams: 1 do_sample: false process_results: !function utils.coco_process_result diff --git a/lmms_eval/tasks/ferret/ferret.yaml b/lmms_eval/tasks/ferret/ferret.yaml index 517649e7..249b711b 100755 --- a/lmms_eval/tasks/ferret/ferret.yaml +++ b/lmms_eval/tasks/ferret/ferret.yaml @@ -13,7 +13,7 @@ generation_kwargs: image_aspect_ratio: original max_new_tokens: 1024 temperature: 0 - top_p: 0 + top_p: 1.0 num_beams: 1 do_sample: false process_results: !function utils.ferret_process_results diff --git a/lmms_eval/tasks/flickr30k/flickr30k_test.yaml b/lmms_eval/tasks/flickr30k/flickr30k_test.yaml index 737d9ff4..e38ae9e7 100755 --- a/lmms_eval/tasks/flickr30k/flickr30k_test.yaml +++ b/lmms_eval/tasks/flickr30k/flickr30k_test.yaml @@ -10,7 +10,7 @@ doc_to_target: "answer" generation_kwargs: max_new_tokens: 64 temperature: 0 - top_p: 0 + top_p: 1.0 num_beams: 1 do_sample: false process_results: !function utils.flickr_process_result diff --git a/lmms_eval/tasks/gqa/gqa.yaml b/lmms_eval/tasks/gqa/gqa.yaml index 404e5a05..5d06c37b 100755 --- a/lmms_eval/tasks/gqa/gqa.yaml +++ b/lmms_eval/tasks/gqa/gqa.yaml @@ -11,7 +11,7 @@ doc_to_target: "answer" generation_kwargs: max_new_tokens: 16 temperature: 0 - top_p: 0 + top_p: 1.0 num_beams: 1 do_sample: false metric_list: diff --git a/lmms_eval/tasks/hallusion_bench/hallusion_bench_image.yaml b/lmms_eval/tasks/hallusion_bench/hallusion_bench_image.yaml index 39a5be4c..d5a485ba 100755 --- a/lmms_eval/tasks/hallusion_bench/hallusion_bench_image.yaml +++ b/lmms_eval/tasks/hallusion_bench/hallusion_bench_image.yaml @@ -15,7 +15,7 @@ model_specific_prompt_kwargs: generation_kwargs: max_new_tokens: 128 temperature: 0 - top_p: 0 + top_p: 1.0 num_beams: 1 do_sample: false metric_list: diff --git a/lmms_eval/tasks/internal_eval/d170_cn.yaml b/lmms_eval/tasks/internal_eval/d170_cn.yaml index ebb956e5..dc6e25b0 100755 --- a/lmms_eval/tasks/internal_eval/d170_cn.yaml +++ b/lmms_eval/tasks/internal_eval/d170_cn.yaml @@ -12,7 +12,7 @@ generation_kwargs: - "ASSISTANT:" max_new_tokens: 1024 temperature: 0 - top_p: 0 + top_p: 1.0 num_beams: 1 do_sample: false process_results: !function d170_cn_utils.process_results # apply gpt eval here diff --git a/lmms_eval/tasks/internal_eval/d170_en.yaml b/lmms_eval/tasks/internal_eval/d170_en.yaml index ffae9a6c..7a4bada6 100755 --- a/lmms_eval/tasks/internal_eval/d170_en.yaml +++ b/lmms_eval/tasks/internal_eval/d170_en.yaml @@ -12,7 +12,7 @@ generation_kwargs: - "ASSISTANT:" max_new_tokens: 1024 temperature: 0 - top_p: 0 + top_p: 1.0 num_beams: 1 do_sample: false process_results: !function d170_en_utils.process_results # apply gpt eval here diff --git a/lmms_eval/tasks/internal_eval/dc100_en.yaml b/lmms_eval/tasks/internal_eval/dc100_en.yaml index 729f323d..bf0a4bc9 100755 --- a/lmms_eval/tasks/internal_eval/dc100_en.yaml +++ b/lmms_eval/tasks/internal_eval/dc100_en.yaml @@ -12,7 +12,7 @@ generation_kwargs: - "ASSISTANT:" max_new_tokens: 1024 temperature: 0 - top_p: 0 + top_p: 1.0 num_beams: 1 do_sample: false process_results: !function dc100_en_utils.process_results # apply gpt eval here diff --git a/lmms_eval/tasks/internal_eval/dc200_cn.yaml b/lmms_eval/tasks/internal_eval/dc200_cn.yaml index 37a888c8..28439e27 100755 --- a/lmms_eval/tasks/internal_eval/dc200_cn.yaml +++ b/lmms_eval/tasks/internal_eval/dc200_cn.yaml @@ -12,7 +12,7 @@ generation_kwargs: - "ASSISTANT:" max_new_tokens: 1024 temperature: 0 - top_p: 0 + top_p: 1.0 num_beams: 1 do_sample: false process_results: !function dc200_cn_utils.process_results # apply gpt eval here diff --git a/lmms_eval/tasks/livebench/livebench.yaml b/lmms_eval/tasks/livebench/livebench.yaml index 620c3863..74b317bb 100644 --- a/lmms_eval/tasks/livebench/livebench.yaml +++ b/lmms_eval/tasks/livebench/livebench.yaml @@ -11,7 +11,7 @@ doc_to_target: "answer" generation_kwargs: max_new_tokens: 1024 temperature: 0 - top_p: 0 + top_p: 1.0 num_beams: 1 do_sample: false process_results: !function utils.livebench_process_results diff --git a/lmms_eval/tasks/llava-bench-coco/llava-bench-coco.yaml b/lmms_eval/tasks/llava-bench-coco/llava-bench-coco.yaml index 0f7d3352..b104bcb7 100755 --- a/lmms_eval/tasks/llava-bench-coco/llava-bench-coco.yaml +++ b/lmms_eval/tasks/llava-bench-coco/llava-bench-coco.yaml @@ -13,7 +13,7 @@ generation_kwargs: image_aspect_ratio: original max_new_tokens: 1024 temperature: 0 - top_p: 0 + top_p: 1.0 num_beams: 1 process_results: !function utils.llava_process_results metric_list: diff --git a/lmms_eval/tasks/llava-in-the-wild/llava-in-the-wild.yaml b/lmms_eval/tasks/llava-in-the-wild/llava-in-the-wild.yaml index 5cad8d9d..02e846c3 100755 --- a/lmms_eval/tasks/llava-in-the-wild/llava-in-the-wild.yaml +++ b/lmms_eval/tasks/llava-in-the-wild/llava-in-the-wild.yaml @@ -13,7 +13,7 @@ generation_kwargs: image_aspect_ratio: original max_new_tokens: 32768 temperature: 0 - top_p: 0 + top_p: 1.0 num_beams: 1 do_sample: false process_results: !function utils.llava_process_results diff --git a/lmms_eval/tasks/llava_wilder/_default_template_wilder_yaml b/lmms_eval/tasks/llava_wilder/_default_template_wilder_yaml index 00409be0..37b744f5 100644 --- a/lmms_eval/tasks/llava_wilder/_default_template_wilder_yaml +++ b/lmms_eval/tasks/llava_wilder/_default_template_wilder_yaml @@ -8,7 +8,7 @@ generation_kwargs: image_aspect_ratio: original max_new_tokens: 4096 temperature: 0 - top_p: 0 + top_p: 1.0 num_beams: 1 do_sample: false process_results: !function utils.llava_process_results diff --git a/lmms_eval/tasks/mathvista/mathvista_test.yaml b/lmms_eval/tasks/mathvista/mathvista_test.yaml index 171e06d2..31fc2a45 100755 --- a/lmms_eval/tasks/mathvista/mathvista_test.yaml +++ b/lmms_eval/tasks/mathvista/mathvista_test.yaml @@ -12,7 +12,7 @@ generation_kwargs: - "ASSISTANT:" max_new_tokens: 1024 temperature: 0 - top_p: 0 + top_p: 1.0 num_beams: 1 do_sample: false process_results: !function utils.mathvista_process_results diff --git a/lmms_eval/tasks/mathvista/mathvista_testmini.yaml b/lmms_eval/tasks/mathvista/mathvista_testmini.yaml index 5efd86ce..90fd568a 100755 --- a/lmms_eval/tasks/mathvista/mathvista_testmini.yaml +++ b/lmms_eval/tasks/mathvista/mathvista_testmini.yaml @@ -12,7 +12,7 @@ generation_kwargs: - "ASSISTANT:" max_new_tokens: 1024 temperature: 0 - top_p: 0 + top_p: 1.0 num_beams: 1 do_sample: false process_results: !function utils.mathvista_process_results diff --git a/lmms_eval/tasks/mmbench/_default_template_mmbench_cn_yaml b/lmms_eval/tasks/mmbench/_default_template_mmbench_cn_yaml index 81094620..6a699725 100755 --- a/lmms_eval/tasks/mmbench/_default_template_mmbench_cn_yaml +++ b/lmms_eval/tasks/mmbench/_default_template_mmbench_cn_yaml @@ -9,7 +9,7 @@ doc_to_text: !function cn_utils.mmbench_doc_to_text generation_kwargs: max_new_tokens: 256 temperature: 0 - top_p: 0 + top_p: 1.0 num_beams: 1 do_sample: false process_results: !function cn_utils.mmbench_process_results diff --git a/lmms_eval/tasks/mmbench/_default_template_mmbench_en_yaml b/lmms_eval/tasks/mmbench/_default_template_mmbench_en_yaml index ab2b882c..060bd5d1 100755 --- a/lmms_eval/tasks/mmbench/_default_template_mmbench_en_yaml +++ b/lmms_eval/tasks/mmbench/_default_template_mmbench_en_yaml @@ -20,6 +20,6 @@ generation_kwargs: - "ASSISTANT:" max_new_tokens: 1024 temperature: 0 - top_p: 0 + top_p: 1.0 num_beams: 1 do_sample: false diff --git a/lmms_eval/tasks/mmbench/mmbench_cc.yaml b/lmms_eval/tasks/mmbench/mmbench_cc.yaml index 4a0d5895..9b7b3cb1 100755 --- a/lmms_eval/tasks/mmbench/mmbench_cc.yaml +++ b/lmms_eval/tasks/mmbench/mmbench_cc.yaml @@ -11,7 +11,7 @@ doc_to_target: "answer" generation_kwargs: max_new_tokens: 256 temperature: 0 - top_p: 0 + top_p: 1.0 num_beams: 1 do_sample: false process_results: !function cc_utils.mmbench_cn_cc_process_results diff --git a/lmms_eval/tasks/mme/mme.yaml b/lmms_eval/tasks/mme/mme.yaml index 3d665314..489745aa 100755 --- a/lmms_eval/tasks/mme/mme.yaml +++ b/lmms_eval/tasks/mme/mme.yaml @@ -10,7 +10,7 @@ doc_to_target: "answer" generation_kwargs: max_new_tokens: 16 temperature: 0 - top_p: 0 + top_p: 1.0 num_beams: 1 do_sample: false # The return value of process_results will be used by metrics diff --git a/lmms_eval/tasks/mmvet/mmvet.yaml b/lmms_eval/tasks/mmvet/mmvet.yaml index f5b4962c..30c1907a 100755 --- a/lmms_eval/tasks/mmvet/mmvet.yaml +++ b/lmms_eval/tasks/mmvet/mmvet.yaml @@ -12,7 +12,7 @@ generation_kwargs: - "ASSISTANT:" max_new_tokens: 32768 temperature: 0 - top_p: 0 + top_p: 1.0 num_beams: 1 do_sample: false process_results: !function utils.mmvet_process_results # apply gpt eval here diff --git a/lmms_eval/tasks/nocaps/nocaps_test.yaml b/lmms_eval/tasks/nocaps/nocaps_test.yaml index 9f21ce2b..d92ec327 100755 --- a/lmms_eval/tasks/nocaps/nocaps_test.yaml +++ b/lmms_eval/tasks/nocaps/nocaps_test.yaml @@ -11,7 +11,7 @@ doc_to_target: "annotations_captions" generation_kwargs: max_new_tokens: 64 temperature: 0 - top_p: 0 + top_p: 1.0 num_beams: 1 do_sample: false process_results: !function utils.nocaps_test_process_result diff --git a/lmms_eval/tasks/nocaps/nocaps_val.yaml b/lmms_eval/tasks/nocaps/nocaps_val.yaml index 048066a6..125c6d77 100755 --- a/lmms_eval/tasks/nocaps/nocaps_val.yaml +++ b/lmms_eval/tasks/nocaps/nocaps_val.yaml @@ -11,7 +11,7 @@ doc_to_target: "annotations_captions" generation_kwargs: max_new_tokens: 64 temperature: 0 - top_p: 0 + top_p: 1.0 num_beams: 1 do_sample: false process_results: !function utils.nocaps_process_result diff --git a/lmms_eval/tasks/ocrbench/ocrbench.yaml b/lmms_eval/tasks/ocrbench/ocrbench.yaml index 7957e7bf..243c98d3 100644 --- a/lmms_eval/tasks/ocrbench/ocrbench.yaml +++ b/lmms_eval/tasks/ocrbench/ocrbench.yaml @@ -10,7 +10,7 @@ doc_to_target: "answer" generation_kwargs: max_new_tokens: 128 temperature: 0 - top_p: 0 + top_p: 1.0 num_beams: 1 do_sample: false process_results: !function utils.ocrbench_process_results diff --git a/lmms_eval/tasks/olympiadbench/olympiadbench_test_cn.yaml b/lmms_eval/tasks/olympiadbench/olympiadbench_test_cn.yaml index 574d0c19..95a13584 100644 --- a/lmms_eval/tasks/olympiadbench/olympiadbench_test_cn.yaml +++ b/lmms_eval/tasks/olympiadbench/olympiadbench_test_cn.yaml @@ -12,7 +12,7 @@ generation_kwargs: - "ASSISTANT:" max_new_tokens: 1024 temperature: 0 - top_p: 0 + top_p: 1.0 num_beams: 1 do_sample: false process_results: !function cn_utils.olympiadbench_process_results diff --git a/lmms_eval/tasks/olympiadbench/olympiadbench_test_en.yaml b/lmms_eval/tasks/olympiadbench/olympiadbench_test_en.yaml index 6d293fb7..c87f7ab4 100644 --- a/lmms_eval/tasks/olympiadbench/olympiadbench_test_en.yaml +++ b/lmms_eval/tasks/olympiadbench/olympiadbench_test_en.yaml @@ -12,7 +12,7 @@ generation_kwargs: - "ASSISTANT:" max_new_tokens: 1024 temperature: 0 - top_p: 0 + top_p: 1.0 num_beams: 1 do_sample: false process_results: !function en_utils.olympiadbench_process_results diff --git a/lmms_eval/tasks/pope/pope.yaml b/lmms_eval/tasks/pope/pope.yaml index 703fe3d8..715cf7d1 100755 --- a/lmms_eval/tasks/pope/pope.yaml +++ b/lmms_eval/tasks/pope/pope.yaml @@ -10,7 +10,7 @@ doc_to_target: "answer" generation_kwargs: max_new_tokens: 128 temperature: 0 - top_p: 0 + top_p: 1.0 num_beams: 1 do_sample: false process_results: !function utils.pope_process_results diff --git a/lmms_eval/tasks/realworldqa/realworldqa.yaml b/lmms_eval/tasks/realworldqa/realworldqa.yaml index adb89be0..4b414967 100644 --- a/lmms_eval/tasks/realworldqa/realworldqa.yaml +++ b/lmms_eval/tasks/realworldqa/realworldqa.yaml @@ -11,7 +11,7 @@ doc_to_target: "answer" generation_kwargs: max_new_tokens: 16 temperature: 0 - top_p: 0 + top_p: 1.0 num_beams: 1 do_sample: false diff --git a/lmms_eval/tasks/synthdog/synthdog_en.yaml b/lmms_eval/tasks/synthdog/synthdog_en.yaml index b08b9cb4..d64cad78 100644 --- a/lmms_eval/tasks/synthdog/synthdog_en.yaml +++ b/lmms_eval/tasks/synthdog/synthdog_en.yaml @@ -10,7 +10,7 @@ doc_to_target: !function utils.synthdog_doc_to_target generation_kwargs: max_new_tokens: 1024 temperature: 0 - top_p: 0 + top_p: 1.0 num_beams: 1 do_sample: false process_results: !function utils.synthdog_process_results diff --git a/lmms_eval/tasks/synthdog/synthdog_zh.yaml b/lmms_eval/tasks/synthdog/synthdog_zh.yaml index 3ffbb3b0..4a895ac2 100644 --- a/lmms_eval/tasks/synthdog/synthdog_zh.yaml +++ b/lmms_eval/tasks/synthdog/synthdog_zh.yaml @@ -10,7 +10,7 @@ doc_to_target: !function utils.synthdog_doc_to_target generation_kwargs: max_new_tokens: 1024 temperature: 0 - top_p: 0 + top_p: 1.0 num_beams: 1 do_sample: false process_results: !function utils.synthdog_process_results diff --git a/lmms_eval/tasks/textcaps/textcaps_test.yaml b/lmms_eval/tasks/textcaps/textcaps_test.yaml index 26369220..f5c2a915 100755 --- a/lmms_eval/tasks/textcaps/textcaps_test.yaml +++ b/lmms_eval/tasks/textcaps/textcaps_test.yaml @@ -11,7 +11,7 @@ doc_to_target: "answer" generation_kwargs: max_new_tokens: 64 temperature: 0 - top_p: 0 + top_p: 1.0 num_beams: 1 do_sample: false process_results: !function utils.textcaps_test_process_result diff --git a/lmms_eval/tasks/textcaps/textcaps_train.yaml b/lmms_eval/tasks/textcaps/textcaps_train.yaml index 931aeb17..9422bb4b 100755 --- a/lmms_eval/tasks/textcaps/textcaps_train.yaml +++ b/lmms_eval/tasks/textcaps/textcaps_train.yaml @@ -13,7 +13,7 @@ generation_kwargs: - "ASSISTANT:" max_new_tokens: 1024 temperature: 0 - top_p: 0 + top_p: 1.0 num_beams: 1 do_sample: false process_results: !function utils.textcaps_process_result diff --git a/lmms_eval/tasks/textcaps/textcaps_val.yaml b/lmms_eval/tasks/textcaps/textcaps_val.yaml index 41baec8e..426b1705 100755 --- a/lmms_eval/tasks/textcaps/textcaps_val.yaml +++ b/lmms_eval/tasks/textcaps/textcaps_val.yaml @@ -11,7 +11,7 @@ doc_to_target: "answer" generation_kwargs: max_new_tokens: 64 temperature: 0 - top_p: 0 + top_p: 1.0 num_beams: 1 do_sample: false process_results: !function utils.textcaps_process_result diff --git a/lmms_eval/tasks/video_detail_description/utils.py b/lmms_eval/tasks/video_detail_description/utils.py index 9fb8c4b2..f2a6878c 100755 --- a/lmms_eval/tasks/video_detail_description/utils.py +++ b/lmms_eval/tasks/video_detail_description/utils.py @@ -215,10 +215,11 @@ def gpt_eval(results): # Update the dictionary with the new entries updated_dict = { "video_name": data_dict["video_name"], + "review": review, "score": score, - "Q": question, - "A": answer, - "pred": pred, + # "Q": question, + # "A": answer, + # "pred": pred, } evaluated_results.append(updated_dict) diff --git a/lmms_eval/tasks/videochatgpt/videochatgpt_consistency.yaml b/lmms_eval/tasks/videochatgpt/videochatgpt_consistency.yaml index 84e6a8c0..85056183 100755 --- a/lmms_eval/tasks/videochatgpt/videochatgpt_consistency.yaml +++ b/lmms_eval/tasks/videochatgpt/videochatgpt_consistency.yaml @@ -18,7 +18,7 @@ generation_kwargs: image_aspect_ratio: original max_new_tokens: 1024 temperature: 0 - top_p: 0 + top_p: 1.0 num_beams: 1 do_sample: false \ No newline at end of file diff --git a/lmms_eval/tasks/videochatgpt/videochatgpt_temporal.yaml b/lmms_eval/tasks/videochatgpt/videochatgpt_temporal.yaml index 1bf336ce..18754afd 100755 --- a/lmms_eval/tasks/videochatgpt/videochatgpt_temporal.yaml +++ b/lmms_eval/tasks/videochatgpt/videochatgpt_temporal.yaml @@ -18,6 +18,6 @@ generation_kwargs: image_aspect_ratio: original max_new_tokens: 1024 temperature: 0 - top_p: 0 + top_p: 1.0 num_beams: 1 do_sample: false