From ec44ab15d9e3623818203b75cca6c14fea32395b Mon Sep 17 00:00:00 2001 From: Li Bo Date: Fri, 24 May 2024 17:02:52 +0800 Subject: [PATCH] [WIP] adding video datasets (#93) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add COCO, RefCOCO, RefCOCO+, RefCOCOg (#5) * Update author name and email in pyproject.toml * add mmvet and try to modify llava arch * Add coco, refcoco support * Fix doc_to_visual error * Fix segmentation mask error * Add refcoco+, refcocog * Remove debug code * black lint * Remove unused code and scripts * Fix group stderr N/A error between str and int * Fix letter case issue * Update lmms_eval tasks and utils * Fix coco test_split name * Add llava-bench-in-the-wild support * Black codestyle, lint * Add COCO evaluation metric * Add refcoco, refcocog, refcoco+ evaluation kit * Add llava bench coco support --------- Co-authored-by: Bo Li * VQAv2 eval (#4) * vqav2 * Add vqav2_process_results function and update vqav2_doc_to_text function * Implement vqav2_process_results function to return exact match score * Refactor fewshot_docs() to use config.fewshot_config * Refactor Task class to handle fewshot_docs when training and validation docs are not available * Add answer processing logic in vqav2_process_results function * Refactor vqav2_process_results function and add submission aggregation * Add vqav2_aggreate_submissions function to utils.py * textvqa * Refactor answer processing in textvqa_process_results() function * textvqa eval * Update dataset path and modify textvqa_doc_to_text function * Capitalize the question in textvqa_doc_to_text function * Update textvqa.yaml and utils.py * Fix formatting issues in lmms_eval/api/task.py, lmms_eval/tasks/gqa/utils.py, lmms_eval/tasks/textvqa/utils.py, and lmms_eval/tasks/vqav2/utils.py --------- Co-authored-by: Li Bo * [Big Changes] add LLaVA-1.6, MMVet, LLaVA-W, POPE, and many other changes on logs, model args. (#7) * Update author name and email in pyproject.toml * add mmvet and try to modify llava arch * black lint * Remove unused code and scripts * Update lmms_eval tasks and utils * Update LMMS-Eval dependencies and configurations * Squashed commit of the following: commit 209f3904f33210bec0b4b146e96fcbd67a4e1541 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Wed Jan 17 20:27:13 2024 +0800 Add COCO, RefCOCO, RefCOCO+, RefCOCOg (#5) * Update author name and email in pyproject.toml * add mmvet and try to modify llava arch * Add coco, refcoco support * Fix doc_to_visual error * Fix segmentation mask error * Add refcoco+, refcocog * Remove debug code * black lint * Remove unused code and scripts * Fix group stderr N/A error between str and int * Fix letter case issue * Update lmms_eval tasks and utils * Fix coco test_split name * Add llava-bench-in-the-wild support * Black codestyle, lint * Add COCO evaluation metric * Add refcoco, refcocog, refcoco+ evaluation kit * Add llava bench coco support --------- Co-authored-by: Bo Li commit f102f038a161fe667628accd2d9daa33e70fe74f Author: Zhang Peiyuan Date: Wed Jan 17 20:26:58 2024 +0800 Update utils.py (#6) * Fix logging issue and remove unnecessary whitespace * Add openai and pycocoevalcap dependencies * Fix device mapping issue in Llava constructor * Add support for truncating context in generation * Update Llava model and evaluation configuration * Update YAML configuration files * Update YAML configuration files * add otterhd and gemini models * Add support for custom image aspect ratio in Llava model * Add dataset_kwargs and max_gen_toks to YAML files * Fix log_samples suffix typo and use hash for output name * Refactor LMMS evaluation code and update LLAVA model properties * matched response for mistral-llava * Refactor logging in llava_aggregation function * Print evaluation statistics instead of logging them * Fix logging information in llava_aggregation function * Add new models and dataset_kwargs for COCO tasks * Update truncate_context parameter in Llava class constructor * Update dataset_kwargs in YAML files * Remove issue type tags from issue and pull request templates * add mmvet and try to modify llava arch * black lint * Update lmms_eval tasks and utils * Update LMMS-Eval dependencies and configurations * Squashed commit of the following: commit 209f3904f33210bec0b4b146e96fcbd67a4e1541 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Wed Jan 17 20:27:13 2024 +0800 Add COCO, RefCOCO, RefCOCO+, RefCOCOg (#5) * Update author name and email in pyproject.toml * add mmvet and try to modify llava arch * Add coco, refcoco support * Fix doc_to_visual error * Fix segmentation mask error * Add refcoco+, refcocog * Remove debug code * black lint * Remove unused code and scripts * Fix group stderr N/A error between str and int * Fix letter case issue * Update lmms_eval tasks and utils * Fix coco test_split name * Add llava-bench-in-the-wild support * Black codestyle, lint * Add COCO evaluation metric * Add refcoco, refcocog, refcoco+ evaluation kit * Add llava bench coco support --------- Co-authored-by: Bo Li commit f102f038a161fe667628accd2d9daa33e70fe74f Author: Zhang Peiyuan Date: Wed Jan 17 20:26:58 2024 +0800 Update utils.py (#6) * Fix logging issue and remove unnecessary whitespace * Add openai and pycocoevalcap dependencies * Fix device mapping issue in Llava constructor * Add support for truncating context in generation * Update Llava model and evaluation configuration * Update YAML configuration files * Update YAML configuration files * add otterhd and gemini models * Add support for custom image aspect ratio in Llava model * Add dataset_kwargs and max_gen_toks to YAML files * Fix log_samples suffix typo and use hash for output name * Refactor LMMS evaluation code and update LLAVA model properties * matched response for mistral-llava * Refactor logging in llava_aggregation function * Print evaluation statistics instead of logging them * Fix logging information in llava_aggregation function * Add new models and dataset_kwargs for COCO tasks * Update truncate_context parameter in Llava class constructor * Update dataset_kwargs in YAML files * Remove issue type tags from issue and pull request templates * Refactor pope utils functions * Update transformers dependency to version 4.36.2 * Revise llava-in-the-wild prompt for align * Add default values for gen_kwargs in Llava class * Fix formatting issues and import pdb for debugging * Remove pdb.set_trace() and update default value for max_new_tokens * Add llava loglikelihood * Fix formatting and indentation issues in lmms_eval/api/metrics.py and lmms_eval/models/llava.py * Update function to handle edge cases This commit updates the function to handle edge cases, improving the overall reliability and robustness of the code. * Update black version in pre-commit config * Remove duplicate lines in gqa * Another way to solve memory issue * Handle exception in model generation * Refactor pope_aggregate_results to use "score" key instead of "pope_accuracy" * Update pope metrics aggregation functions * Add model_to_prompt in pope.yaml * Update pope.yaml configuration * Refactor code to simplify construct_requests call --------- Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Add datetime to output name in cli_evaluate function Add get_datetime_str function to utils.py * Refactor pope_aggregate_f1_score function * Fix datetime format in get_datetime_str function * Update JSON dump indentation in cli_evaluate function * Add datetime to output name in cli_evaluate function (#10) * Revert "Add datetime to output name in cli_evaluate function" This reverts commit ef26f78c46b50d8769a4fb6990b909162c2881c3. * Add datetime to output name in cli_evaluate function * [Datasets] Added POPE and Aligned. (#11) * Update generation_kwargs in pope.yaml * Update pope_doc_to_text function * [Dataset] Add SEED-Bench, TextCaps, NoCaps (#12) * Change coco from print to logger * Add llava loglikelihood * Add Nocaps support * Fix pass through function * Add textcaps support * Fix textcaps eval image_id * Add seedbench support * Add seedbench ppl evaluation * black lint * [Datasets] Add four internal evaluation datasets (#13) * Update generation_kwargs in pope.yaml * Update pope_doc_to_text function * Remove unused variable in mmvet_process_results function * Remove unused imports in utils.py * Refactor get_chat_response function to include retries for API requests * Update gpt_eval_model_name in lmms_eval/tasks/dc100_en.yaml and add retry logic in get_chat_response function * Update prompt variable in lmms_eval tasks * Refactor output_name variable in cli_evaluate function * Fix logging message in mmvet_process_results function * Update sleep time in get_chat_response function * Merge commit 'fec494dbe5971e8fa5a886b191a4781be3ce7a6f' * Refactor get_eval function to include retries * Add token parameter to load_dataset function in gqa_doc_to_visual * Refactor llava_process_results and llava_aggregation functions * [Datasets] Add four internal evaluation datasets (#13) * Update generation_kwargs in pope.yaml * Update pope_doc_to_text function * Remove unused variable in mmvet_process_results function * Remove unused imports in utils.py * Refactor get_chat_response function to include retries for API requests * Update gpt_eval_model_name in lmms_eval/tasks/dc100_en.yaml and add retry logic in get_chat_response function * Update prompt variable in lmms_eval tasks * Refactor output_name variable in cli_evaluate function * Fix logging message in mmvet_process_results function * Update sleep time in get_chat_response function * Merge commit 'fec494dbe5971e8fa5a886b191a4781be3ce7a6f' * Refactor get_eval function to include retries * Add token parameter to load_dataset function in gqa_doc_to_visual * Refactor llava_process_results and llava_aggregation functions * add mmmu (#15) * add mmme * black * add mmmu (#15) * add mmme * black * [Memory issue] Solve memory issue for building context (#14) * Update generation_kwargs in pope.yaml * Update pope_doc_to_text function * Remove unused variable in mmvet_process_results function * Remove unused imports in utils.py * Refactor get_chat_response function to include retries for API requests * Update gpt_eval_model_name in lmms_eval/tasks/dc100_en.yaml and add retry logic in get_chat_response function * Update prompt variable in lmms_eval tasks * Refactor output_name variable in cli_evaluate function * Fix logging message in mmvet_process_results function * Update sleep time in get_chat_response function * Merge commit 'fec494dbe5971e8fa5a886b191a4781be3ce7a6f' * Refactor get_eval function to include retries * Add token parameter to load_dataset function in gqa_doc_to_visual * Refactor llava_process_results and llava_aggregation functions * Remove unused function llava_aggregation * Refractor llava-bench aggregation code * Add logs and scripts to .gitignore, and set image_aspect_ratio to original in scienceqa.yaml * Update generation parameters in scienceqa.yaml * Solve memory issue for building context * Solved gather result error * Update lmms_eval scienceqa_img config * Fixed nocaps store results * Revise seedbench prompt * Squashed commit of the following: commit 290126e6a269db4cca9b3544bd017d6c17012793 Author: Zhang Peiyuan Date: Wed Jan 24 14:07:36 2024 +0800 add mmmu (#15) * add mmme * black commit 8b0227cd7b2602d096d773a01b2199d1f4110f22 Author: Li Bo Date: Wed Jan 24 10:00:33 2024 +0800 [Datasets] Add four internal evaluation datasets (#13) * Update generation_kwargs in pope.yaml * Update pope_doc_to_text function * Remove unused variable in mmvet_process_results function * Remove unused imports in utils.py * Refactor get_chat_response function to include retries for API requests * Update gpt_eval_model_name in lmms_eval/tasks/dc100_en.yaml and add retry logic in get_chat_response function * Update prompt variable in lmms_eval tasks * Refactor output_name variable in cli_evaluate function * Fix logging message in mmvet_process_results function * Update sleep time in get_chat_response function * Merge commit 'fec494dbe5971e8fa5a886b191a4781be3ce7a6f' * Refactor get_eval function to include retries * Add token parameter to load_dataset function in gqa_doc_to_visual * Refactor llava_process_results and llava_aggregation functions commit fec494dbe5971e8fa5a886b191a4781be3ce7a6f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Tue Jan 23 19:17:40 2024 +0800 [Dataset] Add SEED-Bench, TextCaps, NoCaps (#12) * Change coco from print to logger * Add llava loglikelihood * Add Nocaps support * Fix pass through function * Add textcaps support * Fix textcaps eval image_id * Add seedbench support * Add seedbench ppl evaluation * black lint commit 4c3c2c63a681f29c537c2467957de1a90568748d Author: Li Bo Date: Tue Jan 23 19:17:12 2024 +0800 [Datasets] Added POPE and Aligned. (#11) * Update generation_kwargs in pope.yaml * Update pope_doc_to_text function --------- Co-authored-by: Bo Li * [Memory issue] Solve memory issue for building context (#14) * Update generation_kwargs in pope.yaml * Update pope_doc_to_text function * Remove unused variable in mmvet_process_results function * Remove unused imports in utils.py * Refactor get_chat_response function to include retries for API requests * Update gpt_eval_model_name in lmms_eval/tasks/dc100_en.yaml and add retry logic in get_chat_response function * Update prompt variable in lmms_eval tasks * Refactor output_name variable in cli_evaluate function * Fix logging message in mmvet_process_results function * Update sleep time in get_chat_response function * Merge commit 'fec494dbe5971e8fa5a886b191a4781be3ce7a6f' * Refactor get_eval function to include retries * Add token parameter to load_dataset function in gqa_doc_to_visual * Refactor llava_process_results and llava_aggregation functions * Remove unused function llava_aggregation * Refractor llava-bench aggregation code * Add logs and scripts to .gitignore, and set image_aspect_ratio to original in scienceqa.yaml * Update generation parameters in scienceqa.yaml * Solve memory issue for building context * Solved gather result error * Update lmms_eval scienceqa_img config * Fixed nocaps store results * Revise seedbench prompt * Squashed commit of the following: commit c3cc24a89415aeccad31ccbb10642af677cd6fe5 Author: Zhang Peiyuan Date: Wed Jan 24 14:07:36 2024 +0800 add mmmu (#15) * add mmme * black commit 0dbc5d16c4f45ebea8def5f0bc1a36fcd93f9a05 Author: Li Bo Date: Wed Jan 24 10:00:33 2024 +0800 [Datasets] Add four internal evaluation datasets (#13) * Update generation_kwargs in pope.yaml * Update pope_doc_to_text function * Remove unused variable in mmvet_process_results function * Remove unused imports in utils.py * Refactor get_chat_response function to include retries for API requests * Update gpt_eval_model_name in lmms_eval/tasks/dc100_en.yaml and add retry logic in get_chat_response function * Update prompt variable in lmms_eval tasks * Refactor output_name variable in cli_evaluate function * Fix logging message in mmvet_process_results function * Update sleep time in get_chat_response function * Merge commit 'fec494dbe5971e8fa5a886b191a4781be3ce7a6f' * Refactor get_eval function to include retries * Add token parameter to load_dataset function in gqa_doc_to_visual * Refactor llava_process_results and llava_aggregation functions commit fec494dbe5971e8fa5a886b191a4781be3ce7a6f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Tue Jan 23 19:17:40 2024 +0800 [Dataset] Add SEED-Bench, TextCaps, NoCaps (#12) * Change coco from print to logger * Add llava loglikelihood * Add Nocaps support * Fix pass through function * Add textcaps support * Fix textcaps eval image_id * Add seedbench support * Add seedbench ppl evaluation * black lint commit 4c3c2c63a681f29c537c2467957de1a90568748d Author: Li Bo Date: Tue Jan 23 19:17:12 2024 +0800 [Datasets] Added POPE and Aligned. (#11) * Update generation_kwargs in pope.yaml * Update pope_doc_to_text function --------- Co-authored-by: Bo Li * Add output path file naming convention (#16) Update datetime format in get_datetime_str() function * Add output path file naming convention (#16) Update datetime format in get_datetime_str() function * [Datasets] modify NoCaps data path and prompts (#17) * Merge commit '767f7e2cae60cf67ec5878234d84321395a3ed15' * Update dataset paths and improve user prompts * [Datasets] modify NoCaps data path and prompts (#17) * Merge commit '95ef3ea519cbd772924f9a6afa5394979eb00432' * Update dataset paths and improve user prompts * [Dataset] Add flickr30k (#18) * Add flickr30k support * Black lint * Align prompt with NoCaps * [Dataset] Add flickr30k (#18) * Add flickr30k support * Black lint * Align prompt with NoCaps * add model specific prompt and gen kwargs in sqa (#19) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add model specific prompt and gen kwargs in sqa (#19) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * Dev/add chartqa and ai2d (#23) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * Add 'submissions/' directory to .gitignore * Add Python setup and Black version installation workflow Refactor ContextSampler class in samplers.py Remove unnecessary line in DecontaminationFilter class Update dependencies in pyproject.toml * Refactor code in ContextSampler class --------- Co-authored-by: Bo Li * Dev/add chartqa and ai2d (#23) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * Add 'submissions/' directory to .gitignore * Add Python setup and Black version installation workflow Refactor ContextSampler class in samplers.py Remove unnecessary line in DecontaminationFilter class Update dependencies in pyproject.toml * Refactor code in ContextSampler class --------- Co-authored-by: Bo Li * [Datasets] Changes for Flickr30K and NoCaps, also merged Peiyuan's Model Specific Prompt. (#20) * Merge commit '767f7e2cae60cf67ec5878234d84321395a3ed15' * Update dataset paths and improve user prompts * Add submission folder and update file paths for storing prediction results * Merge commit '842fbc6f2da7d9a118adf9ec27c3d8542d74168e' * Update dataset_path in flickr30k.yaml * Add coco_val and coco_test tasks to coco.yaml * Squashed commit of the following: commit 542a34dc5721ecdff6c5c68b0568692ad3a17149 Author: jzhang38 Date: Thu Jan 25 11:59:12 2024 +0800 refactor multi model code commit 3c397b8af85192b1821b3b6a0d8b8df746b5347c Author: jzhang38 Date: Thu Jan 25 11:51:16 2024 +0800 print table at the end commit e7b8a2d1f1e7337f02298efafd2ebf81543f4f85 Author: jzhang38 Date: Thu Jan 25 11:20:59 2024 +0800 add yaml config to supprot multi-model eval commit 2626383d99b5eac59d531ca0f293df960570c524 Author: jzhang38 Date: Thu Jan 25 10:39:42 2024 +0800 black commit 8349935fe145e33af0007ad4fb0d71fd925be7a0 Merge: 7e8b57d 4d11dce Author: jzhang38 Date: Thu Jan 25 10:37:57 2024 +0800 resolve conflicts in sqa commit d4e8e2552d40752bfdc5bbf4cd962c1798096258 Author: jzhang38 Date: Thu Jan 25 10:36:46 2024 +0800 add model specific prompt and gen kwargs commit 842fbc6f2da7d9a118adf9ec27c3d8542d74168e Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Thu Jan 25 09:47:31 2024 +0800 [Dataset] Add flickr30k (#18) * Add flickr30k support * Black lint * Align prompt with NoCaps commit 4bf0504fabc3b62f356c467b2fd1119083d27313 Author: Li Bo Date: Wed Jan 24 22:10:14 2024 +0800 [Datasets] modify NoCaps data path and prompts (#17) * Merge commit '767f7e2cae60cf67ec5878234d84321395a3ed15' * Update dataset paths and improve user prompts commit 520c7a2cafe60810aca79df814ce6829d4576032 Author: jzhang38 Date: Wed Jan 24 13:56:51 2024 +0800 black commit 3a633240327c078fa4f5a75dbd38ad5bc0d468dd Author: jzhang38 Date: Wed Jan 24 13:55:43 2024 +0800 add mmme * Squashed commit of the following: commit 542a34dc5721ecdff6c5c68b0568692ad3a17149 Author: jzhang38 Date: Thu Jan 25 11:59:12 2024 +0800 refactor multi model code commit 3c397b8af85192b1821b3b6a0d8b8df746b5347c Author: jzhang38 Date: Thu Jan 25 11:51:16 2024 +0800 print table at the end commit e7b8a2d1f1e7337f02298efafd2ebf81543f4f85 Author: jzhang38 Date: Thu Jan 25 11:20:59 2024 +0800 add yaml config to supprot multi-model eval commit 2626383d99b5eac59d531ca0f293df960570c524 Author: jzhang38 Date: Thu Jan 25 10:39:42 2024 +0800 black commit 8349935fe145e33af0007ad4fb0d71fd925be7a0 Merge: 7e8b57d 4d11dce Author: jzhang38 Date: Thu Jan 25 10:37:57 2024 +0800 resolve conflicts in sqa commit d4e8e2552d40752bfdc5bbf4cd962c1798096258 Author: jzhang38 Date: Thu Jan 25 10:36:46 2024 +0800 add model specific prompt and gen kwargs commit 520c7a2cafe60810aca79df814ce6829d4576032 Author: jzhang38 Date: Wed Jan 24 13:56:51 2024 +0800 black commit 3a633240327c078fa4f5a75dbd38ad5bc0d468dd Author: jzhang38 Date: Wed Jan 24 13:55:43 2024 +0800 add mmme * Squashed commit of the following: commit b13a805623dfd9d826ddd440e1b5ecde773fbb12 Author: Zhang Peiyuan Date: Thu Jan 25 17:08:25 2024 +0800 add model specific prompt and gen kwargs in sqa (#19) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * Fix cli itself can not run with config file * Fix bug in login functionality Refactor code for better performance Add new feature for user authentication Update UI layout for improved user experience Fix typo in variable name Optimize database queries for faster response time Add error handling for edge cases Update dependencies to latest versions Remove unused code Improve code readability and maintainability * Refactor get_task_dict function to handle nested groups * Add submission file for coco, flickr30k, nocaps, and textcaps tasks * Remove unused files and update task configuration * Fix tasks issue for nocaps, refcoco/+/g * Fix file path and raise error if config file does not exist * Exclude train in refcoco/+/g config * Solve doc_iterator_for_counting crashing issue * Black lint * Refactor code to improve performance and readability * Squashed commit of the following: commit a2cc9303dc72e4d53983bb56e54a32e977c3e270 Author: JvThunder Date: Fri Jan 26 01:03:57 2024 +0800 change okvqa yaml commit 35e87e7c7a480d005abf607c2527a35457d92311 Author: JvThunder Date: Fri Jan 26 00:55:40 2024 +0800 change yaml commit 89755323596b85208ed33aa88c296604a39af6eb Author: JvThunder Date: Fri Jan 26 00:42:43 2024 +0800 add okvqa task commit b13a805623dfd9d826ddd440e1b5ecde773fbb12 Author: Zhang Peiyuan Date: Thu Jan 25 17:08:25 2024 +0800 add model specific prompt and gen kwargs in sqa (#19) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * Squashed commit of the following: commit 0b0d30dfb247c5f0b7b68398b9e9fcde74cf7fa2 Author: JvThunder Date: Fri Jan 26 01:06:02 2024 +0800 change ocr reference commit e273f9cbd91540df86bdbc652bff88a847bd0d2d Author: JvThunder Date: Fri Jan 26 01:05:46 2024 +0800 revert example_eval commit e84126aaaf8a07bd371a0571a914ccbcd3697f20 Author: JvThunder Date: Fri Jan 26 00:17:28 2024 +0800 edit vizwiz utils commit 110deab53dc1a2fd349b1872cd261b69074c5fa8 Author: JvThunder Date: Thu Jan 25 23:49:47 2024 +0800 reorganize __init__ commit 0fa3e0c40075997ea80ed976bdee9615f17d3ece Author: JvThunder Date: Thu Jan 25 23:46:20 2024 +0800 minor fixes commit 2aaca579120def99860f90054233f3358950fa66 Author: JvThunder Date: Thu Jan 25 17:41:03 2024 +0800 add vizwizvqa eval rask commit b13a805623dfd9d826ddd440e1b5ecde773fbb12 Author: Zhang Peiyuan Date: Thu Jan 25 17:08:25 2024 +0800 add model specific prompt and gen kwargs in sqa (#19) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * Refactor mathvista.yaml and utils.py * Add gpt_eval_score to mathvista_process_results * Refactor mathvista_aggregate_results to return average accuracy score * Fix refcoco evaluation error * Fix evaluation problem for refcoco+/g * Refactor mathvista.yaml and mathvista_evals.py * Add dependencies and update YAML files * Refactor mmbench_en/utils.py to save test results to separate Excel file * Fix caption task prompt * Add group field to mmbench_en_test and mmbench_en_val yaml files * Delete mmbench_en_val.yaml file * Update mmbench_cn.yaml and mmbench_cn_test.yaml * Update mmbench_cn_val.yaml and utils.py * Remove unused fields in mmbench_cn_cc_process_results function * Update aggregation function for mmbench_en_dev.yaml * Fix capitalization of L2-category key in utils.py * Fix variable name in mmbench_process_results function * Delete mmbench_cn_val.yaml file * Update mathvista_test.yaml and mathvista_testmini.yaml * Fix warnings and update mathvista.yaml * Remove system message from MathVistaEvaluator * Update GPT model version in MathVistaEvaluator constructor * Update GQA_RAW_IMAGE_DATASET path in utils.py * change vizwiz to test set * Add split flag to mathvista_aggregate_results function * Add higher_is_better: false to gpt_eval_info metric in d170_cn, d170_en, dc100_en, and dc200_cn yaml files * Add download configuration for dataset * Update GQA_RAW_IMAGE_DATASET path in utils.py * add datasets * Update gpt_eval_model_name in mathvista.yaml * Merge commit '817eb057bcb61226b33d3ac3c8def01c36c90f96' * Update pyproject.toml with dependencies and URLs * Squashed commit of the following: commit f253968ad703f682a29317bdd51ec6c1fd7c5465 Author: Zhang Peiyuan Date: Sat Jan 27 13:56:37 2024 +0800 Dev/add chartqa and ai2d (#23) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * Add 'submissions/' directory to .gitignore * Add Python setup and Black version installation workflow Refactor ContextSampler class in samplers.py Remove unnecessary line in DecontaminationFilter class Update dependencies in pyproject.toml * Refactor code in ContextSampler class --------- Co-authored-by: Bo Li * Refactor image processing and submission file path * Refactor directory creation logic in cli_evaluate_single function * Update dataset path and test split in vqav2.yaml * Remove "total" column from cap_details_columns DataFrame * Add retry logic for dataset download * Add 'tenacity' to dependencies in pyproject.toml * Refactor code in ContextSampler class * Update Black version and configuration, and improve code readability in ContextSampler * Update Black version and line length --------- Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> Co-authored-by: Fanyi Pu * [Datasets] Changes for Flickr30K and NoCaps, also merged Peiyuan's Model Specific Prompt. (#20) * Merge commit '95ef3ea519cbd772924f9a6afa5394979eb00432' * Update dataset paths and improve user prompts * Add submission folder and update file paths for storing prediction results * Merge commit 'ecb47d73d6e000b472be6c5c0cdc9413c7734384' * Update dataset_path in flickr30k.yaml * Add coco_val and coco_test tasks to coco.yaml * Squashed commit of the following: commit 4d48d0c9b88e62dfebe05ec909b7f1851e9cd75d Author: jzhang38 Date: Thu Jan 25 11:59:12 2024 +0800 refactor multi model code commit 4a4b7bec200c72332b61a0c277cd8f8a34e4f721 Author: jzhang38 Date: Thu Jan 25 11:51:16 2024 +0800 print table at the end commit 63739fc6fa0a462d807ae81de0db0173102de584 Author: jzhang38 Date: Thu Jan 25 11:20:59 2024 +0800 add yaml config to supprot multi-model eval commit edcc752f97ea3845cefad56624e5d2855066f680 Author: jzhang38 Date: Thu Jan 25 10:39:42 2024 +0800 black commit 41f4b63d3a6e83babe92bac32a7432a8ef740bb5 Merge: 7e8b57d 4d11dce Author: jzhang38 Date: Thu Jan 25 10:37:57 2024 +0800 resolve conflicts in sqa commit 7e8b57d3bcc21d2a049d3abbc8a8201631641db4 Author: jzhang38 Date: Thu Jan 25 10:36:46 2024 +0800 add model specific prompt and gen kwargs commit ecb47d73d6e000b472be6c5c0cdc9413c7734384 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Thu Jan 25 09:47:31 2024 +0800 [Dataset] Add flickr30k (#18) * Add flickr30k support * Black lint * Align prompt with NoCaps commit dc23f4b42b1dd60b41904d7ddbee1412d6851077 Author: Li Bo Date: Wed Jan 24 22:10:14 2024 +0800 [Datasets] modify NoCaps data path and prompts (#17) * Merge commit '95ef3ea519cbd772924f9a6afa5394979eb00432' * Update dataset paths and improve user prompts commit 5f55126484a7c9325db586d26cf2052538222804 Author: jzhang38 Date: Wed Jan 24 13:56:51 2024 +0800 black commit aa6f8853cf82384fb3b15306fec4769212fbc5ab Author: jzhang38 Date: Wed Jan 24 13:55:43 2024 +0800 add mmme * Squashed commit of the following: commit 4d48d0c9b88e62dfebe05ec909b7f1851e9cd75d Author: jzhang38 Date: Thu Jan 25 11:59:12 2024 +0800 refactor multi model code commit 4a4b7bec200c72332b61a0c277cd8f8a34e4f721 Author: jzhang38 Date: Thu Jan 25 11:51:16 2024 +0800 print table at the end commit 63739fc6fa0a462d807ae81de0db0173102de584 Author: jzhang38 Date: Thu Jan 25 11:20:59 2024 +0800 add yaml config to supprot multi-model eval commit edcc752f97ea3845cefad56624e5d2855066f680 Author: jzhang38 Date: Thu Jan 25 10:39:42 2024 +0800 black commit 41f4b63d3a6e83babe92bac32a7432a8ef740bb5 Merge: 7e8b57d 4d11dce Author: jzhang38 Date: Thu Jan 25 10:37:57 2024 +0800 resolve conflicts in sqa commit 7e8b57d3bcc21d2a049d3abbc8a8201631641db4 Author: jzhang38 Date: Thu Jan 25 10:36:46 2024 +0800 add model specific prompt and gen kwargs commit 5f55126484a7c9325db586d26cf2052538222804 Author: jzhang38 Date: Wed Jan 24 13:56:51 2024 +0800 black commit aa6f8853cf82384fb3b15306fec4769212fbc5ab Author: jzhang38 Date: Wed Jan 24 13:55:43 2024 +0800 add mmme * Squashed commit of the following: commit 18e984cfe173390843c73048a931baa17800f918 Author: Zhang Peiyuan Date: Thu Jan 25 17:08:25 2024 +0800 add model specific prompt and gen kwargs in sqa (#19) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * Fix cli itself can not run with config file * Fix bug in login functionality Refactor code for better performance Add new feature for user authentication Update UI layout for improved user experience Fix typo in variable name Optimize database queries for faster response time Add error handling for edge cases Update dependencies to latest versions Remove unused code Improve code readability and maintainability * Refactor get_task_dict function to handle nested groups * Add submission file for coco, flickr30k, nocaps, and textcaps tasks * Remove unused files and update task configuration * Fix tasks issue for nocaps, refcoco/+/g * Fix file path and raise error if config file does not exist * Exclude train in refcoco/+/g config * Solve doc_iterator_for_counting crashing issue * Black lint * Refactor code to improve performance and readability * Squashed commit of the following: commit 0df825c9e72a06e6acb4c0bd43c2083ffe8b74c0 Author: JvThunder Date: Fri Jan 26 01:03:57 2024 +0800 change okvqa yaml commit b9d9f9896993033b92346e9f47420c55b866c715 Author: JvThunder Date: Fri Jan 26 00:55:40 2024 +0800 change yaml commit 4256bef410e4c8d8761e0cd0d79ac5e57b97651b Author: JvThunder Date: Fri Jan 26 00:42:43 2024 +0800 add okvqa task commit 18e984cfe173390843c73048a931baa17800f918 Author: Zhang Peiyuan Date: Thu Jan 25 17:08:25 2024 +0800 add model specific prompt and gen kwargs in sqa (#19) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * Squashed commit of the following: commit 0c8a3919885b8fe2880bb2892f7a619d060012d1 Author: JvThunder Date: Fri Jan 26 01:06:02 2024 +0800 change ocr reference commit d2bc7c92ac61179b8c4031e11bc31970355252f6 Author: JvThunder Date: Fri Jan 26 01:05:46 2024 +0800 revert example_eval commit c78fa29cd0d161641ee05db57bd39314b998c8c7 Author: JvThunder Date: Fri Jan 26 00:17:28 2024 +0800 edit vizwiz utils commit 397f0906968fd8ba04b883469b96217737c43e09 Author: JvThunder Date: Thu Jan 25 23:49:47 2024 +0800 reorganize __init__ commit 52a7ea6c7599adeec2ac2787f500e215ce47cf79 Author: JvThunder Date: Thu Jan 25 23:46:20 2024 +0800 minor fixes commit f706b2aaf9b288c582611191a1841b58feaeb741 Author: JvThunder Date: Thu Jan 25 17:41:03 2024 +0800 add vizwizvqa eval rask commit 18e984cfe173390843c73048a931baa17800f918 Author: Zhang Peiyuan Date: Thu Jan 25 17:08:25 2024 +0800 add model specific prompt and gen kwargs in sqa (#19) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * Refactor mathvista.yaml and utils.py * Add gpt_eval_score to mathvista_process_results * Refactor mathvista_aggregate_results to return average accuracy score * Fix refcoco evaluation error * Fix evaluation problem for refcoco+/g * Refactor mathvista.yaml and mathvista_evals.py * Add dependencies and update YAML files * Refactor mmbench_en/utils.py to save test results to separate Excel file * Fix caption task prompt * Add group field to mmbench_en_test and mmbench_en_val yaml files * Delete mmbench_en_val.yaml file * Update mmbench_cn.yaml and mmbench_cn_test.yaml * Update mmbench_cn_val.yaml and utils.py * Remove unused fields in mmbench_cn_cc_process_results function * Update aggregation function for mmbench_en_dev.yaml * Fix capitalization of L2-category key in utils.py * Fix variable name in mmbench_process_results function * Delete mmbench_cn_val.yaml file * Update mathvista_test.yaml and mathvista_testmini.yaml * Fix warnings and update mathvista.yaml * Remove system message from MathVistaEvaluator * Update GPT model version in MathVistaEvaluator constructor * Update GQA_RAW_IMAGE_DATASET path in utils.py * change vizwiz to test set * Add split flag to mathvista_aggregate_results function * Add higher_is_better: false to gpt_eval_info metric in d170_cn, d170_en, dc100_en, and dc200_cn yaml files * Add download configuration for dataset * Update GQA_RAW_IMAGE_DATASET path in utils.py * add datasets * Update gpt_eval_model_name in mathvista.yaml * Merge commit '0d620f98b49f8204d02633f209eedd5d8b7a1f7c' * Update pyproject.toml with dependencies and URLs * Squashed commit of the following: commit 8b600f55b6cf5627504c407871539db59f6085a3 Author: Zhang Peiyuan Date: Sat Jan 27 13:56:37 2024 +0800 Dev/add chartqa and ai2d (#23) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * Add 'submissions/' directory to .gitignore * Add Python setup and Black version installation workflow Refactor ContextSampler class in samplers.py Remove unnecessary line in DecontaminationFilter class Update dependencies in pyproject.toml * Refactor code in ContextSampler class --------- Co-authored-by: Bo Li * Refactor image processing and submission file path * Refactor directory creation logic in cli_evaluate_single function * Update dataset path and test split in vqav2.yaml * Remove "total" column from cap_details_columns DataFrame * Add retry logic for dataset download * Add 'tenacity' to dependencies in pyproject.toml * Refactor code in ContextSampler class * Update Black version and configuration, and improve code readability in ContextSampler * Update Black version and line length --------- Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> Co-authored-by: Fanyi Pu * vqav2 (#25) * Update tqdm progress bar position * Merge commit '842fbc6f2da7d9a118adf9ec27c3d8542d74168e' * Squashed commit of the following: commit b13a805623dfd9d826ddd440e1b5ecde773fbb12 Author: Zhang Peiyuan Date: Thu Jan 25 17:08:25 2024 +0800 add model specific prompt and gen kwargs in sqa (#19) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code commit 842fbc6f2da7d9a118adf9ec27c3d8542d74168e Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Thu Jan 25 09:47:31 2024 +0800 [Dataset] Add flickr30k (#18) * Add flickr30k support * Black lint * Align prompt with NoCaps commit 4bf0504fabc3b62f356c467b2fd1119083d27313 Author: Li Bo Date: Wed Jan 24 22:10:14 2024 +0800 [Datasets] modify NoCaps data path and prompts (#17) * Merge commit '767f7e2cae60cf67ec5878234d84321395a3ed15' * Update dataset paths and improve user prompts commit 767f7e2cae60cf67ec5878234d84321395a3ed15 Author: Li Bo Date: Wed Jan 24 19:51:34 2024 +0800 Add output path file naming convention (#16) Update datetime format in get_datetime_str() function * remove useless output file * Update dataset path in vqav2.yaml * Squashed commit of the following: commit eeb2b9827502f044ef67d8440f53124baf219ba3 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Fri Jan 26 09:56:45 2024 +0800 Black lint commit 1ce9f0b37e4bc5e6ff5fbfcd23fd339eb14974ae Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Fri Jan 26 09:55:47 2024 +0800 Solve doc_iterator_for_counting crashing issue commit e12b3bb41ed4f51540cfac84e5e96d15777540c4 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Fri Jan 26 09:55:13 2024 +0800 Exclude train in refcoco/+/g config commit 42c56f82bc4ccae12e19e76d09d7e525ca9ef2f4 Merge: 6a1ae69 697a438 Author: Bo Li Date: Thu Jan 25 17:17:13 2024 +0000 Merge branch 'dev/bli_add_datasets' of https://github.com/EvolvingLMMs-Lab/lmms-eval into dev/bli_add_datasets commit aed08303fe87808986d206540a0c0ee6d8764988 Author: Bo Li Date: Thu Jan 25 17:17:06 2024 +0000 Fix file path and raise error if config file does not exist commit a105386613c443d9e740c89725cbd1281bbdfef6 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Fri Jan 26 00:47:24 2024 +0800 Fix tasks issue for nocaps, refcoco/+/g commit 21c8119e377760f44c769bed2528d863a8f4333b Author: Bo Li Date: Thu Jan 25 10:09:43 2024 +0000 Remove unused files and update task configuration commit 0ccb2629c2aacdb297b7cf0c9c2bcfa386bb7582 Author: Bo Li Date: Thu Jan 25 09:43:56 2024 +0000 Add submission file for coco, flickr30k, nocaps, and textcaps tasks commit 5365e13e93c702a1e0e259ee6a08d6a427d72470 Author: Bo Li Date: Thu Jan 25 09:32:54 2024 +0000 Refactor get_task_dict function to handle nested groups commit 6773348c807bcfa1b09ceffc90c75e15cad908f7 Author: Bo Li Date: Thu Jan 25 09:13:46 2024 +0000 Fix bug in login functionality Refactor code for better performance Add new feature for user authentication Update UI layout for improved user experience Fix typo in variable name Optimize database queries for faster response time Add error handling for edge cases Update dependencies to latest versions Remove unused code Improve code readability and maintainability commit 31140f9c87dea89ca94c94bc850e3a8d43e5f8b4 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Thu Jan 25 17:07:20 2024 +0800 Fix cli itself can not run with config file commit df1bad47f6ed13f94848d2bee29b28e00c2384b2 Author: Bo Li Date: Thu Jan 25 09:09:04 2024 +0000 Squashed commit of the following: commit b13a805623dfd9d826ddd440e1b5ecde773fbb12 Author: Zhang Peiyuan Date: Thu Jan 25 17:08:25 2024 +0800 add model specific prompt and gen kwargs in sqa (#19) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code commit 06383aa4a5ff59db52fc8d584f3086efd88b7e74 Author: Bo Li Date: Thu Jan 25 09:02:57 2024 +0000 Squashed commit of the following: commit 542a34dc5721ecdff6c5c68b0568692ad3a17149 Author: jzhang38 Date: Thu Jan 25 11:59:12 2024 +0800 refactor multi model code commit 3c397b8af85192b1821b3b6a0d8b8df746b5347c Author: jzhang38 Date: Thu Jan 25 11:51:16 2024 +0800 print table at the end commit e7b8a2d1f1e7337f02298efafd2ebf81543f4f85 Author: jzhang38 Date: Thu Jan 25 11:20:59 2024 +0800 add yaml config to supprot multi-model eval commit 2626383d99b5eac59d531ca0f293df960570c524 Author: jzhang38 Date: Thu Jan 25 10:39:42 2024 +0800 black commit 8349935fe145e33af0007ad4fb0d71fd925be7a0 Merge: 7e8b57d 4d11dce Author: jzhang38 Date: Thu Jan 25 10:37:57 2024 +0800 resolve conflicts in sqa commit d4e8e2552d40752bfdc5bbf4cd962c1798096258 Author: jzhang38 Date: Thu Jan 25 10:36:46 2024 +0800 add model specific prompt and gen kwargs commit 520c7a2cafe60810aca79df814ce6829d4576032 Author: jzhang38 Date: Wed Jan 24 13:56:51 2024 +0800 black commit 3a633240327c078fa4f5a75dbd38ad5bc0d468dd Author: jzhang38 Date: Wed Jan 24 13:55:43 2024 +0800 add mmme commit 7a71fd6022ee5985100dda38b94956595cec77a5 Merge: 22c3adf 4d11dce Author: Bo Li Date: Thu Jan 25 08:43:15 2024 +0000 Merge commit '842fbc6f2da7d9a118adf9ec27c3d8542d74168e' into dev/bli_add_datasets commit 6870cba13cb54976480c1d5e8d97602c246f881b Author: Bo Li Date: Thu Jan 25 08:38:52 2024 +0000 Squashed commit of the following: commit 542a34dc5721ecdff6c5c68b0568692ad3a17149 Author: jzhang38 Date: Thu Jan 25 11:59:12 2024 +0800 refactor multi model code commit 3c397b8af85192b1821b3b6a0d8b8df746b5347c Author: jzhang38 Date: Thu Jan 25 11:51:16 2024 +0800 print table at the end commit e7b8a2d1f1e7337f02298efafd2ebf81543f4f85 Author: jzhang38 Date: Thu Jan 25 11:20:59 2024 +0800 add yaml config to supprot multi-model eval commit 2626383d99b5eac59d531ca0f293df960570c524 Author: jzhang38 Date: Thu Jan 25 10:39:42 2024 +0800 black commit 8349935fe145e33af0007ad4fb0d71fd925be7a0 Merge: 7e8b57d 4d11dce Author: jzhang38 Date: Thu Jan 25 10:37:57 2024 +0800 resolve conflicts in sqa commit d4e8e2552d40752bfdc5bbf4cd962c1798096258 Author: jzhang38 Date: Thu Jan 25 10:36:46 2024 +0800 add model specific prompt and gen kwargs commit 842fbc6f2da7d9a118adf9ec27c3d8542d74168e Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Thu Jan 25 09:47:31 2024 +0800 [Dataset] Add flickr30k (#18) * Add flickr30k support * Black lint * Align prompt with NoCaps commit 4bf0504fabc3b62f356c467b2fd1119083d27313 Author: Li Bo Date: Wed Jan 24 22:10:14 2024 +0800 [Datasets] modify NoCaps data path and prompts (#17) * Merge commit '767f7e2cae60cf67ec5878234d84321395a3ed15' * Update dataset paths and improve user prompts commit 520c7a2cafe60810aca79df814ce6829d4576032 Author: jzhang38 Date: Wed Jan 24 13:56:51 2024 +0800 black commit 3a633240327c078fa4f5a75dbd38ad5bc0d468dd Author: jzhang38 Date: Wed Jan 24 13:55:43 2024 +0800 add mmme commit b40d522b6bf483ebdfbf5facd4573de0cf8a93f6 Author: Bo Li Date: Thu Jan 25 08:38:11 2024 +0000 Add coco_val and coco_test tasks to coco.yaml commit 5bf643f73d06f1e540897b753450352bb92fd9ec Author: Bo Li Date: Thu Jan 25 04:58:28 2024 +0000 Update dataset_path in flickr30k.yaml commit 95f110f0eef5196205bc501367e3642c57cc7a17 Author: Bo Li Date: Thu Jan 25 02:12:25 2024 +0000 Merge commit '842fbc6f2da7d9a118adf9ec27c3d8542d74168e' commit c844ae49b18c1334711832208b0359c9439fe1c0 Author: Bo Li Date: Thu Jan 25 02:10:18 2024 +0000 Add submission folder and update file paths for storing prediction results commit 842fbc6f2da7d9a118adf9ec27c3d8542d74168e Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Thu Jan 25 09:47:31 2024 +0800 [Dataset] Add flickr30k (#18) * Add flickr30k support * Black lint * Align prompt with NoCaps commit 4bf0504fabc3b62f356c467b2fd1119083d27313 Author: Li Bo Date: Wed Jan 24 22:10:14 2024 +0800 [Datasets] modify NoCaps data path and prompts (#17) * Merge commit '767f7e2cae60cf67ec5878234d84321395a3ed15' * Update dataset paths and improve user prompts commit f0446227f0dd93651e9d6c06254bbf5212ede2dd Merge: c6370bf a0b87f5 Author: Li Bo Date: Wed Jan 24 22:10:07 2024 +0800 Merge branch 'main' into dev/bli_add_datasets commit 1e1f6cfccba758dc606fa4217102518fab73c936 Author: Bo Li Date: Wed Jan 24 14:08:06 2024 +0000 Update dataset paths and improve user prompts commit 966933754b9e5179995b3ab41d746603e13e75c6 Author: Bo Li Date: Wed Jan 24 11:52:33 2024 +0000 Merge commit '767f7e2cae60cf67ec5878234d84321395a3ed15' commit 767f7e2cae60cf67ec5878234d84321395a3ed15 Author: Li Bo Date: Wed Jan 24 19:51:34 2024 +0800 Add output path file naming convention (#16) Update datetime format in get_datetime_str() function * Fix bug in login functionality * create vqav2_val * Update vqav2_test.yaml * Update vqav2_test.yaml * Update vqav2_val.yaml --------- Co-authored-by: Li Bo * vqav2 (#25) * Update tqdm progress bar position * Merge commit 'ecb47d73d6e000b472be6c5c0cdc9413c7734384' * Squashed commit of the following: commit 18e984cfe173390843c73048a931baa17800f918 Author: Zhang Peiyuan Date: Thu Jan 25 17:08:25 2024 +0800 add model specific prompt and gen kwargs in sqa (#19) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code commit ecb47d73d6e000b472be6c5c0cdc9413c7734384 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Thu Jan 25 09:47:31 2024 +0800 [Dataset] Add flickr30k (#18) * Add flickr30k support * Black lint * Align prompt with NoCaps commit dc23f4b42b1dd60b41904d7ddbee1412d6851077 Author: Li Bo Date: Wed Jan 24 22:10:14 2024 +0800 [Datasets] modify NoCaps data path and prompts (#17) * Merge commit '95ef3ea519cbd772924f9a6afa5394979eb00432' * Update dataset paths and improve user prompts commit 95ef3ea519cbd772924f9a6afa5394979eb00432 Author: Li Bo Date: Wed Jan 24 19:51:34 2024 +0800 Add output path file naming convention (#16) Update datetime format in get_datetime_str() function * remove useless output file * Update dataset path in vqav2.yaml * Squashed commit of the following: commit 75bb7043ea5a533ab6351fc0f5ab055e86106423 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Fri Jan 26 09:56:45 2024 +0800 Black lint commit 6635a8aa34cfbd3c7a4afb6fcd214a7283ce01cb Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Fri Jan 26 09:55:47 2024 +0800 Solve doc_iterator_for_counting crashing issue commit 080f42b88ea8acacd527b8d67b84ba1d7d135b03 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Fri Jan 26 09:55:13 2024 +0800 Exclude train in refcoco/+/g config commit 4da84069c08c95e49e8ab0e64a1e103ff7ac8730 Merge: 6a1ae69 697a438 Author: Bo Li Date: Thu Jan 25 17:17:13 2024 +0000 Merge branch 'dev/bli_add_datasets' of https://github.com/EvolvingLMMs-Lab/lmms-eval into dev/bli_add_datasets commit 6a1ae69923d79ae32a001edac38206b605274ec3 Author: Bo Li Date: Thu Jan 25 17:17:06 2024 +0000 Fix file path and raise error if config file does not exist commit 697a4387827ceeec3e393237dd1baa217c714c88 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Fri Jan 26 00:47:24 2024 +0800 Fix tasks issue for nocaps, refcoco/+/g commit 47e40437126d39a5f062c9a33b4de426c1a29804 Author: Bo Li Date: Thu Jan 25 10:09:43 2024 +0000 Remove unused files and update task configuration commit 9976eb8e9ed03c8613725fdbd822ef5d8cf70e47 Author: Bo Li Date: Thu Jan 25 09:43:56 2024 +0000 Add submission file for coco, flickr30k, nocaps, and textcaps tasks commit 95f97a69faa6129676e89eee14960fcfe2076b7c Author: Bo Li Date: Thu Jan 25 09:32:54 2024 +0000 Refactor get_task_dict function to handle nested groups commit 3b79ee842b2488714baf92ab34528ef77989d392 Author: Bo Li Date: Thu Jan 25 09:13:46 2024 +0000 Fix bug in login functionality Refactor code for better performance Add new feature for user authentication Update UI layout for improved user experience Fix typo in variable name Optimize database queries for faster response time Add error handling for edge cases Update dependencies to latest versions Remove unused code Improve code readability and maintainability commit f5c353f2ce93a2d96add4312b695b57432f68cbb Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Thu Jan 25 17:07:20 2024 +0800 Fix cli itself can not run with config file commit 9a68fec37be74cfe8d4a73390bc83edee147ae24 Author: Bo Li Date: Thu Jan 25 09:09:04 2024 +0000 Squashed commit of the following: commit 18e984cfe173390843c73048a931baa17800f918 Author: Zhang Peiyuan Date: Thu Jan 25 17:08:25 2024 +0800 add model specific prompt and gen kwargs in sqa (#19) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code commit 93f847c5851fd246716367935d6b807b17d53949 Author: Bo Li Date: Thu Jan 25 09:02:57 2024 +0000 Squashed commit of the following: commit 4d48d0c9b88e62dfebe05ec909b7f1851e9cd75d Author: jzhang38 Date: Thu Jan 25 11:59:12 2024 +0800 refactor multi model code commit 4a4b7bec200c72332b61a0c277cd8f8a34e4f721 Author: jzhang38 Date: Thu Jan 25 11:51:16 2024 +0800 print table at the end commit 63739fc6fa0a462d807ae81de0db0173102de584 Author: jzhang38 Date: Thu Jan 25 11:20:59 2024 +0800 add yaml config to supprot multi-model eval commit edcc752f97ea3845cefad56624e5d2855066f680 Author: jzhang38 Date: Thu Jan 25 10:39:42 2024 +0800 black commit 41f4b63d3a6e83babe92bac32a7432a8ef740bb5 Merge: 7e8b57d 4d11dce Author: jzhang38 Date: Thu Jan 25 10:37:57 2024 +0800 resolve conflicts in sqa commit 7e8b57d3bcc21d2a049d3abbc8a8201631641db4 Author: jzhang38 Date: Thu Jan 25 10:36:46 2024 +0800 add model specific prompt and gen kwargs commit 5f55126484a7c9325db586d26cf2052538222804 Author: jzhang38 Date: Wed Jan 24 13:56:51 2024 +0800 black commit aa6f8853cf82384fb3b15306fec4769212fbc5ab Author: jzhang38 Date: Wed Jan 24 13:55:43 2024 +0800 add mmme commit fa4ad4404e26d8924f55208746dbb9143b464011 Merge: 22c3adf 4d11dce Author: Bo Li Date: Thu Jan 25 08:43:15 2024 +0000 Merge commit 'ecb47d73d6e000b472be6c5c0cdc9413c7734384' into dev/bli_add_datasets commit 22c3adfd0645acc23b6d7c06b487f4ffd47666c4 Author: Bo Li Date: Thu Jan 25 08:38:52 2024 +0000 Squashed commit of the following: commit 4d48d0c9b88e62dfebe05ec909b7f1851e9cd75d Author: jzhang38 Date: Thu Jan 25 11:59:12 2024 +0800 refactor multi model code commit 4a4b7bec200c72332b61a0c277cd8f8a34e4f721 Author: jzhang38 Date: Thu Jan 25 11:51:16 2024 +0800 print table at the end commit 63739fc6fa0a462d807ae81de0db0173102de584 Author: jzhang38 Date: Thu Jan 25 11:20:59 2024 +0800 add yaml config to supprot multi-model eval commit edcc752f97ea3845cefad56624e5d2855066f680 Author: jzhang38 Date: Thu Jan 25 10:39:42 2024 +0800 black commit 41f4b63d3a6e83babe92bac32a7432a8ef740bb5 Merge: 7e8b57d 4d11dce Author: jzhang38 Date: Thu Jan 25 10:37:57 2024 +0800 resolve conflicts in sqa commit 7e8b57d3bcc21d2a049d3abbc8a8201631641db4 Author: jzhang38 Date: Thu Jan 25 10:36:46 2024 +0800 add model specific prompt and gen kwargs commit ecb47d73d6e000b472be6c5c0cdc9413c7734384 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Thu Jan 25 09:47:31 2024 +0800 [Dataset] Add flickr30k (#18) * Add flickr30k support * Black lint * Align prompt with NoCaps commit dc23f4b42b1dd60b41904d7ddbee1412d6851077 Author: Li Bo Date: Wed Jan 24 22:10:14 2024 +0800 [Datasets] modify NoCaps data path and prompts (#17) * Merge commit '95ef3ea519cbd772924f9a6afa5394979eb00432' * Update dataset paths and improve user prompts commit 5f55126484a7c9325db586d26cf2052538222804 Author: jzhang38 Date: Wed Jan 24 13:56:51 2024 +0800 black commit aa6f8853cf82384fb3b15306fec4769212fbc5ab Author: jzhang38 Date: Wed Jan 24 13:55:43 2024 +0800 add mmme commit 4c712336b6f7438e717a865910bb241e413a4688 Author: Bo Li Date: Thu Jan 25 08:38:11 2024 +0000 Add coco_val and coco_test tasks to coco.yaml commit b5547126c855927fd4dc8384211e4aceee40870f Author: Bo Li Date: Thu Jan 25 04:58:28 2024 +0000 Update dataset_path in flickr30k.yaml commit f786f61e2559f082072f21aa9030e2080ddaf809 Author: Bo Li Date: Thu Jan 25 02:12:25 2024 +0000 Merge commit 'ecb47d73d6e000b472be6c5c0cdc9413c7734384' commit 796a011000e0df90f66f8e80cb34dc2318ae9ac8 Author: Bo Li Date: Thu Jan 25 02:10:18 2024 +0000 Add submission folder and update file paths for storing prediction results commit ecb47d73d6e000b472be6c5c0cdc9413c7734384 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Thu Jan 25 09:47:31 2024 +0800 [Dataset] Add flickr30k (#18) * Add flickr30k support * Black lint * Align prompt with NoCaps commit dc23f4b42b1dd60b41904d7ddbee1412d6851077 Author: Li Bo Date: Wed Jan 24 22:10:14 2024 +0800 [Datasets] modify NoCaps data path and prompts (#17) * Merge commit '95ef3ea519cbd772924f9a6afa5394979eb00432' * Update dataset paths and improve user prompts commit 118744c63eb2d9724571d85fbbd85fcc9ad05b59 Merge: c6370bf a0b87f5 Author: Li Bo Date: Wed Jan 24 22:10:07 2024 +0800 Merge branch 'main' into dev/bli_add_datasets commit c6370bff65903681f00cf3d07111d8e15a57b619 Author: Bo Li Date: Wed Jan 24 14:08:06 2024 +0000 Update dataset paths and improve user prompts commit 810daf458fa94cb3ec2b4a6cc5ecb1e656a24002 Author: Bo Li Date: Wed Jan 24 11:52:33 2024 +0000 Merge commit '95ef3ea519cbd772924f9a6afa5394979eb00432' commit 95ef3ea519cbd772924f9a6afa5394979eb00432 Author: Li Bo Date: Wed Jan 24 19:51:34 2024 +0800 Add output path file naming convention (#16) Update datetime format in get_datetime_str() function * Fix bug in login functionality * create vqav2_val * Update vqav2_test.yaml * Update vqav2_test.yaml * Update vqav2_val.yaml --------- Co-authored-by: Li Bo * vizwiz dataset (#24) * Merge commit '767f7e2cae60cf67ec5878234d84321395a3ed15' * Update dataset paths and improve user prompts * Add submission folder and update file paths for storing prediction results * Merge commit '842fbc6f2da7d9a118adf9ec27c3d8542d74168e' * Update dataset_path in flickr30k.yaml * Add coco_val and coco_test tasks to coco.yaml * Squashed commit of the following: commit 542a34dc5721ecdff6c5c68b0568692ad3a17149 Author: jzhang38 Date: Thu Jan 25 11:59:12 2024 +0800 refactor multi model code commit 3c397b8af85192b1821b3b6a0d8b8df746b5347c Author: jzhang38 Date: Thu Jan 25 11:51:16 2024 +0800 print table at the end commit e7b8a2d1f1e7337f02298efafd2ebf81543f4f85 Author: jzhang38 Date: Thu Jan 25 11:20:59 2024 +0800 add yaml config to supprot multi-model eval commit 2626383d99b5eac59d531ca0f293df960570c524 Author: jzhang38 Date: Thu Jan 25 10:39:42 2024 +0800 black commit 8349935fe145e33af0007ad4fb0d71fd925be7a0 Merge: 7e8b57d 4d11dce Author: jzhang38 Date: Thu Jan 25 10:37:57 2024 +0800 resolve conflicts in sqa commit d4e8e2552d40752bfdc5bbf4cd962c1798096258 Author: jzhang38 Date: Thu Jan 25 10:36:46 2024 +0800 add model specific prompt and gen kwargs commit 842fbc6f2da7d9a118adf9ec27c3d8542d74168e Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Thu Jan 25 09:47:31 2024 +0800 [Dataset] Add flickr30k (#18) * Add flickr30k support * Black lint * Align prompt with NoCaps commit 4bf0504fabc3b62f356c467b2fd1119083d27313 Author: Li Bo Date: Wed Jan 24 22:10:14 2024 +0800 [Datasets] modify NoCaps data path and prompts (#17) * Merge commit '767f7e2cae60cf67ec5878234d84321395a3ed15' * Update dataset paths and improve user prompts commit 520c7a2cafe60810aca79df814ce6829d4576032 Author: jzhang38 Date: Wed Jan 24 13:56:51 2024 +0800 black commit 3a633240327c078fa4f5a75dbd38ad5bc0d468dd Author: jzhang38 Date: Wed Jan 24 13:55:43 2024 +0800 add mmme * Squashed commit of the following: commit 542a34dc5721ecdff6c5c68b0568692ad3a17149 Author: jzhang38 Date: Thu Jan 25 11:59:12 2024 +0800 refactor multi model code commit 3c397b8af85192b1821b3b6a0d8b8df746b5347c Author: jzhang38 Date: Thu Jan 25 11:51:16 2024 +0800 print table at the end commit e7b8a2d1f1e7337f02298efafd2ebf81543f4f85 Author: jzhang38 Date: Thu Jan 25 11:20:59 2024 +0800 add yaml config to supprot multi-model eval commit 2626383d99b5eac59d531ca0f293df960570c524 Author: jzhang38 Date: Thu Jan 25 10:39:42 2024 +0800 black commit 8349935fe145e33af0007ad4fb0d71fd925be7a0 Merge: 7e8b57d 4d11dce Author: jzhang38 Date: Thu Jan 25 10:37:57 2024 +0800 resolve conflicts in sqa commit d4e8e2552d40752bfdc5bbf4cd962c1798096258 Author: jzhang38 Date: Thu Jan 25 10:36:46 2024 +0800 add model specific prompt and gen kwargs commit 520c7a2cafe60810aca79df814ce6829d4576032 Author: jzhang38 Date: Wed Jan 24 13:56:51 2024 +0800 black commit 3a633240327c078fa4f5a75dbd38ad5bc0d468dd Author: jzhang38 Date: Wed Jan 24 13:55:43 2024 +0800 add mmme * Squashed commit of the following: commit b13a805623dfd9d826ddd440e1b5ecde773fbb12 Author: Zhang Peiyuan Date: Thu Jan 25 17:08:25 2024 +0800 add model specific prompt and gen kwargs in sqa (#19) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * Fix cli itself can not run with config file * Fix bug in login functionality Refactor code for better performance Add new feature for user authentication Update UI layout for improved user experience Fix typo in variable name Optimize database queries for faster response time Add error handling for edge cases Update dependencies to latest versions Remove unused code Improve code readability and maintainability * Refactor get_task_dict function to handle nested groups * Add submission file for coco, flickr30k, nocaps, and textcaps tasks * Remove unused files and update task configuration * Fix tasks issue for nocaps, refcoco/+/g * Fix file path and raise error if config file does not exist * Exclude train in refcoco/+/g config * Solve doc_iterator_for_counting crashing issue * Black lint * Refactor code to improve performance and readability * Squashed commit of the following: commit a2cc9303dc72e4d53983bb56e54a32e977c3e270 Author: JvThunder Date: Fri Jan 26 01:03:57 2024 +0800 change okvqa yaml commit 35e87e7c7a480d005abf607c2527a35457d92311 Author: JvThunder Date: Fri Jan 26 00:55:40 2024 +0800 change yaml commit 89755323596b85208ed33aa88c296604a39af6eb Author: JvThunder Date: Fri Jan 26 00:42:43 2024 +0800 add okvqa task commit b13a805623dfd9d826ddd440e1b5ecde773fbb12 Author: Zhang Peiyuan Date: Thu Jan 25 17:08:25 2024 +0800 add model specific prompt and gen kwargs in sqa (#19) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * Squashed commit of the following: commit 0b0d30dfb247c5f0b7b68398b9e9fcde74cf7fa2 Author: JvThunder Date: Fri Jan 26 01:06:02 2024 +0800 change ocr reference commit e273f9cbd91540df86bdbc652bff88a847bd0d2d Author: JvThunder Date: Fri Jan 26 01:05:46 2024 +0800 revert example_eval commit e84126aaaf8a07bd371a0571a914ccbcd3697f20 Author: JvThunder Date: Fri Jan 26 00:17:28 2024 +0800 edit vizwiz utils commit 110deab53dc1a2fd349b1872cd261b69074c5fa8 Author: JvThunder Date: Thu Jan 25 23:49:47 2024 +0800 reorganize __init__ commit 0fa3e0c40075997ea80ed976bdee9615f17d3ece Author: JvThunder Date: Thu Jan 25 23:46:20 2024 +0800 minor fixes commit 2aaca579120def99860f90054233f3358950fa66 Author: JvThunder Date: Thu Jan 25 17:41:03 2024 +0800 add vizwizvqa eval rask commit b13a805623dfd9d826ddd440e1b5ecde773fbb12 Author: Zhang Peiyuan Date: Thu Jan 25 17:08:25 2024 +0800 add model specific prompt and gen kwargs in sqa (#19) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * Refactor mathvista.yaml and utils.py * Add gpt_eval_score to mathvista_process_results * Refactor mathvista_aggregate_results to return average accuracy score * Fix refcoco evaluation error * Fix evaluation problem for refcoco+/g * Refactor mathvista.yaml and mathvista_evals.py * Add dependencies and update YAML files * Refactor mmbench_en/utils.py to save test results to separate Excel file * Fix caption task prompt * Add group field to mmbench_en_test and mmbench_en_val yaml files * Delete mmbench_en_val.yaml file * Update mmbench_cn.yaml and mmbench_cn_test.yaml * Update mmbench_cn_val.yaml and utils.py * Remove unused fields in mmbench_cn_cc_process_results function * Update aggregation function for mmbench_en_dev.yaml * Fix capitalization of L2-category key in utils.py * Fix variable name in mmbench_process_results function * Delete mmbench_cn_val.yaml file * Update mathvista_test.yaml and mathvista_testmini.yaml * Fix warnings and update mathvista.yaml * Remove system message from MathVistaEvaluator * Update GPT model version in MathVistaEvaluator constructor * Update GQA_RAW_IMAGE_DATASET path in utils.py * change vizwiz to test set * Add split flag to mathvista_aggregate_results function * Add higher_is_better: false to gpt_eval_info metric in d170_cn, d170_en, dc100_en, and dc200_cn yaml files * Update lmms_eval/evaluator.py and lmms_eval/tasks/vizwizvqa/utils.py * vizwiz-val * Update utils.py * Update vizwizvqa.yaml --------- Co-authored-by: Bo Li Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * vizwiz dataset (#24) * Merge commit '95ef3ea519cbd772924f9a6afa5394979eb00432' * Update dataset paths and improve user prompts * Add submission folder and update file paths for storing prediction results * Merge commit 'ecb47d73d6e000b472be6c5c0cdc9413c7734384' * Update dataset_path in flickr30k.yaml * Add coco_val and coco_test tasks to coco.yaml * Squashed commit of the following: commit 4d48d0c9b88e62dfebe05ec909b7f1851e9cd75d Author: jzhang38 Date: Thu Jan 25 11:59:12 2024 +0800 refactor multi model code commit 4a4b7bec200c72332b61a0c277cd8f8a34e4f721 Author: jzhang38 Date: Thu Jan 25 11:51:16 2024 +0800 print table at the end commit 63739fc6fa0a462d807ae81de0db0173102de584 Author: jzhang38 Date: Thu Jan 25 11:20:59 2024 +0800 add yaml config to supprot multi-model eval commit edcc752f97ea3845cefad56624e5d2855066f680 Author: jzhang38 Date: Thu Jan 25 10:39:42 2024 +0800 black commit 41f4b63d3a6e83babe92bac32a7432a8ef740bb5 Merge: 7e8b57d 4d11dce Author: jzhang38 Date: Thu Jan 25 10:37:57 2024 +0800 resolve conflicts in sqa commit 7e8b57d3bcc21d2a049d3abbc8a8201631641db4 Author: jzhang38 Date: Thu Jan 25 10:36:46 2024 +0800 add model specific prompt and gen kwargs commit ecb47d73d6e000b472be6c5c0cdc9413c7734384 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Thu Jan 25 09:47:31 2024 +0800 [Dataset] Add flickr30k (#18) * Add flickr30k support * Black lint * Align prompt with NoCaps commit dc23f4b42b1dd60b41904d7ddbee1412d6851077 Author: Li Bo Date: Wed Jan 24 22:10:14 2024 +0800 [Datasets] modify NoCaps data path and prompts (#17) * Merge commit '95ef3ea519cbd772924f9a6afa5394979eb00432' * Update dataset paths and improve user prompts commit 5f55126484a7c9325db586d26cf2052538222804 Author: jzhang38 Date: Wed Jan 24 13:56:51 2024 +0800 black commit aa6f8853cf82384fb3b15306fec4769212fbc5ab Author: jzhang38 Date: Wed Jan 24 13:55:43 2024 +0800 add mmme * Squashed commit of the following: commit 4d48d0c9b88e62dfebe05ec909b7f1851e9cd75d Author: jzhang38 Date: Thu Jan 25 11:59:12 2024 +0800 refactor multi model code commit 4a4b7bec200c72332b61a0c277cd8f8a34e4f721 Author: jzhang38 Date: Thu Jan 25 11:51:16 2024 +0800 print table at the end commit 63739fc6fa0a462d807ae81de0db0173102de584 Author: jzhang38 Date: Thu Jan 25 11:20:59 2024 +0800 add yaml config to supprot multi-model eval commit edcc752f97ea3845cefad56624e5d2855066f680 Author: jzhang38 Date: Thu Jan 25 10:39:42 2024 +0800 black commit 41f4b63d3a6e83babe92bac32a7432a8ef740bb5 Merge: 7e8b57d 4d11dce Author: jzhang38 Date: Thu Jan 25 10:37:57 2024 +0800 resolve conflicts in sqa commit 7e8b57d3bcc21d2a049d3abbc8a8201631641db4 Author: jzhang38 Date: Thu Jan 25 10:36:46 2024 +0800 add model specific prompt and gen kwargs commit 5f55126484a7c9325db586d26cf2052538222804 Author: jzhang38 Date: Wed Jan 24 13:56:51 2024 +0800 black commit aa6f8853cf82384fb3b15306fec4769212fbc5ab Author: jzhang38 Date: Wed Jan 24 13:55:43 2024 +0800 add mmme * Squashed commit of the following: commit 18e984cfe173390843c73048a931baa17800f918 Author: Zhang Peiyuan Date: Thu Jan 25 17:08:25 2024 +0800 add model specific prompt and gen kwargs in sqa (#19) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * Fix cli itself can not run with config file * Fix bug in login functionality Refactor code for better performance Add new feature for user authentication Update UI layout for improved user experience Fix typo in variable name Optimize database queries for faster response time Add error handling for edge cases Update dependencies to latest versions Remove unused code Improve code readability and maintainability * Refactor get_task_dict function to handle nested groups * Add submission file for coco, flickr30k, nocaps, and textcaps tasks * Remove unused files and update task configuration * Fix tasks issue for nocaps, refcoco/+/g * Fix file path and raise error if config file does not exist * Exclude train in refcoco/+/g config * Solve doc_iterator_for_counting crashing issue * Black lint * Refactor code to improve performance and readability * Squashed commit of the following: commit 0df825c9e72a06e6acb4c0bd43c2083ffe8b74c0 Author: JvThunder Date: Fri Jan 26 01:03:57 2024 +0800 change okvqa yaml commit b9d9f9896993033b92346e9f47420c55b866c715 Author: JvThunder Date: Fri Jan 26 00:55:40 2024 +0800 change yaml commit 4256bef410e4c8d8761e0cd0d79ac5e57b97651b Author: JvThunder Date: Fri Jan 26 00:42:43 2024 +0800 add okvqa task commit 18e984cfe173390843c73048a931baa17800f918 Author: Zhang Peiyuan Date: Thu Jan 25 17:08:25 2024 +0800 add model specific prompt and gen kwargs in sqa (#19) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * Squashed commit of the following: commit 0c8a3919885b8fe2880bb2892f7a619d060012d1 Author: JvThunder Date: Fri Jan 26 01:06:02 2024 +0800 change ocr reference commit d2bc7c92ac61179b8c4031e11bc31970355252f6 Author: JvThunder Date: Fri Jan 26 01:05:46 2024 +0800 revert example_eval commit c78fa29cd0d161641ee05db57bd39314b998c8c7 Author: JvThunder Date: Fri Jan 26 00:17:28 2024 +0800 edit vizwiz utils commit 397f0906968fd8ba04b883469b96217737c43e09 Author: JvThunder Date: Thu Jan 25 23:49:47 2024 +0800 reorganize __init__ commit 52a7ea6c7599adeec2ac2787f500e215ce47cf79 Author: JvThunder Date: Thu Jan 25 23:46:20 2024 +0800 minor fixes commit f706b2aaf9b288c582611191a1841b58feaeb741 Author: JvThunder Date: Thu Jan 25 17:41:03 2024 +0800 add vizwizvqa eval rask commit 18e984cfe173390843c73048a931baa17800f918 Author: Zhang Peiyuan Date: Thu Jan 25 17:08:25 2024 +0800 add model specific prompt and gen kwargs in sqa (#19) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * Refactor mathvista.yaml and utils.py * Add gpt_eval_score to mathvista_process_results * Refactor mathvista_aggregate_results to return average accuracy score * Fix refcoco evaluation error * Fix evaluation problem for refcoco+/g * Refactor mathvista.yaml and mathvista_evals.py * Add dependencies and update YAML files * Refactor mmbench_en/utils.py to save test results to separate Excel file * Fix caption task prompt * Add group field to mmbench_en_test and mmbench_en_val yaml files * Delete mmbench_en_val.yaml file * Update mmbench_cn.yaml and mmbench_cn_test.yaml * Update mmbench_cn_val.yaml and utils.py * Remove unused fields in mmbench_cn_cc_process_results function * Update aggregation function for mmbench_en_dev.yaml * Fix capitalization of L2-category key in utils.py * Fix variable name in mmbench_process_results function * Delete mmbench_cn_val.yaml file * Update mathvista_test.yaml and mathvista_testmini.yaml * Fix warnings and update mathvista.yaml * Remove system message from MathVistaEvaluator * Update GPT model version in MathVistaEvaluator constructor * Update GQA_RAW_IMAGE_DATASET path in utils.py * change vizwiz to test set * Add split flag to mathvista_aggregate_results function * Add higher_is_better: false to gpt_eval_info metric in d170_cn, d170_en, dc100_en, and dc200_cn yaml files * Update lmms_eval/evaluator.py and lmms_eval/tasks/vizwizvqa/utils.py * vizwiz-val * Update utils.py * Update vizwizvqa.yaml --------- Co-authored-by: Bo Li Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * [Datasets] check vizwiz, vqav2 validation set and refcoco's naming conventions. (#26) * Update submission file naming and directory structure * Update task names in refcoco+ and refcocog * Merge commit '0265096d5dfda7ece8f1b95f4e4632417cfe775e' * [Datasets] check vizwiz, vqav2 validation set and refcoco's naming conventions. (#26) * Update submission file naming and directory structure * Update task names in refcoco+ and refcocog * Merge commit 'eac8adc657e2925cdca383e9c8e098e62ced4093' * Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * [Dataset Check] dataset check and add wandb logging (#29) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit '0a403e6f5e17c70a50983c83a132edf0fdcd98de' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit fdb0c6785b0c5d6979d10e7ddf75ce9055038db8 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration --------- Co-authored-by: Fanyi Pu Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * [Dataset Check] dataset check and add wandb logging (#29) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit 'f80465fd0f30781c8c36b46c1d6d7bba751f9e33' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit 2df0ce76ef836be1cb8ffbf3c854fe05563647b0 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration --------- Co-authored-by: Fanyi Pu Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Add/mmmu test (#30) * mmmu_test * black * Add/mmmu test (#30) * mmmu_test * black * [Fix] InfoVQA, WandB logging, CLI problems. (#31) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit '0a403e6f5e17c70a50983c83a132edf0fdcd98de' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit fdb0c6785b0c5d6979d10e7ddf75ce9055038db8 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration * Update generation kwargs for LMMS tasks * Update lmms_eval MME task configuration and utils * Update generation_kwargs in lmms_eval tasks * Update doc_to_text function in coco and okvqa tasks * Add COCO 2017 version * Update task name in coco_test2017.yaml * Squashed commit of the following: commit 1e2ae936c90a15d684926e43a38aac86935f38c5 Author: Zhang Peiyuan Date: Mon Jan 29 22:41:33 2024 +0800 Add/mmmu test (#30) * mmmu_test * black commit 10bbaf01c0a4164b6f1d2628367befccf8f39c24 Author: Li Bo Date: Sun Jan 28 22:19:13 2024 +0800 [Dataset Check] dataset check and add wandb logging (#29) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit '0a403e6f5e17c70a50983c83a132edf0fdcd98de' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit fdb0c6785b0c5d6979d10e7ddf75ce9055038db8 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration --------- Co-authored-by: Fanyi Pu Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Refactor CLI evaluate function and improve error logging --------- Co-authored-by: Fanyi Pu Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * [Fix] InfoVQA, WandB logging, CLI problems. (#31) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit 'f80465fd0f30781c8c36b46c1d6d7bba751f9e33' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit 2df0ce76ef836be1cb8ffbf3c854fe05563647b0 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration * Update generation kwargs for LMMS tasks * Update lmms_eval MME task configuration and utils * Update generation_kwargs in lmms_eval tasks * Update doc_to_text function in coco and okvqa tasks * Add COCO 2017 version * Update task name in coco_test2017.yaml * Squashed commit of the following: commit 0390783595c41232352599ab78fbe5949615e982 Author: Zhang Peiyuan Date: Mon Jan 29 22:41:33 2024 +0800 Add/mmmu test (#30) * mmmu_test * black commit 407bc2500c162d8949fbaae3d11d522afd2c9f28 Author: Li Bo Date: Sun Jan 28 22:19:13 2024 +0800 [Dataset Check] dataset check and add wandb logging (#29) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit 'f80465fd0f30781c8c36b46c1d6d7bba751f9e33' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit 2df0ce76ef836be1cb8ffbf3c854fe05563647b0 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration --------- Co-authored-by: Fanyi Pu Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Refactor CLI evaluate function and improve error logging --------- Co-authored-by: Fanyi Pu Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * scienceqa for full set (#32) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit '0a403e6f5e17c70a50983c83a132edf0fdcd98de' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit fdb0c6785b0c5d6979d10e7ddf75ce9055038db8 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration * Update generation kwargs for LMMS tasks * Update lmms_eval MME task configuration and utils * Update generation_kwargs in lmms_eval tasks * Update doc_to_text function in coco and okvqa tasks * Add COCO 2017 version * Update task name in coco_test2017.yaml * Squashed commit of the following: commit 1e2ae936c90a15d684926e43a38aac86935f38c5 Author: Zhang Peiyuan Date: Mon Jan 29 22:41:33 2024 +0800 Add/mmmu test (#30) * mmmu_test * black commit 10bbaf01c0a4164b6f1d2628367befccf8f39c24 Author: Li Bo Date: Sun Jan 28 22:19:13 2024 +0800 [Dataset Check] dataset check and add wandb logging (#29) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit '0a403e6f5e17c70a50983c83a132edf0fdcd98de' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit fdb0c6785b0c5d6979d10e7ddf75ce9055038db8 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration --------- Co-authored-by: Fanyi Pu Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Remove scienceqa_img task configuration * eval scienceqa with no images --------- Co-authored-by: Bo Li Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * scienceqa for full set (#32) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit 'f80465fd0f30781c8c36b46c1d6d7bba751f9e33' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit 2df0ce76ef836be1cb8ffbf3c854fe05563647b0 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration * Update generation kwargs for LMMS tasks * Update lmms_eval MME task configuration and utils * Update generation_kwargs in lmms_eval tasks * Update doc_to_text function in coco and okvqa tasks * Add COCO 2017 version * Update task name in coco_test2017.yaml * Squashed commit of the following: commit 0390783595c41232352599ab78fbe5949615e982 Author: Zhang Peiyuan Date: Mon Jan 29 22:41:33 2024 +0800 Add/mmmu test (#30) * mmmu_test * black commit 407bc2500c162d8949fbaae3d11d522afd2c9f28 Author: Li Bo Date: Sun Jan 28 22:19:13 2024 +0800 [Dataset Check] dataset check and add wandb logging (#29) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit 'f80465fd0f30781c8c36b46c1d6d7bba751f9e33' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit 2df0ce76ef836be1cb8ffbf3c854fe05563647b0 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration --------- Co-authored-by: Fanyi Pu Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Remove scienceqa_img task configuration * eval scienceqa with no images --------- Co-authored-by: Bo Li Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * [Datasets & Models] Fuyu, HalluBench (w/Kaichen, commit 96d95b3) (#33) * add fuyu * Merge commit '0f183a394426d3bf88884b4e2258ab53406bc705' * Squashed commit of the following: commit b81ed2ce4d0e226df7a41bddd82fe1f9d46a27fc Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Tue Jan 30 19:39:57 2024 +0800 Add hallu bench commit 0f183a394426d3bf88884b4e2258ab53406bc705 Author: Pu Fanyi Date: Tue Jan 30 14:52:51 2024 +0800 scienceqa for full set (#32) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit '0a403e6f5e17c70a50983c83a132edf0fdcd98de' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit fdb0c6785b0c5d6979d10e7ddf75ce9055038db8 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration * Update generation kwargs for LMMS tasks * Update lmms_eval MME task configuration and utils * Update generation_kwargs in lmms_eval tasks * Update doc_to_text function in coco and okvqa tasks * Add COCO 2017 version * Update task name in coco_test2017.yaml * Squashed commit of the following: commit 1e2ae936c90a15d684926e43a38aac86935f38c5 Author: Zhang Peiyuan Date: Mon Jan 29 22:41:33 2024 +0800 Add/mmmu test (#30) * mmmu_test * black commit 10bbaf01c0a4164b6f1d2628367befccf8f39c24 Author: Li Bo Date: Sun Jan 28 22:19:13 2024 +0800 [Dataset Check] dataset check and add wandb logging (#29) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit '0a403e6f5e17c70a50983c83a132edf0fdcd98de' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit fdb0c6785b0c5d6979d10e7ddf75ce9055038db8 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration --------- Co-authored-by: Fanyi Pu Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Remove scienceqa_img task configuration * eval scienceqa with no images --------- Co-authored-by: Bo Li Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Update hb_doc_to_text function to remove unnecessary line break * Add Fuyu model and update OtterHD model * Refactor model response handling and fix image processing bug * Refactor flatten method to support only getting the first element * Add support for specifying timezone in datetime string Update flatten method in OtterHD class Update get_datetime_str function in utils.py * Fix condition for checking wandb_args_dict in __main__.py * Commented out assertions for batch size in Fuyu model * Add warning message for existing output file * [Datasets & Models] Fuyu, HalluBench (w/Kaichen, commit 96d95b3) (#33) * add fuyu * Merge commit 'c7ffa8dee96e228c6519154d5a00742b35caa3f2' * Squashed commit of the following: commit 96d95b3cb3540cd17bcab31f1a85ad0d04a12f1e Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Tue Jan 30 19:39:57 2024 +0800 Add hallu bench commit c7ffa8dee96e228c6519154d5a00742b35caa3f2 Author: Pu Fanyi Date: Tue Jan 30 14:52:51 2024 +0800 scienceqa for full set (#32) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit 'f80465fd0f30781c8c36b46c1d6d7bba751f9e33' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit 2df0ce76ef836be1cb8ffbf3c854fe05563647b0 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration * Update generation kwargs for LMMS tasks * Update lmms_eval MME task configuration and utils * Update generation_kwargs in lmms_eval tasks * Update doc_to_text function in coco and okvqa tasks * Add COCO 2017 version * Update task name in coco_test2017.yaml * Squashed commit of the following: commit 0390783595c41232352599ab78fbe5949615e982 Author: Zhang Peiyuan Date: Mon Jan 29 22:41:33 2024 +0800 Add/mmmu test (#30) * mmmu_test * black commit 407bc2500c162d8949fbaae3d11d522afd2c9f28 Author: Li Bo Date: Sun Jan 28 22:19:13 2024 +0800 [Dataset Check] dataset check and add wandb logging (#29) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit 'f80465fd0f30781c8c36b46c1d6d7bba751f9e33' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit 2df0ce76ef836be1cb8ffbf3c854fe05563647b0 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration --------- Co-authored-by: Fanyi Pu Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Remove scienceqa_img task configuration * eval scienceqa with no images --------- Co-authored-by: Bo Li Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Update hb_doc_to_text function to remove unnecessary line break * Add Fuyu model and update OtterHD model * Refactor model response handling and fix image processing bug * Refactor flatten method to support only getting the first element * Add support for specifying timezone in datetime string Update flatten method in OtterHD class Update get_datetime_str function in utils.py * Fix condition for checking wandb_args_dict in __main__.py * Commented out assertions for batch size in Fuyu model * Add warning message for existing output file * [Datasets] add hallubench (#34) * Add hallu bench * Fix hall_b gpt eval bugs --------- Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * [Datasets] add hallubench (#34) * Add hallu bench * Fix hall_b gpt eval bugs --------- Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * [Dataset] fix hallusion benchmark, add saving logic inside aggregate function (#35) * add fuyu * Merge commit '0f183a394426d3bf88884b4e2258ab53406bc705' * Squashed commit of the following: commit b81ed2ce4d0e226df7a41bddd82fe1f9d46a27fc Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Tue Jan 30 19:39:57 2024 +0800 Add hallu bench commit 0f183a394426d3bf88884b4e2258ab53406bc705 Author: Pu Fanyi Date: Tue Jan 30 14:52:51 2024 +0800 scienceqa for full set (#32) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit '0a403e6f5e17c70a50983c83a132edf0fdcd98de' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit fdb0c6785b0c5d6979d10e7ddf75ce9055038db8 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration * Update generation kwargs for LMMS tasks * Update lmms_eval MME task configuration and utils * Update generation_kwargs in lmms_eval tasks * Update doc_to_text function in coco and okvqa tasks * Add COCO 2017 version * Update task name in coco_test2017.yaml * Squashed commit of the following: commit 1e2ae936c90a15d684926e43a38aac86935f38c5 Author: Zhang Peiyuan Date: Mon Jan 29 22:41:33 2024 +0800 Add/mmmu test (#30) * mmmu_test * black commit 10bbaf01c0a4164b6f1d2628367befccf8f39c24 Author: Li Bo Date: Sun Jan 28 22:19:13 2024 +0800 [Dataset Check] dataset check and add wandb logging (#29) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit '0a403e6f5e17c70a50983c83a132edf0fdcd98de' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit fdb0c6785b0c5d6979d10e7ddf75ce9055038db8 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration --------- Co-authored-by: Fanyi Pu Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Remove scienceqa_img task configuration * eval scienceqa with no images --------- Co-authored-by: Bo Li Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Update hb_doc_to_text function to remove unnecessary line break * Add Fuyu model and update OtterHD model * Refactor model response handling and fix image processing bug * Refactor flatten method to support only getting the first element * Add support for specifying timezone in datetime string Update flatten method in OtterHD class Update get_datetime_str function in utils.py * Fix condition for checking wandb_args_dict in __main__.py * Commented out assertions for batch size in Fuyu model * Add warning message for existing output file * Fix batch size issue in OtterHD model * Squashed commit of the following: commit 4b604e75cfde49df52e4abd90be4876ed9a1b08f Author: Li Bo Date: Wed Jan 31 16:00:22 2024 +0800 [Datasets] add hallubench (#34) * Add hallu bench * Fix hall_b gpt eval bugs --------- Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> commit 799a6bcb9033656115755c5169f8c342eb927d54 Author: Li Bo Date: Wed Jan 31 14:23:15 2024 +0800 [Datasets & Models] Fuyu, HalluBench (w/Kaichen, commit 96d95b3) (#33) * add fuyu * Merge commit '0f183a394426d3bf88884b4e2258ab53406bc705' * Squashed commit of the following: commit b81ed2ce4d0e226df7a41bddd82fe1f9d46a27fc Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Tue Jan 30 19:39:57 2024 +0800 Add hallu bench commit 0f183a394426d3bf88884b4e2258ab53406bc705 Author: Pu Fanyi Date: Tue Jan 30 14:52:51 2024 +0800 scienceqa for full set (#32) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit '0a403e6f5e17c70a50983c83a132edf0fdcd98de' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit fdb0c6785b0c5d6979d10e7ddf75ce9055038db8 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration * Update generation kwargs for LMMS tasks * Update lmms_eval MME task configuration and utils * Update generation_kwargs in lmms_eval tasks * Update doc_to_text function in coco and okvqa tasks * Add COCO 2017 version * Update task name in coco_test2017.yaml * Squashed commit of the following: commit 1e2ae936c90a15d684926e43a38aac86935f38c5 Author: Zhang Peiyuan Date: Mon Jan 29 22:41:33 2024 +0800 Add/mmmu test (#30) * mmmu_test * black commit 10bbaf01c0a4164b6f1d2628367befccf8f39c24 Author: Li Bo Date: Sun Jan 28 22:19:13 2024 +0800 [Dataset Check] dataset check and add wandb logging (#29) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit '0a403e6f5e17c70a50983c83a132edf0fdcd98de' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit fdb0c6785b0c5d6979d10e7ddf75ce9055038db8 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration --------- Co-authored-by: Fanyi Pu Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Remove scienceqa_img task configuration * eval scienceqa with no images --------- Co-authored-by: Bo Li Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Update hb_doc_to_text function to remove unnecessary line break * Add Fuyu model and update OtterHD model * Refactor model response handling and fix image processing bug * Refactor flatten method to support only getting the first element * Add support for specifying timezone in datetime string Update flatten method in OtterHD class Update get_datetime_str function in utils.py * Fix condition for checking wandb_args_dict in __main__.py * Commented out assertions for batch size in Fuyu model * Add warning message for existing output file commit 0f183a394426d3bf88884b4e2258ab53406bc705 Author: Pu Fanyi Date: Tue Jan 30 14:52:51 2024 +0800 scienceqa for full set (#32) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit '0a403e6f5e17c70a50983c83a132edf0fdcd98de' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit fdb0c6785b0c5d6979d10e7ddf75ce9055038db8 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration * Update generation kwargs for LMMS tasks * Update lmms_eval MME task configuration and utils * Update generation_kwargs in lmms_eval tasks * Update doc_to_text function in coco and okvqa tasks * Add COCO 2017 version * Update task name in coco_test2017.yaml * Squashed commit of the following: commit 1e2ae936c90a15d684926e43a38aac86935f38c5 Author: Zhang Peiyuan Date: Mon Jan 29 22:41:33 2024 +0800 Add/mmmu test (#30) * mmmu_test * black commit 10bbaf01c0a4164b6f1d2628367befccf8f39c24 Author: Li Bo Date: Sun Jan 28 22:19:13 2024 +0800 [Dataset Check] dataset check and add wandb logging (#29) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit '0a403e6f5e17c70a50983c83a132edf0fdcd98de' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit fdb0c6785b0c5d6979d10e7ddf75ce9055038db8 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration --------- Co-authored-by: Fanyi Pu Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Remove scienceqa_img task configuration * eval scienceqa with no images --------- Co-authored-by: Bo Li Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Update API configuration and file paths * Refactor evaluate_by_chatgpt function in utils.py * Add hallusion_output_vd_model.json to .gitignore * Add timeout to API request * Refactor file path generation and remove unnecessary suffix in log samples output names * Refactor code and add output path handling * Update lmms-eval API and add new models and datasets * [Dataset] fix hallusion benchmark, add saving logic inside aggregate function (#35) * add fuyu * Merge commit 'c7ffa8dee96e228c6519154d5a00742b35caa3f2' * Squashed commit of the following: commit 96d95b3cb3540cd17bcab31f1a85ad0d04a12f1e Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Tue Jan 30 19:39:57 2024 +0800 Add hallu bench commit c7ffa8dee96e228c6519154d5a00742b35caa3f2 Author: Pu Fanyi Date: Tue Jan 30 14:52:51 2024 +0800 scienceqa for full set (#32) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit 'f80465fd0f30781c8c36b46c1d6d7bba751f9e33' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit 2df0ce76ef836be1cb8ffbf3c854fe05563647b0 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration * Update generation kwargs for LMMS tasks * Update lmms_eval MME task configuration and utils * Update generation_kwargs in lmms_eval tasks * Update doc_to_text function in coco and okvqa tasks * Add COCO 2017 version * Update task name in coco_test2017.yaml * Squashed commit of the following: commit 0390783595c41232352599ab78fbe5949615e982 Author: Zhang Peiyuan Date: Mon Jan 29 22:41:33 2024 +0800 Add/mmmu test (#30) * mmmu_test * black commit 407bc2500c162d8949fbaae3d11d522afd2c9f28 Author: Li Bo Date: Sun Jan 28 22:19:13 2024 +0800 [Dataset Check] dataset check and add wandb logging (#29) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit 'f80465fd0f30781c8c36b46c1d6d7bba751f9e33' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit 2df0ce76ef836be1cb8ffbf3c854fe05563647b0 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration --------- Co-authored-by: Fanyi Pu Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Remove scienceqa_img task configuration * eval scienceqa with no images --------- Co-authored-by: Bo Li Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Update hb_doc_to_text function to remove unnecessary line break * Add Fuyu model and update OtterHD model * Refactor model response handling and fix image processing bug * Refactor flatten method to support only getting the first element * Add support for specifying timezone in datetime string Update flatten method in OtterHD class Update get_datetime_str function in utils.py * Fix condition for checking wandb_args_dict in __main__.py * Commented out assertions for batch size in Fuyu model * Add warning message for existing output file * Fix batch size issue in OtterHD model * Squashed commit of the following: commit d0c8c61d9a23686d31c7e014f0c15d802e04ee61 Author: Li Bo Date: Wed Jan 31 16:00:22 2024 +0800 [Datasets] add hallubench (#34) * Add hallu bench * Fix hall_b gpt eval bugs --------- Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> commit f4fd4fd29b45436a96fe65395f0922612f598052 Author: Li Bo Date: Wed Jan 31 14:23:15 2024 +0800 [Datasets & Models] Fuyu, HalluBench (w/Kaichen, commit 96d95b3) (#33) * add fuyu * Merge commit 'c7ffa8dee96e228c6519154d5a00742b35caa3f2' * Squashed commit of the following: commit 96d95b3cb3540cd17bcab31f1a85ad0d04a12f1e Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Tue Jan 30 19:39:57 2024 +0800 Add hallu bench commit c7ffa8dee96e228c6519154d5a00742b35caa3f2 Author: Pu Fanyi Date: Tue Jan 30 14:52:51 2024 +0800 scienceqa for full set (#32) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit 'f80465fd0f30781c8c36b46c1d6d7bba751f9e33' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit 2df0ce76ef836be1cb8ffbf3c854fe05563647b0 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration * Update generation kwargs for LMMS tasks * Update lmms_eval MME task configuration and utils * Update generation_kwargs in lmms_eval tasks * Update doc_to_text function in coco and okvqa tasks * Add COCO 2017 version * Update task name in coco_test2017.yaml * Squashed commit of the following: commit 0390783595c41232352599ab78fbe5949615e982 Author: Zhang Peiyuan Date: Mon Jan 29 22:41:33 2024 +0800 Add/mmmu test (#30) * mmmu_test * black commit 407bc2500c162d8949fbaae3d11d522afd2c9f28 Author: Li Bo Date: Sun Jan 28 22:19:13 2024 +0800 [Dataset Check] dataset check and add wandb logging (#29) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit 'f80465fd0f30781c8c36b46c1d6d7bba751f9e33' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit 2df0ce76ef836be1cb8ffbf3c854fe05563647b0 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration --------- Co-authored-by: Fanyi Pu Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Remove scienceqa_img task configuration * eval scienceqa with no images --------- Co-authored-by: Bo Li Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Update hb_doc_to_text function to remove unnecessary line break * Add Fuyu model and update OtterHD model * Refactor model response handling and fix image processing bug * Refactor flatten method to support only getting the first element * Add support for specifying timezone in datetime string Update flatten method in OtterHD class Update get_datetime_str function in utils.py * Fix condition for checking wandb_args_dict in __main__.py * Commented out assertions for batch size in Fuyu model * Add warning message for existing output file commit c7ffa8dee96e228c6519154d5a00742b35caa3f2 Author: Pu Fanyi Date: Tue Jan 30 14:52:51 2024 +0800 scienceqa for full set (#32) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit 'f80465fd0f30781c8c36b46c1d6d7bba751f9e33' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit 2df0ce76ef836be1cb8ffbf3c854fe05563647b0 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration * Update generation kwargs for LMMS tasks * Update lmms_eval MME task configuration and utils * Update generation_kwargs in lmms_eval tasks * Update doc_to_text function in coco and okvqa tasks * Add COCO 2017 version * Update task name in coco_test2017.yaml * Squashed commit of the following: commit 0390783595c41232352599ab78fbe5949615e982 Author: Zhang Peiyuan Date: Mon Jan 29 22:41:33 2024 +0800 Add/mmmu test (#30) * mmmu_test * black commit 407bc2500c162d8949fbaae3d11d522afd2c9f28 Author: Li Bo Date: Sun Jan 28 22:19:13 2024 +0800 [Dataset Check] dataset check and add wandb logging (#29) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit 'f80465fd0f30781c8c36b46c1d6d7bba751f9e33' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit 2df0ce76ef836be1cb8ffbf3c854fe05563647b0 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration --------- Co-authored-by: Fanyi Pu Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Remove scienceqa_img task configuration * eval scienceqa with no images --------- Co-authored-by: Bo Li Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Update API configuration and file paths * Refactor evaluate_by_chatgpt function in utils.py * Add hallusion_output_vd_model.json to .gitignore * Add timeout to API request * Refactor file path generation and remove unnecessary suffix in log samples output names * Refactor code and add output path handling * Update lmms-eval API and add new models and datasets * adapt qwen to sqa, gqa, ai2d, docvqa (#36) * adapt qwen to sqa, gqa, ai2d, docvqa * black * adapt qwen to sqa, gqa, ai2d, docvqa (#36) * adapt qwen to sqa, gqa, ai2d, docvqa * black * [Fix, Feat] Solve llava wild issue when task num can't divide, add fuyu ppl (#37) * Add fuyu ppl, fix llava-bench gather issue * Add gpt4V * Black lint * [Fix, Feat] Solve llava wild issue when task num can't divide, add fuyu ppl (#37) * Add fuyu ppl, fix llava-bench gather issue * Add gpt4V * Black lint * [Logging] Wandb Logging Support (#38) * add fuyu * Merge commit '0f183a394426d3bf88884b4e2258ab53406bc705' * Squashed commit of the following: commit b81ed2ce4d0e226df7a41bddd82fe1f9d46a27fc Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Tue Jan 30 19:39:57 2024 +0800 Add hallu bench commit 0f183a394426d3bf88884b4e2258ab53406bc705 Author: Pu Fanyi Date: Tue Jan 30 14:52:51 2024 +0800 scienceqa for full set (#32) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit '0a403e6f5e17c70a50983c83a132edf0fdcd98de' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit fdb0c6785b0c5d6979d10e7ddf75ce9055038db8 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration * Update generation kwargs for LMMS tasks * Update lmms_eval MME task configuration and utils * Update generation_kwargs in lmms_eval tasks * Update doc_to_text function in coco and okvqa tasks * Add COCO 2017 version * Update task name in coco_test2017.yaml * Squashed commit of the following: commit 1e2ae936c90a15d684926e43a38aac86935f38c5 Author: Zhang Peiyuan Date: Mon Jan 29 22:41:33 2024 +0800 Add/mmmu test (#30) * mmmu_test * black commit 10bbaf01c0a4164b6f1d2628367befccf8f39c24 Author: Li Bo Date: Sun Jan 28 22:19:13 2024 +0800 [Dataset Check] dataset check and add wandb logging (#29) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit '0a403e6f5e17c70a50983c83a132edf0fdcd98de' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit fdb0c6785b0c5d6979d10e7ddf75ce9055038db8 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration --------- Co-authored-by: Fanyi Pu Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Remove scienceqa_img task configuration * eval scienceqa with no images --------- Co-authored-by: Bo Li Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Update hb_doc_to_text function to remove unnecessary line break * Add Fuyu model and update OtterHD model * Refactor model response handling and fix image processing bug * Refactor flatten method to support only getting the first element * Add support for specifying timezone in datetime string Update flatten method in OtterHD class Update get_datetime_str function in utils.py * Fix condition for checking wandb_args_dict in __main__.py * Commented out assertions for batch size in Fuyu model * Add warning message for existing output file * Fix batch size issue in OtterHD model * Squashed commit of the following: commit 4b604e75cfde49df52e4abd90be4876ed9a1b08f Author: Li Bo Date: Wed Jan 31 16:00:22 2024 +0800 [Datasets] add hallubench (#34) * Add hallu bench * Fix hall_b gpt eval bugs --------- Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> commit 799a6bcb9033656115755c5169f8c342eb927d54 Author: Li Bo Date: Wed Jan 31 14:23:15 2024 +0800 [Datasets & Models] Fuyu, HalluBench (w/Kaichen, commit 96d95b3) (#33) * add fuyu * Merge commit '0f183a394426d3bf88884b4e2258ab53406bc705' * Squashed commit of the following: commit b81ed2ce4d0e226df7a41bddd82fe1f9d46a27fc Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Tue Jan 30 19:39:57 2024 +0800 Add hallu bench commit 0f183a394426d3bf88884b4e2258ab53406bc705 Author: Pu Fanyi Date: Tue Jan 30 14:52:51 2024 +0800 scienceqa for full set (#32) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit '0a403e6f5e17c70a50983c83a132edf0fdcd98de' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit fdb0c6785b0c5d6979d10e7ddf75ce9055038db8 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration * Update generation kwargs for LMMS tasks * Update lmms_eval MME task configuration and utils * Update generation_kwargs in lmms_eval tasks * Update doc_to_text function in coco and okvqa tasks * Add COCO 2017 version * Update task name in coco_test2017.yaml * Squashed commit of the following: commit 1e2ae936c90a15d684926e43a38aac86935f38c5 Author: Zhang Peiyuan Date: Mon Jan 29 22:41:33 2024 +0800 Add/mmmu test (#30) * mmmu_test * black commit 10bbaf01c0a4164b6f1d2628367befccf8f39c24 Author: Li Bo Date: Sun Jan 28 22:19:13 2024 +0800 [Dataset Check] dataset check and add wandb logging (#29) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit '0a403e6f5e17c70a50983c83a132edf0fdcd98de' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit fdb0c6785b0c5d6979d10e7ddf75ce9055038db8 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration --------- Co-authored-by: Fanyi Pu Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Remove scienceqa_img task configuration * eval scienceqa with no images --------- Co-authored-by: Bo Li Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Update hb_doc_to_text function to remove unnecessary line break * Add Fuyu model and update OtterHD model * Refactor model response handling and fix image processing bug * Refactor flatten method to support only getting the first element * Add support for specifying timezone in datetime string Update flatten method in OtterHD class Update get_datetime_str function in utils.py * Fix condition for checking wandb_args_dict in __main__.py * Commented out assertions for batch size in Fuyu model * Add warning message for existing output file commit 0f183a394426d3bf88884b4e2258ab53406bc705 Author: Pu Fanyi Date: Tue Jan 30 14:52:51 2024 +0800 scienceqa for full set (#32) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit '0a403e6f5e17c70a50983c83a132edf0fdcd98de' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit fdb0c6785b0c5d6979d10e7ddf75ce9055038db8 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration * Update generation kwargs for LMMS tasks * Update lmms_eval MME task configuration and utils * Update generation_kwargs in lmms_eval tasks * Update doc_to_text function in coco and okvqa tasks * Add COCO 2017 version * Update task name in coco_test2017.yaml * Squashed commit of the following: commit 1e2ae936c90a15d684926e43a38aac86935f38c5 Author: Zhang Peiyuan Date: Mon Jan 29 22:41:33 2024 +0800 Add/mmmu test (#30) * mmmu_test * black commit 10bbaf01c0a4164b6f1d2628367befccf8f39c24 Author: Li Bo Date: Sun Jan 28 22:19:13 2024 +0800 [Dataset Check] dataset check and add wandb logging (#29) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit '0a403e6f5e17c70a50983c83a132edf0fdcd98de' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit fdb0c6785b0c5d6979d10e7ddf75ce9055038db8 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration --------- Co-authored-by: Fanyi Pu Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Remove scienceqa_img task configuration * eval scienceqa with no images --------- Co-authored-by: Bo Li Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Update API configuration and file paths * Refactor evaluate_by_chatgpt function in utils.py * Add hallusion_output_vd_model.json to .gitignore * Add timeout to API request * Refactor file path generation and remove unnecessary suffix in log samples output names * Refactor code and add output path handling * Update lmms-eval API and add new models and datasets * Refactor directory structure for RefCOCO+ and RefCOCOg datasets * Fix error logging in get_eval and parse_score functions * Update .gitignore and mme.yaml * Squashed commit of the following: commit 3d44977c9254d1ee5254b2ca24c8cc54984e84b0 Author: jzhang38 Date: Fri Feb 2 13:43:28 2024 +0800 black commit a38ffeb692fbeb9deebe20f65b0f3e041823e695 Author: jzhang38 Date: Fri Feb 2 13:42:03 2024 +0800 adapt qwen to sqa, gqa, ai2d, docvqa commit e24607fd5725aabb7f6db5fa457b5e6a5123c199 Author: Li Bo Date: Thu Feb 1 16:20:27 2024 +0800 [Dataset] fix hallusion benchmark, add saving logic inside aggregate function (#35) * add fuyu * Merge commit '0f183a394426d3bf88884b4e2258ab53406bc705' * Squashed commit of the following: commit b81ed2ce4d0e226df7a41bddd82fe1f9d46a27fc Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Tue Jan 30 19:39:57 2024 +0800 Add hallu bench commit 0f183a394426d3bf88884b4e2258ab53406bc705 Author: Pu Fanyi Date: Tue Jan 30 14:52:51 2024 +0800 scienceqa for full set (#32) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit '0a403e6f5e17c70a50983c83a132edf0fdcd98de' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit fdb0c6785b0c5d6979d10e7ddf75ce9055038db8 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration * Update generation kwargs for LMMS tasks * Update lmms_eval MME task configuration and utils * Update generation_kwargs in lmms_eval tasks * Update doc_to_text function in coco and okvqa tasks * Add COCO 2017 version * Update task name in coco_test2017.yaml * Squashed commit of the following: commit 1e2ae936c90a15d684926e43a38aac86935f38c5 Author: Zhang Peiyuan Date: Mon Jan 29 22:41:33 2024 +0800 Add/mmmu test (#30) * mmmu_test * black commit 10bbaf01c0a4164b6f1d2628367befccf8f39c24 Author: Li Bo Date: Sun Jan 28 22:19:13 2024 +0800 [Dataset Check] dataset check and add wandb logging (#29) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit '0a403e6f5e17c70a50983c83a132edf0fdcd98de' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit fdb0c6785b0c5d6979d10e7ddf75ce9055038db8 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration --------- Co-authored-by: Fanyi Pu Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Remove scienceqa_img task configuration * eval scienceqa with no images --------- Co-authored-by: Bo Li Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Update hb_doc_to_text function to remove unnecessary line break * Add Fuyu model and update OtterHD model * Refactor model response handling and fix image processing bug * Refactor flatten method to support only getting the first element * Add support for specifying timezone in datetime string Update flatten method in OtterHD class Update get_datetime_str function in utils.py * Fix condition for checking wandb_args_dict in __main__.py * Commented out assertions for batch size in Fuyu model * Add warning message for existing output file * Fix batch size issue in OtterHD model * Squashed commit of the following: commit 4b604e75cfde49df52e4abd90be4876ed9a1b08f Author: Li Bo Date: Wed Jan 31 16:00:22 2024 +0800 [Datasets] add hallubench (#34) * Add hallu bench * Fix hall_b gpt eval bugs --------- Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> commit 799a6bcb9033656115755c5169f8c342eb927d54 Author: Li Bo Date: Wed Jan 31 14:23:15 2024 +0800 [Datasets & Models] Fuyu, HalluBench (w/Kaichen, commit 96d95b3) (#33) * add fuyu * Merge commit '0f183a394426d3bf88884b4e2258ab53406bc705' * Squashed commit of the following: commit b81ed2ce4d0e226df7a41bddd82fe1f9d46a27fc Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Tue Jan 30 19:39:57 2024 +0800 Add hallu bench commit 0f183a394426d3bf88884b4e2258ab53406bc705 Author: Pu Fanyi Date: Tue Jan 30 14:52:51 2024 +0800 scienceqa for full set (#32) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit '0a403e6f5e17c70a50983c83a132edf0fdcd98de' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit fdb0c6785b0c5d6979d10e7ddf75ce9055038db8 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration * Update generation kwargs for LMMS tasks * Update lmms_eval MME task configuration and utils * Update generation_kwargs in lmms_eval tasks * Update doc_to_text function in coco and okvqa tasks * Add COCO 2017 version * Update task name in coco_test2017.yaml * Squashed commit of the following: commit 1e2ae936c90a15d684926e43a38aac86935f38c5 Author: Zhang Peiyuan Date: Mon Jan 29 22:41:33 2024 +0800 Add/mmmu test (#30) * mmmu_test * black commit 10bbaf01c0a4164b6f1d2628367befccf8f39c24 Author: Li Bo Date: Sun Jan 28 22:19:13 2024 +0800 [Dataset Check] dataset check and add wandb logging (#29) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit '0a403e6f5e17c70a50983c83a132edf0fdcd98de' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit fdb0c6785b0c5d6979d10e7ddf75ce9055038db8 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration --------- … * [Logging] Wandb Logging Support (#38) * add fuyu * Merge commit 'c7ffa8dee96e228c6519154d5a00742b35caa3f2' * Squashed commit of the following: commit 96d95b3cb3540cd17bcab31f1a85ad0d04a12f1e Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Tue Jan 30 19:39:57 2024 +0800 Add hallu bench commit c7ffa8dee96e228c6519154d5a00742b35caa3f2 Author: Pu Fanyi Date: Tue Jan 30 14:52:51 2024 +0800 scienceqa for full set (#32) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit 'f80465fd0f30781c8c36b46c1d6d7bba751f9e33' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit 2df0ce76ef836be1cb8ffbf3c854fe05563647b0 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration * Update generation kwargs for LMMS tasks * Update lmms_eval MME task configuration and utils * Update generation_kwargs in lmms_eval tasks * Update doc_to_text function in coco and okvqa tasks * Add COCO 2017 version * Update task name in coco_test2017.yaml * Squashed commit of the following: commit 0390783595c41232352599ab78fbe5949615e982 Author: Zhang Peiyuan Date: Mon Jan 29 22:41:33 2024 +0800 Add/mmmu test (#30) * mmmu_test * black commit 407bc2500c162d8949fbaae3d11d522afd2c9f28 Author: Li Bo Date: Sun Jan 28 22:19:13 2024 +0800 [Dataset Check] dataset check and add wandb logging (#29) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit 'f80465fd0f30781c8c36b46c1d6d7bba751f9e33' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit 2df0ce76ef836be1cb8ffbf3c854fe05563647b0 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration --------- Co-authored-by: Fanyi Pu Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Remove scienceqa_img task configuration * eval scienceqa with no images --------- Co-authored-by: Bo Li Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Update hb_doc_to_text function to remove unnecessary line break * Add Fuyu model and update OtterHD model * Refactor model response handling and fix image processing bug * Refactor flatten method to support only getting the first element * Add support for specifying timezone in datetime string Update flatten method in OtterHD class Update get_datetime_str function in utils.py * Fix condition for checking wandb_args_dict in __main__.py * Commented out assertions for batch size in Fuyu model * Add warning message for existing output file * Fix batch size issue in OtterHD model * Squashed commit of the following: commit d0c8c61d9a23686d31c7e014f0c15d802e04ee61 Author: Li Bo Date: Wed Jan 31 16:00:22 2024 +0800 [Datasets] add hallubench (#34) * Add hallu bench * Fix hall_b gpt eval bugs --------- Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> commit f4fd4fd29b45436a96fe65395f0922612f598052 Author: Li Bo Date: Wed Jan 31 14:23:15 2024 +0800 [Datasets & Models] Fuyu, HalluBench (w/Kaichen, commit 96d95b3) (#33) * add fuyu * Merge commit 'c7ffa8dee96e228c6519154d5a00742b35caa3f2' * Squashed commit of the following: commit 96d95b3cb3540cd17bcab31f1a85ad0d04a12f1e Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Tue Jan 30 19:39:57 2024 +0800 Add hallu bench commit c7ffa8dee96e228c6519154d5a00742b35caa3f2 Author: Pu Fanyi Date: Tue Jan 30 14:52:51 2024 +0800 scienceqa for full set (#32) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit 'f80465fd0f30781c8c36b46c1d6d7bba751f9e33' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit 2df0ce76ef836be1cb8ffbf3c854fe05563647b0 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration * Update generation kwargs for LMMS tasks * Update lmms_eval MME task configuration and utils * Update generation_kwargs in lmms_eval tasks * Update doc_to_text function in coco and okvqa tasks * Add COCO 2017 version * Update task name in coco_test2017.yaml * Squashed commit of the following: commit 0390783595c41232352599ab78fbe5949615e982 Author: Zhang Peiyuan Date: Mon Jan 29 22:41:33 2024 +0800 Add/mmmu test (#30) * mmmu_test * black commit 407bc2500c162d8949fbaae3d11d522afd2c9f28 Author: Li Bo Date: Sun Jan 28 22:19:13 2024 +0800 [Dataset Check] dataset check and add wandb logging (#29) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit 'f80465fd0f30781c8c36b46c1d6d7bba751f9e33' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit 2df0ce76ef836be1cb8ffbf3c854fe05563647b0 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration --------- Co-authored-by: Fanyi Pu Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Remove scienceqa_img task configuration * eval scienceqa with no images --------- Co-authored-by: Bo Li Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Update hb_doc_to_text function to remove unnecessary line break * Add Fuyu model and update OtterHD model * Refactor model response handling and fix image processing bug * Refactor flatten method to support only getting the first element * Add support for specifying timezone in datetime string Update flatten method in OtterHD class Update get_datetime_str function in utils.py * Fix condition for checking wandb_args_dict in __main__.py * Commented out assertions for batch size in Fuyu model * Add warning message for existing output file commit c7ffa8dee96e228c6519154d5a00742b35caa3f2 Author: Pu Fanyi Date: Tue Jan 30 14:52:51 2024 +0800 scienceqa for full set (#32) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit 'f80465fd0f30781c8c36b46c1d6d7bba751f9e33' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit 2df0ce76ef836be1cb8ffbf3c854fe05563647b0 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration * Update generation kwargs for LMMS tasks * Update lmms_eval MME task configuration and utils * Update generation_kwargs in lmms_eval tasks * Update doc_to_text function in coco and okvqa tasks * Add COCO 2017 version * Update task name in coco_test2017.yaml * Squashed commit of the following: commit 0390783595c41232352599ab78fbe5949615e982 Author: Zhang Peiyuan Date: Mon Jan 29 22:41:33 2024 +0800 Add/mmmu test (#30) * mmmu_test * black commit 407bc2500c162d8949fbaae3d11d522afd2c9f28 Author: Li Bo Date: Sun Jan 28 22:19:13 2024 +0800 [Dataset Check] dataset check and add wandb logging (#29) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit 'f80465fd0f30781c8c36b46c1d6d7bba751f9e33' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit 2df0ce76ef836be1cb8ffbf3c854fe05563647b0 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration --------- Co-authored-by: Fanyi Pu Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Remove scienceqa_img task configuration * eval scienceqa with no images --------- Co-authored-by: Bo Li Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Update API configuration and file paths * Refactor evaluate_by_chatgpt function in utils.py * Add hallusion_output_vd_model.json to .gitignore * Add timeout to API request * Refactor file path generation and remove unnecessary suffix in log samples output names * Refactor code and add output path handling * Update lmms-eval API and add new models and datasets * Refactor directory structure for RefCOCO+ and RefCOCOg datasets * Fix error logging in get_eval and parse_score functions * Update .gitignore and mme.yaml * Squashed commit of the following: commit 380494bb2417fae1bcc1535ad8b67df7af667619 Author: jzhang38 Date: Fri Feb 2 13:43:28 2024 +0800 black commit e46b937aeeed45f5dd574b852459bfb416d165fd Author: jzhang38 Date: Fri Feb 2 13:42:03 2024 +0800 adapt qwen to sqa, gqa, ai2d, docvqa commit 2da8f918c37495b3447b9c24e74234ad0bba8cbf Author: Li Bo Date: Thu Feb 1 16:20:27 2024 +0800 [Dataset] fix hallusion benchmark, add saving logic inside aggregate function (#35) * add fuyu * Merge commit 'c7ffa8dee96e228c6519154d5a00742b35caa3f2' * Squashed commit of the following: commit 96d95b3cb3540cd17bcab31f1a85ad0d04a12f1e Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Tue Jan 30 19:39:57 2024 +0800 Add hallu bench commit c7ffa8dee96e228c6519154d5a00742b35caa3f2 Author: Pu Fanyi Date: Tue Jan 30 14:52:51 2024 +0800 scienceqa for full set (#32) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit 'f80465fd0f30781c8c36b46c1d6d7bba751f9e33' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit 2df0ce76ef836be1cb8ffbf3c854fe05563647b0 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration * Update generation kwargs for LMMS tasks * Update lmms_eval MME task configuration and utils * Update generation_kwargs in lmms_eval tasks * Update doc_to_text function in coco and okvqa tasks * Add COCO 2017 version * Update task name in coco_test2017.yaml * Squashed commit of the following: commit 0390783595c41232352599ab78fbe5949615e982 Author: Zhang Peiyuan Date: Mon Jan 29 22:41:33 2024 +0800 Add/mmmu test (#30) * mmmu_test * black commit 407bc2500c162d8949fbaae3d11d522afd2c9f28 Author: Li Bo Date: Sun Jan 28 22:19:13 2024 +0800 [Dataset Check] dataset check and add wandb logging (#29) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit 'f80465fd0f30781c8c36b46c1d6d7bba751f9e33' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit 2df0ce76ef836be1cb8ffbf3c854fe05563647b0 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration --------- Co-authored-by: Fanyi Pu Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Remove scienceqa_img task configuration * eval scienceqa with no images --------- Co-authored-by: Bo Li Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Update hb_doc_to_text function to remove unnecessary line break * Add Fuyu model and update OtterHD model * Refactor model response handling and fix image processing bug * Refactor flatten method to support only getting the first element * Add support for specifying timezone in datetime string Update flatten method in OtterHD class Update get_datetime_str function in utils.py * Fix condition for checking wandb_args_dict in __main__.py * Commented out assertions for batch size in Fuyu model * Add warning message for existing output file * Fix batch size issue in OtterHD model * Squashed commit of the following: commit d0c8c61d9a23686d31c7e014f0c15d802e04ee61 Author: Li Bo Date: Wed Jan 31 16:00:22 2024 +0800 [Datasets] add hallubench (#34) * Add hallu bench * Fix hall_b gpt eval bugs --------- Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> commit f4fd4fd29b45436a96fe65395f0922612f598052 Author: Li Bo Date: Wed Jan 31 14:23:15 2024 +0800 [Datasets & Models] Fuyu, HalluBench (w/Kaichen, commit 96d95b3) (#33) * add fuyu * Merge commit 'c7ffa8dee96e228c6519154d5a00742b35caa3f2' * Squashed commit of the following: commit 96d95b3cb3540cd17bcab31f1a85ad0d04a12f1e Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Tue Jan 30 19:39:57 2024 +0800 Add hallu bench commit c7ffa8dee96e228c6519154d5a00742b35caa3f2 Author: Pu Fanyi Date: Tue Jan 30 14:52:51 2024 +0800 scienceqa for full set (#32) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit 'f80465fd0f30781c8c36b46c1d6d7bba751f9e33' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit 2df0ce76ef836be1cb8ffbf3c854fe05563647b0 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration * Update generation kwargs for LMMS tasks * Update lmms_eval MME task configuration and utils * Update generation_kwargs in lmms_eval tasks * Update doc_to_text function in coco and okvqa tasks * Add COCO 2017 version * Update task name in coco_test2017.yaml * Squashed commit of the following: commit 0390783595c41232352599ab78fbe5949615e982 Author: Zhang Peiyuan Date: Mon Jan 29 22:41:33 2024 +0800 Add/mmmu test (#30) * mmmu_test * black commit 407bc2500c162d8949fbaae3d11d522afd2c9f28 Author: Li Bo Date: Sun Jan 28 22:19:13 2024 +0800 [Dataset Check] dataset check and add wandb logging (#29) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit 'f80465fd0f30781c8c36b46c1d6d7bba751f9e33' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit 2df0ce76ef836be1cb8ffbf3c854fe05563647b0 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration --------- … * [Fix] fix bugs (#41) * add fuyu * Merge commit '0f183a394426d3bf88884b4e2258ab53406bc705' * Squashed commit of the following: commit b81ed2ce4d0e226df7a41bddd82fe1f9d46a27fc Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Tue Jan 30 19:39:57 2024 +0800 Add hallu bench commit 0f183a394426d3bf88884b4e2258ab53406bc705 Author: Pu Fanyi Date: Tue Jan 30 14:52:51 2024 +0800 scienceqa for full set (#32) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit '0a403e6f5e17c70a50983c83a132edf0fdcd98de' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit fdb0c6785b0c5d6979d10e7ddf75ce9055038db8 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration * Update generation kwargs for LMMS tasks * Update lmms_eval MME task configuration and utils * Update generation_kwargs in lmms_eval tasks * Update doc_to_text function in coco and okvqa tasks * Add COCO 2017 version * Update task name in coco_test2017.yaml * Squashed commit of the following: commit 1e2ae936c90a15d684926e43a38aac86935f38c5 Author: Zhang Peiyuan Date: Mon Jan 29 22:41:33 2024 +0800 Add/mmmu test (#30) * mmmu_test * black commit 10bbaf01c0a4164b6f1d2628367befccf8f39c24 Author: Li Bo Date: Sun Jan 28 22:19:13 2024 +0800 [Dataset Check] dataset check and add wandb logging (#29) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit '0a403e6f5e17c70a50983c83a132edf0fdcd98de' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit fdb0c6785b0c5d6979d10e7ddf75ce9055038db8 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration --------- Co-authored-by: Fanyi Pu Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Remove scienceqa_img task configuration * eval scienceqa with no images --------- Co-authored-by: Bo Li Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Update hb_doc_to_text function to remove unnecessary line break * Add Fuyu model and update OtterHD model * Refactor model response handling and fix image processing bug * Refactor flatten method to support only getting the first element * Add support for specifying timezone in datetime string Update flatten method in OtterHD class Update get_datetime_str function in utils.py * Fix condition for checking wandb_args_dict in __main__.py * Commented out assertions for batch size in Fuyu model * Add warning message for existing output file * Fix batch size issue in OtterHD model * Squashed commit of the following: commit 4b604e75cfde49df52e4abd90be4876ed9a1b08f Author: Li Bo Date: Wed Jan 31 16:00:22 2024 +0800 [Datasets] add hallubench (#34) * Add hallu bench * Fix hall_b gpt eval bugs --------- Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> commit 799a6bcb9033656115755c5169f8c342eb927d54 Author: Li Bo Date: Wed Jan 31 14:23:15 2024 +0800 [Datasets & Models] Fuyu, HalluBench (w/Kaichen, commit 96d95b3) (#33) * add fuyu * Merge commit '0f183a394426d3bf88884b4e2258ab53406bc705' * Squashed commit of the following: commit b81ed2ce4d0e226df7a41bddd82fe1f9d46a27fc Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Tue Jan 30 19:39:57 2024 +0800 Add hallu bench commit 0f183a394426d3bf88884b4e2258ab53406bc705 Author: Pu Fanyi Date: Tue Jan 30 14:52:51 2024 +0800 scienceqa for full set (#32) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit '0a403e6f5e17c70a50983c83a132edf0fdcd98de' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit fdb0c6785b0c5d6979d10e7ddf75ce9055038db8 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration * Update generation kwargs for LMMS tasks * Update lmms_eval MME task configuration and utils * Update generation_kwargs in lmms_eval tasks * Update doc_to_text function in coco and okvqa tasks * Add COCO 2017 version * Update task name in coco_test2017.yaml * Squashed commit of the following: commit 1e2ae936c90a15d684926e43a38aac86935f38c5 Author: Zhang Peiyuan Date: Mon Jan 29 22:41:33 2024 +0800 Add/mmmu test (#30) * mmmu_test * black commit 10bbaf01c0a4164b6f1d2628367befccf8f39c24 Author: Li Bo Date: Sun Jan 28 22:19:13 2024 +0800 [Dataset Check] dataset check and add wandb logging (#29) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit '0a403e6f5e17c70a50983c83a132edf0fdcd98de' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit fdb0c6785b0c5d6979d10e7ddf75ce9055038db8 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration --------- Co-authored-by: Fanyi Pu Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Remove scienceqa_img task configuration * eval scienceqa with no images --------- Co-authored-by: Bo Li Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Update hb_doc_to_text function to remove unnecessary line break * Add Fuyu model and update OtterHD model * Refactor model response handling and fix image processing bug * Refactor flatten method to support only getting the first element * Add support for specifying timezone in datetime string Update flatten method in OtterHD class Update get_datetime_str function in utils.py * Fix condition for checking wandb_args_dict in __main__.py * Commented out assertions for batch size in Fuyu model * Add warning message for existing output file commit 0f183a394426d3bf88884b4e2258ab53406bc705 Author: Pu Fanyi Date: Tue Jan 30 14:52:51 2024 +0800 scienceqa for full set (#32) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit '0a403e6f5e17c70a50983c83a132edf0fdcd98de' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit fdb0c6785b0c5d6979d10e7ddf75ce9055038db8 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration * Update generation kwargs for LMMS tasks * Update lmms_eval MME task configuration and utils * Update generation_kwargs in lmms_eval tasks * Update doc_to_text function in coco and okvqa tasks * Add COCO 2017 version * Update task name in coco_test2017.yaml * Squashed commit of the following: commit 1e2ae936c90a15d684926e43a38aac86935f38c5 Author: Zhang Peiyuan Date: Mon Jan 29 22:41:33 2024 +0800 Add/mmmu test (#30) * mmmu_test * black commit 10bbaf01c0a4164b6f1d2628367befccf8f39c24 Author: Li Bo Date: Sun Jan 28 22:19:13 2024 +0800 [Dataset Check] dataset check and add wandb logging (#29) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit '0a403e6f5e17c70a50983c83a132edf0fdcd98de' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit fdb0c6785b0c5d6979d10e7ddf75ce9055038db8 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration --------- Co-authored-by: Fanyi Pu Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Remove scienceqa_img task configuration * eval scienceqa with no images --------- Co-authored-by: Bo Li Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Update API configuration and file paths * Refactor evaluate_by_chatgpt function in utils.py * Add hallusion_output_vd_model.json to .gitignore * Add timeout to API request * Refactor file path generation and remove unnecessary suffix in log samples output names * Refactor code and add output path handling * Update lmms-eval API and add new models and datasets * Refactor directory structure for RefCOCO+ and RefCOCOg datasets * Fix error logging in get_eval and parse_score functions * Update .gitignore and mme.yaml * Squashed commit of the following: commit 3d44977c9254d1ee5254b2ca24c8cc54984e84b0 Author: jzhang38 Date: Fri Feb 2 13:43:28 2024 +0800 black commit a38ffeb692fbeb9deebe20f65b0f3e041823e695 Author: jzhang38 Date: Fri Feb 2 13:42:03 2024 +0800 adapt qwen to sqa, gqa, ai2d, docvqa commit e24607fd5725aabb7f6db5fa457b5e6a5123c199 Author: Li Bo Date: Thu Feb 1 16:20:27 2024 +0800 [Dataset] fix hallusion benchmark, add saving logic inside aggregate function (#35) * add fuyu * Merge commit '0f183a394426d3bf88884b4e2258ab53406bc705' * Squashed commit of the following: commit b81ed2ce4d0e226df7a41bddd82fe1f9d46a27fc Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Tue Jan 30 19:39:57 2024 +0800 Add hallu bench commit 0f183a394426d3bf88884b4e2258ab53406bc705 Author: Pu Fanyi Date: Tue Jan 30 14:52:51 2024 +0800 scienceqa for full set (#32) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit '0a403e6f5e17c70a50983c83a132edf0fdcd98de' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit fdb0c6785b0c5d6979d10e7ddf75ce9055038db8 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration * Update generation kwargs for LMMS tasks * Update lmms_eval MME task configuration and utils * Update generation_kwargs in lmms_eval tasks * Update doc_to_text function in coco and okvqa tasks * Add COCO 2017 version * Update task name in coco_test2017.yaml * Squashed commit of the following: commit 1e2ae936c90a15d684926e43a38aac86935f38c5 Author: Zhang Peiyuan Date: Mon Jan 29 22:41:33 2024 +0800 Add/mmmu test (#30) * mmmu_test * black commit 10bbaf01c0a4164b6f1d2628367befccf8f39c24 Author: Li Bo Date: Sun Jan 28 22:19:13 2024 +0800 [Dataset Check] dataset check and add wandb logging (#29) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit '0a403e6f5e17c70a50983c83a132edf0fdcd98de' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit fdb0c6785b0c5d6979d10e7ddf75ce9055038db8 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration --------- Co-authored-by: Fanyi Pu Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Remove scienceqa_img task configuration * eval scienceqa with no images --------- Co-authored-by: Bo Li Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Update hb_doc_to_text function to remove unnecessary line break * Add Fuyu model and update OtterHD model * Refactor model response handling and fix image processing bug * Refactor flatten method to support only getting the first element * Add support for specifying timezone in datetime string Update flatten method in OtterHD class Update get_datetime_str function in utils.py * Fix condition for checking wandb_args_dict in __main__.py * Commented out assertions for batch size in Fuyu model * Add warning message for existing output file * Fix batch size issue in OtterHD model * Squashed commit of the following: commit 4b604e75cfde49df52e4abd90be4876ed9a1b08f Author: Li Bo Date: Wed Jan 31 16:00:22 2024 +0800 [Datasets] add hallubench (#34) * Add hallu bench * Fix hall_b gpt eval bugs --------- Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> commit 799a6bcb9033656115755c5169f8c342eb927d54 Author: Li Bo Date: Wed Jan 31 14:23:15 2024 +0800 [Datasets & Models] Fuyu, HalluBench (w/Kaichen, commit 96d95b3) (#33) * add fuyu * Merge commit '0f183a394426d3bf88884b4e2258ab53406bc705' * Squashed commit of the following: commit b81ed2ce4d0e226df7a41bddd82fe1f9d46a27fc Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Tue Jan 30 19:39:57 2024 +0800 Add hallu bench commit 0f183a394426d3bf88884b4e2258ab53406bc705 Author: Pu Fanyi Date: Tue Jan 30 14:52:51 2024 +0800 scienceqa for full set (#32) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit '0a403e6f5e17c70a50983c83a132edf0fdcd98de' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit fdb0c6785b0c5d6979d10e7ddf75ce9055038db8 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration * Update generation kwargs for LMMS tasks * Update lmms_eval MME task configuration and utils * Update generation_kwargs in lmms_eval tasks * Update doc_to_text function in coco and okvqa tasks * Add COCO 2017 version * Update task name in coco_test2017.yaml * Squashed commit of the following: commit 1e2ae936c90a15d684926e43a38aac86935f38c5 Author: Zhang Peiyuan Date: Mon Jan 29 22:41:33 2024 +0800 Add/mmmu test (#30) * mmmu_test * black commit 10bbaf01c0a4164b6f1d2628367befccf8f39c24 Author: Li Bo Date: Sun Jan 28 22:19:13 2024 +0800 [Dataset Check] dataset check and add wandb logging (#29) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit '0a403e6f5e17c70a50983c83a132edf0fdcd98de' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit fdb0c6785b0c5d6979d10e7ddf75ce9055038db8 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit eae210c3700a59b7d5cc9de46fcb855f443096aa Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 18e4a19e82357352ab25df77b5ae4f1b011d61ae Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit e899be48f55f95172fdf96bd2a98d3b91ff2aaed Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit a999fc6889c6986c28ec5d95460a4ab5233e5d4f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit 0a403e6f5e17c70a50983c83a132edf0fdcd98de Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration --------- Co… * [Fix] fix bugs (#41) * add fuyu * Merge commit 'c7ffa8dee96e228c6519154d5a00742b35caa3f2' * Squashed commit of the following: commit 96d95b3cb3540cd17bcab31f1a85ad0d04a12f1e Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Tue Jan 30 19:39:57 2024 +0800 Add hallu bench commit c7ffa8dee96e228c6519154d5a00742b35caa3f2 Author: Pu Fanyi Date: Tue Jan 30 14:52:51 2024 +0800 scienceqa for full set (#32) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit 'f80465fd0f30781c8c36b46c1d6d7bba751f9e33' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit 2df0ce76ef836be1cb8ffbf3c854fe05563647b0 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration * Update generation kwargs for LMMS tasks * Update lmms_eval MME task configuration and utils * Update generation_kwargs in lmms_eval tasks * Update doc_to_text function in coco and okvqa tasks * Add COCO 2017 version * Update task name in coco_test2017.yaml * Squashed commit of the following: commit 0390783595c41232352599ab78fbe5949615e982 Author: Zhang Peiyuan Date: Mon Jan 29 22:41:33 2024 +0800 Add/mmmu test (#30) * mmmu_test * black commit 407bc2500c162d8949fbaae3d11d522afd2c9f28 Author: Li Bo Date: Sun Jan 28 22:19:13 2024 +0800 [Dataset Check] dataset check and add wandb logging (#29) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit 'f80465fd0f30781c8c36b46c1d6d7bba751f9e33' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit 2df0ce76ef836be1cb8ffbf3c854fe05563647b0 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration --------- Co-authored-by: Fanyi Pu Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Remove scienceqa_img task configuration * eval scienceqa with no images --------- Co-authored-by: Bo Li Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Update hb_doc_to_text function to remove unnecessary line break * Add Fuyu model and update OtterHD model * Refactor model response handling and fix image processing bug * Refactor flatten method to support only getting the first element * Add support for specifying timezone in datetime string Update flatten method in OtterHD class Update get_datetime_str function in utils.py * Fix condition for checking wandb_args_dict in __main__.py * Commented out assertions for batch size in Fuyu model * Add warning message for existing output file * Fix batch size issue in OtterHD model * Squashed commit of the following: commit d0c8c61d9a23686d31c7e014f0c15d802e04ee61 Author: Li Bo Date: Wed Jan 31 16:00:22 2024 +0800 [Datasets] add hallubench (#34) * Add hallu bench * Fix hall_b gpt eval bugs --------- Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> commit f4fd4fd29b45436a96fe65395f0922612f598052 Author: Li Bo Date: Wed Jan 31 14:23:15 2024 +0800 [Datasets & Models] Fuyu, HalluBench (w/Kaichen, commit 96d95b3) (#33) * add fuyu * Merge commit 'c7ffa8dee96e228c6519154d5a00742b35caa3f2' * Squashed commit of the following: commit 96d95b3cb3540cd17bcab31f1a85ad0d04a12f1e Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Tue Jan 30 19:39:57 2024 +0800 Add hallu bench commit c7ffa8dee96e228c6519154d5a00742b35caa3f2 Author: Pu Fanyi Date: Tue Jan 30 14:52:51 2024 +0800 scienceqa for full set (#32) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit 'f80465fd0f30781c8c36b46c1d6d7bba751f9e33' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit 2df0ce76ef836be1cb8ffbf3c854fe05563647b0 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration * Update generation kwargs for LMMS tasks * Update lmms_eval MME task configuration and utils * Update generation_kwargs in lmms_eval tasks * Update doc_to_text function in coco and okvqa tasks * Add COCO 2017 version * Update task name in coco_test2017.yaml * Squashed commit of the following: commit 0390783595c41232352599ab78fbe5949615e982 Author: Zhang Peiyuan Date: Mon Jan 29 22:41:33 2024 +0800 Add/mmmu test (#30) * mmmu_test * black commit 407bc2500c162d8949fbaae3d11d522afd2c9f28 Author: Li Bo Date: Sun Jan 28 22:19:13 2024 +0800 [Dataset Check] dataset check and add wandb logging (#29) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit 'f80465fd0f30781c8c36b46c1d6d7bba751f9e33' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit 2df0ce76ef836be1cb8ffbf3c854fe05563647b0 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration --------- Co-authored-by: Fanyi Pu Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Remove scienceqa_img task configuration * eval scienceqa with no images --------- Co-authored-by: Bo Li Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Update hb_doc_to_text function to remove unnecessary line break * Add Fuyu model and update OtterHD model * Refactor model response handling and fix image processing bug * Refactor flatten method to support only getting the first element * Add support for specifying timezone in datetime string Update flatten method in OtterHD class Update get_datetime_str function in utils.py * Fix condition for checking wandb_args_dict in __main__.py * Commented out assertions for batch size in Fuyu model * Add warning message for existing output file commit c7ffa8dee96e228c6519154d5a00742b35caa3f2 Author: Pu Fanyi Date: Tue Jan 30 14:52:51 2024 +0800 scienceqa for full set (#32) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit 'f80465fd0f30781c8c36b46c1d6d7bba751f9e33' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit 2df0ce76ef836be1cb8ffbf3c854fe05563647b0 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration * Update generation kwargs for LMMS tasks * Update lmms_eval MME task configuration and utils * Update generation_kwargs in lmms_eval tasks * Update doc_to_text function in coco and okvqa tasks * Add COCO 2017 version * Update task name in coco_test2017.yaml * Squashed commit of the following: commit 0390783595c41232352599ab78fbe5949615e982 Author: Zhang Peiyuan Date: Mon Jan 29 22:41:33 2024 +0800 Add/mmmu test (#30) * mmmu_test * black commit 407bc2500c162d8949fbaae3d11d522afd2c9f28 Author: Li Bo Date: Sun Jan 28 22:19:13 2024 +0800 [Dataset Check] dataset check and add wandb logging (#29) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit 'f80465fd0f30781c8c36b46c1d6d7bba751f9e33' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit 2df0ce76ef836be1cb8ffbf3c854fe05563647b0 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration --------- Co-authored-by: Fanyi Pu Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Remove scienceqa_img task configuration * eval scienceqa with no images --------- Co-authored-by: Bo Li Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Update API configuration and file paths * Refactor evaluate_by_chatgpt function in utils.py * Add hallusion_output_vd_model.json to .gitignore * Add timeout to API request * Refactor file path generation and remove unnecessary suffix in log samples output names * Refactor code and add output path handling * Update lmms-eval API and add new models and datasets * Refactor directory structure for RefCOCO+ and RefCOCOg datasets * Fix error logging in get_eval and parse_score functions * Update .gitignore and mme.yaml * Squashed commit of the following: commit 380494bb2417fae1bcc1535ad8b67df7af667619 Author: jzhang38 Date: Fri Feb 2 13:43:28 2024 +0800 black commit e46b937aeeed45f5dd574b852459bfb416d165fd Author: jzhang38 Date: Fri Feb 2 13:42:03 2024 +0800 adapt qwen to sqa, gqa, ai2d, docvqa commit 2da8f918c37495b3447b9c24e74234ad0bba8cbf Author: Li Bo Date: Thu Feb 1 16:20:27 2024 +0800 [Dataset] fix hallusion benchmark, add saving logic inside aggregate function (#35) * add fuyu * Merge commit 'c7ffa8dee96e228c6519154d5a00742b35caa3f2' * Squashed commit of the following: commit 96d95b3cb3540cd17bcab31f1a85ad0d04a12f1e Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Tue Jan 30 19:39:57 2024 +0800 Add hallu bench commit c7ffa8dee96e228c6519154d5a00742b35caa3f2 Author: Pu Fanyi Date: Tue Jan 30 14:52:51 2024 +0800 scienceqa for full set (#32) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit 'f80465fd0f30781c8c36b46c1d6d7bba751f9e33' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit 2df0ce76ef836be1cb8ffbf3c854fe05563647b0 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration * Update generation kwargs for LMMS tasks * Update lmms_eval MME task configuration and utils * Update generation_kwargs in lmms_eval tasks * Update doc_to_text function in coco and okvqa tasks * Add COCO 2017 version * Update task name in coco_test2017.yaml * Squashed commit of the following: commit 0390783595c41232352599ab78fbe5949615e982 Author: Zhang Peiyuan Date: Mon Jan 29 22:41:33 2024 +0800 Add/mmmu test (#30) * mmmu_test * black commit 407bc2500c162d8949fbaae3d11d522afd2c9f28 Author: Li Bo Date: Sun Jan 28 22:19:13 2024 +0800 [Dataset Check] dataset check and add wandb logging (#29) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit 'f80465fd0f30781c8c36b46c1d6d7bba751f9e33' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit 2df0ce76ef836be1cb8ffbf3c854fe05563647b0 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration --------- Co-authored-by: Fanyi Pu Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Remove scienceqa_img task configuration * eval scienceqa with no images --------- Co-authored-by: Bo Li Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Update hb_doc_to_text function to remove unnecessary line break * Add Fuyu model and update OtterHD model * Refactor model response handling and fix image processing bug * Refactor flatten method to support only getting the first element * Add support for specifying timezone in datetime string Update flatten method in OtterHD class Update get_datetime_str function in utils.py * Fix condition for checking wandb_args_dict in __main__.py * Commented out assertions for batch size in Fuyu model * Add warning message for existing output file * Fix batch size issue in OtterHD model * Squashed commit of the following: commit d0c8c61d9a23686d31c7e014f0c15d802e04ee61 Author: Li Bo Date: Wed Jan 31 16:00:22 2024 +0800 [Datasets] add hallubench (#34) * Add hallu bench * Fix hall_b gpt eval bugs --------- Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> commit f4fd4fd29b45436a96fe65395f0922612f598052 Author: Li Bo Date: Wed Jan 31 14:23:15 2024 +0800 [Datasets & Models] Fuyu, HalluBench (w/Kaichen, commit 96d95b3) (#33) * add fuyu * Merge commit 'c7ffa8dee96e228c6519154d5a00742b35caa3f2' * Squashed commit of the following: commit 96d95b3cb3540cd17bcab31f1a85ad0d04a12f1e Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Tue Jan 30 19:39:57 2024 +0800 Add hallu bench commit c7ffa8dee96e228c6519154d5a00742b35caa3f2 Author: Pu Fanyi Date: Tue Jan 30 14:52:51 2024 +0800 scienceqa for full set (#32) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit 'f80465fd0f30781c8c36b46c1d6d7bba751f9e33' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit 2df0ce76ef836be1cb8ffbf3c854fe05563647b0 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration * Update generation kwargs for LMMS tasks * Update lmms_eval MME task configuration and utils * Update generation_kwargs in lmms_eval tasks * Update doc_to_text function in coco and okvqa tasks * Add COCO 2017 version * Update task name in coco_test2017.yaml * Squashed commit of the following: commit 0390783595c41232352599ab78fbe5949615e982 Author: Zhang Peiyuan Date: Mon Jan 29 22:41:33 2024 +0800 Add/mmmu test (#30) * mmmu_test * black commit 407bc2500c162d8949fbaae3d11d522afd2c9f28 Author: Li Bo Date: Sun Jan 28 22:19:13 2024 +0800 [Dataset Check] dataset check and add wandb logging (#29) * Remove unused code and configuration file * Remove docvqa.yaml and update vizwizvqa.yaml * lint * Add dataset_kwargs to vizwizvqa.yaml * Add dataset_kwargs to vizwizvqa.yaml * textvqa (#27) * Update textvqa.yaml and utils.py * Fix YAML formatting in textvqa.yaml and remove unused files * remove useless matric * add textvqa val & test * Update progress bar description in evaluator.py * Update submission file names in VizWizVQA tasks * Update output path to include log samples suffix * Update submission file paths in OKVQA and VizWizVQA tasks * Refactor llava-in-the-wild.yaml and utils.py * Update metric for llava evaluation * Refactor logging message in Task class * Merge commit 'f80465fd0f30781c8c36b46c1d6d7bba751f9e33' * Fix formatting issues and add progress bar closing statements * Update task from "infovqa_val" to "infovqa_test" in infovqa_test.yaml * Update tqdm progress bar in OtterHD model * Squashed commit of the following: commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * Fix error handling in loading YAML config files * Squashed commit of the following: commit 2df0ce76ef836be1cb8ffbf3c854fe05563647b0 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 12:41:40 2024 +0800 Fix key bugs commit af6c7a2b8c2959495dc351e6f6eb2a442efe4e94 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:46:19 2024 +0800 Black lint commit 26da729c40008f72ce3f10c932874f120f290e26 Merge: ab898e4 ad8d9da Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:45:31 2024 +0800 Merge branch 'main' into kc/list_tasks_num commit acbb1a1997c5159709e3b81c3f0292b2f9def109 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:44:23 2024 +0800 Enable list all tasks num commit b33ac32f0ff28777204eaaf27a963200024081df Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Jan 28 09:41:32 2024 +0800 Exclude train yaml file in the task list commit f80465fd0f30781c8c36b46c1d6d7bba751f9e33 Author: Zhang Peiyuan Date: Sun Jan 28 02:04:57 2024 +0800 Add InfoVQA, DocVQA, and QwenVL (#28) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * add qwenvl * add infovqa and docvqa * List task #num sorted * Update prompt messages for image-related tasks * Delete unused task configuration files * Remove coco_train.yaml configuration file * Update task name in mmmu.yaml * Fix error message for missing tasks * Add wandb import and integration --------- Co… * Add multiple choice output type support (#40) * Add multiple choice output type support (#40) * Fix wandb_args["name"] assignment in WandbLogger.init_run() method * Fix wandb_args["name"] assignment in WandbLogger.init_run() method * Joshua/vizwizvqa refactor (#42) * refactor vizwizvqa task * Merge commit '0cf06439d3c85aee8783034b226f1badd3a08608' * Fix exact_match accuracy calculation in vizwiz_vqa_process_results * Update vizwiz_vqa tasks --------- Co-authored-by: Fanyi Pu * Joshua/vizwizvqa refactor (#42) * refactor vizwizvqa task * Merge commit '41d044cd287adcbcf095afb1a0ef5a96c88c3d9d' * Fix exact_match accuracy calculation in vizwiz_vqa_process_results * Update vizwiz_vqa tasks --------- Co-authored-by: Fanyi Pu * [Feat] Add qwen loglikelihood (#43) * Add qwen loglikelihood * Revise the pyproject dependency. Move tiktoken out from optional-dependencies * Add ferret-bench * Add seedbench 2, test on llava * [Feat] Add qwen loglikelihood (#43) * Add qwen loglikelihood * Revise the pyproject dependency. Move tiktoken out from optional-dependencies * Add ferret-bench * Add seedbench 2, test on llava * add cmmmu (#44) Co-authored-by: ygjin11 <1633504509@qq.com> * add cmmmu (#44) Co-authored-by: ygjin11 <1633504509@qq.com> * add stvqa and multidocvqa (#46) * add stvqa and multidocvqa (#46) * Fix seedbench choices bugs (#45) * Fix seedbench choices bugs (#45) * [Model] Add models (#47) * Refactor logging and model initialization * Fix wandb_logger.online() method call * Add error handling during evaluation * Add wait time and error handling in get_chat_response function * Update wait_time in get_chat_response function * Refactor code for improved readability and maintainability * Refactor doc_to_visual function to handle multiple images in ICON-QA tasks * Refactor logging_utils.py and utils.py This commit refactors the `logging_utils.py` and `utils.py` files. It removes unused imports, adjusts code formatting, and updates the `get_chat_response` function to increase the `wait_time` parameter from 5 to 10. * Refactor code for wandb logging and generation in OtterHD class * Refactor prepare_report_by_task method in logging_utils.py * Update generation parameters in OtterHD model * Update generation parameters in OtterHD model * Squashed commit of the following: commit 5a44010c0e4dc836a244775750b4da57c55d4d3a Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Tue Feb 13 18:50:37 2024 +0800 Fix seedbench choices bugs (#45) commit cf10a451a301e7655d8e23ad94d349db3cdd02a2 Author: XinrunDu <154438029+XinrunDu@users.noreply.github.com> Date: Tue Feb 13 18:50:23 2024 +0800 add stvqa and multidocvqa (#46) commit caaad1dd8d698a4a613cd7615a824a4ec0a53e1a Author: XinrunDu <154438029+XinrunDu@users.noreply.github.com> Date: Sun Feb 11 00:54:39 2024 +0800 add cmmmu (#44) Co-authored-by: ygjin11 <1633504509@qq.com> commit cfa11b689034ac694d05752c565fd23791e53507 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Feb 11 00:54:23 2024 +0800 [Feat] Add qwen loglikelihood (#43) * Add qwen loglikelihood * Revise the pyproject dependency. Move tiktoken out from optional-dependencies * Add ferret-bench * Add seedbench 2, test on llava commit 4d42aa88d6c9054c2b037732d506826215fc59c2 Author: JvThunder <44111143+JvThunder@users.noreply.github.com> Date: Wed Feb 7 00:08:22 2024 +0800 Joshua/vizwizvqa refactor (#42) * refactor vizwizvqa task * Merge commit '0cf06439d3c85aee8783034b226f1badd3a08608' * Fix exact_match accuracy calculation in vizwiz_vqa_process_results * Update vizwiz_vqa tasks --------- Co-authored-by: Fanyi Pu * [Model] Add models (#47) * Refactor logging and model initialization * Fix wandb_logger.online() method call * Add error handling during evaluation * Add wait time and error handling in get_chat_response function * Update wait_time in get_chat_response function * Refactor code for improved readability and maintainability * Refactor doc_to_visual function to handle multiple images in ICON-QA tasks * Refactor logging_utils.py and utils.py This commit refactors the `logging_utils.py` and `utils.py` files. It removes unused imports, adjusts code formatting, and updates the `get_chat_response` function to increase the `wait_time` parameter from 5 to 10. * Refactor code for wandb logging and generation in OtterHD class * Refactor prepare_report_by_task method in logging_utils.py * Update generation parameters in OtterHD model * Update generation parameters in OtterHD model * Squashed commit of the following: commit 4011e6cc087714a1a0ca848ec604d307c0761751 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Tue Feb 13 18:50:37 2024 +0800 Fix seedbench choices bugs (#45) commit 16a6c1fef0d023b37e438710393c5162f3891c2c Author: XinrunDu <154438029+XinrunDu@users.noreply.github.com> Date: Tue Feb 13 18:50:23 2024 +0800 add stvqa and multidocvqa (#46) commit 515a7c46754dc55e963d174bdee91907773c5866 Author: XinrunDu <154438029+XinrunDu@users.noreply.github.com> Date: Sun Feb 11 00:54:39 2024 +0800 add cmmmu (#44) Co-authored-by: ygjin11 <1633504509@qq.com> commit b3a013c02934dcf328df18cb792d5907f298de54 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Feb 11 00:54:23 2024 +0800 [Feat] Add qwen loglikelihood (#43) * Add qwen loglikelihood * Revise the pyproject dependency. Move tiktoken out from optional-dependencies * Add ferret-bench * Add seedbench 2, test on llava commit 1b4a4776e16bf480cf94ddead55ede5004949541 Author: JvThunder <44111143+JvThunder@users.noreply.github.com> Date: Wed Feb 7 00:08:22 2024 +0800 Joshua/vizwizvqa refactor (#42) * refactor vizwizvqa task * Merge commit '41d044cd287adcbcf095afb1a0ef5a96c88c3d9d' * Fix exact_match accuracy calculation in vizwiz_vqa_process_results * Update vizwiz_vqa tasks --------- Co-authored-by: Fanyi Pu * [Fix] rearrange location of init eval_logger * [Fix] rearrange location of init eval_logger * [Support] Add prepare script deepspeed for llava (#51) * Add prepare script deepspeed for llava * Comment on using ds to prepare your model * [Support] Add prepare script deepspeed for llava (#51) * Add prepare script deepspeed for llava * Comment on using ds to prepare your model * [Feat] GPT4V added interleaved image and text support (#52) * Revise GPT4V to allow interleaved image and text * Use the first char as the answer for seedbench * Save gpt eval's answer * [Feat] GPT4V added interleaved image and text support (#52) * Revise GPT4V to allow interleaved image and text * Use the first char as the answer for seedbench * Save gpt eval's answer * Update README.md * Update README.md * Update README.md * Update README.md * [Loggings] update logging logic (#54) * [Fix] rearrange location of init eval_logger * Ignore DeprecationWarnings in lmms_eval/__main__.py and lmms_eval/models/fuyu.py * Update lmms_eval/__main__.py and lmms_eval/utils.py * update * Update llava.py with LLaVA model imports and error handling * Add and test new datasets * update * Update wandb version and require report-editing:v0 * Add support for logging samples to Weights and Biases This commit adds a new command-line argument `--wandb_log_samples` to enable logging all model outputs and documents for per-sample measurement and post-hoc analysis to Weights and Biases. The `cli_evaluate` function has been modified to handle this new argument and log the samples if the argument is set to True. The `wandb_logger` object has also been updated to include a new method `log_eval_samples` for logging the samples. This enhancement improves the functionality of the LMMS evaluation tool. * update * [Loggings] update logging logic (#54) * [Fix] rearrange location of init eval_logger * Ignore DeprecationWarnings in lmms_eval/__main__.py and lmms_eval/models/fuyu.py * Update lmms_eval/__main__.py and lmms_eval/utils.py * update * Update llava.py with LLaVA model imports and error handling * Add and test new datasets * update * Update wandb version and require report-editing:v0 * Add support for logging samples to Weights and Biases This commit adds a new command-line argument `--wandb_log_samples` to enable logging all model outputs and documents for per-sample measurement and post-hoc analysis to Weights and Biases. The `cli_evaluate` function has been modified to handle this new argument and log the samples if the argument is set to True. The `wandb_logger` object has also been updated to include a new method `log_eval_samples` for logging the samples. This enhancement improves the functionality of the LMMS evaluation tool. * update * Dataset Refractor (#50) * refactor vizwizvqa task * Delete vqav2_test and vqav2_val YAML files * Refactor vqav2_process_results functions * Add a pack for vqav2 * refactor okvqa * roll back vizwiz_vqa * Fix exact_match calculation in ok_vqa_process_results * Update OKVQA dataset name in readme * add model_specific_prompt_kwargs * add model_specific_prompt_kwargs to vizwiz_vqa * add model_specific_prompt_kwargs for vqav2 * lint * fix a small bug for eval_logger * Refactor make_table function to display points as " - " if value is None * Merge commit 'c5e52a785d3cc87a866be9b880deb477d9f73fb7' * Refactor ok_vqa_aggreate_submissions function * Merge commit 'e5aa0a9601d6d8ce727315e4b0a8f13f06f26bff' * Refactor VQA submission file saving * Update file utils * Merge commit '560deca9f72483ca091795d6dc2537d4c54b32b0' * Refactor file path handling and submission generation * OKVQA path * vizwizvqa file * pack cmmmu * fix a small metric bug for cmmmu * Add higher_is_better flag to submission metric * Add CMMMU dataset to README.md * Add logging and refactor submission file generation in docvqa utils.py * pack docvqa * add traceback to print detailed error * Refactor docvqa_test_aggregate_results to accept additional arguments * Add metric check in evaluator.py and update test.yaml and val.yaml * add common `EvalAIAnswerProcessor` for okvqa, textvqa, vizwizvqa and vqav2 * merge textvqa * textvqa * Modify submission file generation for COCO test results * Update test result storage path * update coco cap file name * Update COCO 2017 Caption dataset name * ferret * Add Ferret dataset * Refactor hb_doc_to_text function to include model-specific prompts * Add IconQA and its subtasks * Refactor image list creation in doc_to_visual function * Add process_results function to default template * Update process_results function in iconqa utils.py * refactor flickr30k * change aggregation function * Fix formatting issues and update logging message * Fix llava can not handle only text question (no visuals) * Fix qwen can not handle no image question (no visuals) * Add fuyu prepare accelerator scripts * refactor mme * naming consistency * aggregation_submissions consistency * flickr30k naming consistency * remove submissions for mme * remove unused submission function * Refactor infovqa_test.yaml and infovqa_val.yaml * Refactor code for improved readability and maintainability * stvqa * remane sqa * Update lmms_eval textcaps files and utils.py * Update default prompt for text captions * Refactor textcaps_aggregation_result function * Add generate_submission_file function and update mathvista_aggregate_results signature * Update nocaps_test.yaml and nocaps_val.yaml * refractor internal_eval * Add internal evaluation datasets * pack multidocvqa * mmvet * Fix gpt eval timeout issue for hallubench, restore load from gpt to avoid re evaluating * Refractor llava wild * Refractor llava-bench-coco * Add JSON file generation for gpt evaluation details * mmmu * Remove MMBench English and Chinese tasks * Remove unnecessary return statement in mmbench_aggregate_test_results function * Fix distributed process group initialization * Update dataset paths and group names in mmbench test configs * Update import statements in cc_utils.py, cn_utils.py, and en_utils.py * Add torch module import * lint * Remove IconQA dataset from README.md * Add Multi-DocVQA and its submodules * Add new datasets and update task names --------- Co-authored-by: JvThunder Co-authored-by: kcz358 * Dataset Refractor (#50) * refactor vizwizvqa task * Delete vqav2_test and vqav2_val YAML files * Refactor vqav2_process_results functions * Add a pack for vqav2 * refactor okvqa * roll back vizwiz_vqa * Fix exact_match calculation in ok_vqa_process_results * Update OKVQA dataset name in readme * add model_specific_prompt_kwargs * add model_specific_prompt_kwargs to vizwiz_vqa * add model_specific_prompt_kwargs for vqav2 * lint * fix a small bug for eval_logger * Refactor make_table function to display points as " - " if value is None * Merge commit '90f42f0876a4914c5ac0d213b9dffbfb4797ff62' * Refactor ok_vqa_aggreate_submissions function * Merge commit '4afec3303a0a7ed27a8265565343bf2851b9e4c7' * Refactor VQA submission file saving * Update file utils * Merge commit 'c144b75f0c9145a625b2bbdef5123ed81e343a11' * Refactor file path handling and submission generation * OKVQA path * vizwizvqa file * pack cmmmu * fix a small metric bug for cmmmu * Add higher_is_better flag to submission metric * Add CMMMU dataset to README.md * Add logging and refactor submission file generation in docvqa utils.py * pack docvqa * add traceback to print detailed error * Refactor docvqa_test_aggregate_results to accept additional arguments * Add metric check in evaluator.py and update test.yaml and val.yaml * add common `EvalAIAnswerProcessor` for okvqa, textvqa, vizwizvqa and vqav2 * merge textvqa * textvqa * Modify submission file generation for COCO test results * Update test result storage path * update coco cap file name * Update COCO 2017 Caption dataset name * ferret * Add Ferret dataset * Refactor hb_doc_to_text function to include model-specific prompts * Add IconQA and its subtasks * Refactor image list creation in doc_to_visual function * Add process_results function to default template * Update process_results function in iconqa utils.py * refactor flickr30k * change aggregation function * Fix formatting issues and update logging message * Fix llava can not handle only text question (no visuals) * Fix qwen can not handle no image question (no visuals) * Add fuyu prepare accelerator scripts * refactor mme * naming consistency * aggregation_submissions consistency * flickr30k naming consistency * remove submissions for mme * remove unused submission function * Refactor infovqa_test.yaml and infovqa_val.yaml * Refactor code for improved readability and maintainability * stvqa * remane sqa * Update lmms_eval textcaps files and utils.py * Update default prompt for text captions * Refactor textcaps_aggregation_result function * Add generate_submission_file function and update mathvista_aggregate_results signature * Update nocaps_test.yaml and nocaps_val.yaml * refractor internal_eval * Add internal evaluation datasets * pack multidocvqa * mmvet * Fix gpt eval timeout issue for hallubench, restore load from gpt to avoid re evaluating * Refractor llava wild * Refractor llava-bench-coco * Add JSON file generation for gpt evaluation details * mmmu * Remove MMBench English and Chinese tasks * Remove unnecessary return statement in mmbench_aggregate_test_results function * Fix distributed process group initialization * Update dataset paths and group names in mmbench test configs * Update import statements in cc_utils.py, cn_utils.py, and en_utils.py * Add torch module import * lint * Remove IconQA dataset from README.md * Add Multi-DocVQA and its submodules * Add new datasets and update task names --------- Co-authored-by: JvThunder Co-authored-by: kcz358 * [Wandb Logger] add models, and args to wandb tables. (#55) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * [Wandb Logger] add models, and args to wandb tables. (#55) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * Pufanyi/flickr30k refractor (#56) * refactor vizwizvqa task * Delete vqav2_test and vqav2_val YAML files * Refactor vqav2_process_results functions * Add a pack for vqav2 * refactor okvqa * roll back vizwiz_vqa * Fix exact_match calculation in ok_vqa_process_results * Update OKVQA dataset name in readme * add model_specific_prompt_kwargs * add model_specific_prompt_kwargs to vizwiz_vqa * add model_specific_prompt_kwargs for vqav2 * lint * fix a small bug for eval_logger * Refactor make_table function to display points as " - " if value is None * Merge commit 'c5e52a785d3cc87a866be9b880deb477d9f73fb7' * Refactor ok_vqa_aggreate_submissions function * Merge commit 'e5aa0a9601d6d8ce727315e4b0a8f13f06f26bff' * Refactor VQA submission file saving * Update file utils * Merge commit '560deca9f72483ca091795d6dc2537d4c54b32b0' * Refactor file path handling and submission generation * OKVQA path * vizwizvqa file * pack cmmmu * fix a small metric bug for cmmmu * Add higher_is_better flag to submission metric * Add CMMMU dataset to README.md * Add logging and refactor submission file generation in docvqa utils.py * pack docvqa * add traceback to print detailed error * Refactor docvqa_test_aggregate_results to accept additional arguments * Add metric check in evaluator.py and update test.yaml and val.yaml * add common `EvalAIAnswerProcessor` for okvqa, textvqa, vizwizvqa and vqav2 * merge textvqa * textvqa * Modify submission file generation for COCO test results * Update test result storage path * update coco cap file name * Update COCO 2017 Caption dataset name * ferret * Add Ferret dataset * Refactor hb_doc_to_text function to include model-specific prompts * Add IconQA and its subtasks * Refactor image list creation in doc_to_visual function * Add process_results function to default template * Update process_results function in iconqa utils.py * refactor flickr30k * change aggregation function * Fix formatting issues and update logging message * Fix llava can not handle only text question (no visuals) * Fix qwen can not handle no image question (no visuals) * Add fuyu prepare accelerator scripts * refactor mme * naming consistency * aggregation_submissions consistency * flickr30k naming consistency * remove submissions for mme * remove unused submission function * Refactor infovqa_test.yaml and infovqa_val.yaml * Refactor code for improved readability and maintainability * stvqa * remane sqa * Update lmms_eval textcaps files and utils.py * Update default prompt for text captions * Refactor textcaps_aggregation_result function * Add generate_submission_file function and update mathvista_aggregate_results signature * Update nocaps_test.yaml and nocaps_val.yaml * refractor internal_eval * Add internal evaluation datasets * pack multidocvqa * mmvet * Fix gpt eval timeout issue for hallubench, restore load from gpt to avoid re evaluating * Refractor llava wild * Refractor llava-bench-coco * Add JSON file generation for gpt evaluation details * mmmu * Remove MMBench English and Chinese tasks * Remove unnecessary return statement in mmbench_aggregate_test_results function * Fix distributed process group initialization * Update dataset paths and group names in mmbench test configs * Update import statements in cc_utils.py, cn_utils.py, and en_utils.py * Add torch module import * lint * Remove IconQA dataset from README.md * Add Multi-DocVQA and its submodules * Add new datasets and update task names * Refactor flickr_aggregation_result function to accept additional arguments * Add timeout kwargs in Accelerator constructor * Add encoding to be utf-8 for cmmmu * Fix llava try and catch, remove torch.distributed.init in main * Ds prepare script for llava --------- Co-authored-by: JvThunder Co-authored-by: kcz358 * Pufanyi/flickr30k refractor (#56) * refactor vizwizvqa task * Delete vqav2_test and vqav2_val YAML files * Refactor vqav2_process_results functions * Add a pack for vqav2 * refactor okvqa * roll back vizwiz_vqa * Fix exact_match calculation in ok_vqa_process_results * Update OKVQA dataset name in readme * add model_specific_prompt_kwargs * add model_specific_prompt_kwargs to vizwiz_vqa * add model_specific_prompt_kwargs for vqav2 * lint * fix a small bug for eval_logger * Refactor make_table function to display points as " - " if value is None * Merge commit '90f42f0876a4914c5ac0d213b9dffbfb4797ff62' * Refactor ok_vqa_aggreate_submissions function * Merge commit '4afec3303a0a7ed27a8265565343bf2851b9e4c7' * Refactor VQA submission file saving * Update file utils * Merge commit 'c144b75f0c9145a625b2bbdef5123ed81e343a11' * Refactor file path handling and submission generation * OKVQA path * vizwizvqa file * pack cmmmu * fix a small metric bug for cmmmu * Add higher_is_better flag to submission metric * Add CMMMU dataset to README.md * Add logging and refactor submission file generation in docvqa utils.py * pack docvqa * add traceback to print detailed error * Refactor docvqa_test_aggregate_results to accept additional arguments * Add metric check in evaluator.py and update test.yaml and val.yaml * add common `EvalAIAnswerProcessor` for okvqa, textvqa, vizwizvqa and vqav2 * merge textvqa * textvqa * Modify submission file generation for COCO test results * Update test result storage path * update coco cap file name * Update COCO 2017 Caption dataset name * ferret * Add Ferret dataset * Refactor hb_doc_to_text function to include model-specific prompts * Add IconQA and its subtasks * Refactor image list creation in doc_to_visual function * Add process_results function to default template * Update process_results function in iconqa utils.py * refactor flickr30k * change aggregation function * Fix formatting issues and update logging message * Fix llava can not handle only text question (no visuals) * Fix qwen can not handle no image question (no visuals) * Add fuyu prepare accelerator scripts * refactor mme * naming consistency * aggregation_submissions consistency * flickr30k naming consistency * remove submissions for mme * remove unused submission function * Refactor infovqa_test.yaml and infovqa_val.yaml * Refactor code for improved readability and maintainability * stvqa * remane sqa * Update lmms_eval textcaps files and utils.py * Update default prompt for text captions * Refactor textcaps_aggregation_result function * Add generate_submission_file function and update mathvista_aggregate_results signature * Update nocaps_test.yaml and nocaps_val.yaml * refractor internal_eval * Add internal evaluation datasets * pack multidocvqa * mmvet * Fix gpt eval timeout issue for hallubench, restore load from gpt to avoid re evaluating * Refractor llava wild * Refractor llava-bench-coco * Add JSON file generation for gpt evaluation details * mmmu * Remove MMBench English and Chinese tasks * Remove unnecessary return statement in mmbench_aggregate_test_results function * Fix distributed process group initialization * Update dataset paths and group names in mmbench test configs * Update import statements in cc_utils.py, cn_utils.py, and en_utils.py * Add torch module import * lint * Remove IconQA dataset from README.md * Add Multi-DocVQA and its submodules * Add new datasets and update task names * Refactor flickr_aggregation_result function to accept additional arguments * Add timeout kwargs in Accelerator constructor * Add encoding to be utf-8 for cmmmu * Fix llava try and catch, remove torch.distributed.init in main * Ds prepare script for llava --------- Co-authored-by: JvThunder Co-authored-by: kcz358 * Dev/py add models (#57) * add instructblip * minicpm_v * remove from qwen-vl * speed up postprocessing * Optimize build context speed --------- Co-authored-by: Pu Fanyi Co-authored-by: kcz358 * Dev/py add models (#57) * add instructblip * minicpm_v * remove from qwen-vl * speed up postprocessing * Optimize build context speed --------- Co-authored-by: Pu Fanyi Co-authored-by: kcz358 * [Fix] refcocog dataset path, record gpt prompt in internal eval, build context issue (#59) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * Update README.md with new features and installation instructions * Update supported models and datasets * Delete otter.py file * Fix capitalization in README.md * Update image sizes and add new features * Refactor README.md to improve readability and add new features * Add description for lmms-eval in README.md * Update accelerator support in README.md * Update lmms-eval README with improved description and additional features * Update README.md with improved task grouping description * change `Otter-AI/MME` to `lmms-lab/MME` * Update README.md * Update README.md * Remove unused code in mme.yaml * Squashed commit of the following: commit 6b20902d94ef9120181cd26cdce1e139046dbdf4 Author: Zhang Peiyuan Date: Thu Feb 29 13:40:02 2024 +0800 Dev/py add models (#57) * add instructblip * minicpm_v * remove from qwen-vl * speed up postprocessing * Optimize build context speed --------- Co-authored-by: Pu Fanyi Co-authored-by: kcz358 commit 21050baf08c4bc87c77d98dea55b6eecfefe1c0a Author: Pu Fanyi Date: Wed Feb 28 14:49:07 2024 +0800 Pufanyi/flickr30k refractor (#56) * refactor vizwizvqa task * Delete vqav2_test and vqav2_val YAML files * Refactor vqav2_process_results functions * Add a pack for vqav2 * refactor okvqa * roll back vizwiz_vqa * Fix exact_match calculation in ok_vqa_process_results * Update OKVQA dataset name in readme * add model_specific_prompt_kwargs * add model_specific_prompt_kwargs to vizwiz_vqa * add model_specific_prompt_kwargs for vqav2 * lint * fix a small bug for eval_logger * Refactor make_table function to display points as " - " if value is None * Merge commit 'c5e52a785d3cc87a866be9b880deb477d9f73fb7' * Refactor ok_vqa_aggreate_submissions function * Merge commit 'e5aa0a9601d6d8ce727315e4b0a8f13f06f26bff' * Refactor VQA submission file saving * Update file utils * Merge commit '560deca9f72483ca091795d6dc2537d4c54b32b0' * Refactor file path handling and submission generation * OKVQA path * vizwizvqa file * pack cmmmu * fix a small metric bug for cmmmu * Add higher_is_better flag to submission metric * Add CMMMU dataset to README.md * Add logging and refactor submission file generation in docvqa utils.py * pack docvqa * add traceback to print detailed error * Refactor docvqa_test_aggregate_results to accept additional arguments * Add metric check in evaluator.py and update test.yaml and val.yaml * add common `EvalAIAnswerProcessor` for okvqa, textvqa, vizwizvqa and vqav2 * merge textvqa * textvqa * Modify submission file generation for COCO test results * Update test result storage path * update coco cap file name * Update COCO 2017 Caption dataset name * ferret * Add Ferret dataset * Refactor hb_doc_to_text function to include model-specific prompts * Add IconQA and its subtasks * Refactor image list creation in doc_to_visual function * Add process_results function to default template * Update process_results function in iconqa utils.py * refactor flickr30k * change aggregation function * Fix formatting issues and update logging message * Fix llava can not handle only text question (no visuals) * Fix qwen can not handle no image question (no visuals) * Add fuyu prepare accelerator scripts * refactor mme * naming consistency * aggregation_submissions consistency * flickr30k naming consistency * remove submissions for mme * remove unused submission function * Refactor infovqa_test.yaml and infovqa_val.yaml * Refactor code for improved readability and maintainability * stvqa * remane sqa * Update lmms_eval textcaps files and utils.py * Update default prompt for text captions * Refactor textcaps_aggregation_result function * Add generate_submission_file function and update mathvista_aggregate_results signature * Update nocaps_test.yaml and nocaps_val.yaml * refractor internal_eval * Add internal evaluation datasets * pack multidocvqa * mmvet * Fix gpt eval timeout issue for hallubench, restore load from gpt to avoid re evaluating * Refractor llava wild * Refractor llava-bench-coco * Add JSON file generation for gpt evaluation details * mmmu * Remove MMBench English and Chinese tasks * Remove unnecessary return statement in mmbench_aggregate_test_results function * Fix distributed process group initialization * Update dataset paths and group names in mmbench test configs * Update import statements in cc_utils.py, cn_utils.py, and en_utils.py * Add torch module import * lint * Remove IconQA dataset from README.md * Add Multi-DocVQA and its submodules * Add new datasets and update task names * Refactor flickr_aggregation_result function to accept additional arguments * Add timeout kwargs in Accelerator constructor * Add encoding to be utf-8 for cmmmu * Fix llava try and catch, remove torch.distributed.init in main * Ds prepare script for llava --------- Co-authored-by: JvThunder Co-authored-by: kcz358 commit ba0e7f51e255540791d030468f79a126fec40eb5 Author: Li Bo Date: Tue Feb 27 22:52:07 2024 +0800 [Wandb Logger] add models, and args to wandb tables. (#55) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * add llava main in pyproject * Update README.md * Remove unnecessary dependencies and add specific version for llava_repr * Add dependencies for llava_repr*** * Update README.md * add some docs on models and command line commands * remove some lines * typo * Update model_guide.md * Update model_guide.md * Update README.md * Update README.md * Update README.md * Fix refcocog dataset path * Record gpt response in eval info * Resolve conflict * Fix hallusionbench gpt json saving path * Rename hallubench gpt output path * Change remove image to check by type instead of check by names * More robust check by type * Remove unnecessary img in data * Forcing an empty commit. * Testing * Delete unnecessary things * Fix seedbench2 image issue in doc_to_text * Add conditional exclude for internal eval * Fix small bugs in list_with_num * Revise list_with_num model args --------- Co-authored-by: Bo Li Co-authored-by: Fanyi Pu Co-authored-by: jzhang38 * [Fix] refcocog dataset path, record gpt prompt in internal eval, build context issue (#59) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * Update README.md with new features and installation instructions * Update supported models and datasets * Delete otter.py file * Fix capitalization in README.md * Update image sizes and add new features * Refactor README.md to improve readability and add new features * Add description for lmms-eval in README.md * Update accelerator support in README.md * Update lmms-eval README with improved description and additional features * Update README.md with improved task grouping description * change `Otter-AI/MME` to `lmms-lab/MME` * Update README.md * Update README.md * Remove unused code in mme.yaml * Squashed commit of the following: commit 2a45079a974c55aeb8e04d367759e3cf383072c1 Author: Zhang Peiyuan Date: Thu Feb 29 13:40:02 2024 +0800 Dev/py add models (#57) * add instructblip * minicpm_v * remove from qwen-vl * speed up postprocessing * Optimize build context speed --------- Co-authored-by: Pu Fanyi Co-authored-by: kcz358 commit 7bdab7a743092cdca66b049e15332d8c21680147 Author: Pu Fanyi Date: Wed Feb 28 14:49:07 2024 +0800 Pufanyi/flickr30k refractor (#56) * refactor vizwizvqa task * Delete vqav2_test and vqav2_val YAML files * Refactor vqav2_process_results functions * Add a pack for vqav2 * refactor okvqa * roll back vizwiz_vqa * Fix exact_match calculation in ok_vqa_process_results * Update OKVQA dataset name in readme * add model_specific_prompt_kwargs * add model_specific_prompt_kwargs to vizwiz_vqa * add model_specific_prompt_kwargs for vqav2 * lint * fix a small bug for eval_logger * Refactor make_table function to display points as " - " if value is None * Merge commit '90f42f0876a4914c5ac0d213b9dffbfb4797ff62' * Refactor ok_vqa_aggreate_submissions function * Merge commit '4afec3303a0a7ed27a8265565343bf2851b9e4c7' * Refactor VQA submission file saving * Update file utils * Merge commit 'c144b75f0c9145a625b2bbdef5123ed81e343a11' * Refactor file path handling and submission generation * OKVQA path * vizwizvqa file * pack cmmmu * fix a small metric bug for cmmmu * Add higher_is_better flag to submission metric * Add CMMMU dataset to README.md * Add logging and refactor submission file generation in docvqa utils.py * pack docvqa * add traceback to print detailed error * Refactor docvqa_test_aggregate_results to accept additional arguments * Add metric check in evaluator.py and update test.yaml and val.yaml * add common `EvalAIAnswerProcessor` for okvqa, textvqa, vizwizvqa and vqav2 * merge textvqa * textvqa * Modify submission file generation for COCO test results * Update test result storage path * update coco cap file name * Update COCO 2017 Caption dataset name * ferret * Add Ferret dataset * Refactor hb_doc_to_text function to include model-specific prompts * Add IconQA and its subtasks * Refactor image list creation in doc_to_visual function * Add process_results function to default template * Update process_results function in iconqa utils.py * refactor flickr30k * change aggregation function * Fix formatting issues and update logging message * Fix llava can not handle only text question (no visuals) * Fix qwen can not handle no image question (no visuals) * Add fuyu prepare accelerator scripts * refactor mme * naming consistency * aggregation_submissions consistency * flickr30k naming consistency * remove submissions for mme * remove unused submission function * Refactor infovqa_test.yaml and infovqa_val.yaml * Refactor code for improved readability and maintainability * stvqa * remane sqa * Update lmms_eval textcaps files and utils.py * Update default prompt for text captions * Refactor textcaps_aggregation_result function * Add generate_submission_file function and update mathvista_aggregate_results signature * Update nocaps_test.yaml and nocaps_val.yaml * refractor internal_eval * Add internal evaluation datasets * pack multidocvqa * mmvet * Fix gpt eval timeout issue for hallubench, restore load from gpt to avoid re evaluating * Refractor llava wild * Refractor llava-bench-coco * Add JSON file generation for gpt evaluation details * mmmu * Remove MMBench English and Chinese tasks * Remove unnecessary return statement in mmbench_aggregate_test_results function * Fix distributed process group initialization * Update dataset paths and group names in mmbench test configs * Update import statements in cc_utils.py, cn_utils.py, and en_utils.py * Add torch module import * lint * Remove IconQA dataset from README.md * Add Multi-DocVQA and its submodules * Add new datasets and update task names * Refactor flickr_aggregation_result function to accept additional arguments * Add timeout kwargs in Accelerator constructor * Add encoding to be utf-8 for cmmmu * Fix llava try and catch, remove torch.distributed.init in main * Ds prepare script for llava --------- Co-authored-by: JvThunder Co-authored-by: kcz358 commit d3dfd94ec91706587bf9f57dfebaad2cc3b5281c Author: Li Bo Date: Tue Feb 27 22:52:07 2024 +0800 [Wandb Logger] add models, and args to wandb tables. (#55) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * add llava main in pyproject * Update README.md * Remove unnecessary dependencies and add specific version for llava_repr * Add dependencies for llava_repr*** * Update README.md * add some docs on models and command line commands * remove some lines * typo * Update model_guide.md * Update model_guide.md * Update README.md * Update README.md * Update README.md * Fix refcocog dataset path * Record gpt response in eval info * Resolve conflict * Fix hallusionbench gpt json saving path * Rename hallubench gpt output path * Change remove image to check by type instead of check by names * More robust check by type * Remove unnecessary img in data * Forcing an empty commit. * Testing * Delete unnecessary things * Fix seedbench2 image issue in doc_to_text * Add conditional exclude for internal eval * Fix small bugs in list_with_num * Revise list_with_num model args --------- Co-authored-by: Bo Li Co-authored-by: Fanyi Pu Co-authored-by: jzhang38 * [Fix] wandb group logging missing columns (#61) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * Update README.md with new features and installation instructions * Update supported models and datasets * Delete otter.py file * Fix capitalization in README.md * Update image sizes and add new features * Refactor README.md to improve readability and add new features * Add description for lmms-eval in README.md * Update accelerator support in README.md * Update lmms-eval README with improved description and additional features * Update README.md with improved task grouping description * change `Otter-AI/MME` to `lmms-lab/MME` * Update README.md * Update README.md * Remove unused code in mme.yaml * Squashed commit of the following: commit 6b20902d94ef9120181cd26cdce1e139046dbdf4 Author: Zhang Peiyuan Date: Thu Feb 29 13:40:02 2024 +0800 Dev/py add models (#57) * add instructblip * minicpm_v * remove from qwen-vl * speed up postprocessing * Optimize build context speed --------- Co-authored-by: Pu Fanyi Co-authored-by: kcz358 commit 21050baf08c4bc87c77d98dea55b6eecfefe1c0a Author: Pu Fanyi Date: Wed Feb 28 14:49:07 2024 +0800 Pufanyi/flickr30k refractor (#56) * refactor vizwizvqa task * Delete vqav2_test and vqav2_val YAML files * Refactor vqav2_process_results functions * Add a pack for vqav2 * refactor okvqa * roll back vizwiz_vqa * Fix exact_match calculation in ok_vqa_process_results * Update OKVQA dataset name in readme * add model_specific_prompt_kwargs * add model_specific_prompt_kwargs to vizwiz_vqa * add model_specific_prompt_kwargs for vqav2 * lint * fix a small bug for eval_logger * Refactor make_table function to display points as " - " if value is None * Merge commit 'c5e52a785d3cc87a866be9b880deb477d9f73fb7' * Refactor ok_vqa_aggreate_submissions function * Merge commit 'e5aa0a9601d6d8ce727315e4b0a8f13f06f26bff' * Refactor VQA submission file saving * Update file utils * Merge commit '560deca9f72483ca091795d6dc2537d4c54b32b0' * Refactor file path handling and submission generation * OKVQA path * vizwizvqa file * pack cmmmu * fix a small metric bug for cmmmu * Add higher_is_better flag to submission metric * Add CMMMU dataset to README.md * Add logging and refactor submission file generation in docvqa utils.py * pack docvqa * add traceback to print detailed error * Refactor docvqa_test_aggregate_results to accept additional arguments * Add metric check in evaluator.py and update test.yaml and val.yaml * add common `EvalAIAnswerProcessor` for okvqa, textvqa, vizwizvqa and vqav2 * merge textvqa * textvqa * Modify submission file generation for COCO test results * Update test result storage path * update coco cap file name * Update COCO 2017 Caption dataset name * ferret * Add Ferret dataset * Refactor hb_doc_to_text function to include model-specific prompts * Add IconQA and its subtasks * Refactor image list creation in doc_to_visual function * Add process_results function to default template * Update process_results function in iconqa utils.py * refactor flickr30k * change aggregation function * Fix formatting issues and update logging message * Fix llava can not handle only text question (no visuals) * Fix qwen can not handle no image question (no visuals) * Add fuyu prepare accelerator scripts * refactor mme * naming consistency * aggregation_submissions consistency * flickr30k naming consistency * remove submissions for mme * remove unused submission function * Refactor infovqa_test.yaml and infovqa_val.yaml * Refactor code for improved readability and maintainability * stvqa * remane sqa * Update lmms_eval textcaps files and utils.py * Update default prompt for text captions * Refactor textcaps_aggregation_result function * Add generate_submission_file function and update mathvista_aggregate_results signature * Update nocaps_test.yaml and nocaps_val.yaml * refractor internal_eval * Add internal evaluation datasets * pack multidocvqa * mmvet * Fix gpt eval timeout issue for hallubench, restore load from gpt to avoid re evaluating * Refractor llava wild * Refractor llava-bench-coco * Add JSON file generation for gpt evaluation details * mmmu * Remove MMBench English and Chinese tasks * Remove unnecessary return statement in mmbench_aggregate_test_results function * Fix distributed process group initialization * Update dataset paths and group names in mmbench test configs * Update import statements in cc_utils.py, cn_utils.py, and en_utils.py * Add torch module import * lint * Remove IconQA dataset from README.md * Add Multi-DocVQA and its submodules * Add new datasets and update task names * Refactor flickr_aggregation_result function to accept additional arguments * Add timeout kwargs in Accelerator constructor * Add encoding to be utf-8 for cmmmu * Fix llava try and catch, remove torch.distributed.init in main * Ds prepare script for llava --------- Co-authored-by: JvThunder Co-authored-by: kcz358 commit ba0e7f51e255540791d030468f79a126fec40eb5 Author: Li Bo Date: Tue Feb 27 22:52:07 2024 +0800 [Wandb Logger] add models, and args to wandb tables. (#55) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * add llava main in pyproject * Update README.md * Remove unnecessary dependencies and add specific version for llava_repr * Add dependencies for llava_repr*** * Update README.md * add some docs on models and command line commands * remove some lines * typo * Update model_guide.md * Update model_guide.md * Update README.md * Update README.md * Update README.md * Fix refcocog dataset path * Record gpt response in eval info * Resolve conflict * Fix hallusionbench gpt json saving path * Rename hallubench gpt output path * Change remove image to check by type instead of check by names * More robust check by type * Remove unnecessary img in data * Forcing an empty commit. * Testing * Delete unnecessary things * Fix seedbench2 image issue in doc_to_text * Add conditional exclude for internal eval * Fix small bugs in list_with_num * Revise list_with_num model args * Fix logging utils bug on wandb grouping --------- Co-authored-by: Bo Li Co-authored-by: Fanyi Pu Co-authored-by: jzhang38 * [Fix] wandb group logging missing columns (#61) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * Update README.md with new features and installation instructions * Update supported models and datasets * Delete otter.py file * Fix capitalization in README.md * Update image sizes and add new features * Refactor README.md to improve readability and add new features * Add description for lmms-eval in README.md * Update accelerator support in README.md * Update lmms-eval README with improved description and additional features * Update README.md with improved task grouping description * change `Otter-AI/MME` to `lmms-lab/MME` * Update README.md * Update README.md * Remove unused code in mme.yaml * Squashed commit of the following: commit 2a45079a974c55aeb8e04d367759e3cf383072c1 Author: Zhang Peiyuan Date: Thu Feb 29 13:40:02 2024 +0800 Dev/py add models (#57) * add instructblip * minicpm_v * remove from qwen-vl * speed up postprocessing * Optimize build context speed --------- Co-authored-by: Pu Fanyi Co-authored-by: kcz358 commit 7bdab7a743092cdca66b049e15332d8c21680147 Author: Pu Fanyi Date: Wed Feb 28 14:49:07 2024 +0800 Pufanyi/flickr30k refractor (#56) * refactor vizwizvqa task * Delete vqav2_test and vqav2_val YAML files * Refactor vqav2_process_results functions * Add a pack for vqav2 * refactor okvqa * roll back vizwiz_vqa * Fix exact_match calculation in ok_vqa_process_results * Update OKVQA dataset name in readme * add model_specific_prompt_kwargs * add model_specific_prompt_kwargs to vizwiz_vqa * add model_specific_prompt_kwargs for vqav2 * lint * fix a small bug for eval_logger * Refactor make_table function to display points as " - " if value is None * Merge commit '90f42f0876a4914c5ac0d213b9dffbfb4797ff62' * Refactor ok_vqa_aggreate_submissions function * Merge commit '4afec3303a0a7ed27a8265565343bf2851b9e4c7' * Refactor VQA submission file saving * Update file utils * Merge commit 'c144b75f0c9145a625b2bbdef5123ed81e343a11' * Refactor file path handling and submission generation * OKVQA path * vizwizvqa file * pack cmmmu * fix a small metric bug for cmmmu * Add higher_is_better flag to submission metric * Add CMMMU dataset to README.md * Add logging and refactor submission file generation in docvqa utils.py * pack docvqa * add traceback to print detailed error * Refactor docvqa_test_aggregate_results to accept additional arguments * Add metric check in evaluator.py and update test.yaml and val.yaml * add common `EvalAIAnswerProcessor` for okvqa, textvqa, vizwizvqa and vqav2 * merge textvqa * textvqa * Modify submission file generation for COCO test results * Update test result storage path * update coco cap file name * Update COCO 2017 Caption dataset name * ferret * Add Ferret dataset * Refactor hb_doc_to_text function to include model-specific prompts * Add IconQA and its subtasks * Refactor image list creation in doc_to_visual function * Add process_results function to default template * Update process_results function in iconqa utils.py * refactor flickr30k * change aggregation function * Fix formatting issues and update logging message * Fix llava can not handle only text question (no visuals) * Fix qwen can not handle no image question (no visuals) * Add fuyu prepare accelerator scripts * refactor mme * naming consistency * aggregation_submissions consistency * flickr30k naming consistency * remove submissions for mme * remove unused submission function * Refactor infovqa_test.yaml and infovqa_val.yaml * Refactor code for improved readability and maintainability * stvqa * remane sqa * Update lmms_eval textcaps files and utils.py * Update default prompt for text captions * Refactor textcaps_aggregation_result function * Add generate_submission_file function and update mathvista_aggregate_results signature * Update nocaps_test.yaml and nocaps_val.yaml * refractor internal_eval * Add internal evaluation datasets * pack multidocvqa * mmvet * Fix gpt eval timeout issue for hallubench, restore load from gpt to avoid re evaluating * Refractor llava wild * Refractor llava-bench-coco * Add JSON file generation for gpt evaluation details * mmmu * Remove MMBench English and Chinese tasks * Remove unnecessary return statement in mmbench_aggregate_test_results function * Fix distributed process group initialization * Update dataset paths and group names in mmbench test configs * Update import statements in cc_utils.py, cn_utils.py, and en_utils.py * Add torch module import * lint * Remove IconQA dataset from README.md * Add Multi-DocVQA and its submodules * Add new datasets and update task names * Refactor flickr_aggregation_result function to accept additional arguments * Add timeout kwargs in Accelerator constructor * Add encoding to be utf-8 for cmmmu * Fix llava try and catch, remove torch.distributed.init in main * Ds prepare script for llava --------- Co-authored-by: JvThunder Co-authored-by: kcz358 commit d3dfd94ec91706587bf9f57dfebaad2cc3b5281c Author: Li Bo Date: Tue Feb 27 22:52:07 2024 +0800 [Wandb Logger] add models, and args to wandb tables. (#55) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * add llava main in pyproject * Update README.md * Remove unnecessary dependencies and add specific version for llava_repr * Add dependencies for llava_repr*** * Update README.md * add some docs on models and command line commands * remove some lines * typo * Update model_guide.md * Update model_guide.md * Update README.md * Update README.md * Update README.md * Fix refcocog dataset path * Record gpt response in eval info * Resolve conflict * Fix hallusionbench gpt json saving path * Rename hallubench gpt output path * Change remove image to check by type instead of check by names * More robust check by type * Remove unnecessary img in data * Forcing an empty commit. * Testing * Delete unnecessary things * Fix seedbench2 image issue in doc_to_text * Add conditional exclude for internal eval * Fix small bugs in list_with_num * Revise list_with_num model args * Fix logging utils bug on wandb grouping --------- Co-authored-by: Bo Li Co-authored-by: Fanyi Pu Co-authored-by: jzhang38 * [Repr] Provide reproduce environment and descriptions for llava-1.5 (#62) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * Update README.md with new features and installation instructions * Update supported models and datasets * Delete otter.py file * Fix capitalization in README.md * Update image sizes and add new features * Refactor README.md to improve readability and add new features * Add description for lmms-eval in README.md * Update accelerator support in README.md * Update lmms-eval README with improved description and additional features * Update README.md with improved task grouping description * change `Otter-AI/MME` to `lmms-lab/MME` * Update README.md * Update README.md * Remove unused code in mme.yaml * Squashed commit of the following: commit 6b20902d94ef9120181cd26cdce1e139046dbdf4 Author: Zhang Peiyuan Date: Thu Feb 29 13:40:02 2024 +0800 Dev/py add models (#57) * add instructblip * minicpm_v * remove from qwen-vl * speed up postprocessing * Optimize build context speed --------- Co-authored-by: Pu Fanyi Co-authored-by: kcz358 commit 21050baf08c4bc87c77d98dea55b6eecfefe1c0a Author: Pu Fanyi Date: Wed Feb 28 14:49:07 2024 +0800 Pufanyi/flickr30k refractor (#56) * refactor vizwizvqa task * Delete vqav2_test and vqav2_val YAML files * Refactor vqav2_process_results functions * Add a pack for vqav2 * refactor okvqa * roll back vizwiz_vqa * Fix exact_match calculation in ok_vqa_process_results * Update OKVQA dataset name in readme * add model_specific_prompt_kwargs * add model_specific_prompt_kwargs to vizwiz_vqa * add model_specific_prompt_kwargs for vqav2 * lint * fix a small bug for eval_logger * Refactor make_table function to display points as " - " if value is None * Merge commit 'c5e52a785d3cc87a866be9b880deb477d9f73fb7' * Refactor ok_vqa_aggreate_submissions function * Merge commit 'e5aa0a9601d6d8ce727315e4b0a8f13f06f26bff' * Refactor VQA submission file saving * Update file utils * Merge commit '560deca9f72483ca091795d6dc2537d4c54b32b0' * Refactor file path handling and submission generation * OKVQA path * vizwizvqa file * pack cmmmu * fix a small metric bug for cmmmu * Add higher_is_better flag to submission metric * Add CMMMU dataset to README.md * Add logging and refactor submission file generation in docvqa utils.py * pack docvqa * add traceback to print detailed error * Refactor docvqa_test_aggregate_results to accept additional arguments * Add metric check in evaluator.py and update test.yaml and val.yaml * add common `EvalAIAnswerProcessor` for okvqa, textvqa, vizwizvqa and vqav2 * merge textvqa * textvqa * Modify submission file generation for COCO test results * Update test result storage path * update coco cap file name * Update COCO 2017 Caption dataset name * ferret * Add Ferret dataset * Refactor hb_doc_to_text function to include model-specific prompts * Add IconQA and its subtasks * Refactor image list creation in doc_to_visual function * Add process_results function to default template * Update process_results function in iconqa utils.py * refactor flickr30k * change aggregation function * Fix formatting issues and update logging message * Fix llava can not handle only text question (no visuals) * Fix qwen can not handle no image question (no visuals) * Add fuyu prepare accelerator scripts * refactor mme * naming consistency * aggregation_submissions consistency * flickr30k naming consistency * remove submissions for mme * remove unused submission function * Refactor infovqa_test.yaml and infovqa_val.yaml * Refactor code for improved readability and maintainability * stvqa * remane sqa * Update lmms_eval textcaps files and utils.py * Update default prompt for text captions * Refactor textcaps_aggregation_result function * Add generate_submission_file function and update mathvista_aggregate_results signature * Update nocaps_test.yaml and nocaps_val.yaml * refractor internal_eval * Add internal evaluation datasets * pack multidocvqa * mmvet * Fix gpt eval timeout issue for hallubench, restore load from gpt to avoid re evaluating * Refractor llava wild * Refractor llava-bench-coco * Add JSON file generation for gpt evaluation details * mmmu * Remove MMBench English and Chinese tasks * Remove unnecessary return statement in mmbench_aggregate_test_results function * Fix distributed process group initialization * Update dataset paths and group names in mmbench test configs * Update import statements in cc_utils.py, cn_utils.py, and en_utils.py * Add torch module import * lint * Remove IconQA dataset from README.md * Add Multi-DocVQA and its submodules * Add new datasets and update task names * Refactor flickr_aggregation_result function to accept additional arguments * Add timeout kwargs in Accelerator constructor * Add encoding to be utf-8 for cmmmu * Fix llava try and catch, remove torch.distributed.init in main * Ds prepare script for llava --------- Co-authored-by: JvThunder Co-authored-by: kcz358 commit ba0e7f51e255540791d030468f79a126fec40eb5 Author: Li Bo Date: Tue Feb 27 22:52:07 2024 +0800 [Wandb Logger] add models, and args to wandb tables. (#55) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * add llava main in pyproject * Update README.md * Remove unnecessary dependencies and add specific version for llava_repr * Add dependencies for llava_repr*** * Update README.md * add some docs on models and command line commands * remove some lines * typo * Update model_guide.md * Update model_guide.md * Update README.md * Update README.md * Update README.md * Fix refcocog dataset path * Record gpt response in eval info * Resolve conflict * Fix hallusionbench gpt json saving path * Rename hallubench gpt output path * Change remove image to check by type instead of check by names * More robust check by type * Add timeout to API requests * Remove unnecessary img in data * Forcing an empty commit. * Testing * Delete unnecessary things * Fix error logging in get_chat_response function * Fix seedbench2 image issue in doc_to_text * Add conditional exclude for internal eval * Squashed commit of the following: commit faf9cf65cf5b1e036ee3a74428e8bb1490e8b2eb Author: kcz358 Date: Sat Mar 2 03:49:36 2024 +0000 Add conditional exclude for internal eval commit e3729eb925b718a44b6eb225ef9b41c7fd2408e0 Merge: a3cae8e ffb9eb2 Author: kcz358 Date: Sat Mar 2 03:24:29 2024 +0000 Merge branch 'dev/readme' into kc/final_fix commit 50b697a7ae93b0547484e1cd753722c1d2513349 Author: kcz358 Date: Sat Mar 2 02:47:31 2024 +0000 Fix seedbench2 image issue in doc_to_text commit 17425b5dce41cf67b96c5875139b57d6c7a423df Author: kcz358 Date: Fri Mar 1 15:32:49 2024 +0000 Delete unnecessary things commit 1bc17d54e79e79d11419ba89e7d8e55bc8cfa21b Author: kcz358 Date: Fri Mar 1 15:31:42 2024 +0000 Testing commit a20bbc30ab576d3e2a587c70af1b7c06575bcd8b Author: kcz358 Date: Fri Mar 1 15:29:30 2024 +0000 Forcing an empty commit. commit e2b657694b888ef59b9f896415e7c4c82497e7bf Merge: 786f2b5 1700786 Author: kcz358 Date: Fri Mar 1 15:24:56 2024 +0000 Merge branch 'kc/final_fix' into dev/readme commit 6447d521842b9f83f5119cdcd7714c8f6053ca73 Author: kcz358 Date: Fri Mar 1 15:24:20 2024 +0000 Remove unnecessary img in data commit 8ac333a2e9ebbe6318d536b6589f767f71fbc092 Merge: 4240785 888c1c1 Author: kcz358 Date: Fri Mar 1 13:41:24 2024 +0000 Merge branch 'kc/final_fix' into dev/readme commit 9e542ce049f68f49a237be165e3ad9cde7408ac0 Author: kcz358 Date: Fri Mar 1 13:40:51 2024 +0000 More robust check by type commit f90ccf7b94b130e118b4eca321f68b81e7ab5850 Author: kcz358 Date: Fri Mar 1 13:00:57 2024 +0000 Change remove image to check by type instead of check by names commit f651a77707a4c723ebffb07f2a87743bf42ecea7 Author: kcz358 Date: Fri Mar 1 12:33:02 2024 +0000 Rename hallubench gpt output path commit a683559c704806b7abde5e4c8355f556f3e65866 Author: kcz358 Date: Fri Mar 1 09:32:52 2024 +0000 Fix hallusionbench gpt json saving path commit 8e246e2466f3dd14a5e34f720269d7991a6dcf6b Author: kcz358 Date: Fri Mar 1 08:51:13 2024 +0000 Resolve conflict commit 67f00dc4652d09c662e5202ff7e5fbf7bebcdaf6 Merge: 9cf86fa 93534dc Author: kcz358 Date: Fri Mar 1 08:37:21 2024 +0000 Merge branch 'kc/final_fix' into dev/readme commit 53b7a845fe8412a652905101ec036c84e77a20c2 Author: kcz358 Date: Fri Mar 1 07:55:03 2024 +0000 Record gpt response in eval info commit 920b4112c4508e9a8afe824678958f2e78189e4e Author: kcz358 Date: Fri Mar 1 07:49:01 2024 +0000 Fix refcocog dataset path commit 6b20902d94ef9120181cd26cdce1e139046dbdf4 Author: Zhang Peiyuan Date: Thu Feb 29 13:40:02 2024 +0800 Dev/py add models (#57) * add instructblip * minicpm_v * remove from qwen-vl * speed up postprocessing * Optimize build context speed --------- Co-authored-by: Pu Fanyi Co-authored-by: kcz358 commit 21050baf08c4bc87c77d98dea55b6eecfefe1c0a Author: Pu Fanyi Date: Wed Feb 28 14:49:07 2024 +0800 Pufanyi/flickr30k refractor (#56) * refactor vizwizvqa task * Delete vqav2_test and vqav2_val YAML files * Refactor vqav2_process_results functions * Add a pack for vqav2 * refactor okvqa * roll back vizwiz_vqa * Fix exact_match calculation in ok_vqa_process_results * Update OKVQA dataset name in readme * add model_specific_prompt_kwargs * add model_specific_prompt_kwargs to vizwiz_vqa * add model_specific_prompt_kwargs for vqav2 * lint * fix a small bug for eval_logger * Refactor make_table function to display points as " - " if value is None * Merge commit 'c5e52a785d3cc87a866be9b880deb477d9f73fb7' * Refactor ok_vqa_aggreate_submissions function * Merge commit 'e5aa0a9601d6d8ce727315e4b0a8f13f06f26bff' * Refactor VQA submission file saving * Update file utils * Merge commit '560deca9f72483ca091795d6dc2537d4c54b32b0' * Refactor file path handling and submission generation * OKVQA path * vizwizvqa file * pack cmmmu * fix a small metric bug for cmmmu * Add higher_is_better flag to submission metric * Add CMMMU dataset to README.md * Add logging and refactor submission file generation in docvqa utils.py * pack docvqa * add traceback to print detailed error * Refactor docvqa_test_aggregate_results to accept additional arguments * Add metric check in evaluator.py and update test.yaml and val.yaml * add common `EvalAIAnswerProcessor` for okvqa, textvqa, vizwizvqa and vqav2 * merge textvqa * textvqa * Modify submission file generation for COCO test results * Update test result storage path * update coco cap file name * Update COCO 2017 Caption dataset name * ferret * Add Ferret dataset * Refactor hb_doc_to_text function to include model-specific prompts * Add IconQA and its subtasks * Refactor image list creation in doc_to_visual function * Add process_results function to default template * Update process_results function in iconqa utils.py * refactor flickr30k * change aggregation function * Fix formatting issues and update logging message * Fix llava can not handle only text question (no visuals) * Fix qwen can not handle no image question (no visuals) * Add fuyu prepare accelerator scripts * refactor mme * naming consistency * aggregation_submissions consistency * flickr30k naming consistency * remove submissions for mme * remove unused submission function * Refactor infovqa_test.yaml and infovqa_val.yaml * Refactor code for improved readability and maintainability * stvqa * remane sqa * Update lmms_eval textcaps files and utils.py * Update default prompt for text captions * Refactor textcaps_aggregation_result function * Add generate_submission_file function and update mathvista_aggregate_results signature * Update nocaps_test.yaml and nocaps_val.yaml * refractor internal_eval * Add internal evaluation datasets * pack multidocvqa * mmvet * Fix gpt eval timeout issue for hallubench, restore load from gpt to avoid re evaluating * Refractor llava wild * Refractor llava-bench-coco * Add JSON file generation for gpt evaluation details * mmmu * Remove MMBench English and Chinese tasks * Remove unnecessary return statement in mmbench_aggregate_test_results function * Fix distributed process group initialization * Update dataset paths and group names in mmbench test configs * Update import statements in cc_utils.py, cn_utils.py, and en_utils.py * Add torch module import * lint * Remove IconQA dataset from README.md * Add Multi-DocVQA and its submodules * Add new datasets and update task names * Refactor flickr_aggregation_result function to accept additional arguments * Add timeout kwargs in Accelerator constructor * Add encoding to be utf-8 for cmmmu * Fix llava try and catch, remove torch.distributed.init in main * Ds prepare script for llava --------- Co-authored-by: JvThunder Co-authored-by: kcz358 commit ba0e7f51e255540791d030468f79a126fec40eb5 Author: Li Bo Date: Tue Feb 27 22:52:07 2024 +0800 [Wandb Logger] add models, and args to wandb tables. (#55) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * Fix small bugs in list_with_num * Revise list_with_num model args * Dev/readme rm rolling (#60) * remove log_likelyhood_rolling * Update time efficiency benchmark in README.md * add task guide --------- Co-authored-by: jzhang38 Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Remove unnecessary code and update dependencies * Fix logging utils bug on wandb grouping * Add reproduce envs * Squashed commit of the following: commit 74fff73053b88a90d0f4229a9c748256080fea08 Merge: 2475639 2152f18 Author: kcz358 Date: Sun Mar 3 22:12:12 2024 +0800 Merge branch 'main' into kc/final_fix commit 0c640a636e3882859a17e30a5c3504850a3d02d6 Author: kcz358 Date: Sun Mar 3 22:11:04 2024 +0800 Add reproduce envs commit 7f2b2c38277fcb033c48414d827e96a64e1cac8d Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Mar 3 21:19:15 2024 +0800 [Fix] wandb group logging missing columns (#61) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * Update README.md with new features and installation instructions * Update supported models and datasets * Delete otter.py file * Fix capitalization in README.md * Update image sizes and add new features * Refactor README.md to improve readability and add new features * Add description for lmms-eval in README.md * Update accelerator support in README.md * Update lmms-eval README with improved description and additional features * Update README.md with improved task grouping description * change `Otter-AI/MME` to `lmms-lab/MME` * Update README.md * Update README.md * Remove unused code in mme.yaml * Squashed commit of the following: commit 6b20902d94ef9120181cd26cdce1e139046dbdf4 Author: Zhang Peiyuan Date: Thu Feb 29 13:40:02 2024 +0800 Dev/py add models (#57) * add instructblip * minicpm_v * remove from qwen-vl * speed up postprocessing * Optimize build context speed --------- Co-authored-by: Pu Fanyi Co-authored-by: kcz358 commit 21050baf08c4bc87c77d98dea55b6eecfefe1c0a Author: Pu Fanyi Date: Wed Feb 28 14:49:07 2024 +0800 Pufanyi/flickr30k refractor (#56) * refactor vizwizvqa task * Delete vqav2_test and vqav2_val YAML files * Refactor vqav2_process_results functions * Add a pack for vqav2 * refactor okvqa * roll back vizwiz_vqa * Fix exact_match calculation in ok_vqa_process_results * Update OKVQA dataset name in readme * add model_specific_prompt_kwargs * add model_specific_prompt_kwargs to vizwiz_vqa * add model_specific_prompt_kwargs for vqav2 * lint * fix a small bug for eval_logger * Refactor make_table function to display points as " - " if value is None * Merge commit 'c5e52a785d3cc87a866be9b880deb477d9f73fb7' * Refactor ok_vqa_aggreate_submissions function * Merge commit 'e5aa0a9601d6d8ce727315e4b0a8f13f06f26bff' * Refactor VQA submission file saving * Update file utils * Merge commit '560deca9f72483ca091795d6dc2537d4c54b32b0' * Refactor file path handling and submission generation * OKVQA path * vizwizvqa file * pack cmmmu * fix a small metric bug for cmmmu * Add higher_is_better flag to submission metric * Add CMMMU dataset to README.md * Add logging and refactor submission file generation in docvqa utils.py * pack docvqa * add traceback to print detailed error * Refactor docvqa_test_aggregate_results to accept additional arguments * Add metric check in evaluator.py and update test.yaml and val.yaml * add common `EvalAIAnswerProcessor` for okvqa, textvqa, vizwizvqa and vqav2 * merge textvqa * textvqa * Modify submission file generation for COCO test results * Update test result storage path * update coco cap file name * Update COCO 2017 Caption dataset name * ferret * Add Ferret dataset * Refactor hb_doc_to_text function to include model-specific prompts * Add IconQA and its subtasks * Refactor image list creation in doc_to_visual function * Add process_results function to default template * Update process_results function in iconqa utils.py * refactor flickr30k * change aggregation function * Fix formatting issues and update logging message * Fix llava can not handle only text question (no visuals) * Fix qwen can not handle no image question (no visuals) * Add fuyu prepare accelerator scripts * refactor mme * naming consistency * aggregation_submissions consistency * flickr30k naming consistency * remove submissions for mme * remove unused submission function * Refactor infovqa_test.yaml and infovqa_val.yaml * Refactor code for improved readability and maintainability * stvqa * remane sqa * Update lmms_eval textcaps files and utils.py * Update default prompt for text captions * Refactor textcaps_aggregation_result function * Add generate_submission_file function and update mathvista_aggregate_results signature * Update nocaps_test.yaml and nocaps_val.yaml * refractor internal_eval * Add internal evaluation datasets * pack multidocvqa * mmvet * Fix gpt eval timeout issue for hallubench, restore load from gpt to avoid re evaluating * Refractor llava wild * Refractor llava-bench-coco * Add JSON file generation for gpt evaluation details * mmmu * Remove MMBench English and Chinese tasks * Remove unnecessary return statement in mmbench_aggregate_test_results function * Fix distributed process group initialization * Update dataset paths and group names in mmbench test configs * Update import statements in cc_utils.py, cn_utils.py, and en_utils.py * Add torch module import * lint * Remove IconQA dataset from README.md * Add Multi-DocVQA and its submodules * Add new datasets and update task names * Refactor flickr_aggregation_result function to accept additional arguments * Add timeout kwargs in Accelerator constructor * Add encoding to be utf-8 for cmmmu * Fix llava try and catch, remove torch.distributed.init in main * Ds prepare script for llava --------- Co-authored-by: JvThunder Co-authored-by: kcz358 commit ba0e7f51e255540791d030468f79a126fec40eb5 Author: Li Bo Date: Tue Feb 27 22:52:07 2024 +0800 [Wandb Logger] add models, and args to wandb tables. (#55) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * add llava main in pyproject * Update README.md * Remove unnecessary dependencies and add specific version for llava_repr * Add dependencies for llava_repr*** * Update README.md * add some docs on models and command line commands * remove some lines * typo * Update model_guide.md * Update model_guide.md * Update README.md * Update README.md * Update README.md * Fix refcocog dataset path * Record gpt response in eval info * Resolve conflict * Fix hallusionbench gpt json saving path * Rename hallubench gpt output path * Change remove image to check by type instead of check by names * More robust check by type * Remove unnecessary img in data * Forcing an empty commit. * Testing * Delete unnecessary things * Fix seedbench2 image issue in doc_to_text * Add conditional exclude for internal eval * Fix small bugs in list_with_num * Revise list_with_num model args * Fix logging utils bug on wandb grouping --------- Co-authored-by: Bo Li Co-authored-by: Fanyi Pu Co-authored-by: jzhang38 commit bebff9fad2a60bc0ac52ddc430e5d9e4e0ef6c24 Merge: 83358a4 fd7773d Author: kcz358 Date: Sun Mar 3 07:25:48 2024 +0000 Merge branch 'main' into kc/final_fix commit 5042bb0c2ed4f830dda6bcd14231b1f8763aa95f Author: kcz358 Date: Sun Mar 3 07:23:19 2024 +0000 Fix logging utils bug on wandb grouping commit c82042b46e4c922efd5cb3fa441220333d521f6f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Mar 3 13:01:11 2024 +0800 [Fix] refcocog dataset path, record gpt prompt in internal eval, build context issue (#59) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * Update README.md with new features and installation instructions * Update supported models and datasets * Delete otter.py file * Fix capitalization in README.md * Update image sizes and add new features * Refactor README.md to improve readability and add new features * Add description for lmms-eval in README.md * Update accelerator support in README.md * Update lmms-eval README with improved description and additional features * Update README.md with improved task grouping description * change `Otter-AI/MME` to `lmms-lab/MME` * Update README.md * Update README.md * Remove unused code in mme.yaml * Squashed commit of the following: commit 6b20902d94ef9120181cd26cdce1e139046dbdf4 Author: Zhang Peiyuan Date: Thu Feb 29 13:40:02 2024 +0800 Dev/py add models (#57) * add instructblip * minicpm_v * remove from qwen-vl * speed up postprocessing * Optimize build context speed --------- Co-authored-by: Pu Fanyi Co-authored-by: kcz358 commit 21050baf08c4bc87c77d98dea55b6eecfefe1c0a Author: Pu Fanyi Date: Wed Feb 28 14:49:07 2024 +0800 Pufanyi/flickr30k refractor (#56) * refactor vizwizvqa task * Delete vqav2_test and vqav2_val YAML files * Refactor vqav2_process_results functions * Add a pack for vqav2 * refactor okvqa * roll back vizwiz_vqa * Fix exact_match calculation in ok_vqa_process_results * Update OKVQA dataset name in readme * add model_specific_prompt_kwargs * add model_specific_prompt_kwargs to vizwiz_vqa * add model_specific_prompt_kwargs for vqav2 * lint * fix a small bug for eval_logger * Refactor make_table function to display points as " - " if value is None * Merge commit 'c5e52a785d3cc87a866be9b880deb477d9f73fb7' * Refactor ok_vqa_aggreate_submissions function * Merge commit 'e5aa0a9601d6d8ce727315e4b0a8f13f06f26bff' * Refactor VQA submission file saving * Update file utils * Merge commit '560deca9f72483ca091795d6dc2537d4c54b32b0' * Refactor file path handling and submission generation * OKVQA path * vizwizvqa file * pack cmmmu * fix a small metric bug for cmmmu * Add higher_is_better flag to submission metric * Add CMMMU dataset to README.md * Add logging and refactor submission file generation in docvqa utils.py * pack docvqa * add traceback to print detailed error * Refactor docvqa_test_aggregate_results to accept additional arguments * Add metric check in evaluator.py and update test.yaml and val.yaml * add common `EvalAIAnswerProcessor` for okvqa, textvqa, vizwizvqa and vqav2 * merge textvqa * textvqa * Modify submission file generation for COCO test results * Update test result storage path * update coco cap file name * Update COCO 2017 Caption dataset name * ferret * Add Ferret dataset * Refactor hb_doc_to_text function to include model-specific prompts * Add IconQA and its subtasks * Refactor image list creation in doc_to_visual function * Add process_results function to default template * Update process_results function in iconqa utils.py * refactor flickr30k * change aggregation function * Fix formatting issues and update logging message * Fix llava can not handle only text question (no visuals) * Fix qwen can not handle no image question (no visuals) * Add fuyu prepare accelerator scripts * refactor mme * naming consistency * aggregation_submissions consistency * flickr30k naming consistency * remove submissions for mme * remove unused submission function * Refactor infovqa_test.yaml and infovqa_val.yaml * Refactor code for improved readability and maintainability * stvqa * remane sqa * Update lmms_eval textcaps files and utils.py * Update default prompt for text captions * Refactor textcaps_aggregation_result function * Add generate_submission_file function and update mathvista_aggregate_results signature * Update nocaps_test.yaml and nocaps_val.yaml * refractor internal_eval * Add internal evaluation datasets * pack multidocvqa * mmvet * Fix gpt eval timeout issue for hallubench, restore load from gpt to avoid re evaluating * Refractor llava wild * Refractor llava-bench-coco * Add JSON file generation for gpt evaluation details * mmmu * Remove MMBench English and Chinese tasks * Remove unnecessary return statement in mmbench_aggregate_test_results function * Fix distributed process group initialization * Update dataset paths and group names in mmbench test configs * Update import statements in cc_utils.py, cn_utils.py, and en_utils.py * Add torch module import * lint * Remove IconQA dataset from README.md * Add Multi-DocVQA and its submodules * Add new datasets and update task names * Refactor flickr_aggregation_result function to accept additional arguments * Add timeout kwargs in Accelerator constructor * Add encoding to be utf-8 for cmmmu * Fix llava try and catch, remove torch.distributed.init in main * Ds prepare script for llava --------- Co-authored-by: JvThunder Co-authored-by: kcz358 commit ba0e7f51e255540791d030468f79a126fec40eb5 Author: Li Bo Date: Tue Feb 27 22:52:07 2024 +0800 [Wandb Logger] add models, and args to wandb tables. (#55) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * add llava main in pyproject * Update README.md * Remove unnecessary dependencies and add specific version for llava_repr * Add dependencies for llava_repr*** * Update README.md * add some docs on models and command line commands * remove some lines * typo * Update model_guide.md * Update model_guide.md * Update README.md * Update README.md * Update README.md * Fix refcocog dataset path * Record gpt response in eval info * Resolve conflict * Fix hallusionbench gpt json saving path * Rename hallubench gpt output path * Change remove image to check by type instead of check by names * More robust check by type * Remove unnecessary img in data * Forcing an empty commit. * Testing * Delete unnecessary things * Fix seedbench2 image issue in doc_to_text * Add conditional exclude for internal eval * Fix small bugs in list_with_num * Revise list_with_num model args --------- Co-authored-by: Bo Li Co-authored-by: Fanyi Pu Co-authored-by: jzhang38 commit d78a3d7a53f5285a7eac39ce8f04e9854fdb3e73 Author: kcz358 Date: Sat Mar 2 05:58:08 2024 +0000 Revise list_with_num model args commit 8eefaec8489d48613de9395eb8e8150224985e01 Author: kcz358 Date: Sat Mar 2 05:09:15 2024 +0000 Fix small bugs in list_with_num commit faf9cf65cf5b1e036ee3a74428e8bb1490e8b2eb Author: kcz358 Date: Sat Mar 2 03:49:36 2024 +0000 Add conditional exclude for internal eval commit e3729eb925b718a44b6eb225ef9b41c7fd2408e0 Merge: a3cae8e ffb9eb2 Author: kcz358 Date: Sat Mar 2 03:24:29 2024 +0000 Merge branch 'dev/readme' into kc/final_fix commit 50b697a7ae93b0547484e1cd753722c1d2513349 Author: kcz358 Date: Sat Mar 2 02:47:31 2024 +0000 Fix seedbench2 image issue in doc_to_text commit 17425b5dce41cf67b96c5875139b57d6c7a423df Author: kcz358 Date: Fri Mar 1 15:32:49 2024 +0000 Delete unnecessary things commit 1bc17d54e79e79d11419ba89e7d8e55bc8cfa21b Author: kcz358 Date: Fri Mar 1 15:31:42 2024 +0000 Testing commit a20bbc30ab576d3e2a587c70af1b7c06575bcd8b Author: kcz358 Date: Fri Mar 1 15:29:30 2024 +0000 Forcing an empty commit. commit e2b657694b888ef59b9f896415e7c4c82497e7bf Merge: 786f2b5 1700786 Author: kcz358 Date: Fri Mar 1 15:24:56 2024 +0000 Merge branch 'kc/final_fix' into dev/readme commit 6447d521842b9f83f5119cdcd7714c8f6053ca73 Author: kcz358 Date: Fri Mar 1 15:24:20 2024 +0000 Remove unnecessary img in data commit 8ac333a2e9ebbe6318d536b6589f767f71fbc092 Merge: 4240785 888c1c1 Author: kcz358 Date: Fri Mar 1 13:41:24 2024 +0000 Merge branch 'kc/final_fix' into dev/readme commit 9e542ce049f68f49a237be165e3ad9cde7408ac0 Author: kcz358 Date: Fri Mar 1 13:40:51 2024 +0000 More robust check by type commit f90ccf7b94b130e118b4eca321f68b81e7ab5850 Author: kcz358 Date: Fri Mar 1 13:00:57 2024 +0000 Change remove image to check by type instead of check by names commit f651a77707a4c723ebffb07f2a87743bf42ecea7 Author: kcz358 Date: Fri Mar 1 12:33:02 2024 +0000 Rename hallubench gpt output path commit a683559c704806b7abde5e4c8355f556f3e65866 Author: kcz358 Date: Fri Mar 1 09:32:52 2024 +0000 Fix hallusionbench gpt json saving path commit 8e246e2466f3dd14a5e34f720269d7991a6dcf6b Author: kcz358 Date: Fri Mar 1 08:51:13 2024 +0000 Resolve conflict commit 67f00dc4652d09c662e5202ff7e5fbf7bebcdaf6 Merge: 9cf86fa 93534dc Author: kcz358 Date: Fri Mar 1 08:37:21 2024 +0000 Merge branch 'kc/final_fix' into dev/readme commit 53b7a845fe8412a652905101ec036c84e77a20c2 Author: kcz358 Date: Fri Mar 1 07:55:03 2024 +0000 Record gpt response in eval info commit 920b4112c4508e9a8afe824678958f2e78189e4e Author: kcz358 Date: Fri Mar 1 07:49:01 2024 +0000 Fix refcocog dataset path commit 6b20902d94ef9120181cd26cdce1e139046dbdf4 Author: Zhang Peiyuan Date: Thu Feb 29 13:40:02 2024 +0800 Dev/py add models (#57) * add instructblip * minicpm_v * remove from qwen-vl * speed up postprocessing * Optimize build context speed --------- Co-authored-by: Pu Fanyi Co-authored-by: kcz358 commit 21050baf08c4bc87c77d98dea55b6eecfefe1c0a Author: Pu Fanyi Date: Wed Feb 28 14:49:07 2024 +0800 Pufanyi/flickr30k refractor (#56) * refactor vizwizvqa task * Delete vqav2_test and vqav2_val YAML files * Refactor vqav2_process_results functions * Add a pack for vqav2 * refactor okvqa * roll back vizwiz_vqa * Fix exact_match calculation in ok_vqa_process_results * Update OKVQA dataset name in readme * add model_specific_prompt_kwargs * add model_specific_prompt_kwargs to vizwiz_vqa * add model_specific_prompt_kwargs for vqav2 * lint * fix a small bug for eval_logger * Refactor make_table function to display points as " - " if value is None * Merge commit 'c5e52a785d3cc87a866be9b880deb477d9f73fb7' * Refactor ok_vqa_aggreate_submissions function * Merge commit 'e5aa0a9601d6d8ce727315e4b0a8f13f06f26bff' * Refactor VQA submission file saving * Update file utils * Merge commit '560deca9f72483ca091795d6dc2537d4c54b32b0' * Refactor file path handling and submission generation * OKVQA path * vizwizvqa file * pack cmmmu * fix a small metric bug for cmmmu * Add higher_is_better flag to submission metric * Add CMMMU dataset to README.md * Add logging and refactor submission file generation in docvqa utils.py * pack docvqa * add traceback to print detailed error * Refactor docvqa_test_aggregate_results to accept additional arguments * Add metric check in evaluator.py and update test.yaml and val.yaml * add common `EvalAIAnswerProcessor` for okvqa, textvqa, vizwizvqa and vqav2 * merge textvqa * textvqa * Modify submission file generation for COCO test results * Update test result storage path * update coco cap file name * Update COCO 2017 Caption dataset name * ferret * Add Ferret dataset * Refactor hb_doc_to_text function to include model-specific prompts * Add IconQA and its subtasks * Refactor image list creation in doc_to_visual function * Add process_results function to default template * Update process_results function in iconqa utils.py * refactor flickr30k * change aggregation function * Fix formatting issues and update logging message * Fix llava can not handle only text question (no visuals) * Fix qwen can not handle no image question (no visuals) * Add fuyu prepare accelerator scripts * refactor mme * naming consistency * aggregation_submissions consistency * flickr30k naming consistency * remove submissions for mme * remove unused submission function * Refactor infovqa_test.yaml and infovqa_val.yaml * Refactor code for improved readability and maintainability * stvqa * remane sqa * Update lmms_eval textcaps files and utils.py * Update default prompt for text captions * Refactor textcaps_aggregation_result function * Add generate_submission_file function and update mathvista_aggregate_results signature * Update nocaps_test.yaml and nocaps_val.yaml * refractor internal_eval * Add internal evaluation datasets * pack multidocvqa * mmvet * Fix gpt eval timeout issue for hallubench, restore load from gpt to avoid re evaluating * Refractor llava wild * Refractor llava-bench-coco * Add JSON file generation for gpt evaluation details * mmmu * Remove MMBench English and Chinese tasks * Remove unnecessary return statement in mmbench_aggregate_test_results function * Fix distributed process group initialization * Update dataset paths and group names in mmbench test configs * Update import statements in cc_utils.py, cn_utils.py, and en_utils.py * Add torch module import * lint * Remove IconQA dataset from README.md * Add Multi-DocVQA and its submodules * Add new datasets and update task names * Refactor flickr_aggregation_result function to accept additional arguments * Add timeout kwargs in Accelerator constructor * Add encoding to be utf-8 for cmmmu * Fix llava try and catch, remove torch.distributed.init in main * Ds prepare script for llava --------- Co-authored-by: JvThunder Co-authored-by: kcz358 commit ba0e7f51e255540791d030468f79a126fec40eb5 Author: Li Bo Date: Tue Feb 27 22:52:07 2024 +0800 [Wandb Logger] add models, and args to wandb tables. (#55) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * Update commands.md * Add repr_scripts for reference * Add timeout for gpt4V * Remove unnecessary dependencies * Add reproduce into readme * Revise seedbench process_result * Fix exclude dc hardcode postprocess logic error * Fix metric repeat issue * Update dataset runtime and add environment info * Revise val submission file saving path * Put the correct query into the gpt extraction * Update sleep time in utils.py * update --------- Co-authored-by: Bo Li Co-authored-by: Fanyi Pu Co-authored-by: jzhang38 * [Repr] Provide reproduce environment and descriptions for llava-1.5 (#62) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * Update README.md with new features and installation instructions * Update supported models and datasets * Delete otter.py file * Fix capitalization in README.md * Update image sizes and add new features * Refactor README.md to improve readability and add new features * Add description for lmms-eval in README.md * Update accelerator support in README.md * Update lmms-eval README with improved description and additional features * Update README.md with improved task grouping description * change `Otter-AI/MME` to `lmms-lab/MME` * Update README.md * Update README.md * Remove unused code in mme.yaml * Squashed commit of the following: commit 2a45079a974c55aeb8e04d367759e3cf383072c1 Author: Zhang Peiyuan Date: Thu Feb 29 13:40:02 2024 +0800 Dev/py add models (#57) * add instructblip * minicpm_v * remove from qwen-vl * speed up postprocessing * Optimize build context speed --------- Co-authored-by: Pu Fanyi Co-authored-by: kcz358 commit 7bdab7a743092cdca66b049e15332d8c21680147 Author: Pu Fanyi Date: Wed Feb 28 14:49:07 2024 +0800 Pufanyi/flickr30k refractor (#56) * refactor vizwizvqa task * Delete vqav2_test and vqav2_val YAML files * Refactor vqav2_process_results functions * Add a pack for vqav2 * refactor okvqa * roll back vizwiz_vqa * Fix exact_match calculation in ok_vqa_process_results * Update OKVQA dataset name in readme * add model_specific_prompt_kwargs * add model_specific_prompt_kwargs to vizwiz_vqa * add model_specific_prompt_kwargs for vqav2 * lint * fix a small bug for eval_logger * Refactor make_table function to display points as " - " if value is None * Merge commit '90f42f0876a4914c5ac0d213b9dffbfb4797ff62' * Refactor ok_vqa_aggreate_submissions function * Merge commit '4afec3303a0a7ed27a8265565343bf2851b9e4c7' * Refactor VQA submission file saving * Update file utils * Merge commit 'c144b75f0c9145a625b2bbdef5123ed81e343a11' * Refactor file path handling and submission generation * OKVQA path * vizwizvqa file * pack cmmmu * fix a small metric bug for cmmmu * Add higher_is_better flag to submission metric * Add CMMMU dataset to README.md * Add logging and refactor submission file generation in docvqa utils.py * pack docvqa * add traceback to print detailed error * Refactor docvqa_test_aggregate_results to accept additional arguments * Add metric check in evaluator.py and update test.yaml and val.yaml * add common `EvalAIAnswerProcessor` for okvqa, textvqa, vizwizvqa and vqav2 * merge textvqa * textvqa * Modify submission file generation for COCO test results * Update test result storage path * update coco cap file name * Update COCO 2017 Caption dataset name * ferret * Add Ferret dataset * Refactor hb_doc_to_text function to include model-specific prompts * Add IconQA and its subtasks * Refactor image list creation in doc_to_visual function * Add process_results function to default template * Update process_results function in iconqa utils.py * refactor flickr30k * change aggregation function * Fix formatting issues and update logging message * Fix llava can not handle only text question (no visuals) * Fix qwen can not handle no image question (no visuals) * Add fuyu prepare accelerator scripts * refactor mme * naming consistency * aggregation_submissions consistency * flickr30k naming consistency * remove submissions for mme * remove unused submission function * Refactor infovqa_test.yaml and infovqa_val.yaml * Refactor code for improved readability and maintainability * stvqa * remane sqa * Update lmms_eval textcaps files and utils.py * Update default prompt for text captions * Refactor textcaps_aggregation_result function * Add generate_submission_file function and update mathvista_aggregate_results signature * Update nocaps_test.yaml and nocaps_val.yaml * refractor internal_eval * Add internal evaluation datasets * pack multidocvqa * mmvet * Fix gpt eval timeout issue for hallubench, restore load from gpt to avoid re evaluating * Refractor llava wild * Refractor llava-bench-coco * Add JSON file generation for gpt evaluation details * mmmu * Remove MMBench English and Chinese tasks * Remove unnecessary return statement in mmbench_aggregate_test_results function * Fix distributed process group initialization * Update dataset paths and group names in mmbench test configs * Update import statements in cc_utils.py, cn_utils.py, and en_utils.py * Add torch module import * lint * Remove IconQA dataset from README.md * Add Multi-DocVQA and its submodules * Add new datasets and update task names * Refactor flickr_aggregation_result function to accept additional arguments * Add timeout kwargs in Accelerator constructor * Add encoding to be utf-8 for cmmmu * Fix llava try and catch, remove torch.distributed.init in main * Ds prepare script for llava --------- Co-authored-by: JvThunder Co-authored-by: kcz358 commit d3dfd94ec91706587bf9f57dfebaad2cc3b5281c Author: Li Bo Date: Tue Feb 27 22:52:07 2024 +0800 [Wandb Logger] add models, and args to wandb tables. (#55) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * add llava main in pyproject * Update README.md * Remove unnecessary dependencies and add specific version for llava_repr * Add dependencies for llava_repr*** * Update README.md * add some docs on models and command line commands * remove some lines * typo * Update model_guide.md * Update model_guide.md * Update README.md * Update README.md * Update README.md * Fix refcocog dataset path * Record gpt response in eval info * Resolve conflict * Fix hallusionbench gpt json saving path * Rename hallubench gpt output path * Change remove image to check by type instead of check by names * More robust check by type * Add timeout to API requests * Remove unnecessary img in data * Forcing an empty commit. * Testing * Delete unnecessary things * Fix error logging in get_chat_response function * Fix seedbench2 image issue in doc_to_text * Add conditional exclude for internal eval * Squashed commit of the following: commit 2fbeafc882c80242a10381abc67629d5d8b7071a Author: kcz358 Date: Sat Mar 2 03:49:36 2024 +0000 Add conditional exclude for internal eval commit f188052450bed2f3a30ab6f9a6f7eb844a64cb33 Merge: a3cae8e ffb9eb2 Author: kcz358 Date: Sat Mar 2 03:24:29 2024 +0000 Merge branch 'dev/readme' into kc/final_fix commit baef5905505892593fe783beb18a2de20991d6af Author: kcz358 Date: Sat Mar 2 02:47:31 2024 +0000 Fix seedbench2 image issue in doc_to_text commit 11b46f3b701b79b361dd5175a263e4d89bd07fb5 Author: kcz358 Date: Fri Mar 1 15:32:49 2024 +0000 Delete unnecessary things commit 0982de2e7a2310429e51ec7828886fd49953f716 Author: kcz358 Date: Fri Mar 1 15:31:42 2024 +0000 Testing commit f840ed80f4ae467fff62b61844854a3a9e8ec8a5 Author: kcz358 Date: Fri Mar 1 15:29:30 2024 +0000 Forcing an empty commit. commit 80db78f600d07011188983637c94da84b9475fbf Merge: 786f2b5 1700786 Author: kcz358 Date: Fri Mar 1 15:24:56 2024 +0000 Merge branch 'kc/final_fix' into dev/readme commit 676229de870b8d465cef08867cd272a4b696e630 Author: kcz358 Date: Fri Mar 1 15:24:20 2024 +0000 Remove unnecessary img in data commit d293b96fb3537fea85f10f216d762abf35e05e8d Merge: 4240785 888c1c1 Author: kcz358 Date: Fri Mar 1 13:41:24 2024 +0000 Merge branch 'kc/final_fix' into dev/readme commit 01bbd010590d6b7f105525580209191a1d6d5232 Author: kcz358 Date: Fri Mar 1 13:40:51 2024 +0000 More robust check by type commit 66595ebc073ff9431f2400006196c0645be58ea4 Author: kcz358 Date: Fri Mar 1 13:00:57 2024 +0000 Change remove image to check by type instead of check by names commit 08c2ebad1532fd6c34ac04efb94a268db9862d4f Author: kcz358 Date: Fri Mar 1 12:33:02 2024 +0000 Rename hallubench gpt output path commit aefbd3c6856584135e2dcbe13381db0e0780f063 Author: kcz358 Date: Fri Mar 1 09:32:52 2024 +0000 Fix hallusionbench gpt json saving path commit b9aebc3ff3b122d6d4a81bd2f28e86b2c390c505 Author: kcz358 Date: Fri Mar 1 08:51:13 2024 +0000 Resolve conflict commit c9daa91f2576de69af73c80e263afb085ecd8288 Merge: 9cf86fa 93534dc Author: kcz358 Date: Fri Mar 1 08:37:21 2024 +0000 Merge branch 'kc/final_fix' into dev/readme commit b1c4c88b9b36e02e9ed738ff9217d98a5ef2117b Author: kcz358 Date: Fri Mar 1 07:55:03 2024 +0000 Record gpt response in eval info commit b35bc4a6c8fd6b4b2a68bb3054878807b8b92281 Author: kcz358 Date: Fri Mar 1 07:49:01 2024 +0000 Fix refcocog dataset path commit 2a45079a974c55aeb8e04d367759e3cf383072c1 Author: Zhang Peiyuan Date: Thu Feb 29 13:40:02 2024 +0800 Dev/py add models (#57) * add instructblip * minicpm_v * remove from qwen-vl * speed up postprocessing * Optimize build context speed --------- Co-authored-by: Pu Fanyi Co-authored-by: kcz358 commit 7bdab7a743092cdca66b049e15332d8c21680147 Author: Pu Fanyi Date: Wed Feb 28 14:49:07 2024 +0800 Pufanyi/flickr30k refractor (#56) * refactor vizwizvqa task * Delete vqav2_test and vqav2_val YAML files * Refactor vqav2_process_results functions * Add a pack for vqav2 * refactor okvqa * roll back vizwiz_vqa * Fix exact_match calculation in ok_vqa_process_results * Update OKVQA dataset name in readme * add model_specific_prompt_kwargs * add model_specific_prompt_kwargs to vizwiz_vqa * add model_specific_prompt_kwargs for vqav2 * lint * fix a small bug for eval_logger * Refactor make_table function to display points as " - " if value is None * Merge commit '90f42f0876a4914c5ac0d213b9dffbfb4797ff62' * Refactor ok_vqa_aggreate_submissions function * Merge commit '4afec3303a0a7ed27a8265565343bf2851b9e4c7' * Refactor VQA submission file saving * Update file utils * Merge commit 'c144b75f0c9145a625b2bbdef5123ed81e343a11' * Refactor file path handling and submission generation * OKVQA path * vizwizvqa file * pack cmmmu * fix a small metric bug for cmmmu * Add higher_is_better flag to submission metric * Add CMMMU dataset to README.md * Add logging and refactor submission file generation in docvqa utils.py * pack docvqa * add traceback to print detailed error * Refactor docvqa_test_aggregate_results to accept additional arguments * Add metric check in evaluator.py and update test.yaml and val.yaml * add common `EvalAIAnswerProcessor` for okvqa, textvqa, vizwizvqa and vqav2 * merge textvqa * textvqa * Modify submission file generation for COCO test results * Update test result storage path * update coco cap file name * Update COCO 2017 Caption dataset name * ferret * Add Ferret dataset * Refactor hb_doc_to_text function to include model-specific prompts * Add IconQA and its subtasks * Refactor image list creation in doc_to_visual function * Add process_results function to default template * Update process_results function in iconqa utils.py * refactor flickr30k * change aggregation function * Fix formatting issues and update logging message * Fix llava can not handle only text question (no visuals) * Fix qwen can not handle no image question (no visuals) * Add fuyu prepare accelerator scripts * refactor mme * naming consistency * aggregation_submissions consistency * flickr30k naming consistency * remove submissions for mme * remove unused submission function * Refactor infovqa_test.yaml and infovqa_val.yaml * Refactor code for improved readability and maintainability * stvqa * remane sqa * Update lmms_eval textcaps files and utils.py * Update default prompt for text captions * Refactor textcaps_aggregation_result function * Add generate_submission_file function and update mathvista_aggregate_results signature * Update nocaps_test.yaml and nocaps_val.yaml * refractor internal_eval * Add internal evaluation datasets * pack multidocvqa * mmvet * Fix gpt eval timeout issue for hallubench, restore load from gpt to avoid re evaluating * Refractor llava wild * Refractor llava-bench-coco * Add JSON file generation for gpt evaluation details * mmmu * Remove MMBench English and Chinese tasks * Remove unnecessary return statement in mmbench_aggregate_test_results function * Fix distributed process group initialization * Update dataset paths and group names in mmbench test configs * Update import statements in cc_utils.py, cn_utils.py, and en_utils.py * Add torch module import * lint * Remove IconQA dataset from README.md * Add Multi-DocVQA and its submodules * Add new datasets and update task names * Refactor flickr_aggregation_result function to accept additional arguments * Add timeout kwargs in Accelerator constructor * Add encoding to be utf-8 for cmmmu * Fix llava try and catch, remove torch.distributed.init in main * Ds prepare script for llava --------- Co-authored-by: JvThunder Co-authored-by: kcz358 commit d3dfd94ec91706587bf9f57dfebaad2cc3b5281c Author: Li Bo Date: Tue Feb 27 22:52:07 2024 +0800 [Wandb Logger] add models, and args to wandb tables. (#55) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * Fix small bugs in list_with_num * Revise list_with_num model args * Dev/readme rm rolling (#60) * remove log_likelyhood_rolling * Update time efficiency benchmark in README.md * add task guide --------- Co-authored-by: jzhang38 Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Remove unnecessary code and update dependencies * Fix logging utils bug on wandb grouping * Add reproduce envs * Squashed commit of the following: commit 556b12620379d79c9ed5ddba0856063b498f917c Merge: 2475639 2152f18 Author: kcz358 Date: Sun Mar 3 22:12:12 2024 +0800 Merge branch 'main' into kc/final_fix commit 9509a782c9e9824273cefb1dc9671c92b887697d Author: kcz358 Date: Sun Mar 3 22:11:04 2024 +0800 Add reproduce envs commit 0bff98b798c71a361639385c985eccc54c46dfb6 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Mar 3 21:19:15 2024 +0800 [Fix] wandb group logging missing columns (#61) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * Update README.md with new features and installation instructions * Update supported models and datasets * Delete otter.py file * Fix capitalization in README.md * Update image sizes and add new features * Refactor README.md to improve readability and add new features * Add description for lmms-eval in README.md * Update accelerator support in README.md * Update lmms-eval README with improved description and additional features * Update README.md with improved task grouping description * change `Otter-AI/MME` to `lmms-lab/MME` * Update README.md * Update README.md * Remove unused code in mme.yaml * Squashed commit of the following: commit 2a45079a974c55aeb8e04d367759e3cf383072c1 Author: Zhang Peiyuan Date: Thu Feb 29 13:40:02 2024 +0800 Dev/py add models (#57) * add instructblip * minicpm_v * remove from qwen-vl * speed up postprocessing * Optimize build context speed --------- Co-authored-by: Pu Fanyi Co-authored-by: kcz358 commit 7bdab7a743092cdca66b049e15332d8c21680147 Author: Pu Fanyi Date: Wed Feb 28 14:49:07 2024 +0800 Pufanyi/flickr30k refractor (#56) * refactor vizwizvqa task * Delete vqav2_test and vqav2_val YAML files * Refactor vqav2_process_results functions * Add a pack for vqav2 * refactor okvqa * roll back vizwiz_vqa * Fix exact_match calculation in ok_vqa_process_results * Update OKVQA dataset name in readme * add model_specific_prompt_kwargs * add model_specific_prompt_kwargs to vizwiz_vqa * add model_specific_prompt_kwargs for vqav2 * lint * fix a small bug for eval_logger * Refactor make_table function to display points as " - " if value is None * Merge commit '90f42f0876a4914c5ac0d213b9dffbfb4797ff62' * Refactor ok_vqa_aggreate_submissions function * Merge commit '4afec3303a0a7ed27a8265565343bf2851b9e4c7' * Refactor VQA submission file saving * Update file utils * Merge commit 'c144b75f0c9145a625b2bbdef5123ed81e343a11' * Refactor file path handling and submission generation * OKVQA path * vizwizvqa file * pack cmmmu * fix a small metric bug for cmmmu * Add higher_is_better flag to submission metric * Add CMMMU dataset to README.md * Add logging and refactor submission file generation in docvqa utils.py * pack docvqa * add traceback to print detailed error * Refactor docvqa_test_aggregate_results to accept additional arguments * Add metric check in evaluator.py and update test.yaml and val.yaml * add common `EvalAIAnswerProcessor` for okvqa, textvqa, vizwizvqa and vqav2 * merge textvqa * textvqa * Modify submission file generation for COCO test results * Update test result storage path * update coco cap file name * Update COCO 2017 Caption dataset name * ferret * Add Ferret dataset * Refactor hb_doc_to_text function to include model-specific prompts * Add IconQA and its subtasks * Refactor image list creation in doc_to_visual function * Add process_results function to default template * Update process_results function in iconqa utils.py * refactor flickr30k * change aggregation function * Fix formatting issues and update logging message * Fix llava can not handle only text question (no visuals) * Fix qwen can not handle no image question (no visuals) * Add fuyu prepare accelerator scripts * refactor mme * naming consistency * aggregation_submissions consistency * flickr30k naming consistency * remove submissions for mme * remove unused submission function * Refactor infovqa_test.yaml and infovqa_val.yaml * Refactor code for improved readability and maintainability * stvqa * remane sqa * Update lmms_eval textcaps files and utils.py * Update default prompt for text captions * Refactor textcaps_aggregation_result function * Add generate_submission_file function and update mathvista_aggregate_results signature * Update nocaps_test.yaml and nocaps_val.yaml * refractor internal_eval * Add internal evaluation datasets * pack multidocvqa * mmvet * Fix gpt eval timeout issue for hallubench, restore load from gpt to avoid re evaluating * Refractor llava wild * Refractor llava-bench-coco * Add JSON file generation for gpt evaluation details * mmmu * Remove MMBench English and Chinese tasks * Remove unnecessary return statement in mmbench_aggregate_test_results function * Fix distributed process group initialization * Update dataset paths and group names in mmbench test configs * Update import statements in cc_utils.py, cn_utils.py, and en_utils.py * Add torch module import * lint * Remove IconQA dataset from README.md * Add Multi-DocVQA and its submodules * Add new datasets and update task names * Refactor flickr_aggregation_result function to accept additional arguments * Add timeout kwargs in Accelerator constructor * Add encoding to be utf-8 for cmmmu * Fix llava try and catch, remove torch.distributed.init in main * Ds prepare script for llava --------- Co-authored-by: JvThunder Co-authored-by: kcz358 commit d3dfd94ec91706587bf9f57dfebaad2cc3b5281c Author: Li Bo Date: Tue Feb 27 22:52:07 2024 +0800 [Wandb Logger] add models, and args to wandb tables. (#55) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * add llava main in pyproject * Update README.md * Remove unnecessary dependencies and add specific version for llava_repr * Add dependencies for llava_repr*** * Update README.md * add some docs on models and command line commands * remove some lines * typo * Update model_guide.md * Update model_guide.md * Update README.md * Update README.md * Update README.md * Fix refcocog dataset path * Record gpt response in eval info * Resolve conflict * Fix hallusionbench gpt json saving path * Rename hallubench gpt output path * Change remove image to check by type instead of check by names * More robust check by type * Remove unnecessary img in data * Forcing an empty commit. * Testing * Delete unnecessary things * Fix seedbench2 image issue in doc_to_text * Add conditional exclude for internal eval * Fix small bugs in list_with_num * Revise list_with_num model args * Fix logging utils bug on wandb grouping --------- Co-authored-by: Bo Li Co-authored-by: Fanyi Pu Co-authored-by: jzhang38 commit 7c4501a32bbb415ba7e62e93194b37ba9a435cf5 Merge: 83358a4 fd7773d Author: kcz358 Date: Sun Mar 3 07:25:48 2024 +0000 Merge branch 'main' into kc/final_fix commit 5c419f9fa23616a63a0bd584f18e509bb7704b50 Author: kcz358 Date: Sun Mar 3 07:23:19 2024 +0000 Fix logging utils bug on wandb grouping commit 0010d0ac308051e977981b83a51d0654488501d9 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Mar 3 13:01:11 2024 +0800 [Fix] refcocog dataset path, record gpt prompt in internal eval, build context issue (#59) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * Update README.md with new features and installation instructions * Update supported models and datasets * Delete otter.py file * Fix capitalization in README.md * Update image sizes and add new features * Refactor README.md to improve readability and add new features * Add description for lmms-eval in README.md * Update accelerator support in README.md * Update lmms-eval README with improved description and additional features * Update README.md with improved task grouping description * change `Otter-AI/MME` to `lmms-lab/MME` * Update README.md * Update README.md * Remove unused code in mme.yaml * Squashed commit of the following: commit 2a45079a974c55aeb8e04d367759e3cf383072c1 Author: Zhang Peiyuan Date: Thu Feb 29 13:40:02 2024 +0800 Dev/py add models (#57) * add instructblip * minicpm_v * remove from qwen-vl * speed up postprocessing * Optimize build context speed --------- Co-authored-by: Pu Fanyi Co-authored-by: kcz358 commit 7bdab7a743092cdca66b049e15332d8c21680147 Author: Pu Fanyi Date: Wed Feb 28 14:49:07 2024 +0800 Pufanyi/flickr30k refractor (#56) * refactor vizwizvqa task * Delete vqav2_test and vqav2_val YAML files * Refactor vqav2_process_results functions * Add a pack for vqav2 * refactor okvqa * roll back vizwiz_vqa * Fix exact_match calculation in ok_vqa_process_results * Update OKVQA dataset name in readme * add model_specific_prompt_kwargs * add model_specific_prompt_kwargs to vizwiz_vqa * add model_specific_prompt_kwargs for vqav2 * lint * fix a small bug for eval_logger * Refactor make_table function to display points as " - " if value is None * Merge commit '90f42f0876a4914c5ac0d213b9dffbfb4797ff62' * Refactor ok_vqa_aggreate_submissions function * Merge commit '4afec3303a0a7ed27a8265565343bf2851b9e4c7' * Refactor VQA submission file saving * Update file utils * Merge commit 'c144b75f0c9145a625b2bbdef5123ed81e343a11' * Refactor file path handling and submission generation * OKVQA path * vizwizvqa file * pack cmmmu * fix a small metric bug for cmmmu * Add higher_is_better flag to submission metric * Add CMMMU dataset to README.md * Add logging and refactor submission file generation in docvqa utils.py * pack docvqa * add traceback to print detailed error * Refactor docvqa_test_aggregate_results to accept additional arguments * Add metric check in evaluator.py and update test.yaml and val.yaml * add common `EvalAIAnswerProcessor` for okvqa, textvqa, vizwizvqa and vqav2 * merge textvqa * textvqa * Modify submission file generation for COCO test results * Update test result storage path * update coco cap file name * Update COCO 2017 Caption dataset name * ferret * Add Ferret dataset * Refactor hb_doc_to_text function to include model-specific prompts * Add IconQA and its subtasks * Refactor image list creation in doc_to_visual function * Add process_results function to default template * Update process_results function in iconqa utils.py * refactor flickr30k * change aggregation function * Fix formatting issues and update logging message * Fix llava can not handle only text question (no visuals) * Fix qwen can not handle no image question (no visuals) * Add fuyu prepare accelerator scripts * refactor mme * naming consistency * aggregation_submissions consistency * flickr30k naming consistency * remove submissions for mme * remove unused submission function * Refactor infovqa_test.yaml and infovqa_val.yaml * Refactor code for improved readability and maintainability * stvqa * remane sqa * Update lmms_eval textcaps files and utils.py * Update default prompt for text captions * Refactor textcaps_aggregation_result function * Add generate_submission_file function and update mathvista_aggregate_results signature * Update nocaps_test.yaml and nocaps_val.yaml * refractor internal_eval * Add internal evaluation datasets * pack multidocvqa * mmvet * Fix gpt eval timeout issue for hallubench, restore load from gpt to avoid re evaluating * Refractor llava wild * Refractor llava-bench-coco * Add JSON file generation for gpt evaluation details * mmmu * Remove MMBench English and Chinese tasks * Remove unnecessary return statement in mmbench_aggregate_test_results function * Fix distributed process group initialization * Update dataset paths and group names in mmbench test configs * Update import statements in cc_utils.py, cn_utils.py, and en_utils.py * Add torch module import * lint * Remove IconQA dataset from README.md * Add Multi-DocVQA and its submodules * Add new datasets and update task names * Refactor flickr_aggregation_result function to accept additional arguments * Add timeout kwargs in Accelerator constructor * Add encoding to be utf-8 for cmmmu * Fix llava try and catch, remove torch.distributed.init in main * Ds prepare script for llava --------- Co-authored-by: JvThunder Co-authored-by: kcz358 commit d3dfd94ec91706587bf9f57dfebaad2cc3b5281c Author: Li Bo Date: Tue Feb 27 22:52:07 2024 +0800 [Wandb Logger] add models, and args to wandb tables. (#55) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * add llava main in pyproject * Update README.md * Remove unnecessary dependencies and add specific version for llava_repr * Add dependencies for llava_repr*** * Update README.md * add some docs on models and command line commands * remove some lines * typo * Update model_guide.md * Update model_guide.md * Update README.md * Update README.md * Update README.md * Fix refcocog dataset path * Record gpt response in eval info * Resolve conflict * Fix hallusionbench gpt json saving path * Rename hallubench gpt output path * Change remove image to check by type instead of check by names * More robust check by type * Remove unnecessary img in data * Forcing an empty commit. * Testing * Delete unnecessary things * Fix seedbench2 image issue in doc_to_text * Add conditional exclude for internal eval * Fix small bugs in list_with_num * Revise list_with_num model args --------- Co-authored-by: Bo Li Co-authored-by: Fanyi Pu Co-authored-by: jzhang38 commit b2ca65d1f12d84ae7a37ecc81f760901389a1af0 Author: kcz358 Date: Sat Mar 2 05:58:08 2024 +0000 Revise list_with_num model args commit a262ea1720b2c02839d21dad2a7618bc80725f18 Author: kcz358 Date: Sat Mar 2 05:09:15 2024 +0000 Fix small bugs in list_with_num commit 2fbeafc882c80242a10381abc67629d5d8b7071a Author: kcz358 Date: Sat Mar 2 03:49:36 2024 +0000 Add conditional exclude for internal eval commit f188052450bed2f3a30ab6f9a6f7eb844a64cb33 Merge: a3cae8e ffb9eb2 Author: kcz358 Date: Sat Mar 2 03:24:29 2024 +0000 Merge branch 'dev/readme' into kc/final_fix commit baef5905505892593fe783beb18a2de20991d6af Author: kcz358 Date: Sat Mar 2 02:47:31 2024 +0000 Fix seedbench2 image issue in doc_to_text commit 11b46f3b701b79b361dd5175a263e4d89bd07fb5 Author: kcz358 Date: Fri Mar 1 15:32:49 2024 +0000 Delete unnecessary things commit 0982de2e7a2310429e51ec7828886fd49953f716 Author: kcz358 Date: Fri Mar 1 15:31:42 2024 +0000 Testing commit f840ed80f4ae467fff62b61844854a3a9e8ec8a5 Author: kcz358 Date: Fri Mar 1 15:29:30 2024 +0000 Forcing an empty commit. commit 80db78f600d07011188983637c94da84b9475fbf Merge: 786f2b5 1700786 Author: kcz358 Date: Fri Mar 1 15:24:56 2024 +0000 Merge branch 'kc/final_fix' into dev/readme commit 676229de870b8d465cef08867cd272a4b696e630 Author: kcz358 Date: Fri Mar 1 15:24:20 2024 +0000 Remove unnecessary img in data commit d293b96fb3537fea85f10f216d762abf35e05e8d Merge: 4240785 888c1c1 Author: kcz358 Date: Fri Mar 1 13:41:24 2024 +0000 Merge branch 'kc/final_fix' into dev/readme commit 01bbd010590d6b7f105525580209191a1d6d5232 Author: kcz358 Date: Fri Mar 1 13:40:51 2024 +0000 More robust check by type commit 66595ebc073ff9431f2400006196c0645be58ea4 Author: kcz358 Date: Fri Mar 1 13:00:57 2024 +0000 Change remove image to check by type instead of check by names commit 08c2ebad1532fd6c34ac04efb94a268db9862d4f Author: kcz358 Date: Fri Mar 1 12:33:02 2024 +0000 Rename hallubench gpt output path commit aefbd3c6856584135e2dcbe13381db0e0780f063 Author: kcz358 Date: Fri Mar 1 09:32:52 2024 +0000 Fix hallusionbench gpt json saving path commit b9aebc3ff3b122d6d4a81bd2f28e86b2c390c505 Author: kcz358 Date: Fri Mar 1 08:51:13 2024 +0000 Resolve conflict commit c9daa91f2576de69af73c80e263afb085ecd8288 Merge: 9cf86fa 93534dc Author: kcz358 Date: Fri Mar 1 08:37:21 2024 +0000 Merge branch 'kc/final_fix' into dev/readme commit b1c4c88b9b36e02e9ed738ff9217d98a5ef2117b Author: kcz358 Date: Fri Mar 1 07:55:03 2024 +0000 Record gpt response in eval info commit b35bc4a6c8fd6b4b2a68bb3054878807b8b92281 Author: kcz358 Date: Fri Mar 1 07:49:01 2024 +0000 Fix refcocog dataset path commit 2a45079a974c55aeb8e04d367759e3cf383072c1 Author: Zhang Peiyuan Date: Thu Feb 29 13:40:02 2024 +0800 Dev/py add models (#57) * add instructblip * minicpm_v * remove from qwen-vl * speed up postprocessing * Optimize build context speed --------- Co-authored-by: Pu Fanyi Co-authored-by: kcz358 commit 7bdab7a743092cdca66b049e15332d8c21680147 Author: Pu Fanyi Date: Wed Feb 28 14:49:07 2024 +0800 Pufanyi/flickr30k refractor (#56) * refactor vizwizvqa task * Delete vqav2_test and vqav2_val YAML files * Refactor vqav2_process_results functions * Add a pack for vqav2 * refactor okvqa * roll back vizwiz_vqa * Fix exact_match calculation in ok_vqa_process_results * Update OKVQA dataset name in readme * add model_specific_prompt_kwargs * add model_specific_prompt_kwargs to vizwiz_vqa * add model_specific_prompt_kwargs for vqav2 * lint * fix a small bug for eval_logger * Refactor make_table function to display points as " - " if value is None * Merge commit '90f42f0876a4914c5ac0d213b9dffbfb4797ff62' * Refactor ok_vqa_aggreate_submissions function * Merge commit '4afec3303a0a7ed27a8265565343bf2851b9e4c7' * Refactor VQA submission file saving * Update file utils * Merge commit 'c144b75f0c9145a625b2bbdef5123ed81e343a11' * Refactor file path handling and submission generation * OKVQA path * vizwizvqa file * pack cmmmu * fix a small metric bug for cmmmu * Add higher_is_better flag to submission metric * Add CMMMU dataset to README.md * Add logging and refactor submission file generation in docvqa utils.py * pack docvqa * add traceback to print detailed error * Refactor docvqa_test_aggregate_results to accept additional arguments * Add metric check in evaluator.py and update test.yaml and val.yaml * add common `EvalAIAnswerProcessor` for okvqa, textvqa, vizwizvqa and vqav2 * merge textvqa * textvqa * Modify submission file generation for COCO test results * Update test result storage path * update coco cap file name * Update COCO 2017 Caption dataset name * ferret * Add Ferret dataset * Refactor hb_doc_to_text function to include model-specific prompts * Add IconQA and its subtasks * Refactor image list creation in doc_to_visual function * Add process_results function to default template * Update process_results function in iconqa utils.py * refactor flickr30k * change aggregation function * Fix formatting issues and update logging message * Fix llava can not handle only text question (no visuals) * Fix qwen can not handle no image question (no visuals) * Add fuyu prepare accelerator scripts * refactor mme * naming consistency * aggregation_submissions consistency * flickr30k naming consistency * remove submissions for mme * remove unused submission function * Refactor infovqa_test.yaml and infovqa_val.yaml * Refactor code for improved readability and maintainability * stvqa * remane sqa * Update lmms_eval textcaps files and utils.py * Update default prompt for text captions * Refactor textcaps_aggregation_result function * Add generate_submission_file function and update mathvista_aggregate_results signature * Update nocaps_test.yaml and nocaps_val.yaml * refractor internal_eval * Add internal evaluation datasets * pack multidocvqa * mmvet * Fix gpt eval timeout issue for hallubench, restore load from gpt to avoid re evaluating * Refractor llava wild * Refractor llava-bench-coco * Add JSON file generation for gpt evaluation details * mmmu * Remove MMBench English and Chinese tasks * Remove unnecessary return statement in mmbench_aggregate_test_results function * Fix distributed process group initialization * Update dataset paths and group names in mmbench test configs * Update import statements in cc_utils.py, cn_utils.py, and en_utils.py * Add torch module import * lint * Remove IconQA dataset from README.md * Add Multi-DocVQA and its submodules * Add new datasets and update task names * Refactor flickr_aggregation_result function to accept additional arguments * Add timeout kwargs in Accelerator constructor * Add encoding to be utf-8 for cmmmu * Fix llava try and catch, remove torch.distributed.init in main * Ds prepare script for llava --------- Co-authored-by: JvThunder Co-authored-by: kcz358 commit d3dfd94ec91706587bf9f57dfebaad2cc3b5281c Author: Li Bo Date: Tue Feb 27 22:52:07 2024 +0800 [Wandb Logger] add models, and args to wandb tables. (#55) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * Update commands.md * Add repr_scripts for reference * Add timeout for gpt4V * Remove unnecessary dependencies * Add reproduce into readme * Revise seedbench process_result * Fix exclude dc hardcode postprocess logic error * Fix metric repeat issue * Update dataset runtime and add environment info * Revise val submission file saving path * Put the correct query into the gpt extraction * Update sleep time in utils.py * update --------- Co-authored-by: Bo Li Co-authored-by: Fanyi Pu Co-authored-by: jzhang38 * [README] near public (#63) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * Update README.md with new features and installation instructions * Update supported models and datasets * Delete otter.py file * Fix capitalization in README.md * Update image sizes and add new features * Refactor README.md to improve readability and add new features * Add description for lmms-eval in README.md * Update accelerator support in README.md * Update lmms-eval README with improved description and additional features * Update README.md with improved task grouping description * change `Otter-AI/MME` to `lmms-lab/MME` * Update README.md * Update README.md * Remove unused code in mme.yaml * Squashed commit of the following: commit 6b20902d94ef9120181cd26cdce1e139046dbdf4 Author: Zhang Peiyuan Date: Thu Feb 29 13:40:02 2024 +0800 Dev/py add models (#57) * add instructblip * minicpm_v * remove from qwen-vl * speed up postprocessing * Optimize build context speed --------- Co-authored-by: Pu Fanyi Co-authored-by: kcz358 commit 21050baf08c4bc87c77d98dea55b6eecfefe1c0a Author: Pu Fanyi Date: Wed Feb 28 14:49:07 2024 +0800 Pufanyi/flickr30k refractor (#56) * refactor vizwizvqa task * Delete vqav2_test and vqav2_val YAML files * Refactor vqav2_process_results functions * Add a pack for vqav2 * refactor okvqa * roll back vizwiz_vqa * Fix exact_match calculation in ok_vqa_process_results * Update OKVQA dataset name in readme * add model_specific_prompt_kwargs * add model_specific_prompt_kwargs to vizwiz_vqa * add model_specific_prompt_kwargs for vqav2 * lint * fix a small bug for eval_logger * Refactor make_table function to display points as " - " if value is None * Merge commit 'c5e52a785d3cc87a866be9b880deb477d9f73fb7' * Refactor ok_vqa_aggreate_submissions function * Merge commit 'e5aa0a9601d6d8ce727315e4b0a8f13f06f26bff' * Refactor VQA submission file saving * Update file utils * Merge commit '560deca9f72483ca091795d6dc2537d4c54b32b0' * Refactor file path handling and submission generation * OKVQA path * vizwizvqa file * pack cmmmu * fix a small metric bug for cmmmu * Add higher_is_better flag to submission metric * Add CMMMU dataset to README.md * Add logging and refactor submission file generation in docvqa utils.py * pack docvqa * add traceback to print detailed error * Refactor docvqa_test_aggregate_results to accept additional arguments * Add metric check in evaluator.py and update test.yaml and val.yaml * add common `EvalAIAnswerProcessor` for okvqa, textvqa, vizwizvqa and vqav2 * merge textvqa * textvqa * Modify submission file generation for COCO test results * Update test result storage path * update coco cap file name * Update COCO 2017 Caption dataset name * ferret * Add Ferret dataset * Refactor hb_doc_to_text function to include model-specific prompts * Add IconQA and its subtasks * Refactor image list creation in doc_to_visual function * Add process_results function to default template * Update process_results function in iconqa utils.py * refactor flickr30k * change aggregation function * Fix formatting issues and update logging message * Fix llava can not handle only text question (no visuals) * Fix qwen can not handle no image question (no visuals) * Add fuyu prepare accelerator scripts * refactor mme * naming consistency * aggregation_submissions consistency * flickr30k naming consistency * remove submissions for mme * remove unused submission function * Refactor infovqa_test.yaml and infovqa_val.yaml * Refactor code for improved readability and maintainability * stvqa * remane sqa * Update lmms_eval textcaps files and utils.py * Update default prompt for text captions * Refactor textcaps_aggregation_result function * Add generate_submission_file function and update mathvista_aggregate_results signature * Update nocaps_test.yaml and nocaps_val.yaml * refractor internal_eval * Add internal evaluation datasets * pack multidocvqa * mmvet * Fix gpt eval timeout issue for hallubench, restore load from gpt to avoid re evaluating * Refractor llava wild * Refractor llava-bench-coco * Add JSON file generation for gpt evaluation details * mmmu * Remove MMBench English and Chinese tasks * Remove unnecessary return statement in mmbench_aggregate_test_results function * Fix distributed process group initialization * Update dataset paths and group names in mmbench test configs * Update import statements in cc_utils.py, cn_utils.py, and en_utils.py * Add torch module import * lint * Remove IconQA dataset from README.md * Add Multi-DocVQA and its submodules * Add new datasets and update task names * Refactor flickr_aggregation_result function to accept additional arguments * Add timeout kwargs in Accelerator constructor * Add encoding to be utf-8 for cmmmu * Fix llava try and catch, remove torch.distributed.init in main * Ds prepare script for llava --------- Co-authored-by: JvThunder Co-authored-by: kcz358 commit ba0e7f51e255540791d030468f79a126fec40eb5 Author: Li Bo Date: Tue Feb 27 22:52:07 2024 +0800 [Wandb Logger] add models, and args to wandb tables. (#55) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * add llava main in pyproject * Update README.md * Remove unnecessary dependencies and add specific version for llava_repr * Add dependencies for llava_repr*** * Update README.md * add some docs on models and command line commands * remove some lines * typo * Update model_guide.md * Update model_guide.md * Update README.md * Update README.md * Update README.md * Fix refcocog dataset path * Record gpt response in eval info * Resolve conflict * Fix hallusionbench gpt json saving path * Rename hallubench gpt output path * Change remove image to check by type instead of check by names * More robust check by type * Add timeout to API requests * Remove unnecessary img in data * Forcing an empty commit. * Testing * Delete unnecessary things * Fix error logging in get_chat_response function * Fix seedbench2 image issue in doc_to_text * Add conditional exclude for internal eval * Squashed commit of the following: commit faf9cf65cf5b1e036ee3a74428e8bb1490e8b2eb Author: kcz358 Date: Sat Mar 2 03:49:36 2024 +0000 Add conditional exclude for internal eval commit e3729eb925b718a44b6eb225ef9b41c7fd2408e0 Merge: a3cae8e ffb9eb2 Author: kcz358 Date: Sat Mar 2 03:24:29 2024 +0000 Merge branch 'dev/readme' into kc/final_fix commit 50b697a7ae93b0547484e1cd753722c1d2513349 Author: kcz358 Date: Sat Mar 2 02:47:31 2024 +0000 Fix seedbench2 image issue in doc_to_text commit 17425b5dce41cf67b96c5875139b57d6c7a423df Author: kcz358 Date: Fri Mar 1 15:32:49 2024 +0000 Delete unnecessary things commit 1bc17d54e79e79d11419ba89e7d8e55bc8cfa21b Author: kcz358 Date: Fri Mar 1 15:31:42 2024 +0000 Testing commit a20bbc30ab576d3e2a587c70af1b7c06575bcd8b Author: kcz358 Date: Fri Mar 1 15:29:30 2024 +0000 Forcing an empty commit. commit e2b657694b888ef59b9f896415e7c4c82497e7bf Merge: 786f2b5 1700786 Author: kcz358 Date: Fri Mar 1 15:24:56 2024 +0000 Merge branch 'kc/final_fix' into dev/readme commit 6447d521842b9f83f5119cdcd7714c8f6053ca73 Author: kcz358 Date: Fri Mar 1 15:24:20 2024 +0000 Remove unnecessary img in data commit 8ac333a2e9ebbe6318d536b6589f767f71fbc092 Merge: 4240785 888c1c1 Author: kcz358 Date: Fri Mar 1 13:41:24 2024 +0000 Merge branch 'kc/final_fix' into dev/readme commit 9e542ce049f68f49a237be165e3ad9cde7408ac0 Author: kcz358 Date: Fri Mar 1 13:40:51 2024 +0000 More robust check by type commit f90ccf7b94b130e118b4eca321f68b81e7ab5850 Author: kcz358 Date: Fri Mar 1 13:00:57 2024 +0000 Change remove image to check by type instead of check by names commit f651a77707a4c723ebffb07f2a87743bf42ecea7 Author: kcz358 Date: Fri Mar 1 12:33:02 2024 +0000 Rename hallubench gpt output path commit a683559c704806b7abde5e4c8355f556f3e65866 Author: kcz358 Date: Fri Mar 1 09:32:52 2024 +0000 Fix hallusionbench gpt json saving path commit 8e246e2466f3dd14a5e34f720269d7991a6dcf6b Author: kcz358 Date: Fri Mar 1 08:51:13 2024 +0000 Resolve conflict commit 67f00dc4652d09c662e5202ff7e5fbf7bebcdaf6 Merge: 9cf86fa 93534dc Author: kcz358 Date: Fri Mar 1 08:37:21 2024 +0000 Merge branch 'kc/final_fix' into dev/readme commit 53b7a845fe8412a652905101ec036c84e77a20c2 Author: kcz358 Date: Fri Mar 1 07:55:03 2024 +0000 Record gpt response in eval info commit 920b4112c4508e9a8afe824678958f2e78189e4e Author: kcz358 Date: Fri Mar 1 07:49:01 2024 +0000 Fix refcocog dataset path commit 6b20902d94ef9120181cd26cdce1e139046dbdf4 Author: Zhang Peiyuan Date: Thu Feb 29 13:40:02 2024 +0800 Dev/py add models (#57) * add instructblip * minicpm_v * remove from qwen-vl * speed up postprocessing * Optimize build context speed --------- Co-authored-by: Pu Fanyi Co-authored-by: kcz358 commit 21050baf08c4bc87c77d98dea55b6eecfefe1c0a Author: Pu Fanyi Date: Wed Feb 28 14:49:07 2024 +0800 Pufanyi/flickr30k refractor (#56) * refactor vizwizvqa task * Delete vqav2_test and vqav2_val YAML files * Refactor vqav2_process_results functions * Add a pack for vqav2 * refactor okvqa * roll back vizwiz_vqa * Fix exact_match calculation in ok_vqa_process_results * Update OKVQA dataset name in readme * add model_specific_prompt_kwargs * add model_specific_prompt_kwargs to vizwiz_vqa * add model_specific_prompt_kwargs for vqav2 * lint * fix a small bug for eval_logger * Refactor make_table function to display points as " - " if value is None * Merge commit 'c5e52a785d3cc87a866be9b880deb477d9f73fb7' * Refactor ok_vqa_aggreate_submissions function * Merge commit 'e5aa0a9601d6d8ce727315e4b0a8f13f06f26bff' * Refactor VQA submission file saving * Update file utils * Merge commit '560deca9f72483ca091795d6dc2537d4c54b32b0' * Refactor file path handling and submission generation * OKVQA path * vizwizvqa file * pack cmmmu * fix a small metric bug for cmmmu * Add higher_is_better flag to submission metric * Add CMMMU dataset to README.md * Add logging and refactor submission file generation in docvqa utils.py * pack docvqa * add traceback to print detailed error * Refactor docvqa_test_aggregate_results to accept additional arguments * Add metric check in evaluator.py and update test.yaml and val.yaml * add common `EvalAIAnswerProcessor` for okvqa, textvqa, vizwizvqa and vqav2 * merge textvqa * textvqa * Modify submission file generation for COCO test results * Update test result storage path * update coco cap file name * Update COCO 2017 Caption dataset name * ferret * Add Ferret dataset * Refactor hb_doc_to_text function to include model-specific prompts * Add IconQA and its subtasks * Refactor image list creation in doc_to_visual function * Add process_results function to default template * Update process_results function in iconqa utils.py * refactor flickr30k * change aggregation function * Fix formatting issues and update logging message * Fix llava can not handle only text question (no visuals) * Fix qwen can not handle no image question (no visuals) * Add fuyu prepare accelerator scripts * refactor mme * naming consistency * aggregation_submissions consistency * flickr30k naming consistency * remove submissions for mme * remove unused submission function * Refactor infovqa_test.yaml and infovqa_val.yaml * Refactor code for improved readability and maintainability * stvqa * remane sqa * Update lmms_eval textcaps files and utils.py * Update default prompt for text captions * Refactor textcaps_aggregation_result function * Add generate_submission_file function and update mathvista_aggregate_results signature * Update nocaps_test.yaml and nocaps_val.yaml * refractor internal_eval * Add internal evaluation datasets * pack multidocvqa * mmvet * Fix gpt eval timeout issue for hallubench, restore load from gpt to avoid re evaluating * Refractor llava wild * Refractor llava-bench-coco * Add JSON file generation for gpt evaluation details * mmmu * Remove MMBench English and Chinese tasks * Remove unnecessary return statement in mmbench_aggregate_test_results function * Fix distributed process group initialization * Update dataset paths and group names in mmbench test configs * Update import statements in cc_utils.py, cn_utils.py, and en_utils.py * Add torch module import * lint * Remove IconQA dataset from README.md * Add Multi-DocVQA and its submodules * Add new datasets and update task names * Refactor flickr_aggregation_result function to accept additional arguments * Add timeout kwargs in Accelerator constructor * Add encoding to be utf-8 for cmmmu * Fix llava try and catch, remove torch.distributed.init in main * Ds prepare script for llava --------- Co-authored-by: JvThunder Co-authored-by: kcz358 commit ba0e7f51e255540791d030468f79a126fec40eb5 Author: Li Bo Date: Tue Feb 27 22:52:07 2024 +0800 [Wandb Logger] add models, and args to wandb tables. (#55) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * Fix small bugs in list_with_num * Revise list_with_num model args * Dev/readme rm rolling (#60) * remove log_likelyhood_rolling * Update time efficiency benchmark in README.md * add task guide --------- Co-authored-by: jzhang38 Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Remove unnecessary code and update dependencies * Fix logging utils bug on wandb grouping * Add reproduce envs * Squashed commit of the following: commit 74fff73053b88a90d0f4229a9c748256080fea08 Merge: 2475639 2152f18 Author: kcz358 Date: Sun Mar 3 22:12:12 2024 +0800 Merge branch 'main' into kc/final_fix commit 0c640a636e3882859a17e30a5c3504850a3d02d6 Author: kcz358 Date: Sun Mar 3 22:11:04 2024 +0800 Add reproduce envs commit 7f2b2c38277fcb033c48414d827e96a64e1cac8d Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Mar 3 21:19:15 2024 +0800 [Fix] wandb group logging missing columns (#61) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * Update README.md with new features and installation instructions * Update supported models and datasets * Delete otter.py file * Fix capitalization in README.md * Update image sizes and add new features * Refactor README.md to improve readability and add new features * Add description for lmms-eval in README.md * Update accelerator support in README.md * Update lmms-eval README with improved description and additional features * Update README.md with improved task grouping description * change `Otter-AI/MME` to `lmms-lab/MME` * Update README.md * Update README.md * Remove unused code in mme.yaml * Squashed commit of the following: commit 6b20902d94ef9120181cd26cdce1e139046dbdf4 Author: Zhang Peiyuan Date: Thu Feb 29 13:40:02 2024 +0800 Dev/py add models (#57) * add instructblip * minicpm_v * remove from qwen-vl * speed up postprocessing * Optimize build context speed --------- Co-authored-by: Pu Fanyi Co-authored-by: kcz358 commit 21050baf08c4bc87c77d98dea55b6eecfefe1c0a Author: Pu Fanyi Date: Wed Feb 28 14:49:07 2024 +0800 Pufanyi/flickr30k refractor (#56) * refactor vizwizvqa task * Delete vqav2_test and vqav2_val YAML files * Refactor vqav2_process_results functions * Add a pack for vqav2 * refactor okvqa * roll back vizwiz_vqa * Fix exact_match calculation in ok_vqa_process_results * Update OKVQA dataset name in readme * add model_specific_prompt_kwargs * add model_specific_prompt_kwargs to vizwiz_vqa * add model_specific_prompt_kwargs for vqav2 * lint * fix a small bug for eval_logger * Refactor make_table function to display points as " - " if value is None * Merge commit 'c5e52a785d3cc87a866be9b880deb477d9f73fb7' * Refactor ok_vqa_aggreate_submissions function * Merge commit 'e5aa0a9601d6d8ce727315e4b0a8f13f06f26bff' * Refactor VQA submission file saving * Update file utils * Merge commit '560deca9f72483ca091795d6dc2537d4c54b32b0' * Refactor file path handling and submission generation * OKVQA path * vizwizvqa file * pack cmmmu * fix a small metric bug for cmmmu * Add higher_is_better flag to submission metric * Add CMMMU dataset to README.md * Add logging and refactor submission file generation in docvqa utils.py * pack docvqa * add traceback to print detailed error * Refactor docvqa_test_aggregate_results to accept additional arguments * Add metric check in evaluator.py and update test.yaml and val.yaml * add common `EvalAIAnswerProcessor` for okvqa, textvqa, vizwizvqa and vqav2 * merge textvqa * textvqa * Modify submission file generation for COCO test results * Update test result storage path * update coco cap file name * Update COCO 2017 Caption dataset name * ferret * Add Ferret dataset * Refactor hb_doc_to_text function to include model-specific prompts * Add IconQA and its subtasks * Refactor image list creation in doc_to_visual function * Add process_results function to default template * Update process_results function in iconqa utils.py * refactor flickr30k * change aggregation function * Fix formatting issues and update logging message * Fix llava can not handle only text question (no visuals) * Fix qwen can not handle no image question (no visuals) * Add fuyu prepare accelerator scripts * refactor mme * naming consistency * aggregation_submissions consistency * flickr30k naming consistency * remove submissions for mme * remove unused submission function * Refactor infovqa_test.yaml and infovqa_val.yaml * Refactor code for improved readability and maintainability * stvqa * remane sqa * Update lmms_eval textcaps files and utils.py * Update default prompt for text captions * Refactor textcaps_aggregation_result function * Add generate_submission_file function and update mathvista_aggregate_results signature * Update nocaps_test.yaml and nocaps_val.yaml * refractor internal_eval * Add internal evaluation datasets * pack multidocvqa * mmvet * Fix gpt eval timeout issue for hallubench, restore load from gpt to avoid re evaluating * Refractor llava wild * Refractor llava-bench-coco * Add JSON file generation for gpt evaluation details * mmmu * Remove MMBench English and Chinese tasks * Remove unnecessary return statement in mmbench_aggregate_test_results function * Fix distributed process group initialization * Update dataset paths and group names in mmbench test configs * Update import statements in cc_utils.py, cn_utils.py, and en_utils.py * Add torch module import * lint * Remove IconQA dataset from README.md * Add Multi-DocVQA and its submodules * Add new datasets and update task names * Refactor flickr_aggregation_result function to accept additional arguments * Add timeout kwargs in Accelerator constructor * Add encoding to be utf-8 for cmmmu * Fix llava try and catch, remove torch.distributed.init in main * Ds prepare script for llava --------- Co-authored-by: JvThunder Co-authored-by: kcz358 commit ba0e7f51e255540791d030468f79a126fec40eb5 Author: Li Bo Date: Tue Feb 27 22:52:07 2024 +0800 [Wandb Logger] add models, and args to wandb tables. (#55) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * add llava main in pyproject * Update README.md * Remove unnecessary dependencies and add specific version for llava_repr * Add dependencies for llava_repr*** * Update README.md * add some docs on models and command line commands * remove some lines * typo * Update model_guide.md * Update model_guide.md * Update README.md * Update README.md * Update README.md * Fix refcocog dataset path * Record gpt response in eval info * Resolve conflict * Fix hallusionbench gpt json saving path * Rename hallubench gpt output path * Change remove image to check by type instead of check by names * More robust check by type * Remove unnecessary img in data * Forcing an empty commit. * Testing * Delete unnecessary things * Fix seedbench2 image issue in doc_to_text * Add conditional exclude for internal eval * Fix small bugs in list_with_num * Revise list_with_num model args * Fix logging utils bug on wandb grouping --------- Co-authored-by: Bo Li Co-authored-by: Fanyi Pu Co-authored-by: jzhang38 commit bebff9fad2a60bc0ac52ddc430e5d9e4e0ef6c24 Merge: 83358a4 fd7773d Author: kcz358 Date: Sun Mar 3 07:25:48 2024 +0000 Merge branch 'main' into kc/final_fix commit 5042bb0c2ed4f830dda6bcd14231b1f8763aa95f Author: kcz358 Date: Sun Mar 3 07:23:19 2024 +0000 Fix logging utils bug on wandb grouping commit c82042b46e4c922efd5cb3fa441220333d521f6f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Mar 3 13:01:11 2024 +0800 [Fix] refcocog dataset path, record gpt prompt in internal eval, build context issue (#59) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * Update README.md with new features and installation instructions * Update supported models and datasets * Delete otter.py file * Fix capitalization in README.md * Update image sizes and add new features * Refactor README.md to improve readability and add new features * Add description for lmms-eval in README.md * Update accelerator support in README.md * Update lmms-eval README with improved description and additional features * Update README.md with improved task grouping description * change `Otter-AI/MME` to `lmms-lab/MME` * Update README.md * Update README.md * Remove unused code in mme.yaml * Squashed commit of the following: commit 6b20902d94ef9120181cd26cdce1e139046dbdf4 Author: Zhang Peiyuan Date: Thu Feb 29 13:40:02 2024 +0800 Dev/py add models (#57) * add instructblip * minicpm_v * remove from qwen-vl * speed up postprocessing * Optimize build context speed --------- Co-authored-by: Pu Fanyi Co-authored-by: kcz358 commit 21050baf08c4bc87c77d98dea55b6eecfefe1c0a Author: Pu Fanyi Date: Wed Feb 28 14:49:07 2024 +0800 Pufanyi/flickr30k refractor (#56) * refactor vizwizvqa task * Delete vqav2_test and vqav2_val YAML files * Refactor vqav2_process_results functions * Add a pack for vqav2 * refactor okvqa * roll back vizwiz_vqa * Fix exact_match calculation in ok_vqa_process_results * Update OKVQA dataset name in readme * add model_specific_prompt_kwargs * add model_specific_prompt_kwargs to vizwiz_vqa * add model_specific_prompt_kwargs for vqav2 * lint * fix a small bug for eval_logger * Refactor make_table function to display points as " - " if value is None * Merge commit 'c5e52a785d3cc87a866be9b880deb477d9f73fb7' * Refactor ok_vqa_aggreate_submissions function * Merge commit 'e5aa0a9601d6d8ce727315e4b0a8f13f06f26bff' * Refactor VQA submission file saving * Update file utils * Merge commit '560deca9f72483ca091795d6dc2537d4c54b32b0' * Refactor file path handling and submission generation * OKVQA path * vizwizvqa file * pack cmmmu * fix a small metric bug for cmmmu * Add higher_is_better flag to submission metric * Add CMMMU dataset to README.md * Add logging and refactor submission file generation in docvqa utils.py * pack docvqa * add traceback to print detailed error * Refactor docvqa_test_aggregate_results to accept additional arguments * Add metric check in evaluator.py and update test.yaml and val.yaml * add common `EvalAIAnswerProcessor` for okvqa, textvqa, vizwizvqa and vqav2 * merge textvqa * textvqa * Modify submission file generation for COCO test results * Update test result storage path * update coco cap file name * Update COCO 2017 Caption dataset name * ferret * Add Ferret dataset * Refactor hb_doc_to_text function to include model-specific prompts * Add IconQA and its subtasks * Refactor image list creation in doc_to_visual function * Add process_results function to default template * Update process_results function in iconqa utils.py * refactor flickr30k * change aggregation function * Fix formatting issues and update logging message * Fix llava can not handle only text question (no visuals) * Fix qwen can not handle no image question (no visuals) * Add fuyu prepare accelerator scripts * refactor mme * naming consistency * aggregation_submissions consistency * flickr30k naming consistency * remove submissions for mme * remove unused submission function * Refactor infovqa_test.yaml and infovqa_val.yaml * Refactor code for improved readability and maintainability * stvqa * remane sqa * Update lmms_eval textcaps files and utils.py * Update default prompt for text captions * Refactor textcaps_aggregation_result function * Add generate_submission_file function and update mathvista_aggregate_results signature * Update nocaps_test.yaml and nocaps_val.yaml * refractor internal_eval * Add internal evaluation datasets * pack multidocvqa * mmvet * Fix gpt eval timeout issue for hallubench, restore load from gpt to avoid re evaluating * Refractor llava wild * Refractor llava-bench-coco * Add JSON file generation for gpt evaluation details * mmmu * Remove MMBench English and Chinese tasks * Remove unnecessary return statement in mmbench_aggregate_test_results function * Fix distributed process group initialization * Update dataset paths and group names in mmbench test configs * Update import statements in cc_utils.py, cn_utils.py, and en_utils.py * Add torch module import * lint * Remove IconQA dataset from README.md * Add Multi-DocVQA and its submodules * Add new datasets and update task names * Refactor flickr_aggregation_result function to accept additional arguments * Add timeout kwargs in Accelerator constructor * Add encoding to be utf-8 for cmmmu * Fix llava try and catch, remove torch.distributed.init in main * Ds prepare script for llava --------- Co-authored-by: JvThunder Co-authored-by: kcz358 commit ba0e7f51e255540791d030468f79a126fec40eb5 Author: Li Bo Date: Tue Feb 27 22:52:07 2024 +0800 [Wandb Logger] add models, and args to wandb tables. (#55) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * add llava main in pyproject * Update README.md * Remove unnecessary dependencies and add specific version for llava_repr * Add dependencies for llava_repr*** * Update README.md * add some docs on models and command line commands * remove some lines * typo * Update model_guide.md * Update model_guide.md * Update README.md * Update README.md * Update README.md * Fix refcocog dataset path * Record gpt response in eval info * Resolve conflict * Fix hallusionbench gpt json saving path * Rename hallubench gpt output path * Change remove image to check by type instead of check by names * More robust check by type * Remove unnecessary img in data * Forcing an empty commit. * Testing * Delete unnecessary things * Fix seedbench2 image issue in doc_to_text * Add conditional exclude for internal eval * Fix small bugs in list_with_num * Revise list_with_num model args --------- Co-authored-by: Bo Li Co-authored-by: Fanyi Pu Co-authored-by: jzhang38 commit d78a3d7a53f5285a7eac39ce8f04e9854fdb3e73 Author: kcz358 Date: Sat Mar 2 05:58:08 2024 +0000 Revise list_with_num model args commit 8eefaec8489d48613de9395eb8e8150224985e01 Author: kcz358 Date: Sat Mar 2 05:09:15 2024 +0000 Fix small bugs in list_with_num commit faf9cf65cf5b1e036ee3a74428e8bb1490e8b2eb Author: kcz358 Date: Sat Mar 2 03:49:36 2024 +0000 Add conditional exclude for internal eval commit e3729eb925b718a44b6eb225ef9b41c7fd2408e0 Merge: a3cae8e ffb9eb2 Author: kcz358 Date: Sat Mar 2 03:24:29 2024 +0000 Merge branch 'dev/readme' into kc/final_fix commit 50b697a7ae93b0547484e1cd753722c1d2513349 Author: kcz358 Date: Sat Mar 2 02:47:31 2024 +0000 Fix seedbench2 image issue in doc_to_text commit 17425b5dce41cf67b96c5875139b57d6c7a423df Author: kcz358 Date: Fri Mar 1 15:32:49 2024 +0000 Delete unnecessary things commit 1bc17d54e79e79d11419ba89e7d8e55bc8cfa21b Author: kcz358 Date: Fri Mar 1 15:31:42 2024 +0000 Testing commit a20bbc30ab576d3e2a587c70af1b7c06575bcd8b Author: kcz358 Date: Fri Mar 1 15:29:30 2024 +0000 Forcing an empty commit. commit e2b657694b888ef59b9f896415e7c4c82497e7bf Merge: 786f2b5 1700786 Author: kcz358 Date: Fri Mar 1 15:24:56 2024 +0000 Merge branch 'kc/final_fix' into dev/readme commit 6447d521842b9f83f5119cdcd7714c8f6053ca73 Author: kcz358 Date: Fri Mar 1 15:24:20 2024 +0000 Remove unnecessary img in data commit 8ac333a2e9ebbe6318d536b6589f767f71fbc092 Merge: 4240785 888c1c1 Author: kcz358 Date: Fri Mar 1 13:41:24 2024 +0000 Merge branch 'kc/final_fix' into dev/readme commit 9e542ce049f68f49a237be165e3ad9cde7408ac0 Author: kcz358 Date: Fri Mar 1 13:40:51 2024 +0000 More robust check by type commit f90ccf7b94b130e118b4eca321f68b81e7ab5850 Author: kcz358 Date: Fri Mar 1 13:00:57 2024 +0000 Change remove image to check by type instead of check by names commit f651a77707a4c723ebffb07f2a87743bf42ecea7 Author: kcz358 Date: Fri Mar 1 12:33:02 2024 +0000 Rename hallubench gpt output path commit a683559c704806b7abde5e4c8355f556f3e65866 Author: kcz358 Date: Fri Mar 1 09:32:52 2024 +0000 Fix hallusionbench gpt json saving path commit 8e246e2466f3dd14a5e34f720269d7991a6dcf6b Author: kcz358 Date: Fri Mar 1 08:51:13 2024 +0000 Resolve conflict commit 67f00dc4652d09c662e5202ff7e5fbf7bebcdaf6 Merge: 9cf86fa 93534dc Author: kcz358 Date: Fri Mar 1 08:37:21 2024 +0000 Merge branch 'kc/final_fix' into dev/readme commit 53b7a845fe8412a652905101ec036c84e77a20c2 Author: kcz358 Date: Fri Mar 1 07:55:03 2024 +0000 Record gpt response in eval info commit 920b4112c4508e9a8afe824678958f2e78189e4e Author: kcz358 Date: Fri Mar 1 07:49:01 2024 +0000 Fix refcocog dataset path commit 6b20902d94ef9120181cd26cdce1e139046dbdf4 Author: Zhang Peiyuan Date: Thu Feb 29 13:40:02 2024 +0800 Dev/py add models (#57) * add instructblip * minicpm_v * remove from qwen-vl * speed up postprocessing * Optimize build context speed --------- Co-authored-by: Pu Fanyi Co-authored-by: kcz358 commit 21050baf08c4bc87c77d98dea55b6eecfefe1c0a Author: Pu Fanyi Date: Wed Feb 28 14:49:07 2024 +0800 Pufanyi/flickr30k refractor (#56) * refactor vizwizvqa task * Delete vqav2_test and vqav2_val YAML files * Refactor vqav2_process_results functions * Add a pack for vqav2 * refactor okvqa * roll back vizwiz_vqa * Fix exact_match calculation in ok_vqa_process_results * Update OKVQA dataset name in readme * add model_specific_prompt_kwargs * add model_specific_prompt_kwargs to vizwiz_vqa * add model_specific_prompt_kwargs for vqav2 * lint * fix a small bug for eval_logger * Refactor make_table function to display points as " - " if value is None * Merge commit 'c5e52a785d3cc87a866be9b880deb477d9f73fb7' * Refactor ok_vqa_aggreate_submissions function * Merge commit 'e5aa0a9601d6d8ce727315e4b0a8f13f06f26bff' * Refactor VQA submission file saving * Update file utils * Merge commit '560deca9f72483ca091795d6dc2537d4c54b32b0' * Refactor file path handling and submission generation * OKVQA path * vizwizvqa file * pack cmmmu * fix a small metric bug for cmmmu * Add higher_is_better flag to submission metric * Add CMMMU dataset to README.md * Add logging and refactor submission file generation in docvqa utils.py * pack docvqa * add traceback to print detailed error * Refactor docvqa_test_aggregate_results to accept additional arguments * Add metric check in evaluator.py and update test.yaml and val.yaml * add common `EvalAIAnswerProcessor` for okvqa, textvqa, vizwizvqa and vqav2 * merge textvqa * textvqa * Modify submission file generation for COCO test results * Update test result storage path * update coco cap file name * Update COCO 2017 Caption dataset name * ferret * Add Ferret dataset * Refactor hb_doc_to_text function to include model-specific prompts * Add IconQA and its subtasks * Refactor image list creation in doc_to_visual function * Add process_results function to default template * Update process_results function in iconqa utils.py * refactor flickr30k * change aggregation function * Fix formatting issues and update logging message * Fix llava can not handle only text question (no visuals) * Fix qwen can not handle no image question (no visuals) * Add fuyu prepare accelerator scripts * refactor mme * naming consistency * aggregation_submissions consistency * flickr30k naming consistency * remove submissions for mme * remove unused submission function * Refactor infovqa_test.yaml and infovqa_val.yaml * Refactor code for improved readability and maintainability * stvqa * remane sqa * Update lmms_eval textcaps files and utils.py * Update default prompt for text captions * Refactor textcaps_aggregation_result function * Add generate_submission_file function and update mathvista_aggregate_results signature * Update nocaps_test.yaml and nocaps_val.yaml * refractor internal_eval * Add internal evaluation datasets * pack multidocvqa * mmvet * Fix gpt eval timeout issue for hallubench, restore load from gpt to avoid re evaluating * Refractor llava wild * Refractor llava-bench-coco * Add JSON file generation for gpt evaluation details * mmmu * Remove MMBench English and Chinese tasks * Remove unnecessary return statement in mmbench_aggregate_test_results function * Fix distributed process group initialization * Update dataset paths and group names in mmbench test configs * Update import statements in cc_utils.py, cn_utils.py, and en_utils.py * Add torch module import * lint * Remove IconQA dataset from README.md * Add Multi-DocVQA and its submodules * Add new datasets and update task names * Refactor flickr_aggregation_result function to accept additional arguments * Add timeout kwargs in Accelerator constructor * Add encoding to be utf-8 for cmmmu * Fix llava try and catch, remove torch.distributed.init in main * Ds prepare script for llava --------- Co-authored-by: JvThunder Co-authored-by: kcz358 commit ba0e7f51e255540791d030468f79a126fec40eb5 Author: Li Bo Date: Tue Feb 27 22:52:07 2024 +0800 [Wandb Logger] add models, and args to wandb tables. (#55) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * Update commands.md * Add repr_scripts for reference * Add timeout for gpt4V * Remove unnecessary dependencies * Add reproduce into readme * Revise seedbench process_result * Fix exclude dc hardcode postprocess logic error * Fix metric repeat issue * Update dataset runtime and add environment info * Revise val submission file saving path * Put the correct query into the gpt extraction * Update sleep time in utils.py * update --------- Co-authored-by: Fanyi Pu Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> Co-authored-by: jzhang38 Co-authored-by: kcz358 * [README] near public (#63) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * Update README.md with new features and installation instructions * Update supported models and datasets * Delete otter.py file * Fix capitalization in README.md * Update image sizes and add new features * Refactor README.md to improve readability and add new features * Add description for lmms-eval in README.md * Update accelerator support in README.md * Update lmms-eval README with improved description and additional features * Update README.md with improved task grouping description * change `Otter-AI/MME` to `lmms-lab/MME` * Update README.md * Update README.md * Remove unused code in mme.yaml * Squashed commit of the following: commit 2a45079a974c55aeb8e04d367759e3cf383072c1 Author: Zhang Peiyuan Date: Thu Feb 29 13:40:02 2024 +0800 Dev/py add models (#57) * add instructblip * minicpm_v * remove from qwen-vl * speed up postprocessing * Optimize build context speed --------- Co-authored-by: Pu Fanyi Co-authored-by: kcz358 commit 7bdab7a743092cdca66b049e15332d8c21680147 Author: Pu Fanyi Date: Wed Feb 28 14:49:07 2024 +0800 Pufanyi/flickr30k refractor (#56) * refactor vizwizvqa task * Delete vqav2_test and vqav2_val YAML files * Refactor vqav2_process_results functions * Add a pack for vqav2 * refactor okvqa * roll back vizwiz_vqa * Fix exact_match calculation in ok_vqa_process_results * Update OKVQA dataset name in readme * add model_specific_prompt_kwargs * add model_specific_prompt_kwargs to vizwiz_vqa * add model_specific_prompt_kwargs for vqav2 * lint * fix a small bug for eval_logger * Refactor make_table function to display points as " - " if value is None * Merge commit '90f42f0876a4914c5ac0d213b9dffbfb4797ff62' * Refactor ok_vqa_aggreate_submissions function * Merge commit '4afec3303a0a7ed27a8265565343bf2851b9e4c7' * Refactor VQA submission file saving * Update file utils * Merge commit 'c144b75f0c9145a625b2bbdef5123ed81e343a11' * Refactor file path handling and submission generation * OKVQA path * vizwizvqa file * pack cmmmu * fix a small metric bug for cmmmu * Add higher_is_better flag to submission metric * Add CMMMU dataset to README.md * Add logging and refactor submission file generation in docvqa utils.py * pack docvqa * add traceback to print detailed error * Refactor docvqa_test_aggregate_results to accept additional arguments * Add metric check in evaluator.py and update test.yaml and val.yaml * add common `EvalAIAnswerProcessor` for okvqa, textvqa, vizwizvqa and vqav2 * merge textvqa * textvqa * Modify submission file generation for COCO test results * Update test result storage path * update coco cap file name * Update COCO 2017 Caption dataset name * ferret * Add Ferret dataset * Refactor hb_doc_to_text function to include model-specific prompts * Add IconQA and its subtasks * Refactor image list creation in doc_to_visual function * Add process_results function to default template * Update process_results function in iconqa utils.py * refactor flickr30k * change aggregation function * Fix formatting issues and update logging message * Fix llava can not handle only text question (no visuals) * Fix qwen can not handle no image question (no visuals) * Add fuyu prepare accelerator scripts * refactor mme * naming consistency * aggregation_submissions consistency * flickr30k naming consistency * remove submissions for mme * remove unused submission function * Refactor infovqa_test.yaml and infovqa_val.yaml * Refactor code for improved readability and maintainability * stvqa * remane sqa * Update lmms_eval textcaps files and utils.py * Update default prompt for text captions * Refactor textcaps_aggregation_result function * Add generate_submission_file function and update mathvista_aggregate_results signature * Update nocaps_test.yaml and nocaps_val.yaml * refractor internal_eval * Add internal evaluation datasets * pack multidocvqa * mmvet * Fix gpt eval timeout issue for hallubench, restore load from gpt to avoid re evaluating * Refractor llava wild * Refractor llava-bench-coco * Add JSON file generation for gpt evaluation details * mmmu * Remove MMBench English and Chinese tasks * Remove unnecessary return statement in mmbench_aggregate_test_results function * Fix distributed process group initialization * Update dataset paths and group names in mmbench test configs * Update import statements in cc_utils.py, cn_utils.py, and en_utils.py * Add torch module import * lint * Remove IconQA dataset from README.md * Add Multi-DocVQA and its submodules * Add new datasets and update task names * Refactor flickr_aggregation_result function to accept additional arguments * Add timeout kwargs in Accelerator constructor * Add encoding to be utf-8 for cmmmu * Fix llava try and catch, remove torch.distributed.init in main * Ds prepare script for llava --------- Co-authored-by: JvThunder Co-authored-by: kcz358 commit d3dfd94ec91706587bf9f57dfebaad2cc3b5281c Author: Li Bo Date: Tue Feb 27 22:52:07 2024 +0800 [Wandb Logger] add models, and args to wandb tables. (#55) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * add llava main in pyproject * Update README.md * Remove unnecessary dependencies and add specific version for llava_repr * Add dependencies for llava_repr*** * Update README.md * add some docs on models and command line commands * remove some lines * typo * Update model_guide.md * Update model_guide.md * Update README.md * Update README.md * Update README.md * Fix refcocog dataset path * Record gpt response in eval info * Resolve conflict * Fix hallusionbench gpt json saving path * Rename hallubench gpt output path * Change remove image to check by type instead of check by names * More robust check by type * Add timeout to API requests * Remove unnecessary img in data * Forcing an empty commit. * Testing * Delete unnecessary things * Fix error logging in get_chat_response function * Fix seedbench2 image issue in doc_to_text * Add conditional exclude for internal eval * Squashed commit of the following: commit 2fbeafc882c80242a10381abc67629d5d8b7071a Author: kcz358 Date: Sat Mar 2 03:49:36 2024 +0000 Add conditional exclude for internal eval commit f188052450bed2f3a30ab6f9a6f7eb844a64cb33 Merge: a3cae8e ffb9eb2 Author: kcz358 Date: Sat Mar 2 03:24:29 2024 +0000 Merge branch 'dev/readme' into kc/final_fix commit baef5905505892593fe783beb18a2de20991d6af Author: kcz358 Date: Sat Mar 2 02:47:31 2024 +0000 Fix seedbench2 image issue in doc_to_text commit 11b46f3b701b79b361dd5175a263e4d89bd07fb5 Author: kcz358 Date: Fri Mar 1 15:32:49 2024 +0000 Delete unnecessary things commit 0982de2e7a2310429e51ec7828886fd49953f716 Author: kcz358 Date: Fri Mar 1 15:31:42 2024 +0000 Testing commit f840ed80f4ae467fff62b61844854a3a9e8ec8a5 Author: kcz358 Date: Fri Mar 1 15:29:30 2024 +0000 Forcing an empty commit. commit 80db78f600d07011188983637c94da84b9475fbf Merge: 786f2b5 1700786 Author: kcz358 Date: Fri Mar 1 15:24:56 2024 +0000 Merge branch 'kc/final_fix' into dev/readme commit 676229de870b8d465cef08867cd272a4b696e630 Author: kcz358 Date: Fri Mar 1 15:24:20 2024 +0000 Remove unnecessary img in data commit d293b96fb3537fea85f10f216d762abf35e05e8d Merge: 4240785 888c1c1 Author: kcz358 Date: Fri Mar 1 13:41:24 2024 +0000 Merge branch 'kc/final_fix' into dev/readme commit 01bbd010590d6b7f105525580209191a1d6d5232 Author: kcz358 Date: Fri Mar 1 13:40:51 2024 +0000 More robust check by type commit 66595ebc073ff9431f2400006196c0645be58ea4 Author: kcz358 Date: Fri Mar 1 13:00:57 2024 +0000 Change remove image to check by type instead of check by names commit 08c2ebad1532fd6c34ac04efb94a268db9862d4f Author: kcz358 Date: Fri Mar 1 12:33:02 2024 +0000 Rename hallubench gpt output path commit aefbd3c6856584135e2dcbe13381db0e0780f063 Author: kcz358 Date: Fri Mar 1 09:32:52 2024 +0000 Fix hallusionbench gpt json saving path commit b9aebc3ff3b122d6d4a81bd2f28e86b2c390c505 Author: kcz358 Date: Fri Mar 1 08:51:13 2024 +0000 Resolve conflict commit c9daa91f2576de69af73c80e263afb085ecd8288 Merge: 9cf86fa 93534dc Author: kcz358 Date: Fri Mar 1 08:37:21 2024 +0000 Merge branch 'kc/final_fix' into dev/readme commit b1c4c88b9b36e02e9ed738ff9217d98a5ef2117b Author: kcz358 Date: Fri Mar 1 07:55:03 2024 +0000 Record gpt response in eval info commit b35bc4a6c8fd6b4b2a68bb3054878807b8b92281 Author: kcz358 Date: Fri Mar 1 07:49:01 2024 +0000 Fix refcocog dataset path commit 2a45079a974c55aeb8e04d367759e3cf383072c1 Author: Zhang Peiyuan Date: Thu Feb 29 13:40:02 2024 +0800 Dev/py add models (#57) * add instructblip * minicpm_v * remove from qwen-vl * speed up postprocessing * Optimize build context speed --------- Co-authored-by: Pu Fanyi Co-authored-by: kcz358 commit 7bdab7a743092cdca66b049e15332d8c21680147 Author: Pu Fanyi Date: Wed Feb 28 14:49:07 2024 +0800 Pufanyi/flickr30k refractor (#56) * refactor vizwizvqa task * Delete vqav2_test and vqav2_val YAML files * Refactor vqav2_process_results functions * Add a pack for vqav2 * refactor okvqa * roll back vizwiz_vqa * Fix exact_match calculation in ok_vqa_process_results * Update OKVQA dataset name in readme * add model_specific_prompt_kwargs * add model_specific_prompt_kwargs to vizwiz_vqa * add model_specific_prompt_kwargs for vqav2 * lint * fix a small bug for eval_logger * Refactor make_table function to display points as " - " if value is None * Merge commit '90f42f0876a4914c5ac0d213b9dffbfb4797ff62' * Refactor ok_vqa_aggreate_submissions function * Merge commit '4afec3303a0a7ed27a8265565343bf2851b9e4c7' * Refactor VQA submission file saving * Update file utils * Merge commit 'c144b75f0c9145a625b2bbdef5123ed81e343a11' * Refactor file path handling and submission generation * OKVQA path * vizwizvqa file * pack cmmmu * fix a small metric bug for cmmmu * Add higher_is_better flag to submission metric * Add CMMMU dataset to README.md * Add logging and refactor submission file generation in docvqa utils.py * pack docvqa * add traceback to print detailed error * Refactor docvqa_test_aggregate_results to accept additional arguments * Add metric check in evaluator.py and update test.yaml and val.yaml * add common `EvalAIAnswerProcessor` for okvqa, textvqa, vizwizvqa and vqav2 * merge textvqa * textvqa * Modify submission file generation for COCO test results * Update test result storage path * update coco cap file name * Update COCO 2017 Caption dataset name * ferret * Add Ferret dataset * Refactor hb_doc_to_text function to include model-specific prompts * Add IconQA and its subtasks * Refactor image list creation in doc_to_visual function * Add process_results function to default template * Update process_results function in iconqa utils.py * refactor flickr30k * change aggregation function * Fix formatting issues and update logging message * Fix llava can not handle only text question (no visuals) * Fix qwen can not handle no image question (no visuals) * Add fuyu prepare accelerator scripts * refactor mme * naming consistency * aggregation_submissions consistency * flickr30k naming consistency * remove submissions for mme * remove unused submission function * Refactor infovqa_test.yaml and infovqa_val.yaml * Refactor code for improved readability and maintainability * stvqa * remane sqa * Update lmms_eval textcaps files and utils.py * Update default prompt for text captions * Refactor textcaps_aggregation_result function * Add generate_submission_file function and update mathvista_aggregate_results signature * Update nocaps_test.yaml and nocaps_val.yaml * refractor internal_eval * Add internal evaluation datasets * pack multidocvqa * mmvet * Fix gpt eval timeout issue for hallubench, restore load from gpt to avoid re evaluating * Refractor llava wild * Refractor llava-bench-coco * Add JSON file generation for gpt evaluation details * mmmu * Remove MMBench English and Chinese tasks * Remove unnecessary return statement in mmbench_aggregate_test_results function * Fix distributed process group initialization * Update dataset paths and group names in mmbench test configs * Update import statements in cc_utils.py, cn_utils.py, and en_utils.py * Add torch module import * lint * Remove IconQA dataset from README.md * Add Multi-DocVQA and its submodules * Add new datasets and update task names * Refactor flickr_aggregation_result function to accept additional arguments * Add timeout kwargs in Accelerator constructor * Add encoding to be utf-8 for cmmmu * Fix llava try and catch, remove torch.distributed.init in main * Ds prepare script for llava --------- Co-authored-by: JvThunder Co-authored-by: kcz358 commit d3dfd94ec91706587bf9f57dfebaad2cc3b5281c Author: Li Bo Date: Tue Feb 27 22:52:07 2024 +0800 [Wandb Logger] add models, and args to wandb tables. (#55) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * Fix small bugs in list_with_num * Revise list_with_num model args * Dev/readme rm rolling (#60) * remove log_likelyhood_rolling * Update time efficiency benchmark in README.md * add task guide --------- Co-authored-by: jzhang38 Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Remove unnecessary code and update dependencies * Fix logging utils bug on wandb grouping * Add reproduce envs * Squashed commit of the following: commit 556b12620379d79c9ed5ddba0856063b498f917c Merge: 2475639 2152f18 Author: kcz358 Date: Sun Mar 3 22:12:12 2024 +0800 Merge branch 'main' into kc/final_fix commit 9509a782c9e9824273cefb1dc9671c92b887697d Author: kcz358 Date: Sun Mar 3 22:11:04 2024 +0800 Add reproduce envs commit 0bff98b798c71a361639385c985eccc54c46dfb6 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Mar 3 21:19:15 2024 +0800 [Fix] wandb group logging missing columns (#61) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * Update README.md with new features and installation instructions * Update supported models and datasets * Delete otter.py file * Fix capitalization in README.md * Update image sizes and add new features * Refactor README.md to improve readability and add new features * Add description for lmms-eval in README.md * Update accelerator support in README.md * Update lmms-eval README with improved description and additional features * Update README.md with improved task grouping description * change `Otter-AI/MME` to `lmms-lab/MME` * Update README.md * Update README.md * Remove unused code in mme.yaml * Squashed commit of the following: commit 2a45079a974c55aeb8e04d367759e3cf383072c1 Author: Zhang Peiyuan Date: Thu Feb 29 13:40:02 2024 +0800 Dev/py add models (#57) * add instructblip * minicpm_v * remove from qwen-vl * speed up postprocessing * Optimize build context speed --------- Co-authored-by: Pu Fanyi Co-authored-by: kcz358 commit 7bdab7a743092cdca66b049e15332d8c21680147 Author: Pu Fanyi Date: Wed Feb 28 14:49:07 2024 +0800 Pufanyi/flickr30k refractor (#56) * refactor vizwizvqa task * Delete vqav2_test and vqav2_val YAML files * Refactor vqav2_process_results functions * Add a pack for vqav2 * refactor okvqa * roll back vizwiz_vqa * Fix exact_match calculation in ok_vqa_process_results * Update OKVQA dataset name in readme * add model_specific_prompt_kwargs * add model_specific_prompt_kwargs to vizwiz_vqa * add model_specific_prompt_kwargs for vqav2 * lint * fix a small bug for eval_logger * Refactor make_table function to display points as " - " if value is None * Merge commit '90f42f0876a4914c5ac0d213b9dffbfb4797ff62' * Refactor ok_vqa_aggreate_submissions function * Merge commit '4afec3303a0a7ed27a8265565343bf2851b9e4c7' * Refactor VQA submission file saving * Update file utils * Merge commit 'c144b75f0c9145a625b2bbdef5123ed81e343a11' * Refactor file path handling and submission generation * OKVQA path * vizwizvqa file * pack cmmmu * fix a small metric bug for cmmmu * Add higher_is_better flag to submission metric * Add CMMMU dataset to README.md * Add logging and refactor submission file generation in docvqa utils.py * pack docvqa * add traceback to print detailed error * Refactor docvqa_test_aggregate_results to accept additional arguments * Add metric check in evaluator.py and update test.yaml and val.yaml * add common `EvalAIAnswerProcessor` for okvqa, textvqa, vizwizvqa and vqav2 * merge textvqa * textvqa * Modify submission file generation for COCO test results * Update test result storage path * update coco cap file name * Update COCO 2017 Caption dataset name * ferret * Add Ferret dataset * Refactor hb_doc_to_text function to include model-specific prompts * Add IconQA and its subtasks * Refactor image list creation in doc_to_visual function * Add process_results function to default template * Update process_results function in iconqa utils.py * refactor flickr30k * change aggregation function * Fix formatting issues and update logging message * Fix llava can not handle only text question (no visuals) * Fix qwen can not handle no image question (no visuals) * Add fuyu prepare accelerator scripts * refactor mme * naming consistency * aggregation_submissions consistency * flickr30k naming consistency * remove submissions for mme * remove unused submission function * Refactor infovqa_test.yaml and infovqa_val.yaml * Refactor code for improved readability and maintainability * stvqa * remane sqa * Update lmms_eval textcaps files and utils.py * Update default prompt for text captions * Refactor textcaps_aggregation_result function * Add generate_submission_file function and update mathvista_aggregate_results signature * Update nocaps_test.yaml and nocaps_val.yaml * refractor internal_eval * Add internal evaluation datasets * pack multidocvqa * mmvet * Fix gpt eval timeout issue for hallubench, restore load from gpt to avoid re evaluating * Refractor llava wild * Refractor llava-bench-coco * Add JSON file generation for gpt evaluation details * mmmu * Remove MMBench English and Chinese tasks * Remove unnecessary return statement in mmbench_aggregate_test_results function * Fix distributed process group initialization * Update dataset paths and group names in mmbench test configs * Update import statements in cc_utils.py, cn_utils.py, and en_utils.py * Add torch module import * lint * Remove IconQA dataset from README.md * Add Multi-DocVQA and its submodules * Add new datasets and update task names * Refactor flickr_aggregation_result function to accept additional arguments * Add timeout kwargs in Accelerator constructor * Add encoding to be utf-8 for cmmmu * Fix llava try and catch, remove torch.distributed.init in main * Ds prepare script for llava --------- Co-authored-by: JvThunder Co-authored-by: kcz358 commit d3dfd94ec91706587bf9f57dfebaad2cc3b5281c Author: Li Bo Date: Tue Feb 27 22:52:07 2024 +0800 [Wandb Logger] add models, and args to wandb tables. (#55) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * add llava main in pyproject * Update README.md * Remove unnecessary dependencies and add specific version for llava_repr * Add dependencies for llava_repr*** * Update README.md * add some docs on models and command line commands * remove some lines * typo * Update model_guide.md * Update model_guide.md * Update README.md * Update README.md * Update README.md * Fix refcocog dataset path * Record gpt response in eval info * Resolve conflict * Fix hallusionbench gpt json saving path * Rename hallubench gpt output path * Change remove image to check by type instead of check by names * More robust check by type * Remove unnecessary img in data * Forcing an empty commit. * Testing * Delete unnecessary things * Fix seedbench2 image issue in doc_to_text * Add conditional exclude for internal eval * Fix small bugs in list_with_num * Revise list_with_num model args * Fix logging utils bug on wandb grouping --------- Co-authored-by: Bo Li Co-authored-by: Fanyi Pu Co-authored-by: jzhang38 commit 7c4501a32bbb415ba7e62e93194b37ba9a435cf5 Merge: 83358a4 fd7773d Author: kcz358 Date: Sun Mar 3 07:25:48 2024 +0000 Merge branch 'main' into kc/final_fix commit 5c419f9fa23616a63a0bd584f18e509bb7704b50 Author: kcz358 Date: Sun Mar 3 07:23:19 2024 +0000 Fix logging utils bug on wandb grouping commit 0010d0ac308051e977981b83a51d0654488501d9 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun Mar 3 13:01:11 2024 +0800 [Fix] refcocog dataset path, record gpt prompt in internal eval, build context issue (#59) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * Update README.md with new features and installation instructions * Update supported models and datasets * Delete otter.py file * Fix capitalization in README.md * Update image sizes and add new features * Refactor README.md to improve readability and add new features * Add description for lmms-eval in README.md * Update accelerator support in README.md * Update lmms-eval README with improved description and additional features * Update README.md with improved task grouping description * change `Otter-AI/MME` to `lmms-lab/MME` * Update README.md * Update README.md * Remove unused code in mme.yaml * Squashed commit of the following: commit 2a45079a974c55aeb8e04d367759e3cf383072c1 Author: Zhang Peiyuan Date: Thu Feb 29 13:40:02 2024 +0800 Dev/py add models (#57) * add instructblip * minicpm_v * remove from qwen-vl * speed up postprocessing * Optimize build context speed --------- Co-authored-by: Pu Fanyi Co-authored-by: kcz358 commit 7bdab7a743092cdca66b049e15332d8c21680147 Author: Pu Fanyi Date: Wed Feb 28 14:49:07 2024 +0800 Pufanyi/flickr30k refractor (#56) * refactor vizwizvqa task * Delete vqav2_test and vqav2_val YAML files * Refactor vqav2_process_results functions * Add a pack for vqav2 * refactor okvqa * roll back vizwiz_vqa * Fix exact_match calculation in ok_vqa_process_results * Update OKVQA dataset name in readme * add model_specific_prompt_kwargs * add model_specific_prompt_kwargs to vizwiz_vqa * add model_specific_prompt_kwargs for vqav2 * lint * fix a small bug for eval_logger * Refactor make_table function to display points as " - " if value is None * Merge commit '90f42f0876a4914c5ac0d213b9dffbfb4797ff62' * Refactor ok_vqa_aggreate_submissions function * Merge commit '4afec3303a0a7ed27a8265565343bf2851b9e4c7' * Refactor VQA submission file saving * Update file utils * Merge commit 'c144b75f0c9145a625b2bbdef5123ed81e343a11' * Refactor file path handling and submission generation * OKVQA path * vizwizvqa file * pack cmmmu * fix a small metric bug for cmmmu * Add higher_is_better flag to submission metric * Add CMMMU dataset to README.md * Add logging and refactor submission file generation in docvqa utils.py * pack docvqa * add traceback to print detailed error * Refactor docvqa_test_aggregate_results to accept additional arguments * Add metric check in evaluator.py and update test.yaml and val.yaml * add common `EvalAIAnswerProcessor` for okvqa, textvqa, vizwizvqa and vqav2 * merge textvqa * textvqa * Modify submission file generation for COCO test results * Update test result storage path * update coco cap file name * Update COCO 2017 Caption dataset name * ferret * Add Ferret dataset * Refactor hb_doc_to_text function to include model-specific prompts * Add IconQA and its subtasks * Refactor image list creation in doc_to_visual function * Add process_results function to default template * Update process_results function in iconqa utils.py * refactor flickr30k * change aggregation function * Fix formatting issues and update logging message * Fix llava can not handle only text question (no visuals) * Fix qwen can not handle no image question (no visuals) * Add fuyu prepare accelerator scripts * refactor mme * naming consistency * aggregation_submissions consistency * flickr30k naming consistency * remove submissions for mme * remove unused submission function * Refactor infovqa_test.yaml and infovqa_val.yaml * Refactor code for improved readability and maintainability * stvqa * remane sqa * Update lmms_eval textcaps files and utils.py * Update default prompt for text captions * Refactor textcaps_aggregation_result function * Add generate_submission_file function and update mathvista_aggregate_results signature * Update nocaps_test.yaml and nocaps_val.yaml * refractor internal_eval * Add internal evaluation datasets * pack multidocvqa * mmvet * Fix gpt eval timeout issue for hallubench, restore load from gpt to avoid re evaluating * Refractor llava wild * Refractor llava-bench-coco * Add JSON file generation for gpt evaluation details * mmmu * Remove MMBench English and Chinese tasks * Remove unnecessary return statement in mmbench_aggregate_test_results function * Fix distributed process group initialization * Update dataset paths and group names in mmbench test configs * Update import statements in cc_utils.py, cn_utils.py, and en_utils.py * Add torch module import * lint * Remove IconQA dataset from README.md * Add Multi-DocVQA and its submodules * Add new datasets and update task names * Refactor flickr_aggregation_result function to accept additional arguments * Add timeout kwargs in Accelerator constructor * Add encoding to be utf-8 for cmmmu * Fix llava try and catch, remove torch.distributed.init in main * Ds prepare script for llava --------- Co-authored-by: JvThunder Co-authored-by: kcz358 commit d3dfd94ec91706587bf9f57dfebaad2cc3b5281c Author: Li Bo Date: Tue Feb 27 22:52:07 2024 +0800 [Wandb Logger] add models, and args to wandb tables. (#55) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * add llava main in pyproject * Update README.md * Remove unnecessary dependencies and add specific version for llava_repr * Add dependencies for llava_repr*** * Update README.md * add some docs on models and command line commands * remove some lines * typo * Update model_guide.md * Update model_guide.md * Update README.md * Update README.md * Update README.md * Fix refcocog dataset path * Record gpt response in eval info * Resolve conflict * Fix hallusionbench gpt json saving path * Rename hallubench gpt output path * Change remove image to check by type instead of check by names * More robust check by type * Remove unnecessary img in data * Forcing an empty commit. * Testing * Delete unnecessary things * Fix seedbench2 image issue in doc_to_text * Add conditional exclude for internal eval * Fix small bugs in list_with_num * Revise list_with_num model args --------- Co-authored-by: Bo Li Co-authored-by: Fanyi Pu Co-authored-by: jzhang38 commit b2ca65d1f12d84ae7a37ecc81f760901389a1af0 Author: kcz358 Date: Sat Mar 2 05:58:08 2024 +0000 Revise list_with_num model args commit a262ea1720b2c02839d21dad2a7618bc80725f18 Author: kcz358 Date: Sat Mar 2 05:09:15 2024 +0000 Fix small bugs in list_with_num commit 2fbeafc882c80242a10381abc67629d5d8b7071a Author: kcz358 Date: Sat Mar 2 03:49:36 2024 +0000 Add conditional exclude for internal eval commit f188052450bed2f3a30ab6f9a6f7eb844a64cb33 Merge: a3cae8e ffb9eb2 Author: kcz358 Date: Sat Mar 2 03:24:29 2024 +0000 Merge branch 'dev/readme' into kc/final_fix commit baef5905505892593fe783beb18a2de20991d6af Author: kcz358 Date: Sat Mar 2 02:47:31 2024 +0000 Fix seedbench2 image issue in doc_to_text commit 11b46f3b701b79b361dd5175a263e4d89bd07fb5 Author: kcz358 Date: Fri Mar 1 15:32:49 2024 +0000 Delete unnecessary things commit 0982de2e7a2310429e51ec7828886fd49953f716 Author: kcz358 Date: Fri Mar 1 15:31:42 2024 +0000 Testing commit f840ed80f4ae467fff62b61844854a3a9e8ec8a5 Author: kcz358 Date: Fri Mar 1 15:29:30 2024 +0000 Forcing an empty commit. commit 80db78f600d07011188983637c94da84b9475fbf Merge: 786f2b5 1700786 Author: kcz358 Date: Fri Mar 1 15:24:56 2024 +0000 Merge branch 'kc/final_fix' into dev/readme commit 676229de870b8d465cef08867cd272a4b696e630 Author: kcz358 Date: Fri Mar 1 15:24:20 2024 +0000 Remove unnecessary img in data commit d293b96fb3537fea85f10f216d762abf35e05e8d Merge: 4240785 888c1c1 Author: kcz358 Date: Fri Mar 1 13:41:24 2024 +0000 Merge branch 'kc/final_fix' into dev/readme commit 01bbd010590d6b7f105525580209191a1d6d5232 Author: kcz358 Date: Fri Mar 1 13:40:51 2024 +0000 More robust check by type commit 66595ebc073ff9431f2400006196c0645be58ea4 Author: kcz358 Date: Fri Mar 1 13:00:57 2024 +0000 Change remove image to check by type instead of check by names commit 08c2ebad1532fd6c34ac04efb94a268db9862d4f Author: kcz358 Date: Fri Mar 1 12:33:02 2024 +0000 Rename hallubench gpt output path commit aefbd3c6856584135e2dcbe13381db0e0780f063 Author: kcz358 Date: Fri Mar 1 09:32:52 2024 +0000 Fix hallusionbench gpt json saving path commit b9aebc3ff3b122d6d4a81bd2f28e86b2c390c505 Author: kcz358 Date: Fri Mar 1 08:51:13 2024 +0000 Resolve conflict commit c9daa91f2576de69af73c80e263afb085ecd8288 Merge: 9cf86fa 93534dc Author: kcz358 Date: Fri Mar 1 08:37:21 2024 +0000 Merge branch 'kc/final_fix' into dev/readme commit b1c4c88b9b36e02e9ed738ff9217d98a5ef2117b Author: kcz358 Date: Fri Mar 1 07:55:03 2024 +0000 Record gpt response in eval info commit b35bc4a6c8fd6b4b2a68bb3054878807b8b92281 Author: kcz358 Date: Fri Mar 1 07:49:01 2024 +0000 Fix refcocog dataset path commit 2a45079a974c55aeb8e04d367759e3cf383072c1 Author: Zhang Peiyuan Date: Thu Feb 29 13:40:02 2024 +0800 Dev/py add models (#57) * add instructblip * minicpm_v * remove from qwen-vl * speed up postprocessing * Optimize build context speed --------- Co-authored-by: Pu Fanyi Co-authored-by: kcz358 commit 7bdab7a743092cdca66b049e15332d8c21680147 Author: Pu Fanyi Date: Wed Feb 28 14:49:07 2024 +0800 Pufanyi/flickr30k refractor (#56) * refactor vizwizvqa task * Delete vqav2_test and vqav2_val YAML files * Refactor vqav2_process_results functions * Add a pack for vqav2 * refactor okvqa * roll back vizwiz_vqa * Fix exact_match calculation in ok_vqa_process_results * Update OKVQA dataset name in readme * add model_specific_prompt_kwargs * add model_specific_prompt_kwargs to vizwiz_vqa * add model_specific_prompt_kwargs for vqav2 * lint * fix a small bug for eval_logger * Refactor make_table function to display points as " - " if value is None * Merge commit '90f42f0876a4914c5ac0d213b9dffbfb4797ff62' * Refactor ok_vqa_aggreate_submissions function * Merge commit '4afec3303a0a7ed27a8265565343bf2851b9e4c7' * Refactor VQA submission file saving * Update file utils * Merge commit 'c144b75f0c9145a625b2bbdef5123ed81e343a11' * Refactor file path handling and submission generation * OKVQA path * vizwizvqa file * pack cmmmu * fix a small metric bug for cmmmu * Add higher_is_better flag to submission metric * Add CMMMU dataset to README.md * Add logging and refactor submission file generation in docvqa utils.py * pack docvqa * add traceback to print detailed error * Refactor docvqa_test_aggregate_results to accept additional arguments * Add metric check in evaluator.py and update test.yaml and val.yaml * add common `EvalAIAnswerProcessor` for okvqa, textvqa, vizwizvqa and vqav2 * merge textvqa * textvqa * Modify submission file generation for COCO test results * Update test result storage path * update coco cap file name * Update COCO 2017 Caption dataset name * ferret * Add Ferret dataset * Refactor hb_doc_to_text function to include model-specific prompts * Add IconQA and its subtasks * Refactor image list creation in doc_to_visual function * Add process_results function to default template * Update process_results function in iconqa utils.py * refactor flickr30k * change aggregation function * Fix formatting issues and update logging message * Fix llava can not handle only text question (no visuals) * Fix qwen can not handle no image question (no visuals) * Add fuyu prepare accelerator scripts * refactor mme * naming consistency * aggregation_submissions consistency * flickr30k naming consistency * remove submissions for mme * remove unused submission function * Refactor infovqa_test.yaml and infovqa_val.yaml * Refactor code for improved readability and maintainability * stvqa * remane sqa * Update lmms_eval textcaps files and utils.py * Update default prompt for text captions * Refactor textcaps_aggregation_result function * Add generate_submission_file function and update mathvista_aggregate_results signature * Update nocaps_test.yaml and nocaps_val.yaml * refractor internal_eval * Add internal evaluation datasets * pack multidocvqa * mmvet * Fix gpt eval timeout issue for hallubench, restore load from gpt to avoid re evaluating * Refractor llava wild * Refractor llava-bench-coco * Add JSON file generation for gpt evaluation details * mmmu * Remove MMBench English and Chinese tasks * Remove unnecessary return statement in mmbench_aggregate_test_results function * Fix distributed process group initialization * Update dataset paths and group names in mmbench test configs * Update import statements in cc_utils.py, cn_utils.py, and en_utils.py * Add torch module import * lint * Remove IconQA dataset from README.md * Add Multi-DocVQA and its submodules * Add new datasets and update task names * Refactor flickr_aggregation_result function to accept additional arguments * Add timeout kwargs in Accelerator constructor * Add encoding to be utf-8 for cmmmu * Fix llava try and catch, remove torch.distributed.init in main * Ds prepare script for llava --------- Co-authored-by: JvThunder Co-authored-by: kcz358 commit d3dfd94ec91706587bf9f57dfebaad2cc3b5281c Author: Li Bo Date: Tue Feb 27 22:52:07 2024 +0800 [Wandb Logger] add models, and args to wandb tables. (#55) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * Update commands.md * Add repr_scripts for reference * Add timeout for gpt4V * Remove unnecessary dependencies * Add reproduce into readme * Revise seedbench process_result * Fix exclude dc hardcode postprocess logic error * Fix metric repeat issue * Update dataset runtime and add environment info * Revise val submission file saving path * Put the correct query into the gpt extraction * Update sleep time in utils.py * update --------- Co-authored-by: Fanyi Pu Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> Co-authored-by: jzhang38 Co-authored-by: kcz358 * Update README.md * Update README.md * Fix bug in login functionality * Fix bug in login functionality * Refactor device assignment and remove unnecessary line * Refactor device assignment and remove unnecessary line * Refactor code and remove unnecessary lines * Refactor code and remove unnecessary lines * Refactor output path generation in cli_evaluate_single function * Refactor output path generation in cli_evaluate_single function * Update gpt_eval_model_name in llava-in-the-wild.yaml * Update gpt_eval_model_name in llava-in-the-wild.yaml * Update gpt_eval_model_name in llava-in-the-wild.yaml * Update gpt_eval_model_name in llava-in-the-wild.yaml * Fix loglikelihood error for llava 1.6 (#66) * [ReFactor] make internal_dev as the current main branch. (#67) * Add COCO, RefCOCO, RefCOCO+, RefCOCOg (#5) * Update author name and email in pyproject.toml * add mmvet and try to modify llava arch * Add coco, refcoco support * Fix doc_to_visual error * Fix segmentation mask error * Add refcoco+, refcocog * Remove debug code * black lint * Remove unused code and scripts * Fix group stderr N/A error between str and int * Fix letter case issue * Update lmms_eval tasks and utils * Fix coco test_split name * Add llava-bench-in-the-wild support * Black codestyle, lint * Add COCO evaluation metric * Add refcoco, refcocog, refcoco+ evaluation kit * Add llava bench coco support --------- Co-authored-by: Bo Li * VQAv2 eval (#4) * vqav2 * Add vqav2_process_results function and update vqav2_doc_to_text function * Implement vqav2_process_results function to return exact match score * Refactor fewshot_docs() to use config.fewshot_config * Refactor Task class to handle fewshot_docs when training and validation docs are not available * Add answer processing logic in vqav2_process_results function * Refactor vqav2_process_results function and add submission aggregation * Add vqav2_aggreate_submissions function to utils.py * textvqa * Refactor answer processing in textvqa_process_results() function * textvqa eval * Update dataset path and modify textvqa_doc_to_text function * Capitalize the question in textvqa_doc_to_text function * Update textvqa.yaml and utils.py * Fix formatting issues in lmms_eval/api/task.py, lmms_eval/tasks/gqa/utils.py, lmms_eval/tasks/textvqa/utils.py, and lmms_eval/tasks/vqav2/utils.py --------- Co-authored-by: Li Bo * [Big Changes] add LLaVA-1.6, MMVet, LLaVA-W, POPE, and many other changes on logs, model args. (#7) * Update author name and email in pyproject.toml * add mmvet and try to modify llava arch * black lint * Remove unused code and scripts * Update lmms_eval tasks and utils * Update LMMS-Eval dependencies and configurations * Squashed commit of the following: commit 209f3904f33210bec0b4b146e96fcbd67a4e1541 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Wed Jan 17 20:27:13 2024 +0800 Add COCO, RefCOCO, RefCOCO+, RefCOCOg (#5) * Update author name and email in pyproject.toml * add mmvet and try to modify llava arch * Add coco, refcoco support * Fix doc_to_visual error * Fix segmentation mask error * Add refcoco+, refcocog * Remove debug code * black lint * Remove unused code and scripts * Fix group stderr N/A error between str and int * Fix letter case issue * Update lmms_eval tasks and utils * Fix coco test_split name * Add llava-bench-in-the-wild support * Black codestyle, lint * Add COCO evaluation metric * Add refcoco, refcocog, refcoco+ evaluation kit * Add llava bench coco support --------- Co-authored-by: Bo Li commit f102f038a161fe667628accd2d9daa33e70fe74f Author: Zhang Peiyuan Date: Wed Jan 17 20:26:58 2024 +0800 Update utils.py (#6) * Fix logging issue and remove unnecessary whitespace * Add openai and pycocoevalcap dependencies * Fix device mapping issue in Llava constructor * Add support for truncating context in generation * Update Llava model and evaluation configuration * Update YAML configuration files * Update YAML configuration files * add otterhd and gemini models * Add support for custom image aspect ratio in Llava model * Add dataset_kwargs and max_gen_toks to YAML files * Fix log_samples suffix typo and use hash for output name * Refactor LMMS evaluation code and update LLAVA model properties * matched response for mistral-llava * Refactor logging in llava_aggregation function * Print evaluation statistics instead of logging them * Fix logging information in llava_aggregation function * Add new models and dataset_kwargs for COCO tasks * Update truncate_context parameter in Llava class constructor * Update dataset_kwargs in YAML files * Remove issue type tags from issue and pull request templates * add mmvet and try to modify llava arch * black lint * Update lmms_eval tasks and utils * Update LMMS-Eval dependencies and configurations * Squashed commit of the following: commit 209f3904f33210bec0b4b146e96fcbd67a4e1541 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Wed Jan 17 20:27:13 2024 +0800 Add COCO, RefCOCO, RefCOCO+, RefCOCOg (#5) * Update author name and email in pyproject.toml * add mmvet and try to modify llava arch * Add coco, refcoco support * Fix doc_to_visual error * Fix segmentation mask error * Add refcoco+, refcocog * Remove debug code * black lint * Remove unused code and scripts * Fix group stderr N/A error between str and int * Fix letter case issue * Update lmms_eval tasks and utils * Fix coco test_split name * Add llava-bench-in-the-wild support * Black codestyle, lint * Add COCO evaluation metric * Add refcoco, refcocog, refcoco+ evaluation kit * Add llava bench coco support --------- Co-authored-by: Bo Li commit f102f038a161fe667628accd2d9daa33e70fe74f Author: Zhang Peiyuan Date: Wed Jan 17 20:26:58 2024 +0800 Update utils.py (#6) * Fix logging issue and remove unnecessary whitespace * Add openai and pycocoevalcap dependencies * Fix device mapping issue in Llava constructor * Add support for truncating context in generation * Update Llava model and evaluation configuration * Update YAML configuration files * Update YAML configuration files * add otterhd and gemini models * Add support for custom image aspect ratio in Llava model * Add dataset_kwargs and max_gen_toks to YAML files * Fix log_samples suffix typo and use hash for output name * Refactor LMMS evaluation code and update LLAVA model properties * matched response for mistral-llava * Refactor logging in llava_aggregation function * Print evaluation statistics instead of logging them * Fix logging information in llava_aggregation function * Add new models and dataset_kwargs for COCO tasks * Update truncate_context parameter in Llava class constructor * Update dataset_kwargs in YAML files * Remove issue type tags from issue and pull request templates * Refactor pope utils functions * Update transformers dependency to version 4.36.2 * Revise llava-in-the-wild prompt for align * Add default values for gen_kwargs in Llava class * Fix formatting issues and import pdb for debugging * Remove pdb.set_trace() and update default value for max_new_tokens * Add llava loglikelihood * Fix formatting and indentation issues in lmms_eval/api/metrics.py and lmms_eval/models/llava.py * Update function to handle edge cases This commit updates the function to handle edge cases, improving the overall reliability and robustness of the code. * Update black version in pre-commit config * Remove duplicate lines in gqa * Another way to solve memory issue * Handle exception in model generation * Refactor pope_aggregate_results to use "score" key instead of "pope_accuracy" * Update pope metrics aggregation functions * Add model_to_prompt in pope.yaml * Update pope.yaml configuration * Refactor code to simplify construct_requests call --------- Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> * Add datetime to output name in cli_evaluate function Add get_datetime_str function to utils.py * Refactor pope_aggregate_f1_score function * Fix datetime format in get_datetime_str function * Update JSON dump indentation in cli_evaluate function * Add datetime to output name in cli_evaluate function (#10) * Revert "Add datetime to output name in cli_evaluate function" This reverts commit ef26f78c46b50d8769a4fb6990b909162c2881c3. * Add datetime to output name in cli_evaluate function * [Datasets] Added POPE and Aligned. (#11) * Update generation_kwargs in pope.yaml * Update pope_doc_to_text function * [Dataset] Add SEED-Bench, TextCaps, NoCaps (#12) * Change coco from print to logger * Add llava loglikelihood * Add Nocaps support * Fix pass through function * Add textcaps support * Fix textcaps eval image_id * Add seedbench support * Add seedbench ppl evaluation * black lint * [Datasets] Add four internal evaluation datasets (#13) * Update generation_kwargs in pope.yaml * Update pope_doc_to_text function * Remove unused variable in mmvet_process_results function * Remove unused imports in utils.py * Refactor get_chat_response function to include retries for API requests * Update gpt_eval_model_name in lmms_eval/tasks/dc100_en.yaml and add retry logic in get_chat_response function * Update prompt variable in lmms_eval tasks * Refactor output_name variable in cli_evaluate function * Fix logging message in mmvet_process_results function * Update sleep time in get_chat_response function * Merge commit 'fec494dbe5971e8fa5a886b191a4781be3ce7a6f' * Refactor get_eval function to include retries * Add token parameter to load_dataset function in gqa_doc_to_visual * Refactor llava_process_results and llava_aggregation functions * [Datasets] Add four internal evaluation datasets (#13) * Update generation_kwargs in pope.yaml * Update pope_doc_to_text function * Remove unused variable in mmvet_process_results function * Remove unused imports in utils.py * Refactor get_chat_response function to include retries for API requests * Update gpt_eval_model_name in lmms_eval/tasks/dc100_en.yaml and add retry logic in get_chat_response function * Update prompt variable in lmms_eval tasks * Refactor output_name variable in cli_evaluate function * Fix logging message in mmvet_process_results function * Update sleep time in get_chat_response function * Merge commit 'fec494dbe5971e8fa5a886b191a4781be3ce7a6f' * Refactor get_eval function to include retries * Add token parameter to load_dataset function in gqa_doc_to_visual * Refactor llava_process_results and llava_aggregation functions * add mmmu (#15) * add mmme * black * add mmmu (#15) * add mmme * black * [Memory issue] Solve memory issue for building context (#14) * Update generation_kwargs in pope.yaml * Update pope_doc_to_text function * Remove unused variable in mmvet_process_results function * Remove unused imports in utils.py * Refactor get_chat_response function to include retries for API requests * Update gpt_eval_model_name in lmms_eval/tasks/dc100_en.yaml and add retry logic in get_chat_response function * Update prompt variable in lmms_eval tasks * Refactor output_name variable in cli_evaluate function * Fix logging message in mmvet_process_results function * Update sleep time in get_chat_response function * Merge commit 'fec494dbe5971e8fa5a886b191a4781be3ce7a6f' * Refactor get_eval function to include retries * Add token parameter to load_dataset function in gqa_doc_to_visual * Refactor llava_process_results and llava_aggregation functions * Remove unused function llava_aggregation * Refractor llava-bench aggregation code * Add logs and scripts to .gitignore, and set image_aspect_ratio to original in scienceqa.yaml * Update generation parameters in scienceqa.yaml * Solve memory issue for building context * Solved gather result error * Update lmms_eval scienceqa_img config * Fixed nocaps store results * Revise seedbench prompt * Squashed commit of the following: commit c3cc24a89415aeccad31ccbb10642af677cd6fe5 Author: Zhang Peiyuan Date: Wed Jan 24 14:07:36 2024 +0800 add mmmu (#15) * add mmme * black commit 0dbc5d16c4f45ebea8def5f0bc1a36fcd93f9a05 Author: Li Bo Date: Wed Jan 24 10:00:33 2024 +0800 [Datasets] Add four internal evaluation datasets (#13) * Update generation_kwargs in pope.yaml * Update pope_doc_to_text function * Remove unused variable in mmvet_process_results function * Remove unused imports in utils.py * Refactor get_chat_response function to include retries for API requests * Update gpt_eval_model_name in lmms_eval/tasks/dc100_en.yaml and add retry logic in get_chat_response function * Update prompt variable in lmms_eval tasks * Refactor output_name variable in cli_evaluate function * Fix logging message in mmvet_process_results function * Update sleep time in get_chat_response function * Merge commit 'fec494dbe5971e8fa5a886b191a4781be3ce7a6f' * Refactor get_eval function to include retries * Add token parameter to load_dataset function in gqa_doc_to_visual * Refactor llava_process_results and llava_aggregation functions commit fec494dbe5971e8fa5a886b191a4781be3ce7a6f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Tue Jan 23 19:17:40 2024 +0800 [Dataset] Add SEED-Bench, TextCaps, NoCaps (#12) * Change coco from print to logger * Add llava loglikelihood * Add Nocaps support * Fix pass through function * Add textcaps support * Fix textcaps eval image_id * Add seedbench support * Add seedbench ppl evaluation * black lint commit 4c3c2c63a681f29c537c2467957de1a90568748d Author: Li Bo Date: Tue Jan 23 19:17:12 2024 +0800 [Datasets] Added POPE and Aligned. (#11) * Update generation_kwargs in pope.yaml * Update pope_doc_to_text function --------- Co-authored-by: Bo Li * [Memory issue] Solve memory issue for building context (#14) * Update generation_kwargs in pope.yaml * Update pope_doc_to_text function * Remove unused variable in mmvet_process_results function * Remove unused imports in utils.py * Refactor get_chat_response function to include retries for API requests * Update gpt_eval_model_name in lmms_eval/tasks/dc100_en.yaml and add retry logic in get_chat_response function * Update prompt variable in lmms_eval tasks * Refactor output_name variable in cli_evaluate function * Fix logging message in mmvet_process_results function * Update sleep time in get_chat_response function * Merge commit 'fec494dbe5971e8fa5a886b191a4781be3ce7a6f' * Refactor get_eval function to include retries * Add token parameter to load_dataset function in gqa_doc_to_visual * Refactor llava_process_results and llava_aggregation functions * Remove unused function llava_aggregation * Refractor llava-bench aggregation code * Add logs and scripts to .gitignore, and set image_aspect_ratio to original in scienceqa.yaml * Update generation parameters in scienceqa.yaml * Solve memory issue for building context * Solved gather result error * Update lmms_eval scienceqa_img config * Fixed nocaps store results * Revise seedbench prompt * Squashed commit of the following: commit 290126e6a269db4cca9b3544bd017d6c17012793 Author: Zhang Peiyuan Date: Wed Jan 24 14:07:36 2024 +0800 add mmmu (#15) * add mmme * black commit 8b0227cd7b2602d096d773a01b2199d1f4110f22 Author: Li Bo Date: Wed Jan 24 10:00:33 2024 +0800 [Datasets] Add four internal evaluation datasets (#13) * Update generation_kwargs in pope.yaml * Update pope_doc_to_text function * Remove unused variable in mmvet_process_results function * Remove unused imports in utils.py * Refactor get_chat_response function to include retries for API requests * Update gpt_eval_model_name in lmms_eval/tasks/dc100_en.yaml and add retry logic in get_chat_response function * Update prompt variable in lmms_eval tasks * Refactor output_name variable in cli_evaluate function * Fix logging message in mmvet_process_results function * Update sleep time in get_chat_response function * Merge commit 'fec494dbe5971e8fa5a886b191a4781be3ce7a6f' * Refactor get_eval function to include retries * Add token parameter to load_dataset function in gqa_doc_to_visual * Refactor llava_process_results and llava_aggregation functions commit fec494dbe5971e8fa5a886b191a4781be3ce7a6f Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Tue Jan 23 19:17:40 2024 +0800 [Dataset] Add SEED-Bench, TextCaps, NoCaps (#12) * Change coco from print to logger * Add llava loglikelihood * Add Nocaps support * Fix pass through function * Add textcaps support * Fix textcaps eval image_id * Add seedbench support * Add seedbench ppl evaluation * black lint commit 4c3c2c63a681f29c537c2467957de1a90568748d Author: Li Bo Date: Tue Jan 23 19:17:12 2024 +0800 [Datasets] Added POPE and Aligned. (#11) * Update generation_kwargs in pope.yaml * Update pope_doc_to_text function --------- Co-authored-by: Bo Li * Add output path file naming convention (#16) Update datetime format in get_datetime_str() function * Add output path file naming convention (#16) Update datetime format in get_datetime_str() function * [Datasets] modify NoCaps data path and prompts (#17) * Merge commit '95ef3ea519cbd772924f9a6afa5394979eb00432' * Update dataset paths and improve user prompts * [Datasets] modify NoCaps data path and prompts (#17) * Merge commit '767f7e2cae60cf67ec5878234d84321395a3ed15' * Update dataset paths and improve user prompts * [Dataset] Add flickr30k (#18) * Add flickr30k support * Black lint * Align prompt with NoCaps * [Dataset] Add flickr30k (#18) * Add flickr30k support * Black lint * Align prompt with NoCaps * add model specific prompt and gen kwargs in sqa (#19) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add model specific prompt and gen kwargs in sqa (#19) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * Dev/add chartqa and ai2d (#23) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * Add 'submissions/' directory to .gitignore * Add Python setup and Black version installation workflow Refactor ContextSampler class in samplers.py Remove unnecessary line in DecontaminationFilter class Update dependencies in pyproject.toml * Refactor code in ContextSampler class --------- Co-authored-by: Bo Li * Dev/add chartqa and ai2d (#23) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * Add 'submissions/' directory to .gitignore * Add Python setup and Black version installation workflow Refactor ContextSampler class in samplers.py Remove unnecessary line in DecontaminationFilter class Update dependencies in pyproject.toml * Refactor code in ContextSampler class --------- Co-authored-by: Bo Li * [Datasets] Changes for Flickr30K and NoCaps, also merged Peiyuan's Model Specific Prompt. (#20) * Merge commit '95ef3ea519cbd772924f9a6afa5394979eb00432' * Update dataset paths and improve user prompts * Add submission folder and update file paths for storing prediction results * Merge commit 'ecb47d73d6e000b472be6c5c0cdc9413c7734384' * Update dataset_path in flickr30k.yaml * Add coco_val and coco_test tasks to coco.yaml * Squashed commit of the following: commit 4d48d0c9b88e62dfebe05ec909b7f1851e9cd75d Author: jzhang38 Date: Thu Jan 25 11:59:12 2024 +0800 refactor multi model code commit 4a4b7bec200c72332b61a0c277cd8f8a34e4f721 Author: jzhang38 Date: Thu Jan 25 11:51:16 2024 +0800 print table at the end commit 63739fc6fa0a462d807ae81de0db0173102de584 Author: jzhang38 Date: Thu Jan 25 11:20:59 2024 +0800 add yaml config to supprot multi-model eval commit edcc752f97ea3845cefad56624e5d2855066f680 Author: jzhang38 Date: Thu Jan 25 10:39:42 2024 +0800 black commit 41f4b63d3a6e83babe92bac32a7432a8ef740bb5 Merge: 7e8b57d 4d11dce Author: jzhang38 Date: Thu Jan 25 10:37:57 2024 +0800 resolve conflicts in sqa commit 7e8b57d3bcc21d2a049d3abbc8a8201631641db4 Author: jzhang38 Date: Thu Jan 25 10:36:46 2024 +0800 add model specific prompt and gen kwargs commit ecb47d73d6e000b472be6c5c0cdc9413c7734384 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Thu Jan 25 09:47:31 2024 +0800 [Dataset] Add flickr30k (#18) * Add flickr30k support * Black lint * Align prompt with NoCaps commit dc23f4b42b1dd60b41904d7ddbee1412d6851077 Author: Li Bo Date: Wed Jan 24 22:10:14 2024 +0800 [Datasets] modify NoCaps data path and prompts (#17) * Merge commit '95ef3ea519cbd772924f9a6afa5394979eb00432' * Update dataset paths and improve user prompts commit 5f55126484a7c9325db586d26cf2052538222804 Author: jzhang38 Date: Wed Jan 24 13:56:51 2024 +0800 black commit aa6f8853cf82384fb3b15306fec4769212fbc5ab Author: jzhang38 Date: Wed Jan 24 13:55:43 2024 +0800 add mmme * Squashed commit of the following: commit 4d48d0c9b88e62dfebe05ec909b7f1851e9cd75d Author: jzhang38 Date: Thu Jan 25 11:59:12 2024 +0800 refactor multi model code commit 4a4b7bec200c72332b61a0c277cd8f8a34e4f721 Author: jzhang38 Date: Thu Jan 25 11:51:16 2024 +0800 print table at the end commit 63739fc6fa0a462d807ae81de0db0173102de584 Author: jzhang38 Date: Thu Jan 25 11:20:59 2024 +0800 add yaml config to supprot multi-model eval commit edcc752f97ea3845cefad56624e5d2855066f680 Author: jzhang38 Date: Thu Jan 25 10:39:42 2024 +0800 black commit 41f4b63d3a6e83babe92bac32a7432a8ef740bb5 Merge: 7e8b57d 4d11dce Author: jzhang38 Date: Thu Jan 25 10:37:57 2024 +0800 resolve conflicts in sqa commit 7e8b57d3bcc21d2a049d3abbc8a8201631641db4 Author: jzhang38 Date: Thu Jan 25 10:36:46 2024 +0800 add model specific prompt and gen kwargs commit 5f55126484a7c9325db586d26cf2052538222804 Author: jzhang38 Date: Wed Jan 24 13:56:51 2024 +0800 black commit aa6f8853cf82384fb3b15306fec4769212fbc5ab Author: jzhang38 Date: Wed Jan 24 13:55:43 2024 +0800 add mmme * Squashed commit of the following: commit 18e984cfe173390843c73048a931baa17800f918 Author: Zhang Peiyuan Date: Thu Jan 25 17:08:25 2024 +0800 add model specific prompt and gen kwargs in sqa (#19) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * Fix cli itself can not run with config file * Fix bug in login functionality Refactor code for better performance Add new feature for user authentication Update UI layout for improved user experience Fix typo in variable name Optimize database queries for faster response time Add error handling for edge cases Update dependencies to latest versions Remove unused code Improve code readability and maintainability * Refactor get_task_dict function to handle nested groups * Add submission file for coco, flickr30k, nocaps, and textcaps tasks * Remove unused files and update task configuration * Fix tasks issue for nocaps, refcoco/+/g * Fix file path and raise error if config file does not exist * Exclude train in refcoco/+/g config * Solve doc_iterator_for_counting crashing issue * Black lint * Refactor code to improve performance and readability * Squashed commit of the following: commit 0df825c9e72a06e6acb4c0bd43c2083ffe8b74c0 Author: JvThunder Date: Fri Jan 26 01:03:57 2024 +0800 change okvqa yaml commit b9d9f9896993033b92346e9f47420c55b866c715 Author: JvThunder Date: Fri Jan 26 00:55:40 2024 +0800 change yaml commit 4256bef410e4c8d8761e0cd0d79ac5e57b97651b Author: JvThunder Date: Fri Jan 26 00:42:43 2024 +0800 add okvqa task commit 18e984cfe173390843c73048a931baa17800f918 Author: Zhang Peiyuan Date: Thu Jan 25 17:08:25 2024 +0800 add model specific prompt and gen kwargs in sqa (#19) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * Squashed commit of the following: commit 0c8a3919885b8fe2880bb2892f7a619d060012d1 Author: JvThunder Date: Fri Jan 26 01:06:02 2024 +0800 change ocr reference commit d2bc7c92ac61179b8c4031e11bc31970355252f6 Author: JvThunder Date: Fri Jan 26 01:05:46 2024 +0800 revert example_eval commit c78fa29cd0d161641ee05db57bd39314b998c8c7 Author: JvThunder Date: Fri Jan 26 00:17:28 2024 +0800 edit vizwiz utils commit 397f0906968fd8ba04b883469b96217737c43e09 Author: JvThunder Date: Thu Jan 25 23:49:47 2024 +0800 reorganize __init__ commit 52a7ea6c7599adeec2ac2787f500e215ce47cf79 Author: JvThunder Date: Thu Jan 25 23:46:20 2024 +0800 minor fixes commit f706b2aaf9b288c582611191a1841b58feaeb741 Author: JvThunder Date: Thu Jan 25 17:41:03 2024 +0800 add vizwizvqa eval rask commit 18e984cfe173390843c73048a931baa17800f918 Author: Zhang Peiyuan Date: Thu Jan 25 17:08:25 2024 +0800 add model specific prompt and gen kwargs in sqa (#19) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * Refactor mathvista.yaml and utils.py * Add gpt_eval_score to mathvista_process_results * Refactor mathvista_aggregate_results to return average accuracy score * Fix refcoco evaluation error * Fix evaluation problem for refcoco+/g * Refactor mathvista.yaml and mathvista_evals.py * Add dependencies and update YAML files * Refactor mmbench_en/utils.py to save test results to separate Excel file * Fix caption task prompt * Add group field to mmbench_en_test and mmbench_en_val yaml files * Delete mmbench_en_val.yaml file * Update mmbench_cn.yaml and mmbench_cn_test.yaml * Update mmbench_cn_val.yaml and utils.py * Remove unused fields in mmbench_cn_cc_process_results function * Update aggregation function for mmbench_en_dev.yaml * Fix capitalization of L2-category key in utils.py * Fix variable name in mmbench_process_results function * Delete mmbench_cn_val.yaml file * Update mathvista_test.yaml and mathvista_testmini.yaml * Fix warnings and update mathvista.yaml * Remove system message from MathVistaEvaluator * Update GPT model version in MathVistaEvaluator constructor * Update GQA_RAW_IMAGE_DATASET path in utils.py * change vizwiz to test set * Add split flag to mathvista_aggregate_results function * Add higher_is_better: false to gpt_eval_info metric in d170_cn, d170_en, dc100_en, and dc200_cn yaml files * Add download configuration for dataset * Update GQA_RAW_IMAGE_DATASET path in utils.py * add datasets * Update gpt_eval_model_name in mathvista.yaml * Merge commit '0d620f98b49f8204d02633f209eedd5d8b7a1f7c' * Update pyproject.toml with dependencies and URLs * Squashed commit of the following: commit 8b600f55b6cf5627504c407871539db59f6085a3 Author: Zhang Peiyuan Date: Sat Jan 27 13:56:37 2024 +0800 Dev/add chartqa and ai2d (#23) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * Add 'submissions/' directory to .gitignore * Add Python setup and Black version installation workflow Refactor ContextSampler class in samplers.py Remove unnecessary line in DecontaminationFilter class Update dependencies in pyproject.toml * Refactor code in ContextSampler class --------- Co-authored-by: Bo Li * Refactor image processing and submission file path * Refactor directory creation logic in cli_evaluate_single function * Update dataset path and test split in vqav2.yaml * Remove "total" column from cap_details_columns DataFrame * Add retry logic for dataset download * Add 'tenacity' to dependencies in pyproject.toml * Refactor code in ContextSampler class * Update Black version and configuration, and improve code readability in ContextSampler * Update Black version and line length --------- Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> Co-authored-by: Fanyi Pu * [Datasets] Changes for Flickr30K and NoCaps, also merged Peiyuan's Model Specific Prompt. (#20) * Merge commit '767f7e2cae60cf67ec5878234d84321395a3ed15' * Update dataset paths and improve user prompts * Add submission folder and update file paths for storing prediction results * Merge commit '842fbc6f2da7d9a118adf9ec27c3d8542d74168e' * Update dataset_path in flickr30k.yaml * Add coco_val and coco_test tasks to coco.yaml * Squashed commit of the following: commit 542a34dc5721ecdff6c5c68b0568692ad3a17149 Author: jzhang38 Date: Thu Jan 25 11:59:12 2024 +0800 refactor multi model code commit 3c397b8af85192b1821b3b6a0d8b8df746b5347c Author: jzhang38 Date: Thu Jan 25 11:51:16 2024 +0800 print table at the end commit e7b8a2d1f1e7337f02298efafd2ebf81543f4f85 Author: jzhang38 Date: Thu Jan 25 11:20:59 2024 +0800 add yaml config to supprot multi-model eval commit 2626383d99b5eac59d531ca0f293df960570c524 Author: jzhang38 Date: Thu Jan 25 10:39:42 2024 +0800 black commit 8349935fe145e33af0007ad4fb0d71fd925be7a0 Merge: 7e8b57d 4d11dce Author: jzhang38 Date: Thu Jan 25 10:37:57 2024 +0800 resolve conflicts in sqa commit d4e8e2552d40752bfdc5bbf4cd962c1798096258 Author: jzhang38 Date: Thu Jan 25 10:36:46 2024 +0800 add model specific prompt and gen kwargs commit 842fbc6f2da7d9a118adf9ec27c3d8542d74168e Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Thu Jan 25 09:47:31 2024 +0800 [Dataset] Add flickr30k (#18) * Add flickr30k support * Black lint * Align prompt with NoCaps commit 4bf0504fabc3b62f356c467b2fd1119083d27313 Author: Li Bo Date: Wed Jan 24 22:10:14 2024 +0800 [Datasets] modify NoCaps data path and prompts (#17) * Merge commit '767f7e2cae60cf67ec5878234d84321395a3ed15' * Update dataset paths and improve user prompts commit 520c7a2cafe60810aca79df814ce6829d4576032 Author: jzhang38 Date: Wed Jan 24 13:56:51 2024 +0800 black commit 3a633240327c078fa4f5a75dbd38ad5bc0d468dd Author: jzhang38 Date: Wed Jan 24 13:55:43 2024 +0800 add mmme * Squashed commit of the following: commit 542a34dc5721ecdff6c5c68b0568692ad3a17149 Author: jzhang38 Date: Thu Jan 25 11:59:12 2024 +0800 refactor multi model code commit 3c397b8af85192b1821b3b6a0d8b8df746b5347c Author: jzhang38 Date: Thu Jan 25 11:51:16 2024 +0800 print table at the end commit e7b8a2d1f1e7337f02298efafd2ebf81543f4f85 Author: jzhang38 Date: Thu Jan 25 11:20:59 2024 +0800 add yaml config to supprot multi-model eval commit 2626383d99b5eac59d531ca0f293df960570c524 Author: jzhang38 Date: Thu Jan 25 10:39:42 2024 +0800 black commit 8349935fe145e33af0007ad4fb0d71fd925be7a0 Merge: 7e8b57d 4d11dce Author: jzhang38 Date: Thu Jan 25 10:37:57 2024 +0800 resolve conflicts in sqa commit d4e8e2552d40752bfdc5bbf4cd962c1798096258 Author: jzhang38 Date: Thu Jan 25 10:36:46 2024 +0800 add model specific prompt and gen kwargs commit 520c7a2cafe60810aca79df814ce6829d4576032 Author: jzhang38 Date: Wed Jan 24 13:56:51 2024 +0800 black commit 3a633240327c078fa4f5a75dbd38ad5bc0d468dd Author: jzhang38 Date: Wed Jan 24 13:55:43 2024 +0800 add mmme * Squashed commit of the following: commit b13a805623dfd9d826ddd440e1b5ecde773fbb12 Author: Zhang Peiyuan Date: Thu Jan 25 17:08:25 2024 +0800 add model specific prompt and gen kwargs in sqa (#19) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * Fix cli itself can not run with config file * Fix bug in login functionality Refactor code for better performance Add new feature for user authentication Update UI layout for improved user experience Fix typo in variable name Optimize database queries for faster response time Add error handling for edge cases Update dependencies to latest versions Remove unused code Improve code readability and maintainability * Refactor get_task_dict function to handle nested groups * Add submission file for coco, flickr30k, nocaps, and textcaps tasks * Remove unused files and update task configuration * Fix tasks issue for nocaps, refcoco/+/g * Fix file path and raise error if config file does not exist * Exclude train in refcoco/+/g config * Solve doc_iterator_for_counting crashing issue * Black lint * Refactor code to improve performance and readability * Squashed commit of the following: commit a2cc9303dc72e4d53983bb56e54a32e977c3e270 Author: JvThunder Date: Fri Jan 26 01:03:57 2024 +0800 change okvqa yaml commit 35e87e7c7a480d005abf607c2527a35457d92311 Author: JvThunder Date: Fri Jan 26 00:55:40 2024 +0800 change yaml commit 89755323596b85208ed33aa88c296604a39af6eb Author: JvThunder Date: Fri Jan 26 00:42:43 2024 +0800 add okvqa task commit b13a805623dfd9d826ddd440e1b5ecde773fbb12 Author: Zhang Peiyuan Date: Thu Jan 25 17:08:25 2024 +0800 add model specific prompt and gen kwargs in sqa (#19) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * Squashed commit of the following: commit 0b0d30dfb247c5f0b7b68398b9e9fcde74cf7fa2 Author: JvThunder Date: Fri Jan 26 01:06:02 2024 +0800 change ocr reference commit e273f9cbd91540df86bdbc652bff88a847bd0d2d Author: JvThunder Date: Fri Jan 26 01:05:46 2024 +0800 revert example_eval commit e84126aaaf8a07bd371a0571a914ccbcd3697f20 Author: JvThunder Date: Fri Jan 26 00:17:28 2024 +0800 edit vizwiz utils commit 110deab53dc1a2fd349b1872cd261b69074c5fa8 Author: JvThunder Date: Thu Jan 25 23:49:47 2024 +0800 reorganize __init__ commit 0fa3e0c40075997ea80ed976bdee9615f17d3ece Author: JvThunder Date: Thu Jan 25 23:46:20 2024 +0800 minor fixes commit 2aaca579120def99860f90054233f3358950fa66 Author: JvThunder Date: Thu Jan 25 17:41:03 2024 +0800 add vizwizvqa eval rask commit b13a805623dfd9d826ddd440e1b5ecde773fbb12 Author: Zhang Peiyuan Date: Thu Jan 25 17:08:25 2024 +0800 add model specific prompt and gen kwargs in sqa (#19) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * Refactor mathvista.yaml and utils.py * Add gpt_eval_score to mathvista_process_results * Refactor mathvista_aggregate_results to return average accuracy score * Fix refcoco evaluation error * Fix evaluation problem for refcoco+/g * Refactor mathvista.yaml and mathvista_evals.py * Add dependencies and update YAML files * Refactor mmbench_en/utils.py to save test results to separate Excel file * Fix caption task prompt * Add group field to mmbench_en_test and mmbench_en_val yaml files * Delete mmbench_en_val.yaml file * Update mmbench_cn.yaml and mmbench_cn_test.yaml * Update mmbench_cn_val.yaml and utils.py * Remove unused fields in mmbench_cn_cc_process_results function * Update aggregation function for mmbench_en_dev.yaml * Fix capitalization of L2-category key in utils.py * Fix variable name in mmbench_process_results function * Delete mmbench_cn_val.yaml file * Update mathvista_test.yaml and mathvista_testmini.yaml * Fix warnings and update mathvista.yaml * Remove system message from MathVistaEvaluator * Update GPT model version in MathVistaEvaluator constructor * Update GQA_RAW_IMAGE_DATASET path in utils.py * change vizwiz to test set * Add split flag to mathvista_aggregate_results function * Add higher_is_better: false to gpt_eval_info metric in d170_cn, d170_en, dc100_en, and dc200_cn yaml files * Add download configuration for dataset * Update GQA_RAW_IMAGE_DATASET path in utils.py * add datasets * Update gpt_eval_model_name in mathvista.yaml * Merge commit '817eb057bcb61226b33d3ac3c8def01c36c90f96' * Update pyproject.toml with dependencies and URLs * Squashed commit of the following: commit f253968ad703f682a29317bdd51ec6c1fd7c5465 Author: Zhang Peiyuan Date: Sat Jan 27 13:56:37 2024 +0800 Dev/add chartqa and ai2d (#23) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code * add chartqa * black * add ai2d * black * update chartqa * blacl * update ai2d dataset * black * Add 'submissions/' directory to .gitignore * Add Python setup and Black version installation workflow Refactor ContextSampler class in samplers.py Remove unnecessary line in DecontaminationFilter class Update dependencies in pyproject.toml * Refactor code in ContextSampler class --------- Co-authored-by: Bo Li * Refactor image processing and submission file path * Refactor directory creation logic in cli_evaluate_single function * Update dataset path and test split in vqav2.yaml * Remove "total" column from cap_details_columns DataFrame * Add retry logic for dataset download * Add 'tenacity' to dependencies in pyproject.toml * Refactor code in ContextSampler class * Update Black version and configuration, and improve code readability in ContextSampler * Update Black version and line length --------- Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> Co-authored-by: Fanyi Pu * vqav2 (#25) * Update tqdm progress bar position * Merge commit 'ecb47d73d6e000b472be6c5c0cdc9413c7734384' * Squashed commit of the following: commit 18e984cfe173390843c73048a931baa17800f918 Author: Zhang Peiyuan Date: Thu Jan 25 17:08:25 2024 +0800 add model specific prompt and gen kwargs in sqa (#19) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code commit ecb47d73d6e000b472be6c5c0cdc9413c7734384 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Thu Jan 25 09:47:31 2024 +0800 [Dataset] Add flickr30k (#18) * Add flickr30k support * Black lint * Align prompt with NoCaps commit dc23f4b42b1dd60b41904d7ddbee1412d6851077 Author: Li Bo Date: Wed Jan 24 22:10:14 2024 +0800 [Datasets] modify NoCaps data path and prompts (#17) * Merge commit '95ef3ea519cbd772924f9a6afa5394979eb00432' * Update dataset paths and improve user prompts commit 95ef3ea519cbd772924f9a6afa5394979eb00432 Author: Li Bo Date: Wed Jan 24 19:51:34 2024 +0800 Add output path file naming convention (#16) Update datetime format in get_datetime_str() function * remove useless output file * Update dataset path in vqav2.yaml * Squashed commit of the following: commit 75bb7043ea5a533ab6351fc0f5ab055e86106423 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Fri Jan 26 09:56:45 2024 +0800 Black lint commit 6635a8aa34cfbd3c7a4afb6fcd214a7283ce01cb Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Fri Jan 26 09:55:47 2024 +0800 Solve doc_iterator_for_counting crashing issue commit 080f42b88ea8acacd527b8d67b84ba1d7d135b03 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Fri Jan 26 09:55:13 2024 +0800 Exclude train in refcoco/+/g config commit 4da84069c08c95e49e8ab0e64a1e103ff7ac8730 Merge: 6a1ae69 697a438 Author: Bo Li Date: Thu Jan 25 17:17:13 2024 +0000 Merge branch 'dev/bli_add_datasets' of https://github.com/EvolvingLMMs-Lab/lmms-eval into dev/bli_add_datasets commit 6a1ae69923d79ae32a001edac38206b605274ec3 Author: Bo Li Date: Thu Jan 25 17:17:06 2024 +0000 Fix file path and raise error if config file does not exist commit 697a4387827ceeec3e393237dd1baa217c714c88 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Fri Jan 26 00:47:24 2024 +0800 Fix tasks issue for nocaps, refcoco/+/g commit 47e40437126d39a5f062c9a33b4de426c1a29804 Author: Bo Li Date: Thu Jan 25 10:09:43 2024 +0000 Remove unused files and update task configuration commit 9976eb8e9ed03c8613725fdbd822ef5d8cf70e47 Author: Bo Li Date: Thu Jan 25 09:43:56 2024 +0000 Add submission file for coco, flickr30k, nocaps, and textcaps tasks commit 95f97a69faa6129676e89eee14960fcfe2076b7c Author: Bo Li Date: Thu Jan 25 09:32:54 2024 +0000 Refactor get_task_dict function to handle nested groups commit 3b79ee842b2488714baf92ab34528ef77989d392 Author: Bo Li Date: Thu Jan 25 09:13:46 2024 +0000 Fix bug in login functionality Refactor code for better performance Add new feature for user authentication Update UI layout for improved user experience Fix typo in variable name Optimize database queries for faster response time Add error handling for edge cases Update dependencies to latest versions Remove unused code Improve code readability and maintainability commit f5c353f2ce93a2d96add4312b695b57432f68cbb Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Thu Jan 25 17:07:20 2024 +0800 Fix cli itself can not run with config file commit 9a68fec37be74cfe8d4a73390bc83edee147ae24 Author: Bo Li Date: Thu Jan 25 09:09:04 2024 +0000 Squashed commit of the following: commit 18e984cfe173390843c73048a931baa17800f918 Author: Zhang Peiyuan Date: Thu Jan 25 17:08:25 2024 +0800 add model specific prompt and gen kwargs in sqa (#19) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code commit 93f847c5851fd246716367935d6b807b17d53949 Author: Bo Li Date: Thu Jan 25 09:02:57 2024 +0000 Squashed commit of the following: commit 4d48d0c9b88e62dfebe05ec909b7f1851e9cd75d Author: jzhang38 Date: Thu Jan 25 11:59:12 2024 +0800 refactor multi model code commit 4a4b7bec200c72332b61a0c277cd8f8a34e4f721 Author: jzhang38 Date: Thu Jan 25 11:51:16 2024 +0800 print table at the end commit 63739fc6fa0a462d807ae81de0db0173102de584 Author: jzhang38 Date: Thu Jan 25 11:20:59 2024 +0800 add yaml config to supprot multi-model eval commit edcc752f97ea3845cefad56624e5d2855066f680 Author: jzhang38 Date: Thu Jan 25 10:39:42 2024 +0800 black commit 41f4b63d3a6e83babe92bac32a7432a8ef740bb5 Merge: 7e8b57d 4d11dce Author: jzhang38 Date: Thu Jan 25 10:37:57 2024 +0800 resolve conflicts in sqa commit 7e8b57d3bcc21d2a049d3abbc8a8201631641db4 Author: jzhang38 Date: Thu Jan 25 10:36:46 2024 +0800 add model specific prompt and gen kwargs commit 5f55126484a7c9325db586d26cf2052538222804 Author: jzhang38 Date: Wed Jan 24 13:56:51 2024 +0800 black commit aa6f8853cf82384fb3b15306fec4769212fbc5ab Author: jzhang38 Date: Wed Jan 24 13:55:43 2024 +0800 add mmme commit fa4ad4404e26d8924f55208746dbb9143b464011 Merge: 22c3adf 4d11dce Author: Bo Li Date: Thu Jan 25 08:43:15 2024 +0000 Merge commit 'ecb47d73d6e000b472be6c5c0cdc9413c7734384' into dev/bli_add_datasets commit 22c3adfd0645acc23b6d7c06b487f4ffd47666c4 Author: Bo Li Date: Thu Jan 25 08:38:52 2024 +0000 Squashed commit of the following: commit 4d48d0c9b88e62dfebe05ec909b7f1851e9cd75d Author: jzhang38 Date: Thu Jan 25 11:59:12 2024 +0800 refactor multi model code commit 4a4b7bec200c72332b61a0c277cd8f8a34e4f721 Author: jzhang38 Date: Thu Jan 25 11:51:16 2024 +0800 print table at the end commit 63739fc6fa0a462d807ae81de0db0173102de584 Author: jzhang38 Date: Thu Jan 25 11:20:59 2024 +0800 add yaml config to supprot multi-model eval commit edcc752f97ea3845cefad56624e5d2855066f680 Author: jzhang38 Date: Thu Jan 25 10:39:42 2024 +0800 black commit 41f4b63d3a6e83babe92bac32a7432a8ef740bb5 Merge: 7e8b57d 4d11dce Author: jzhang38 Date: Thu Jan 25 10:37:57 2024 +0800 resolve conflicts in sqa commit 7e8b57d3bcc21d2a049d3abbc8a8201631641db4 Author: jzhang38 Date: Thu Jan 25 10:36:46 2024 +0800 add model specific prompt and gen kwargs commit ecb47d73d6e000b472be6c5c0cdc9413c7734384 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Thu Jan 25 09:47:31 2024 +0800 [Dataset] Add flickr30k (#18) * Add flickr30k support * Black lint * Align prompt with NoCaps commit dc23f4b42b1dd60b41904d7ddbee1412d6851077 Author: Li Bo Date: Wed Jan 24 22:10:14 2024 +0800 [Datasets] modify NoCaps data path and prompts (#17) * Merge commit '95ef3ea519cbd772924f9a6afa5394979eb00432' * Update dataset paths and improve user prompts commit 5f55126484a7c9325db586d26cf2052538222804 Author: jzhang38 Date: Wed Jan 24 13:56:51 2024 +0800 black commit aa6f8853cf82384fb3b15306fec4769212fbc5ab Author: jzhang38 Date: Wed Jan 24 13:55:43 2024 +0800 add mmme commit 4c712336b6f7438e717a865910bb241e413a4688 Author: Bo Li Date: Thu Jan 25 08:38:11 2024 +0000 Add coco_val and coco_test tasks to coco.yaml commit b5547126c855927fd4dc8384211e4aceee40870f Author: Bo Li Date: Thu Jan 25 04:58:28 2024 +0000 Update dataset_path in flickr30k.yaml commit f786f61e2559f082072f21aa9030e2080ddaf809 Author: Bo Li Date: Thu Jan 25 02:12:25 2024 +0000 Merge commit 'ecb47d73d6e000b472be6c5c0cdc9413c7734384' commit 796a011000e0df90f66f8e80cb34dc2318ae9ac8 Author: Bo Li Date: Thu Jan 25 02:10:18 2024 +0000 Add submission folder and update file paths for storing prediction results commit ecb47d73d6e000b472be6c5c0cdc9413c7734384 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Thu Jan 25 09:47:31 2024 +0800 [Dataset] Add flickr30k (#18) * Add flickr30k support * Black lint * Align prompt with NoCaps commit dc23f4b42b1dd60b41904d7ddbee1412d6851077 Author: Li Bo Date: Wed Jan 24 22:10:14 2024 +0800 [Datasets] modify NoCaps data path and prompts (#17) * Merge commit '95ef3ea519cbd772924f9a6afa5394979eb00432' * Update dataset paths and improve user prompts commit 118744c63eb2d9724571d85fbbd85fcc9ad05b59 Merge: c6370bf a0b87f5 Author: Li Bo Date: Wed Jan 24 22:10:07 2024 +0800 Merge branch 'main' into dev/bli_add_datasets commit c6370bff65903681f00cf3d07111d8e15a57b619 Author: Bo Li Date: Wed Jan 24 14:08:06 2024 +0000 Update dataset paths and improve user prompts commit 810daf458fa94cb3ec2b4a6cc5ecb1e656a24002 Author: Bo Li Date: Wed Jan 24 11:52:33 2024 +0000 Merge commit '95ef3ea519cbd772924f9a6afa5394979eb00432' commit 95ef3ea519cbd772924f9a6afa5394979eb00432 Author: Li Bo Date: Wed Jan 24 19:51:34 2024 +0800 Add output path file naming convention (#16) Update datetime format in get_datetime_str() function * Fix bug in login functionality * create vqav2_val * Update vqav2_test.yaml * Update vqav2_test.yaml * Update vqav2_val.yaml --------- Co-authored-by: Li Bo * vqav2 (#25) * Update tqdm progress bar position * Merge commit '842fbc6f2da7d9a118adf9ec27c3d8542d74168e' * Squashed commit of the following: commit b13a805623dfd9d826ddd440e1b5ecde773fbb12 Author: Zhang Peiyuan Date: Thu Jan 25 17:08:25 2024 +0800 add model specific prompt and gen kwargs in sqa (#19) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code commit 842fbc6f2da7d9a118adf9ec27c3d8542d74168e Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Thu Jan 25 09:47:31 2024 +0800 [Dataset] Add flickr30k (#18) * Add flickr30k support * Black lint * Align prompt with NoCaps commit 4bf0504fabc3b62f356c467b2fd1119083d27313 Author: Li Bo Date: Wed Jan 24 22:10:14 2024 +0800 [Datasets] modify NoCaps data path and prompts (#17) * Merge commit '767f7e2cae60cf67ec5878234d84321395a3ed15' * Update dataset paths and improve user prompts commit 767f7e2cae60cf67ec5878234d84321395a3ed15 Author: Li Bo Date: Wed Jan 24 19:51:34 2024 +0800 Add output path file naming convention (#16) Update datetime format in get_datetime_str() function * remove useless output file * Update dataset path in vqav2.yaml * Squashed commit of the following: commit eeb2b9827502f044ef67d8440f53124baf219ba3 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Fri Jan 26 09:56:45 2024 +0800 Black lint commit 1ce9f0b37e4bc5e6ff5fbfcd23fd339eb14974ae Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Fri Jan 26 09:55:47 2024 +0800 Solve doc_iterator_for_counting crashing issue commit e12b3bb41ed4f51540cfac84e5e96d15777540c4 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Fri Jan 26 09:55:13 2024 +0800 Exclude train in refcoco/+/g config commit 42c56f82bc4ccae12e19e76d09d7e525ca9ef2f4 Merge: 6a1ae69 697a438 Author: Bo Li Date: Thu Jan 25 17:17:13 2024 +0000 Merge branch 'dev/bli_add_datasets' of https://github.com/EvolvingLMMs-Lab/lmms-eval into dev/bli_add_datasets commit aed08303fe87808986d206540a0c0ee6d8764988 Author: Bo Li Date: Thu Jan 25 17:17:06 2024 +0000 Fix file path and raise error if config file does not exist commit a105386613c443d9e740c89725cbd1281bbdfef6 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Fri Jan 26 00:47:24 2024 +0800 Fix tasks issue for nocaps, refcoco/+/g commit 21c8119e377760f44c769bed2528d863a8f4333b Author: Bo Li Date: Thu Jan 25 10:09:43 2024 +0000 Remove unused files and update task configuration commit 0ccb2629c2aacdb297b7cf0c9c2bcfa386bb7582 Author: Bo Li Date: Thu Jan 25 09:43:56 2024 +0000 Add submission file for coco, flickr30k, nocaps, and textcaps tasks commit 5365e13e93c702a1e0e259ee6a08d6a427d72470 Author: Bo Li Date: Thu Jan 25 09:32:54 2024 +0000 Refactor get_task_dict function to handle nested groups commit 6773348c807bcfa1b09ceffc90c75e15cad908f7 Author: Bo Li Date: Thu Jan 25 09:13:46 2024 +0000 Fix bug in login functionality Refactor code for better performance Add new feature for user authentication Update UI layout for improved user experience Fix typo in variable name Optimize database queries for faster response time Add error handling for edge cases Update dependencies to latest versions Remove unused code Improve code readability and maintainability commit 31140f9c87dea89ca94c94bc850e3a8d43e5f8b4 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Thu Jan 25 17:07:20 2024 +0800 Fix cli itself can not run with config file commit df1bad47f6ed13f94848d2bee29b28e00c2384b2 Author: Bo Li Date: Thu Jan 25 09:09:04 2024 +0000 Squashed commit of the following: commit b13a805623dfd9d826ddd440e1b5ecde773fbb12 Author: Zhang Peiyuan Date: Thu Jan 25 17:08:25 2024 +0800 add model specific prompt and gen kwargs in sqa (#19) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code commit 06383aa4a5ff59db52fc8d584f3086efd88b7e74 Author: Bo Li Date: Thu Jan 25 09:02:57 2024 +0000 Squashed commit of the following: commit 542a34dc5721ecdff6c5c68b0568692ad3a17149 Author: jzhang38 Date: Thu Jan 25 11:59:12 2024 +0800 refactor multi model code commit 3c397b8af85192b1821b3b6a0d8b8df746b5347c Author: jzhang38 Date: Thu Jan 25 11:51:16 2024 +0800 print table at the end commit e7b8a2d1f1e7337f02298efafd2ebf81543f4f85 Author: jzhang38 Date: Thu Jan 25 11:20:59 2024 +0800 add yaml config to supprot multi-model eval commit 2626383d99b5eac59d531ca0f293df960570c524 Author: jzhang38 Date: Thu Jan 25 10:39:42 2024 +0800 black commit 8349935fe145e33af0007ad4fb0d71fd925be7a0 Merge: 7e8b57d 4d11dce Author: jzhang38 Date: Thu Jan 25 10:37:57 2024 +0800 resolve conflicts in sqa commit d4e8e2552d40752bfdc5bbf4cd962c1798096258 Author: jzhang38 Date: Thu Jan 25 10:36:46 2024 +0800 add model specific prompt and gen kwargs commit 520c7a2cafe60810aca79df814ce6829d4576032 Author: jzhang38 Date: Wed Jan 24 13:56:51 2024 +0800 black commit 3a633240327c078fa4f5a75dbd38ad5bc0d468dd Author: jzhang38 Date: Wed Jan 24 13:55:43 2024 +0800 add mmme commit 7a71fd6022ee5985100dda38b94956595cec77a5 Merge: 22c3adf 4d11dce Author: Bo Li Date: Thu Jan 25 08:43:15 2024 +0000 Merge commit '842fbc6f2da7d9a118adf9ec27c3d8542d74168e' into dev/bli_add_datasets commit 6870cba13cb54976480c1d5e8d97602c246f881b Author: Bo Li Date: Thu Jan 25 08:38:52 2024 +0000 Squashed commit of the following: commit 542a34dc5721ecdff6c5c68b0568692ad3a17149 Author: jzhang38 Date: Thu Jan 25 11:59:12 2024 +0800 refactor multi model code commit 3c397b8af85192b1821b3b6a0d8b8df746b5347c Author: jzhang38 Date: Thu Jan 25 11:51:16 2024 +0800 print table at the end commit e7b8a2d1f1e7337f02298efafd2ebf81543f4f85 Author: jzhang38 Date: Thu Jan 25 11:20:59 2024 +0800 add yaml config to supprot multi-model eval commit 2626383d99b5eac59d531ca0f293df960570c524 Author: jzhang38 Date: Thu Jan 25 10:39:42 2024 +0800 black commit 8349935fe145e33af0007ad4fb0d71fd925be7a0 Merge: 7e8b57d 4d11dce Author: jzhang38 Date: Thu Jan 25 10:37:57 2024 +0800 resolve conflicts in sqa commit d4e8e2552d40752bfdc5bbf4cd962c1798096258 Author: jzhang38 Date: Thu Jan 25 10:36:46 2024 +0800 add model specific prompt and gen kwargs commit 842fbc6f2da7d9a118adf9ec27c3d8542d74168e Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Thu Jan 25 09:47:31 2024 +0800 [Dataset] Add flickr30k (#18) * Add flickr30k support * Black lint * Align prompt with NoCaps commit 4bf0504fabc3b62f356c467b2fd1119083d27313 Author: Li Bo Date: Wed Jan 24 22:10:14 2024 +0800 [Datasets] modify NoCaps data path and prompts (#17) * Merge commit '767f7e2cae60cf67ec5878234d84321395a3ed15' * Update dataset paths and improve user prompts commit 520c7a2cafe60810aca79df814ce6829d4576032 Author: jzhang38 Date: Wed Jan 24 13:56:51 2024 +0800 black commit 3a633240327c078fa4f5a75dbd38ad5bc0d468dd Author: jzhang38 Date: Wed Jan 24 13:55:43 2024 +0800 add mmme commit b40d522b6bf483ebdfbf5facd4573de0cf8a93f6 Author: Bo Li Date: Thu Jan 25 08:38:11 2024 +0000 Add coco_val and coco_test tasks to coco.yaml commit 5bf643f73d06f1e540897b753450352bb92fd9ec Author: Bo Li Date: Thu Jan 25 04:58:28 2024 +0000 Update dataset_path in flickr30k.yaml commit 95f110f0eef5196205bc501367e3642c57cc7a17 Author: Bo Li Date: Thu Jan 25 02:12:25 2024 +0000 Merge commit '842fbc6f2da7d9a118adf9ec27c3d8542d74168e' commit c844ae49b18c1334711832208b0359c9439fe1c0 Author: Bo Li Date: Thu Jan 25 02:10:18 2024 +0000 Add submission folder and update file paths for storing prediction results commit 842fbc6f2da7d9a118adf9ec27c3d8542d74168e Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Thu Jan 25 09:47:31 2024 +0800 [Dataset] Add flickr30k (#18) * Add flickr30k support * Black lint * Align prompt with NoCaps commit 4bf0504fabc3b62f356c467b2fd1119083d27313 Author: Li Bo Date: Wed Jan 24 22:10:14 2024 +0800 [Datasets] modify NoCaps data path and prompts (#17) * Merge commit '767f7e2cae60cf67ec5878234d84321395a3ed15' * Update dataset paths and improve user prompts commit f0446227f0dd93651e9d6c06254bbf5212ede2dd Merge: c6370bf a0b87f5 Author: Li Bo Date: Wed Jan 24 22:10:07 2024 +0800 Merge branch 'main' into dev/bli_add_datasets commit 1e1f6cfccba758dc606fa4217102518fab73c936 Author: Bo Li Date: Wed Jan 24 14:08:06 2024 +0000 Update dataset paths and improve user prompts commit 966933754b9e5179995b3ab41d746603e13e75c6 Author: Bo Li Date: Wed Jan 24 11:52:33 2024 +0000 Merge commit '767f7e2cae60cf67ec5878234d84321395a3ed15' commit 767f7e2cae60cf67ec5878234d84321395a3ed15 Author: Li Bo Date: Wed Jan 24 19:51:34 2024 +0800 Add output path file naming convention (#16) Update datetime format in get_datetime_str() function * Fix bug in login functionality * create vqav2_val * Update vqav2_test.yaml * Update vqav2_test.yaml * Update vqav2_val.yaml --------- Co-authored-by: Li Bo * vizwiz dataset (#24) * Merge commit '95ef3ea519cbd772924f9a6afa5394979eb00432' * Update dataset paths and improve user prompts * Add submission folder and update file paths for storing prediction results * Merge commit 'ecb47d73d6e000b472be6c5c0cdc9413c7734384' * Update dataset_path in flickr30k.yaml * Add coco_val and coco_test tasks to coco.yaml * Squashed commit of the following: commit 4d48d0c9b88e62dfebe05ec909b7f1851e9cd75d Author: jzhang38 Date: Thu Jan 25 11:59:12 2024 +0800 refactor multi model code commit 4a4b7bec200c72332b61a0c277cd8f8a34e4f721 Author: jzhang38 Date: Thu Jan 25 11:51:16 2024 +0800 print table at the end commit 63739fc6fa0a462d807ae81de0db0173102de584 Author: jzhang38 Date: Thu Jan 25 11:20:59 2024 +0800 add yaml config to supprot multi-model eval commit edcc752f97ea3845cefad56624e5d2855066f680 Author: jzhang38 Date: Thu Jan 25 10:39:42 2024 +0800 black commit 41f4b63d3a6e83babe92bac32a7432a8ef740bb5 Merge: 7e8b57d 4d11dce Author: jzhang38 Date: Thu Jan 25 10:37:57 2024 +0800 resolve conflicts i… * Update LLaVA-Bench name in README.md * Make sure that gen_kwargs are added to gpt4v * Merge pull request #13 from EvolvingLMMs-Lab/pufanyi/mmbench_fix [Tasks] Fix MMBench * Merge pull request #9 from EvolvingLMMs-Lab/pufanyi/pip_dev update version to 0.1.1 * Make sure that gen_kwargs are added to gpt4v (#68) * Merge commits from lmms-eval/main * Add worldqa task and llava_vid * Adjust snapshot download, we handle the unzip file and the loaded path for the user * Add sft_eval * Generate until strip all response for process results * Squashed commit of the following: commit 75445d4426f491f6369a1138fd3352cb7c9e74fd Author: kcz358 Date: Tue Mar 19 07:49:08 2024 +0000 Add worldqa task and llava_vid * Fix llava repr issue * Fix llava repr error (#70) * Refactor logging and sanitization in WandbLogger * updates * [Feat] Enable world qa evaluation (#71) * Add evaluation metric for worldqa mc and mc_ppl * Fix a small bug in qwen model * put images at the very from MMMU * debug batchsize >1 for llava * Fix image aspect ratio setting in Llava model * Fix setting image aspect ratio in Llava model * change image list * Fix LLaVA import error and update attention implementation * Update dependencies in pyproject.toml and add attn_implementation parameter in Llava class constructor * Squashed commit of the following: commit b1bd1b1644a16f49d1322a2483137254f1711c66 Author: Bo Li Date: Sat Mar 30 17:39:44 2024 +0000 Update dependencies in pyproject.toml and add attn_implementation parameter in Llava class constructor commit 5c0a7ab7d0cee5cab460bfdf674cbd793a1043ca Author: Bo Li Date: Sat Mar 30 16:14:48 2024 +0000 Fix LLaVA import error and update attention implementation * Fix endless warning in fuyu and instructblip * update * updates * remove debug traits * 🐞 fix(): set reasonble patience time * Fix token decoding error in Llava class * Add overwrite config to avoid longer than context length * Delete redundant args * Fix default parameters value error * merge * Remove builder script requirement in video eval * Add videochatgpt support * Add LLaMA-Vid support * Add video-llava support * merge llava_vid from kc/vid_eval to kr_ego * [WIP] adding mmbench dev evaluation (#75) * WIP * Update GPT evaluation model name and sys prompt * 🛠️ Scale accuracy to percentage The accuracy value is now multiplied by 100 in the aggregation function to represent it as a percentage. Regarding the evaluation process, `math` module importation and refactoring reduce progress log verbosity by logging every 100 evaluations instead of 10. It prevents potential logging overflow. Handling of NaN values is added to ensure 'default_value' is set in case of missing data, avoiding errors in split, category, and l2-category assignments. Finally, reporting of categorical and l2-categorical accuracies is streamlined through a new `calculate_hit_rates` function, improving code readability and maintenance. Issue refs: #1427, #1533 * Update GPT evaluation model name and API configuration * Refactor MMBench_Evaluator class to handle missing columns * Add print statements for detailed results in MMBench-CN(CC), MMBench-CN(Dev), and MMBench-EN(Dev) evaluations * Refactor MMBench-CN and MMBench-EN evaluation functions * 🔄 Refactor result processing and logging logic - Simplified the result processing functions across different utility modules (`cc_utils.py`, `cn_utils.py`, `en_utils.py`) to unify the handling of multiple-choice options. Now, all options ("A" to "E") are dynamically added to the result data, and default to "nan" if not provided in the document. - Removed redundant keys directly from the process results dict creation to avoid clutter and align with the new dynamic addition of options. - In `mmbench_evals.py`, removed the unnecessary check for all splits being 'dev' and streamlined the evaluation loop by eliminating the progress bar (tqdm) for a cleaner log output. - Commented-out code and verbose logging during evaluation, which may have interfered with performance, has been removed for a more efficient and less intrusive logging experience. This cleanup reduces redundancy in the codebase and improves evaluation performance. Refs #2045 --------- Co-authored-by: Bo Li * change frame num to 32 * change submission format * change builder file * complete the huggingface dataset building and downloading * add make_hf script * Remove redundant json files * remove redundant py files * remove redundant txt files * remove redundant file that makes hugginface dataset * remove redundant print * remove redundant tasks * add videochatgpt task * add activitynetqa * enable direct scoring for egoschema subset * add videochatgpt gpt-eval * "add some builder script files for activitynetqa as an example" * Update LMMS-Eval and LLaVA models * Update LMMS-Eval and LLaVA models * LiveBench (#89) * try screen_shot * try screenshot * update * screen shoter * update * update * add sampling * update * Add image formatting and generation retry logic * lint * update * Refactor data_capturer.py and qa_generator.py * update * update * update * Refactor RollingBenchData class to include subtask attribute * chore: Refactor data_capturer.py and qa_generator.py * rolling_bench=>live_bench * Update function to handle edge cases * update * update * update * Refactor code and handle exceptions in LiveBench class * update * Refactor load_driver function and update prompt.md * Add JSON import and update text content in qa_generator.py * Update example output file and handle dataset loading error * lint * update example * Update dataset initialization in example.ipynb * Update lmms_eval/live_bench/example.ipynb * clear outputs * self collect image * fix a bug * fix * lint * prompt * add score * update score * Refactor score calculation in GPT4VScoreGetter * Update website capture settings and fix image size in example.ipynb * Refactor LiveBenchData class and add ClaudeScoreGetter * Update ClaudeScoreGetter to include system prompt * update * update * checker for gpt4v * Add checker attribute to LiveBenchData class * Refactor code and update prompt text * update api * clear output * live * bigger max_new_tokens * gemini * fix size * fix --------- Co-authored-by: Bo Li * Update LMMS-Eval and LLaVA models * Update LMMS-Eval and LLaVA models * add cvrr benchmark object count dimension * Fix file path and add unzip functionality * Delete unused files and update model prompt for GPT4V * Update LMMS evaluation tasks and models * Refactor MultiChoiceRegexFilter class to directly extract option letter from model response * Delete unused files and scripts for ActivityNetQA dataset generation * Update NUM_SECONDS_TO_SLEEP value and log response in case of failure * add newest activitynetqa dataset * add newest egoschema dataset * add newest videochatgpt dataset * add newest cvrr dataset * reformat the files --------- Co-authored-by: kcz358 <92624596+kcz358@users.noreply.github.com> Co-authored-by: Pu Fanyi Co-authored-by: Zhang Peiyuan Co-authored-by: JvThunder <44111143+JvThunder@users.noreply.github.com> Co-authored-by: XinrunDu <154438029+XinrunDu@users.noreply.github.com> Co-authored-by: ygjin11 <1633504509@qq.com> Co-authored-by: JvThunder Co-authored-by: kcz358 Co-authored-by: Bo Li Co-authored-by: CarryWho --- lmms_eval/api/task.py | 11 +- lmms_eval/models/__init__.py | 1 + lmms_eval/models/batch_gpt4.py | 202 +++++++ lmms_eval/models/fuyu.py | 4 +- lmms_eval/models/gemini_api.py | 30 +- lmms_eval/models/gpt4v.py | 3 +- lmms_eval/models/idefics2.py | 2 +- lmms_eval/models/qwen_vl.py | 3 +- lmms_eval/models/videoChatGPT.py | 188 ++++++ lmms_eval/models/xcomposer2_4KHD.py | 2 +- .../activitynetqa/_default_template_yaml | 13 + .../activitynetqa_generation.yaml | 25 + lmms_eval/tasks/activitynetqa/utils.py | 312 ++++++++++ lmms_eval/tasks/ai2d/ai2d.yaml | 53 +- lmms_eval/tasks/ai2d/utils.py | 47 +- lmms_eval/tasks/coco_cap/utils.py | 2 +- lmms_eval/tasks/cvrr/_default_template_yaml | 13 + .../cvrr/cvrr_object_instance_count.yaml | 14 + lmms_eval/tasks/cvrr/utils.py | 301 ++++++++++ .../tasks/egoschema/_default_template_yaml | 9 + .../tasks/egoschema/egoschema_generation.yaml | 15 + lmms_eval/tasks/egoschema/egoschema_mc.yaml | 15 + .../tasks/egoschema/egoschema_mcppl.yaml | 15 + .../tasks/egoschema/egoschema_subset.yaml | 17 + .../tasks/egoschema/egoschema_subset_gen.yaml | 16 + lmms_eval/tasks/egoschema/utils.py | 156 +++++ lmms_eval/tasks/flickr30k/utils.py | 2 +- lmms_eval/tasks/livebench/livebench.yaml | 2 +- lmms_eval/tasks/livebench/utils.py | 2 + .../mix_evals/mix_evals_video2text_mc.yaml | 1 - lmms_eval/tasks/mix_evals/utils.py | 54 +- .../mmbench/_default_template_mmbench_cn_yaml | 0 .../mmbench/_default_template_mmbench_en_yaml | 0 lmms_eval/tasks/mme/mme.yaml | 3 + lmms_eval/tasks/nocaps/utils.py | 2 +- lmms_eval/tasks/ocrbench/utils.py | 2 +- lmms_eval/tasks/refcoco+/utils.py | 2 +- lmms_eval/tasks/refcoco/utils.py | 2 +- lmms_eval/tasks/refcocog/utils.py | 2 +- lmms_eval/tasks/synthdog/donut_evaluator.py | 8 +- lmms_eval/tasks/textcaps/utils.py | 2 +- .../tasks/video_detail_description/utils.py | 4 + .../tasks/videochatgpt/_default_template_yaml | 13 + lmms_eval/tasks/videochatgpt/utils.py | 551 ++++++++++++++++++ .../videochatgpt_consistency.yaml | 24 + .../videochatgpt/videochatgpt_context.yaml | 13 + .../videochatgpt_correctness.yaml | 14 + .../videochatgpt_detailed_orientation.yaml | 13 + .../videochatgpt/videochatgpt_temporal.yaml | 23 + pyproject.toml | 4 +- tools/make_activitynetqa.ipynb | 120 ++++ 51 files changed, 2233 insertions(+), 99 deletions(-) create mode 100755 lmms_eval/models/batch_gpt4.py create mode 100644 lmms_eval/models/videoChatGPT.py create mode 100644 lmms_eval/tasks/activitynetqa/_default_template_yaml create mode 100755 lmms_eval/tasks/activitynetqa/activitynetqa_generation.yaml create mode 100755 lmms_eval/tasks/activitynetqa/utils.py create mode 100644 lmms_eval/tasks/cvrr/_default_template_yaml create mode 100755 lmms_eval/tasks/cvrr/cvrr_object_instance_count.yaml create mode 100755 lmms_eval/tasks/cvrr/utils.py create mode 100644 lmms_eval/tasks/egoschema/_default_template_yaml create mode 100755 lmms_eval/tasks/egoschema/egoschema_generation.yaml create mode 100755 lmms_eval/tasks/egoschema/egoschema_mc.yaml create mode 100755 lmms_eval/tasks/egoschema/egoschema_mcppl.yaml create mode 100755 lmms_eval/tasks/egoschema/egoschema_subset.yaml create mode 100755 lmms_eval/tasks/egoschema/egoschema_subset_gen.yaml create mode 100755 lmms_eval/tasks/egoschema/utils.py mode change 100644 => 100755 lmms_eval/tasks/mmbench/_default_template_mmbench_cn_yaml mode change 100644 => 100755 lmms_eval/tasks/mmbench/_default_template_mmbench_en_yaml create mode 100644 lmms_eval/tasks/videochatgpt/_default_template_yaml create mode 100755 lmms_eval/tasks/videochatgpt/utils.py create mode 100755 lmms_eval/tasks/videochatgpt/videochatgpt_consistency.yaml create mode 100755 lmms_eval/tasks/videochatgpt/videochatgpt_context.yaml create mode 100755 lmms_eval/tasks/videochatgpt/videochatgpt_correctness.yaml create mode 100755 lmms_eval/tasks/videochatgpt/videochatgpt_detailed_orientation.yaml create mode 100755 lmms_eval/tasks/videochatgpt/videochatgpt_temporal.yaml create mode 100755 tools/make_activitynetqa.ipynb diff --git a/lmms_eval/api/task.py b/lmms_eval/api/task.py index 8a506048..35716b24 100755 --- a/lmms_eval/api/task.py +++ b/lmms_eval/api/task.py @@ -10,6 +10,7 @@ from glob import glob import shutil from tqdm import tqdm +import subprocess import datasets from datasets import Image, Sequence @@ -19,7 +20,7 @@ from datasets import DownloadConfig from typing import Union, List, Any from collections.abc import Callable -from tenacity import retry, stop_after_attempt, wait_fixed +from tenacity import retry, stop_after_attempt, wait_fixed, stop_after_delay from huggingface_hub import snapshot_download from lmms_eval import utils @@ -679,19 +680,20 @@ def _prepare_metric_and_aggregation(self): eval_logger.warning(f"[Task: {self._config.task}] metric {metric_name} is defined, but higher_is_better is not. " f"using default " f"higher_is_better={is_higher_better(metric_name)}") self._higher_is_better[metric_name] = is_higher_better(metric_name) - @retry(stop=stop_after_attempt(5), wait=wait_fixed(2)) + @retry(stop=(stop_after_attempt(5) | stop_after_delay(60)), wait=wait_fixed(2)) def download(self, dataset_kwargs=None) -> None: # If the dataset is a video dataset, # Recursively search whether their is a zip and unzip it to the huggingface home if dataset_kwargs is not None and "video" in dataset_kwargs and dataset_kwargs["video"]: hf_home = os.getenv("HF_HOME", "~/.cache/huggingface/") cache_dir = dataset_kwargs["cache_dir"] - cache_dir = os.path.join(hf_home, cache_dir) + cache_path = snapshot_download(repo_id=self.DATASET_PATH, repo_type="dataset") zip_files = glob(os.path.join(cache_path, "**/*.zip"), recursive=True) - if not os.path.exists(cache_dir): + if not os.path.exists(cache_dir) and len(zip_files) > 0: for zip_file in zip_files: + print(f"Unzipping {zip_file} to {cache_dir}") shutil.unpack_archive(zip_file, cache_dir) if "builder_script" in dataset_kwargs: @@ -995,6 +997,7 @@ def construct_requests(self, doc_id: int, ctx: str, **kwargs) -> Union[List[Inst arguments = (ctx, self.config.generation_kwargs, self.doc_to_visual, doc_id, self.config.task, split) return Instance(request_type=self.OUTPUT_TYPE, arguments=arguments, idx=0, **kwargs) + @retry(stop=(stop_after_attempt(5) | stop_after_delay(1200)), wait=wait_fixed(2)) def process_results(self, doc, results): if self.OUTPUT_TYPE == "generate_until": results[0] = results[0].strip() diff --git a/lmms_eval/models/__init__.py b/lmms_eval/models/__init__.py index 8c683cb4..87292a3d 100755 --- a/lmms_eval/models/__init__.py +++ b/lmms_eval/models/__init__.py @@ -7,6 +7,7 @@ "llava": "Llava", "qwen_vl": "Qwen_VL", "fuyu": "Fuyu", + "batch_gpt4": "BatchGPT4", "gpt4v": "GPT4V", "instructblip": "InstructBLIP", "minicpm_v": "MiniCPM_V", diff --git a/lmms_eval/models/batch_gpt4.py b/lmms_eval/models/batch_gpt4.py new file mode 100755 index 00000000..9c6d5ba3 --- /dev/null +++ b/lmms_eval/models/batch_gpt4.py @@ -0,0 +1,202 @@ +# Standard library imports +from copy import deepcopy +from io import BytesIO +import base64 +import logging +import os +import time +import json + +# Related third-party imports +from accelerate import Accelerator, DistributedType, InitProcessGroupKwargs +from accelerate.state import AcceleratorState +import numpy as np +from PIL import Image +import requests as url_requests +from tqdm import tqdm +from openai import OpenAI + +# Local application/library specific imports +from lmms_eval.api.instance import Instance +from lmms_eval.api.model import lmms +from lmms_eval.api.registry import register_model +from lmms_eval import utils + +# Conditional imports +try: + from decord import VideoReader, cpu +except ImportError: + eval_logger = logging.getLogger("lmms-eval") + eval_logger.info("Decord is not installed. Video input will not be supported.") + +# Constants and global configurations +API_TYPE = os.getenv("API_TYPE", "openai") +NUM_SECONDS_TO_SLEEP = 5 + +if API_TYPE == "openai": + API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions") + API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY") + headers = { + "Authorization": f"Bearer {API_KEY}", + "Content-Type": "application/json", + } +elif API_TYPE == "azure": + API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken") + API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY") + headers = { + "api-key": API_KEY, + "Content-Type": "application/json", + } + + +@register_model("batch_gpt4") +class BatchGPT4(lmms): + def __init__( + self, + model_version: str = "gpt-4o", + api_key: str = API_KEY, + api_url: str = API_URL, + modality: str = "image", + max_frames_for_video: int = 10, + timeout: int = 120, + **kwargs, + ) -> None: + super().__init__() + # Manually set a image token for GPT4V so that we can search for it + # and split the text and image + # Here we just use the same token as llava for convenient + self.model_version = model_version + self.modality = modality + self.max_frames_for_video = max_frames_for_video + self.image_token = "" + self.timeout = timeout + + self.api_key = api_key + self.api_url = api_url + self.client = OpenAI(api_key=api_key) + + accelerator = Accelerator() + assert accelerator.state.local_process_index == 0, "BatchGPT4 does not support distributed inference." + assert accelerator.state.num_processes == 1, "BatchGPT4 does not support distributed inference." + + # Function to encode the image + def encode_image(self, image: Image): + output_buffer = BytesIO() + image.save(output_buffer, format="PNG") + byte_data = output_buffer.getvalue() + base64_str = base64.b64encode(byte_data).decode("utf-8") + return base64_str + + # Function to encode the video + def encode_video(self, video_path, for_get_frames_num): + vr = VideoReader(video_path, ctx=cpu(0)) + total_frame_num = len(vr) + uniform_sampled_frames = np.linspace(0, total_frame_num - 1, for_get_frames_num, dtype=int) + frame_idx = uniform_sampled_frames.tolist() + frames = vr.get_batch(frame_idx).asnumpy() + + base64_frames = [] + for frame in frames: + img = Image.fromarray(frame) + output_buffer = BytesIO() + img.save(output_buffer, format="PNG") + byte_data = output_buffer.getvalue() + base64_str = base64.b64encode(byte_data).decode("utf-8") + base64_frames.append(base64_str) + + return base64_frames + + def flatten(self, input): + new_list = [] + for i in input: + for j in i: + new_list.append(j) + return new_list + + def generate_until(self, requests): + # Prepare the batch requests data + requests_data = {} + pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Batch Preparing") + for idx, (contexts, gen_kwargs, doc_to_visual, doc_id, task, split) in enumerate([reg.args for reg in requests]): + visuals = [doc_to_visual(self.task_dict[task][split][doc_id])] + visuals = self.flatten(visuals) + imgs = [] + for visual in visuals: + if self.modality == "image": + img = self.encode_image(visual) + imgs.append(img) + elif self.modality == "video": + frames = self.encode_video(visual, self.max_frames_for_video) + imgs.extend(frames) + + messages = [] + if self.image_token not in contexts: + messages.append({"role": "user", "content": contexts}) + for img in imgs: + messages.append({"role": "user", "content": f"data:image/jpeg;base64,{img}"}) + else: + contexts_split = contexts.split(self.image_token) + for idx, context in enumerate(contexts_split): + if idx < len(imgs): + messages.append({"role": "user", "content": context}) + messages.append({"role": "user", "content": f"data:image/jpeg;base64,{imgs[idx]}"}) + if len(contexts_split) > len(imgs): + messages.append({"role": "user", "content": contexts_split[-1]}) + + requests_data[f"request-{idx}"] = {"model": self.model_version, "messages": messages, "max_tokens": gen_kwargs.get("max_new_tokens", 1024)} + pbar.update(1) + + file_path = os.getenv("HF_HOME", "~/.cache/huggingface") + f"/batchinput_{len(requests_data)}.jsonl" + file_path = self.create_batch_input_file(requests_data, file_path) + file_id = self.upload_input_file(file_path) + + batch_response = self.create_batch(file_id, metadata={"description": "Batch Processing for GPT-4"}) + batch_status = self.check_batch_status(batch_response.id) + while True: + batch_status = self.check_batch_status(batch_response.id) + if batch_status.status == "completed": + eval_logger.info("Batch processing completed.") + batch_results = self.retrieve_batch_results(batch_status.output_file_id) + res = [result["response"]["choices"][0]["message"]["content"] for result in json.loads(batch_results)] + return res + elif batch_status.status == "failed": + eval_logger.info("Batch processing failed.") + res = ["Batch failed"] * len(requests) + return res + else: + eval_logger.info(f"Batch status: {batch_status.status}. Retrying in {NUM_SECONDS_TO_SLEEP} seconds.") + time.sleep(NUM_SECONDS_TO_SLEEP) + + def loglikelihood(self, requests): + # TODO + assert False, "GPT4V not support" + + def create_batch_input_file(self, requests_data, file_path="batchinput.jsonl"): + with open(file_path, "w") as file: + for request_id, data in requests_data.items(): + json_record = json.dumps({"custom_id": request_id, "method": "POST", "url": "/v1/chat/completions", "body": data}) + file.write(json_record + "\n") + return file_path + + def upload_input_file(self, file_path): + with open(file_path, "rb") as file: + response = self.client.files.create(file=file, purpose="batch") + return response.id + + def create_batch(self, file_id, metadata=None): + if metadata is None: + metadata = {} + response = self.client.batches.create(input_file_id=file_id, endpoint="/v1/chat/completions", completion_window="24h", metadata=metadata) + return response + + def check_batch_status(self, batch_id): + return self.client.batches.retrieve(batch_id) + + def retrieve_batch_results(self, file_id): + return self.client.files.content(file_id) + + def cancel_batch(self, batch_id): + return self.client.batches.cancel(batch_id) + + def list_batches(self, limit=10): + return self.client.batches.list(limit=limit) diff --git a/lmms_eval/models/fuyu.py b/lmms_eval/models/fuyu.py index 7a50c174..d4f7a483 100755 --- a/lmms_eval/models/fuyu.py +++ b/lmms_eval/models/fuyu.py @@ -85,7 +85,7 @@ def __init__( self._rank = 0 self._word_size = 1 - '''if accelerator.num_processes > 1: + """if accelerator.num_processes > 1: assert accelerator.distributed_type in [ DistributedType.FSDP, DistributedType.MULTI_GPU, @@ -98,7 +98,7 @@ def __init__( if self.accelerator.is_local_main_process: eval_logger.info(f"Using {accelerator.num_processes} devices with data parallelism") self._rank = self.accelerator.local_process_index - self._world_size = self.accelerator.num_processes''' + self._world_size = self.accelerator.num_processes""" @property def config(self): diff --git a/lmms_eval/models/gemini_api.py b/lmms_eval/models/gemini_api.py index 7ef1c19d..31a782e4 100644 --- a/lmms_eval/models/gemini_api.py +++ b/lmms_eval/models/gemini_api.py @@ -2,7 +2,6 @@ import os import time import logging -import google.generativeai as genai from PIL import Image from typing import List, Tuple @@ -13,9 +12,16 @@ eval_logger = logging.getLogger("lmms-eval") -NUM_SECONDS_TO_SLEEP = 30 -GOOGLE_API_KEY=os.getenv('GOOGLE_API_KEY') -genai.configure(api_key=GOOGLE_API_KEY) +try: + import google.generativeai as genai + + NUM_SECONDS_TO_SLEEP = 5 + GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") + genai.configure(api_key=GOOGLE_API_KEY) + +except Exception as e: + eval_logger.error(f"Error importing generativeai: {str(e)}") + genai = None @register_model("gemini_api") @@ -37,7 +43,19 @@ def flatten(self, input): for j in i: new_list.append(j) return new_list - + + def get_image_size(self, image): + # Create a BytesIO object to store the image bytes + img_byte_array = io.BytesIO() + + # Save the image to the BytesIO object + image.save(img_byte_array, format="PNG") + + # Get the size of the BytesIO object + img_size = img_byte_array.tell() + + return img_size + def get_image_size(self, image): # Create a BytesIO object to store the image bytes img_byte_array = io.BytesIO() @@ -67,7 +85,7 @@ def generate_until(self, requests) -> List[str]: visuals = [doc_to_visual(self.task_dict[task][split][doc_id])] visuals = self.flatten(visuals) - + message = [contexts] + visuals for attempt in range(5): diff --git a/lmms_eval/models/gpt4v.py b/lmms_eval/models/gpt4v.py index 3aec2fad..bf8f66ed 100755 --- a/lmms_eval/models/gpt4v.py +++ b/lmms_eval/models/gpt4v.py @@ -25,7 +25,7 @@ from PIL import Image API_TYPE = os.getenv("API_TYPE", "openai") -NUM_SECONDS_TO_SLEEP = 5 +NUM_SECONDS_TO_SLEEP = 30 eval_logger = logging.getLogger("lmms-eval") if API_TYPE == "openai": @@ -178,6 +178,7 @@ def generate_until(self, requests) -> List[str]: time.sleep(NUM_SECONDS_TO_SLEEP) else: # If this was the last attempt, log and return empty eval_logger.error(f"All 5 attempts failed. Last error message: {str(e)}") + eval_logger.error(f"Response: {response}") content = "" res.append(content) pbar.update(1) diff --git a/lmms_eval/models/idefics2.py b/lmms_eval/models/idefics2.py index 979ae050..e978f907 100644 --- a/lmms_eval/models/idefics2.py +++ b/lmms_eval/models/idefics2.py @@ -198,7 +198,7 @@ def _collate(x): gen_kwargs = all_gen_kwargs[0] # until = gen_kwargs.pop("until", None) - image_aspect_ratio = gen_kwargs.pop("image_aspect_ratio", None) + image_aspect_ratio = gen_kwargs.pop("image_aspect_ratio", None) if "max_new_tokens" not in gen_kwargs: gen_kwargs["max_new_tokens"] = 1024 if "temperature" not in gen_kwargs: diff --git a/lmms_eval/models/qwen_vl.py b/lmms_eval/models/qwen_vl.py index 4d9cdbb1..e55ad7c9 100755 --- a/lmms_eval/models/qwen_vl.py +++ b/lmms_eval/models/qwen_vl.py @@ -242,12 +242,11 @@ def _collate(x): if len(visual_paths) == 0: for context in contexts: query.append({"text": context}) - else: + else: for visual_path, context in zip(visual_paths, contexts): query.append({"image": visual_path}) query.append({"text": context}) - questions = self.tokenizer.from_list_format(query) input_ids = self.tokenizer(questions, return_tensors="pt", padding="longest") diff --git a/lmms_eval/models/videoChatGPT.py b/lmms_eval/models/videoChatGPT.py new file mode 100644 index 00000000..94f61803 --- /dev/null +++ b/lmms_eval/models/videoChatGPT.py @@ -0,0 +1,188 @@ +import os +from lmms_eval import utils +from lmms_eval.api.instance import Instance +from lmms_eval.api.model import lmms +from lmms_eval.api.registry import register_model + +from accelerate import Accelerator, DistributedType, InitProcessGroupKwargs +from accelerate.state import AcceleratorState +from huggingface_hub import snapshot_download +import torch + +from datetime import timedelta +import logging +from typing import List, Tuple, Optional, Union +from tqdm import tqdm + +try: + from video_chatgpt.eval.model_utils import load_video, initialize_model + from video_chatgpt.inference import video_chatgpt_infer, video_chatgpt_infer_ppl, get_spatio_temporal_features_torch +except ImportError: + eval_logger = logging.getLogger("lmms-eval") + eval_logger.info("Failed to import video_chatgpt modules") + +eval_logger = logging.getLogger("lmms-eval") + + +@register_model("video_chatgpt") +class VideoChatGPT(lmms): + def __init__( + self, + batch_size: Optional[Union[int, str]] = 1, + projection_path: str = "MBZUAI/Video-ChatGPT-7B", + model_path: str = "mmaaz60/LLaVA-7B-Lightening-v1-1", + device_map="cuda:0", + device: Optional[str] = "cuda:0", + num_frm: Optional[Union[int, str]] = 100, + ) -> None: + super().__init__() + self.batch_size_per_gpu = int(batch_size) + self.num_frm = int(num_frm) + accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52)) + accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs]) + if accelerator.num_processes > 1: + self._device = torch.device(f"cuda:{accelerator.local_process_index}") + self.device_map = f"cuda:{accelerator.local_process_index}" + elif accelerator.num_processes == 1 and device_map == "auto": + self._device = torch.device(device) + self.device_map = device_map + else: + self._device = torch.device(f"cuda:{accelerator.local_process_index}") + self.device_map = f"cuda:{accelerator.local_process_index}" + try: + self.model, self.vision_tower, self.tokenizer, self.image_processor, self.video_token_len = initialize_model(model_path, projection_path, device=self.device) + except: + eval_logger.info("Does not find the model from the path you provide, try downloading from the hf repo.") + model_path = snapshot_download(repo_id=model_path) + projection_path = os.path.join(snapshot_download(repo_id=projection_path), "video_chatgpt-7B.bin") + self.model, self.vision_tower, self.tokenizer, self.image_processor, self.video_token_len = initialize_model(model_path, projection_path, device=self.device) + + if accelerator.num_processes > 1: + assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported." + # If you want to use DistributedType.DEEPSPEED, you have to run accelerate config before using the model + # Also, you have to select zero stage 0 (equivalent to DDP) in order to make the prepare model works + # I tried to set different parameters in the kwargs to let default zero 2 stage works, but it didn't work. + if accelerator.distributed_type == DistributedType.DEEPSPEED: + kwargs = { + "train_micro_batch_size_per_gpu": self.batch_size_per_gpu, + "train_batch_size": self.batch_size_per_gpu * accelerator.num_processes, + } + AcceleratorState().deepspeed_plugin.deepspeed_config_process(must_match=True, **kwargs) + eval_logger.info("Detected that you are using DistributedType.DEEPSPEED. Make sure you run `accelerate config` and set zero stage to 0") + if accelerator.distributed_type == DistributedType.FSDP or accelerator.distributed_type == DistributedType.DEEPSPEED: + self._model = accelerator.prepare(self.model) + else: + self._model = accelerator.prepare_model(self.model, evaluation_mode=True) + self.accelerator = accelerator + if self.accelerator.is_local_main_process: + eval_logger.info(f"Using {accelerator.num_processes} devices with data parallelism") + self._rank = self.accelerator.local_process_index + self._world_size = self.accelerator.num_processes + elif accelerator.num_processes == 1 and device_map == "auto": + eval_logger.info(f"Using {accelerator.num_processes} devices with tensor parallelism") + self._rank = 0 + self._word_size = 1 + else: + eval_logger.info(f"Using single device: {self._device}") + self.model.to(self._device) + self._rank = 0 + self._world_size = 1 + + def flatten(self, input): + new_list = [] + for i in input: + for j in i: + new_list.append(j) + return new_list + + def generate_until(self, requests) -> List[str]: + res = [] + pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding") + + for contexts, gen_kwargs, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]: + # encode, pad, and truncate contexts for this batch + visuals = [doc_to_visual(self.task_dict[task][split][doc_id])] + visuals = self.flatten(visuals) + videos = [] + for visual in visuals: + video_frames = load_video(visual, num_frm=self.num_frm) + # VideoChatGPT load video return a list of PIL Image + videos += video_frames + + output = video_chatgpt_infer( + video_frames, contexts, conv_mode="video-chatgpt_v1", model=self.model, vision_tower=self.vision_tower, tokenizer=self.tokenizer, image_processor=self.image_processor, video_token_len=self.video_token_len + ) + + res.append(output) + pbar.update(1) + + return res + + def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: + res = [] + pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding") + + for contexts, doc_to_target, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]: + # encode, pad, and truncate contexts for this batch + if type(doc_to_target) == str: + continuation = doc_to_target + else: + continuation = doc_to_target(self.task_dict[task][split][doc_id]) + visuals = [doc_to_visual(self.task_dict[task][split][doc_id])] + visuals = self.flatten(visuals) + videos = [] + for visual in visuals: + video_frames = load_video(visual, num_frm=self.num_frm) + # VideoChatGPT load video return a list of PIL Image + videos += video_frames + image_tensor = self.image_processor.preprocess(videos, return_tensors="pt")["pixel_values"] + + # Move image tensor to GPU and reduce precision to half + image_tensor = image_tensor.half().to(self.device) + + # Generate video spatio-temporal features + with torch.no_grad(): + image_forward_outs = self.vision_tower(image_tensor, output_hidden_states=True) + frame_features = image_forward_outs.hidden_states[-2][:, 1:] # Use second to last layer as in LLaVA + video_spatio_temporal_features = get_spatio_temporal_features_torch(frame_features).cuda() + + outputs, input_ids, context_ids = video_chatgpt_infer_ppl( + # video_frames, + contexts, + continuation, + conv_mode="video-chatgpt_v1", + model=self.model, + vision_tower=self.vision_tower, + tokenizer=self.tokenizer, + image_processor=self.image_processor, + video_token_len=self.video_token_len, + video_spatio_temporal_features=video_spatio_temporal_features, + ) + + loss = outputs["loss"] + # loss = torch.exp(loss) + logits = outputs["logits"] + greedy_tokens = logits.argmax(dim=-1) + cont_toks = input_ids[:, context_ids.shape[1] :] # [1, seq] + greedy_tokens = greedy_tokens[:, context_ids.shape[1] : input_ids.shape[1]] # [1, seq] + max_equal = (greedy_tokens == cont_toks).all() + res.append((float(loss.item()), bool(max_equal))) + pbar.update(1) + pbar.close() + return res + + @property + def batch_size(self): + return self.batch_size_per_gpu + + @property + def device(self): + return self._device + + @property + def rank(self): + return self._rank + + @property + def world_size(self): + return self._world_size diff --git a/lmms_eval/models/xcomposer2_4KHD.py b/lmms_eval/models/xcomposer2_4KHD.py index 2c49e43b..b43f12e4 100644 --- a/lmms_eval/models/xcomposer2_4KHD.py +++ b/lmms_eval/models/xcomposer2_4KHD.py @@ -146,7 +146,7 @@ def generate_until(self, requests) -> List[str]: for contexts, gen_kwargs, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]: # encode, pad, and truncate contexts for this batch if "[UNUSED_TOKEN_146]" not in contexts: - contexts = f"[UNUSED_TOKEN_146]user\n{contexts}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n" + contexts = f"[UNUSED_TOKEN_146]user\n{contexts}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n" visuals = [doc_to_visual(self.task_dict[task][split][doc_id])] visuals = self.flatten(visuals) diff --git a/lmms_eval/tasks/activitynetqa/_default_template_yaml b/lmms_eval/tasks/activitynetqa/_default_template_yaml new file mode 100644 index 00000000..ee83f523 --- /dev/null +++ b/lmms_eval/tasks/activitynetqa/_default_template_yaml @@ -0,0 +1,13 @@ +dataset_path: lmms-lab/ActivityNetQA +dataset_kwargs: + token: True + video: True + cache_dir: activitynetqa +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "" + +metadata: + version: 0.0 + gpt_eval_model_name: gpt-3.5-turbo-0613 \ No newline at end of file diff --git a/lmms_eval/tasks/activitynetqa/activitynetqa_generation.yaml b/lmms_eval/tasks/activitynetqa/activitynetqa_generation.yaml new file mode 100755 index 00000000..a9bb69d2 --- /dev/null +++ b/lmms_eval/tasks/activitynetqa/activitynetqa_generation.yaml @@ -0,0 +1,25 @@ +dataset_name: "Generation" +task: "activitynetqa" +test_split: test +output_type: generate_until +doc_to_visual: !function utils.activitynetqa_doc_to_visual +doc_to_text: !function utils.activitynetqa_doc_to_text +doc_to_target: !function utils.activitynetqa_doc_to_answer +process_results: !function utils.activitynetqa_process_results +metric_list: + - metric: submission + aggregation: !function utils.activitynetqa_aggregate + higher_is_better: true + - metric: exact_match + higher_is_better: true +include: _default_template_yaml + +generation_kwargs: + until: + - "ASSISTANT:" + image_aspect_ratio: original + max_new_tokens: 64 + temperature: 0 + top_p: 0 + num_beams: 1 + do_sample: false diff --git a/lmms_eval/tasks/activitynetqa/utils.py b/lmms_eval/tasks/activitynetqa/utils.py new file mode 100755 index 00000000..b06be6bb --- /dev/null +++ b/lmms_eval/tasks/activitynetqa/utils.py @@ -0,0 +1,312 @@ +from decord import VideoReader, cpu +import numpy as np +import os +import sys +import datetime +import lmms_eval.tasks._task_utils.file_utils as file_utils +import json +import logging +import yaml +from pathlib import Path + +import requests +import openai +from openai import OpenAI +import time +import ast + +eval_logger = logging.getLogger("lmms-eval") + +with open(Path(__file__).parent / "_default_template_yaml", "r") as f: + raw_data = f.readlines() + safe_data = [] + for i, line in enumerate(raw_data): + # remove function definition since yaml load cannot handle it + if "!function" not in line: + safe_data.append(line) + + config = yaml.safe_load("".join(safe_data)) + +NUM_SECONDS_TO_SLEEP = 5 + +GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"] + +API_TYPE = os.getenv("API_TYPE", "openai") + +if API_TYPE == "openai": + API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions") + API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY") + headers = { + "Authorization": f"Bearer {API_KEY}", + "Content-Type": "application/json", + } + +# Unzip all the zip files to HF HOME cache dir +HF_HOME = os.environ["HF_HOME"] +cache_dir = config["dataset_kwargs"]["cache_dir"] +cache_dir = os.path.join(HF_HOME, cache_dir) +cache_dir = os.path.join(cache_dir, "all_test") + + +# Pass in video path here +# Can only work correctly with video dataset +def activitynetqa_doc_to_visual(doc): + video_path = os.path.join(cache_dir, f"v_{doc['video_name']}.mp4") + extensions = ["mp4", "webm", "mkv"] + for ext in extensions: + modified_path = video_path.replace("mp4", ext) + if os.path.exists(modified_path): + return [modified_path] + sys.exit(f"video path:{video_path} does not exist, please check") + + +# This is the place where format the question +def activitynetqa_doc_to_text(doc, model_specific_prompt_kwargs=None): + if model_specific_prompt_kwargs is None: + model_specific_prompt_kwargs = {} + pre_prompt = "" + post_prompt = "" + if "pre_prompt" in model_specific_prompt_kwargs: + pre_prompt = model_specific_prompt_kwargs["pre_prompt"] + if "post_prompt" in model_specific_prompt_kwargs: + post_prompt = model_specific_prompt_kwargs["post_prompt"] + + raw_question = doc["question"] + question = raw_question + "?" + + # type_specific_prompts = { + # '3': "Please answer with 'yes' or 'no'.", + # '4': "Please state the color as a single word.", + # '7': "Please give the numerical answer." + # } + + # doc_type = str(doc['type']) + # type_specific_prompt = type_specific_prompts.get(doc_type, "") + + # return f"{pre_prompt}{question} {type_specific_prompt}{post_prompt}" + return f"{pre_prompt}{question}{post_prompt}" + + +def activitynetqa_doc_to_answer(doc): + return doc["answer"] + + +def get_eval(question, answer, pred, max_tokens: int, retries: int = 5): + global headers + + messages = [ + { + "role": "system", + "content": "You are an intelligent chatbot designed for evaluating the correctness of generative outputs for question-answer pairs. " + "Your task is to compare the predicted answer with the correct answer and determine if they match meaningfully. Here's how you can accomplish the task:" + "------" + "##INSTRUCTIONS: " + "- Focus on the meaningful match between the predicted answer and the correct answer.\n" + "- Consider synonyms or paraphrases as valid matches.\n" + "- Evaluate the correctness of the prediction compared to the answer.", + }, + { + "role": "user", + "content": f"Please evaluate the following video-based question-answer pair:\n\n" + f"Question: {question}\n" + f"Correct Answer: {answer}\n" + f"Predicted Answer: {pred}\n\n" + "Provide your evaluation only as a yes/no and score where the score is an integer value between 0 and 5, with 5 indicating the highest meaningful match. " + "Please generate the response in the form of a Python dictionary string with keys 'pred' and 'score', where value of 'pred' is a string of 'yes' or 'no' and value of 'score' is in INTEGER, not STRING." + "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. " + "For example, your response should look like this: {'pred': 'yes', 'score': 4.8}.", + }, + ] + + payload = { + "model": GPT_EVAL_MODEL_NAME, + "messages": messages, + "temperature": 0, + "max_tokens": max_tokens, + } + + for attempt in range(retries): + try: + response = requests.post(API_URL, headers=headers, json=payload, timeout=60) + response.raise_for_status() # Raises HTTPError for bad responses + try: + response_data = response.json() # Attempt to parse JSON + except requests.exceptions.JSONDecodeError: + eval_logger.error(f"JSON decode error on attempt {attempt + 1}. Response text: {response.text}") + continue # Skip to next retry + content = response_data["choices"][0]["message"]["content"].strip() + if content != "": + return content, response_data["model"] + # Handle HTTP errors separately + except requests.exceptions.HTTPError as e: + eval_logger.error(f"HTTP error on attempt {attempt + 1}: {e}") + # Handle other requests-related errors + except requests.exceptions.RequestException as e: + eval_logger.error(f"Request exception on attempt {attempt + 1}: {e}") + except Exception as e: + eval_logger.error(f"Unexpected error on attempt {attempt + 1}: {e}") + + # Handle other unexpected errors + if attempt < retries - 1: + time.sleep(NUM_SECONDS_TO_SLEEP) + else: # If this was the last attempt, log and return empty + eval_logger.error(f"All {retries} attempts failed. Last error message: {e}") + return "", "" + + return "", "" + + +def parse_score(review): + try: + # Convert the string representation of a dictionary to an actual dictionary + review_dict = ast.literal_eval(review) + pred = review_dict.get("pred", "no") + score = review_dict.get("score", 0) + return [pred, float(score)] + except SyntaxError as e: + eval_logger.error(f"Syntax error parsing the review string: {e}. Review content: {review}") + return ["no", 0] + except ValueError as e: + eval_logger.error(f"Value error parsing the review string: {e}. Review content: {review}") + return ["no", 0] + except Exception as e: + eval_logger.error(f"Unexpected error parsing the review string: {e}. Review content: {review}") + return ["no", 0] + + +# we process answer and gpt_eval seperately, in case gpt is not stable +# so we obtained a submission file for answer first +# and then feed the submission file to gpt for scoring +def activitynetqa_process_results(doc, result): + """ + Args: + doc: a instance of the eval dataset + results: [pred] + Returns: + a dictionary + """ + # try: + # question = doc.get("question", "") + # answer = doc.get("answer", "") + # pred = result[0] + + # review, model_name = get_eval(question, answer, pred, 64) + # scores = parse_score(review) + # except Exception as e: + # eval_logger.error(f"Error for Question ID: {doc.get('question_id', 'Unknown')}: {e}") + # review = "Failed to Get a Proper Review." + # model_name = "Failed Request" + # scores = ['no', 0] + pred = result[0] + + data_dict = {"submission": {"video_name": doc["video_name"], "Q": doc["question"], "A": doc["answer"], "pred": pred, "question_id": doc["question_id"], "type": doc["type"]}} + + return data_dict + + +def activitynetqa_aggregate_submissions(results, args, task): + now_date_time = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + submission_file_name = f"activitynetqa-{task}-{now_date_time}.json" + path = file_utils.generate_submission_file(submission_file_name, args) + + with open(path, "w") as f: + json.dump(results, f, indent=4) + + eval_logger.info(f"Submission file saved to {path}") + + return path + + +def activitynetqa_print_scores(eval_file_path, args): + # Load the predictions from the result file + with open(eval_file_path, "r") as file: + evaluated_list = json.load(file) + + score_file_name = "scores.json" + path = file_utils.generate_submission_file(score_file_name, args) + + # Compute average score and final accuracy + # Initialize counters + yes_count = 0 + no_count = 0 + total_score = 0 + + # Iterate over the results to count correctness and sum scores + for result_dict in evaluated_list: + if result_dict["Correctness"] == "yes": + yes_count += 1 + else: + no_count += 1 + total_score += result_dict["score"] + + # Calculate accuracy and average score + accuracy = yes_count / (yes_count + no_count) if (yes_count + no_count) > 0 else 0 + average_score = total_score / len(evaluated_list) if evaluated_list else 0 + + # Print the results + print(f"Accuracy: {accuracy}") + print(f"Average Score: {average_score}") + + # Write the processed data to the scores file + with open(path, "w") as f: + json.dump({"accuracy": accuracy, "average_score": average_score}, f, indent=4) + + eval_logger.info(f"Score file saved to {path}") + + +# we process answer and gpt_eval seperately, in case gpt is not stable +# so we obtained a submission file for answer first +# and then feed the submission file to gpt for scoring + + +def activitynetqa_gpt_eval(result_file_path, args): + """ + Process the result file containing predictions, score them using GPT, + and save the results with added scores and correctness fields to a new file. + + Args: + result_file_path: path to the JSON file with results to be evaluated + eval_file_path: path to save the JSON file with evaluated results + """ + + eval_file_name = "gpt_eval_result.json" + eval_file_path = file_utils.generate_submission_file(eval_file_name, args) + + # Load the predictions from the result file + with open(result_file_path, "r") as file: + result_list = json.load(file) + + evaluated_results = [] + + # Process each result to generate scores + for data_dict in result_list: + try: + question = data_dict.get("Q", "") + answer = data_dict.get("A", "") + pred = data_dict.get("pred", "") + + # Assume get_eval returns a review and the model name, and parse_score parses this review + review, model_name = get_eval(question, answer, pred, 64) + scores = parse_score(review) + except Exception as e: + eval_logger.error(f"Error for Question ID: {data_dict.get('question_id', 'Unknown')}: {e}") + review = "Failed to Get a Proper Review." + model_name = "Failed Request" + scores = ["no", 0] + + # Update the dictionary with the new entries + updated_dict = {"video_name": data_dict["video_name"], "Correctness": scores[0], "score": scores[1], "Q": question, "A": answer, "pred": pred, "question_id": data_dict.get("question_id"), "type": data_dict.get("type")} + evaluated_results.append(updated_dict) + + # Save the evaluated results to a new JSON file + with open(eval_file_path, "w") as f: + json.dump(evaluated_results, f, indent=4) + + return eval_file_path + + +# Factory into different aggregate +def activitynetqa_aggregate(results, args): + result_file_path = activitynetqa_aggregate_submissions(results, args, "Generation") + eval_file_path = activitynetqa_gpt_eval(result_file_path, args) + activitynetqa_print_scores(eval_file_path, args) diff --git a/lmms_eval/tasks/ai2d/ai2d.yaml b/lmms_eval/tasks/ai2d/ai2d.yaml index 58ae6c79..5221ff64 100755 --- a/lmms_eval/tasks/ai2d/ai2d.yaml +++ b/lmms_eval/tasks/ai2d/ai2d.yaml @@ -7,35 +7,16 @@ output_type: generate_until doc_to_visual: !function utils.ai2d_doc_to_visual doc_to_text: !function utils.ai2d_doc_to_text doc_to_target: !function utils.ai2d_doc_to_target - -generation_kwargs: - max_new_tokens: 16 - temperature: 0 - do_sample: False - -metric_list: - - metric: exact_match - aggregation: mean - higher_is_better: true - ignore_case: true - ignore_punctuation: true - -# filter_list: -# - name: flexible-extract -# filter: -# - function: !function utils.SimpleMultiChoiceRegexFilter -# group_select: 0 -# regex_pattern: "(\\([A-Z]\\))" - -metadata: - - version: 0.0 model_specific_prompt_kwargs: default: prompt_format: mcq pre_prompt: "" post_prompt: "\nAnswer with the option's letter from the given choices directly." - # qwen formulate ai2d as question answering instead of mcq + gpt4v: + prompt_format: mcq + pre_prompt: "" + post_prompt: "\nAbove choices are given in {option}. {content} format.\nPlease answer with the option letter from the given choices directly." qwen_vl: prompt_format: qa pre_prompt: "" @@ -47,4 +28,28 @@ model_specific_prompt_kwargs: model_specific_target_kwargs: default: "mcq" - qwen_vl: "qa" \ No newline at end of file + qwen_vl: "qa" + +generation_kwargs: + max_new_tokens: 16 + temperature: 0 + do_sample: False + +filter_list: + - name: "flexible-extract" + filter: + - function: !function utils.MultiChoiceRegexFilter + group_select: 0 + ignore_case: true + ignore_punctuation: true + regex_pattern: "([A-Z])\\." + +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + ignore_case: true + ignore_punctuation: true + +metadata: + - version: 0.0 \ No newline at end of file diff --git a/lmms_eval/tasks/ai2d/utils.py b/lmms_eval/tasks/ai2d/utils.py index 04a2cd93..a6e2368d 100755 --- a/lmms_eval/tasks/ai2d/utils.py +++ b/lmms_eval/tasks/ai2d/utils.py @@ -1,4 +1,6 @@ -from lmms_eval.filters.extraction import SimpleMultiChoiceRegexFilter +from lmms_eval.filters.extraction import ExtendedRegexFilter +from lmms_eval.filters.transformation import MapFilter +import re def ai2d_doc_to_text(doc, model_specific_prompt_kwargs=None): @@ -32,3 +34,46 @@ def ai2d_doc_to_target(doc, model_specific_target_kwargs): return options[int(doc["answer"])] elif model_specific_target_kwargs == "qa": return doc["options"][int(doc["answer"])] + + +class MultiChoiceRegexFilter(ExtendedRegexFilter): + def __init__(self, *args, **kwargs): + """ + regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure + - step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response. + - step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices. + group_select: Selects the (group_select)th match from the findall result. + ignore_case: Ignores the case during step 1 matching + ignore_punctuation: Remove the punctuation during step 1 matching + regexes_to_ignore: Remove these regexes during step 1 matching + """ + super().__init__(*args, **kwargs) + + def apply(self, resps, docs): + # here, we assume we have a list, in which each element is + # a list of model responses for some particular input/target pair. + # so we process each of these (same input/target response sets) + # independently (and keep them a list.) + + filtered_resps = [] + + for r, doc in zip(resps, docs): + # Regex to directly extract the option letter from the model response + option_letter_regex = re.compile(r"^\s*([A-Z])\.") + + # Process each response + filtered = [] + for resp in r: + # Try to match the option letter at the start of the response + match = option_letter_regex.match(resp) + if match: + # If a match is found, append the matched letter + filtered.append(match.group(1)) + else: + # If no match, return the original response + filtered.append(resp) + + # Assuming we need the first response that matches or the original response + filtered_resps.append(filtered[0]) + + return filtered_resps diff --git a/lmms_eval/tasks/coco_cap/utils.py b/lmms_eval/tasks/coco_cap/utils.py index 0102dbef..ab3a736a 100755 --- a/lmms_eval/tasks/coco_cap/utils.py +++ b/lmms_eval/tasks/coco_cap/utils.py @@ -43,7 +43,7 @@ def coco_process_result(doc, result): def coco_aggregation_result(results, metric, args): - scorers = [(Bleu(4), "Bleu_1"), (Bleu(4), "Bleu_2"), (Bleu(4), "Bleu_3"), (Bleu(4), "Bleu_4"), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")]#, (Spice(), "SPICE")] + scorers = [(Bleu(4), "Bleu_1"), (Bleu(4), "Bleu_2"), (Bleu(4), "Bleu_3"), (Bleu(4), "Bleu_4"), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] # , (Spice(), "SPICE")] scorers_dict = {s[1]: s for s in scorers} stored_results = [] diff --git a/lmms_eval/tasks/cvrr/_default_template_yaml b/lmms_eval/tasks/cvrr/_default_template_yaml new file mode 100644 index 00000000..cce57e12 --- /dev/null +++ b/lmms_eval/tasks/cvrr/_default_template_yaml @@ -0,0 +1,13 @@ +dataset_path: lmms-lab/CVRR-ES +dataset_kwargs: + token: True + video: True + cache_dir: cvrr-es +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "" + +metadata: + version: 0.0 + gpt_eval_model_name: gpt-3.5-turbo-0613 \ No newline at end of file diff --git a/lmms_eval/tasks/cvrr/cvrr_object_instance_count.yaml b/lmms_eval/tasks/cvrr/cvrr_object_instance_count.yaml new file mode 100755 index 00000000..fc85014a --- /dev/null +++ b/lmms_eval/tasks/cvrr/cvrr_object_instance_count.yaml @@ -0,0 +1,14 @@ +dataset_name: "continuity_and_object_instance_count" +task: "cvrr_continuity_and_object_instance_count" +test_split: test +output_type: generate_until +doc_to_visual: !function utils.cvrr_doc_to_visual +doc_to_text: !function utils.cvrr_doc_to_text +doc_to_target: !function utils.cvrr_doc_to_answer +process_results: !function utils.cvrr_process_results +metric_list: + - metric: submission + aggregation: !function utils.cvrr_aggregate_results + higher_is_better: true +include: _default_template_yaml + diff --git a/lmms_eval/tasks/cvrr/utils.py b/lmms_eval/tasks/cvrr/utils.py new file mode 100755 index 00000000..0e300d4f --- /dev/null +++ b/lmms_eval/tasks/cvrr/utils.py @@ -0,0 +1,301 @@ +from decord import VideoReader, cpu +import numpy as np +import os +import sys +import datetime +import lmms_eval.tasks._task_utils.file_utils as file_utils +import json +import logging +import yaml +from pathlib import Path + +import requests +import openai +from openai import OpenAI +import time +import ast + +eval_logger = logging.getLogger("lmms-eval") + +with open(Path(__file__).parent / "_default_template_yaml", "r") as f: + raw_data = f.readlines() + safe_data = [] + for i, line in enumerate(raw_data): + # remove function definition since yaml load cannot handle it + if "!function" not in line: + safe_data.append(line) + + config = yaml.safe_load("".join(safe_data)) + +NUM_SECONDS_TO_SLEEP = 5 + +GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"] + +API_TYPE = os.getenv("API_TYPE", "openai") + +if API_TYPE == "openai": + API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions") + API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY") + headers = { + "Authorization": f"Bearer {API_KEY}", + "Content-Type": "application/json", + } + +# Unzip all the zip files to HF HOME cache dir +HF_HOME = os.environ["HF_HOME"] +cache_dir = config["dataset_kwargs"]["cache_dir"] +cache_dir = os.path.join(HF_HOME, cache_dir) +cache_dir = os.path.join(cache_dir, "CVRR-ES/continuity_and_object_instance_count") + + +# Pass in video path here +# Can only work correctly with video llm +def cvrr_doc_to_visual(doc): + video_path = doc["VideoID"] + + if doc["DimensionName"] == "Continuity and Object Instance Count": + + video_path = os.path.join(cache_dir, video_path) + if os.path.exists(video_path): + video_path = video_path + else: + sys.exit(f"video path:{video_path} does not exist, please check") + return [video_path] + + +# format the question +def cvrr_doc_to_text(doc, model_specific_prompt_kwargs=None): + if model_specific_prompt_kwargs is None: + model_specific_prompt_kwargs = {} + pre_prompt = "" + post_prompt = "" + if "pre_prompt" in model_specific_prompt_kwargs: + pre_prompt = model_specific_prompt_kwargs["pre_prompt"] + if "post_prompt" in model_specific_prompt_kwargs: + post_prompt = model_specific_prompt_kwargs["post_prompt"] + + question = doc["Q"] + + return f"{pre_prompt}{question}{post_prompt}" + + +# format answer +def cvrr_doc_to_answer(doc): + return doc["A"] + + +# Note: we process answer and gpt_eval seperately, in case gpt is not stable +# so we obtained a submission file for answer first +# and then feed the submission file to gpt for scoring + + +# Process result for evaluation +def cvrr_process_results(doc, result): + pred = result[0] + + return {"submission": {"VideoID": doc["VideoID"], "Q": doc["Q"], "A": doc["A"], "pred": pred, "DimensionName": doc["DimensionName"]}} + + +def cvrr_aggregate_submissions(results, args, task): + now_date_time = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + submission_file_name = f"cvrr-{task}-{now_date_time}.json" + path = file_utils.generate_submission_file(submission_file_name, args) + + with open(path, "w") as f: + json.dump(results, f, indent=4) + + eval_logger.info(f"Submission file saved to {path}") + + return path + + +def get_eval(question, answer, pred, task, max_tokens: int, retries: int = 5): + global headers + + if task == "continuity_and_object_instance_count": + messages = [ + { + "role": "system", + "content": "You are an intelligent chatbot designed for evaluating the correctness of AI assistant predictions for question-answer pairs. " + "Your task is to compare the predicted answer with the ground-truth answer and determine if the predicted answer is correct or not. Here's how you can accomplish the task:" + "------" + "##INSTRUCTIONS: " + "- Focus on the correctness and accuracy of the predicted answer with the ground-truth.\n" + "- Consider predictions with less specific details as correct evaluation, unless such details are explicitly asked in the question.\n", + }, + { + "role": "user", + "content": "Please evaluate the following video-based question-answer pair:\n\n" + f"Question: {question}\n" + f"Ground truth correct Answer: {answer}\n" + f"Predicted Answer: {pred}\n\n" + "Provide your evaluation as a correct/incorrect prediction along with the score where the score is an integer value between 0 (fully wrong) and 5 (fully correct). The middle score provides the percentage of correctness." + "Please generate the response in the form of a Python dictionary string with keys 'pred', 'score' and 'reason', where value of 'pred' is a string of 'correct' or 'incorrect', value of 'score' is in INTEGER, not STRING and value of 'reason' should provide the reason behind the decision." + "Only provide the Python dictionary string." + "For example, your response should look like this: {'pred': 'correct', 'score': 4.8, 'reason': reason}.", + }, + ] + + print(messages) + + payload = { + "model": GPT_EVAL_MODEL_NAME, + "messages": messages, + "temperature": 0, + "max_tokens": max_tokens, + } + + for attempt in range(retries): + try: + response = requests.post(API_URL, headers=headers, json=payload, timeout=60) + response.raise_for_status() # Raises HTTPError for bad responses + try: + response_data = response.json() # Attempt to parse JSON + except requests.exceptions.JSONDecodeError: + eval_logger.error(f"JSON decode error on attempt {attempt + 1}. Response text: {response.text}") + continue # Skip to next retry + content = response_data["choices"][0]["message"]["content"].strip() + if content != "": + return content, response_data["model"] + # Handle HTTP errors separately + except requests.exceptions.HTTPError as e: + eval_logger.error(f"HTTP error on attempt {attempt + 1}: {e}") + # Handle other requests-related errors + except requests.exceptions.RequestException as e: + eval_logger.error(f"Request exception on attempt {attempt + 1}: {e}") + except Exception as e: + eval_logger.error(f"Unexpected error on attempt {attempt + 1}: {e}") + + # Handle other unexpected errors + if attempt < retries - 1: + time.sleep(NUM_SECONDS_TO_SLEEP) + else: # If this was the last attempt, log and return empty + eval_logger.error(f"All {retries} attempts failed. Last error message: {e}") + return "", "" + + return "", "" + + +def parse_score(review): + try: + # Convert the string representation of a dictionary to an actual dictionary + review_dict = ast.literal_eval(review) + correctness = review_dict.get("pred", "incorrect") + score = review_dict.get("score", 0) + reason = review_dict.get("reason", "") + return correctness, float(score), reason + except SyntaxError as e: + eval_logger.error(f"Syntax error parsing the review string: {e}. Review content: {review}") + return "incorrect", float(0), "" + except ValueError as e: + eval_logger.error(f"Value error parsing the review string: {e}. Review content: {review}") + return "incorrect", float(0), "" + except Exception as e: + eval_logger.error(f"Unexpected error parsing the review string: {e}. Review content: {review}") + return "incorrect", float(0), "" + + +def cvrr_print_scores(eval_file_path, args): + # Load the predictions from the result file + with open(eval_file_path, "r") as file: + evaluated_list = json.load(file) + + score_file_name = "scores.json" + path = file_utils.generate_submission_file(score_file_name, args) + + # Compute average score and final accuracy + # Initialize counters + yes_count = 0 + no_count = 0 + total_score = 0 + + # Iterate over the results to count correctness and sum scores + for result_list in evaluated_list: + eval_dict = result_list[0] + total_score += eval_dict["score"] + + if eval_dict["Correctness"] == "yes": + yes_count += 1 + else: + no_count += 1 + + # Calculate accuracy and average score + accuracy = yes_count / (yes_count + no_count) if (yes_count + no_count) > 0 else 0 + average_score = total_score / len(evaluated_list) if evaluated_list else 0 + + # Print the results + print(f"Accuracy: {accuracy}") + print(f"Average Score: {average_score}") + + # Write the processed data to the scores file + with open(path, "w") as f: + json.dump({"accuracy": accuracy, "average_score": average_score}, f, indent=4) + + eval_logger.info(f"Score file saved to {path}") + + +def cvrr_gpt_eval(result_file_path, args, task): + """ + Process the result file containing predictions, score them using GPT, + and save the results with added scores and correctness fields to a new file. + + Args: + result_file_path: path to the JSON file with results to be evaluated + """ + + eval_file_name = "gpt_eval_result.json" + eval_file_path = file_utils.generate_submission_file(eval_file_name, args) + + # Load the predictions from the result file + with open(result_file_path, "r") as file: + result_list = json.load(file) + + evaluated_results = [] + + # Load the predictions from the result file + with open(result_file_path, "r") as file: + result_list = json.load(file) + + # Process each result to generate scores + for data_dict in result_list: + try: + question = data_dict.get("Q", "") + answer = data_dict.get("A", "") + pred = data_dict.get("pred", "") + + # Assume get_eval returns a review and the model name, and parse_score parses this review + review, model_name = get_eval(question, answer, pred, task, 64) + correctness, score, reason = parse_score(review) + except Exception as e: + eval_logger.error(f"Error for Video Name: {data_dict.get('VideoID', 'Unknown')}: {e}") + review = "Failed to Get a Proper Review." + model_name = "Failed Request" + score = 0 + correctness = "incorrect" + reason = "" + + # Update the dictionary with the new entries + eval_dict = { + "pred": correctness, + "score": score, + "reason": reason, + } + result_dict = { + "Q": question, + "A": answer, + "pred": pred, + } + updated_list = [eval_dict, result_dict] + evaluated_results.append(updated_list) + + # Save the evaluated results to a new JSON file + with open(eval_file_path, "w") as f: + json.dump(evaluated_results, f, indent=4) + + return eval_file_path + + +def cvrr_aggregate_results(results, args): + result_file_path = cvrr_aggregate_submissions(results, args, "continuity_and_object_instance_count") + eval_file_path = cvrr_gpt_eval(result_file_path, args, "continuity_and_object_instance_count") + cvrr_print_scores(eval_file_path, args) diff --git a/lmms_eval/tasks/egoschema/_default_template_yaml b/lmms_eval/tasks/egoschema/_default_template_yaml new file mode 100644 index 00000000..8e030645 --- /dev/null +++ b/lmms_eval/tasks/egoschema/_default_template_yaml @@ -0,0 +1,9 @@ +dataset_path: lmms-lab/egoschema +dataset_kwargs: + token: True + video: True + cache_dir: egoschema +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "" \ No newline at end of file diff --git a/lmms_eval/tasks/egoschema/egoschema_generation.yaml b/lmms_eval/tasks/egoschema/egoschema_generation.yaml new file mode 100755 index 00000000..be7bebe9 --- /dev/null +++ b/lmms_eval/tasks/egoschema/egoschema_generation.yaml @@ -0,0 +1,15 @@ +dataset_name: "GENERATION" +task: "egoschema_gen" +test_split: test +output_type: generate_until +doc_to_visual: !function utils.egoschema_doc_to_visual +doc_to_text: !function utils.egoschema_doc_to_text +doc_to_target: !function utils.egoschema_doc_to_answer +process_results: !function utils.egoschema_process_results_generation +metric_list: + - metric: submission + aggregation: !function utils.egoschema_aggregate_gen + higher_is_better: true + - metric: exact_match + higher_is_better: true +include: _default_template_yaml diff --git a/lmms_eval/tasks/egoschema/egoschema_mc.yaml b/lmms_eval/tasks/egoschema/egoschema_mc.yaml new file mode 100755 index 00000000..7b894da0 --- /dev/null +++ b/lmms_eval/tasks/egoschema/egoschema_mc.yaml @@ -0,0 +1,15 @@ +dataset_name: "MC" +task: "egoschema_mc" +test_split: test +output_type: generate_until +doc_to_visual: !function utils.egoschema_doc_to_visual +doc_to_text: !function utils.egoschema_doc_to_text +doc_to_target: !function utils.egoschema_doc_to_answer_mc +process_results: !function utils.egoschema_process_results +metric_list: + - metric: submission + aggregation: !function utils.egoschema_aggregate_mc + higher_is_better: true + - metric: exact_match + higher_is_better: true +include: _default_template_yaml diff --git a/lmms_eval/tasks/egoschema/egoschema_mcppl.yaml b/lmms_eval/tasks/egoschema/egoschema_mcppl.yaml new file mode 100755 index 00000000..4d13a0fe --- /dev/null +++ b/lmms_eval/tasks/egoschema/egoschema_mcppl.yaml @@ -0,0 +1,15 @@ +dataset_name: "MC_PPL" +task: "egoschema_mc_ppl" +test_split: test +output_type: multiple_choice +doc_to_visual: !function utils.egoschema_doc_to_visual +doc_to_text: "question" +doc_to_target: !function utils.egoschema_doc_to_answer_mc_ppl +doc_to_choice: !function utils.egoschema_doc_to_choice +process_results: !function utils.egoschema_process_results +metric_list: + - metric: submission + aggregation: !function utils.egoschema_aggregate_mc_ppl + higher_is_better: true + - metric: acc +include: _default_template_yaml diff --git a/lmms_eval/tasks/egoschema/egoschema_subset.yaml b/lmms_eval/tasks/egoschema/egoschema_subset.yaml new file mode 100755 index 00000000..b5c7fd14 --- /dev/null +++ b/lmms_eval/tasks/egoschema/egoschema_subset.yaml @@ -0,0 +1,17 @@ +dataset_name: "Subset" +task: "egoschema_subset_mcppl" +test_split: test +output_type: multiple_choice +doc_to_visual: !function utils.egoschema_doc_to_visual +doc_to_text: "question" +doc_to_target: !function utils.egoschema_doc_to_answer_mc_ppl +doc_to_choice: !function utils.egoschema_doc_to_choice +process_results: !function utils.egoschema_process_results +metric_list: + - metric: submission + aggregation: !function utils.egoschema_aggregate_mc_ppl + higher_is_better: true + - metric: score + aggregation: !function utils.egoschema_aggregate_score + higher_is_better: true +include: _default_template_yaml diff --git a/lmms_eval/tasks/egoschema/egoschema_subset_gen.yaml b/lmms_eval/tasks/egoschema/egoschema_subset_gen.yaml new file mode 100755 index 00000000..c7455b29 --- /dev/null +++ b/lmms_eval/tasks/egoschema/egoschema_subset_gen.yaml @@ -0,0 +1,16 @@ +dataset_name: "Subset" +task: "egoschema_subset_generation" +test_split: test +output_type: generate_until +doc_to_visual: !function utils.egoschema_doc_to_visual +doc_to_text: !function utils.egoschema_doc_to_text +doc_to_target: !function utils.egoschema_doc_to_answer +process_results: !function utils.egoschema_process_results_generation +metric_list: + - metric: submission + aggregation: !function utils.egoschema_aggregate_gen + higher_is_better: true + - metric: score + aggregation: !function utils.egoschema_aggregate_score + higher_is_better: true +include: _default_template_yaml diff --git a/lmms_eval/tasks/egoschema/utils.py b/lmms_eval/tasks/egoschema/utils.py new file mode 100755 index 00000000..872c452e --- /dev/null +++ b/lmms_eval/tasks/egoschema/utils.py @@ -0,0 +1,156 @@ +from decord import VideoReader, cpu +import numpy as np +import os +import sys +import datetime +import lmms_eval.tasks._task_utils.file_utils as file_utils +import json +import logging +import yaml +from pathlib import Path + +with open(Path(__file__).parent / "_default_template_yaml", "r") as f: + raw_data = f.readlines() + safe_data = [] + for i, line in enumerate(raw_data): + # remove function definition since yaml load cannot handle it + if "!function" not in line: + safe_data.append(line) + + config = yaml.safe_load("".join(safe_data)) + +# A bit ugly here +# But the idea is that we will unzip all the zip files +# To HF HOME cache dir +# And load it here +HF_HOME = os.environ["HF_HOME"] +cache_dir = config["dataset_kwargs"]["cache_dir"] +cache_dir = os.path.join(HF_HOME, cache_dir) +cache_dir = os.path.join(cache_dir, "videos") + +eval_logger = logging.getLogger("lmms-eval") + + +# Pass in video path here +# Can only work correctly with video llm +def egoschema_doc_to_visual(doc): + video_path = doc["video_idx"] + ".mp4" + video_path = os.path.join(cache_dir, video_path) + if os.path.exists(video_path): + video_path = video_path + elif os.path.exists(video_path.replace("mp4", "MP4")): + video_path = video_path.replace("mp4", "MP4") + else: + sys.exit(f"video path:{video_path} does not exist, please check") + return [video_path] + + +# This is the place where you format your question +def egoschema_doc_to_text(doc, model_specific_prompt_kwargs=None): + if model_specific_prompt_kwargs is None: + model_specific_prompt_kwargs = {} + pre_prompt = "" + post_prompt = "" + if "pre_prompt" in model_specific_prompt_kwargs: + pre_prompt = model_specific_prompt_kwargs["pre_prompt"] + if "post_prompt" in model_specific_prompt_kwargs: + post_prompt = model_specific_prompt_kwargs["post_prompt"] + + question = doc["question"] + if "option" in doc: + for op in doc["option"]: + question += "\n" + op + post_prompt = "\nAnswer with the option's letter from the given choices directly." + + return f"{pre_prompt}{question}{post_prompt}" + + +def egoschema_doc_to_answer(doc): + return doc["answer"] + + +# If it is mc, keep the option for exact match +def egoschema_doc_to_answer_mc(doc): + # return doc["answer"].split(".")[0].strip() + return doc["answer"] # pseudo answer + + +# If it is mc ppl, keep the option str for perplexity base matching +def egoschema_doc_to_answer_mc_ppl(doc): + # return doc["answer"].split(".")[1].strip() + return doc["answer"] # pseudo answer + + +# Process result for mc_ppl +def egoschema_process_results(doc, result): + # Initialize minimum value and index + min_value = float("inf") + min_index = -1 + + # Iterate through the results to find the index of the lowest value + for i, (value, _) in enumerate(result): + if value < min_value: + min_value = value + min_index = i + + # Return the result with the index of the lowest value + return {"submission": {doc["video_idx"]: min_index}, "score": {"pred": min_index, "ground_truth": doc["answer"]}} + + +# Process result for generation +def egoschema_process_results_generation(doc, result): + pred = result[0] # string prediction "A", "B", "C", "D", or "E" + + # Map the prediction to an index + pred_to_index = {"A": 0, "B": 1, "C": 2, "D": 3, "E": 4} + index = pred_to_index.get(pred, -1) # Default to -1 if the prediction is not found + + return {"submission": {doc["video_idx"]: index}, "score": {"pred": index, "ground_truth": doc["answer"]}} + + +def egoschema_aggregate_submissions(results, args, task): + now_date_time = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + submission_file_name = f"egoschema-{task}-{now_date_time}.json" + path = file_utils.generate_submission_file(submission_file_name, args) + + # results is a list of 5031 dict, + # need to convert results into a single dict with 5031 key-value pairs + combined_submission = {} + + for submission_dict in results: + combined_submission.update(submission_dict) + + with open(path, "w") as f: + json.dump(combined_submission, f, indent=4) + + eval_logger.info(f"Submission file saved to {path}") + + +# Factory into different aggregate +def egoschema_aggregate_gen(results, args): + egoschema_aggregate_submissions(results, args, "GENERATION") + + +def egoschema_aggregate_mc(results, args): + egoschema_aggregate_submissions(results, args, "MC") + + +def egoschema_aggregate_mc_ppl(results, args): + egoschema_aggregate_submissions(results, args, "MC_PPL") + + +def egoschema_aggregate_score(results, args): + yes_count = 0 + + # results is a list of dict + for answer_dict in results: + if str(answer_dict["ground_truth"]) == str(answer_dict["pred"]): + yes_count = yes_count + 1 + + accuracy = yes_count / len(results) + + return accuracy + + +def egoschema_doc_to_choice(doc): + return [op.split(".")[1].strip() for op in doc["option"]] diff --git a/lmms_eval/tasks/flickr30k/utils.py b/lmms_eval/tasks/flickr30k/utils.py index f5d5c144..8fa1069a 100755 --- a/lmms_eval/tasks/flickr30k/utils.py +++ b/lmms_eval/tasks/flickr30k/utils.py @@ -41,7 +41,7 @@ def flickr_process_result(doc, result): def flickr_aggregation_result(results, metric, args): - scorers = [(Bleu(4), "Bleu_1"), (Bleu(4), "Bleu_2"), (Bleu(4), "Bleu_3"), (Bleu(4), "Bleu_4"), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")]#, (Spice(), "SPICE")] + scorers = [(Bleu(4), "Bleu_1"), (Bleu(4), "Bleu_2"), (Bleu(4), "Bleu_3"), (Bleu(4), "Bleu_4"), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] # , (Spice(), "SPICE")] scorers_dict = {s[1]: s for s in scorers} stored_results = [] diff --git a/lmms_eval/tasks/livebench/livebench.yaml b/lmms_eval/tasks/livebench/livebench.yaml index 3cf6087c..620c3863 100644 --- a/lmms_eval/tasks/livebench/livebench.yaml +++ b/lmms_eval/tasks/livebench/livebench.yaml @@ -24,6 +24,6 @@ model_specific_prompt_kwargs: pre_prompt: "" post_prompt: "" metadata: - version: 0.0 + version: "2024-05" api_type : openai gpt_eval_model_name: "gpt-4-turbo" diff --git a/lmms_eval/tasks/livebench/utils.py b/lmms_eval/tasks/livebench/utils.py index 11a026b1..ecc39539 100644 --- a/lmms_eval/tasks/livebench/utils.py +++ b/lmms_eval/tasks/livebench/utils.py @@ -144,8 +144,10 @@ def livebench_doc_to_text(doc, model_specific_prompt_kwargs=None): post_prompt = model_specific_prompt_kwargs.get("post_prompt", "") return f"{pre_prompt}{doc['question']}{post_prompt}" + SUBTASKS = ("basic understanding", "contextual analysis", "deeper implications", "broader implications", "further insights") + def livebench_process_results(doc, results): base64_images = [image_to_base64(image) for image in livebench_doc_to_visual(doc)] subtask = doc["subtask"] diff --git a/lmms_eval/tasks/mix_evals/mix_evals_video2text_mc.yaml b/lmms_eval/tasks/mix_evals/mix_evals_video2text_mc.yaml index 3852a264..d04dabf4 100755 --- a/lmms_eval/tasks/mix_evals/mix_evals_video2text_mc.yaml +++ b/lmms_eval/tasks/mix_evals/mix_evals_video2text_mc.yaml @@ -21,7 +21,6 @@ filter_list: group_select: 0 ignore_case: true ignore_punctuation: true - regex_pattern: "(\\([A-Z]\\))" model_specific_prompt_kwargs: default: diff --git a/lmms_eval/tasks/mix_evals/utils.py b/lmms_eval/tasks/mix_evals/utils.py index 7b08cab6..88be9656 100755 --- a/lmms_eval/tasks/mix_evals/utils.py +++ b/lmms_eval/tasks/mix_evals/utils.py @@ -239,62 +239,28 @@ def mix_evals_video2text_aggregate_gen(results, args): class MultiChoiceRegexFilter(ExtendedRegexFilter): def __init__(self, *args, **kwargs): - """ - regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure - - step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response. - - step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices. - group_select: Selects the (group_select)th match from the findall result. - ignore_case: Ignores the case during step 1 matching - ignore_punctuation: Remove the punctuation during step 1 matching - regexes_to_ignore: Remove these regexes during step 1 matching - """ super().__init__(*args, **kwargs) def apply(self, resps, docs): - # here, we assume we have a list, in which each element is - # a list of model responses for some particular input/target pair. - # so we process each of these (same input/target response sets) - # independently (and keep them a list.) - filtered_resps = [] for r, doc in zip(resps, docs): - fallback_regexes = [] - choice_to_alpha = {} - next_alpha = "A" - - without_paren_fallback_regexes = [] - without_paren_to_target = {} - - # Regex to extract multiple choice options from the question - multiple_choices_regex = re.compile(r"\b([A-Z])\.\s+([^\n]*)") - matches = multiple_choices_regex.findall(doc["question"]) - - # Build regex patterns and mappings for each choice - for m in matches: - choice_text = m[1].strip() - fallback_regexes.append(f"{re.escape(choice_text)}") - choice_to_alpha[choice_text] = next_alpha - - next_alpha = chr(ord(next_alpha) + 1) - - # Compile regex to match any of the extracted choices - fallback_regex = re.compile("|".join(fallback_regexes)) + # Regex to directly extract the option letter from the model response + option_letter_regex = re.compile(r"\b([A-Z])\.\s+([^\n]*)") # Process each response filtered = [] for resp in r: - # Remove any punctuation and extra spaces - cleaned_resp = re.sub(r"[^\w\s]", "", resp).strip() - # Try to match cleaned response with the choice text - match = fallback_regex.search(cleaned_resp) - if match and match.group() in choice_to_alpha: - # Map the matched choice text back to its corresponding letter - filtered.append(choice_to_alpha[match.group()]) + # Try to match the option letter at the start of the response + match = option_letter_regex.match(resp) + if match: + # If a match is found, append the matched letter + filtered.append(match.group(1)) else: - # If no match, return the cleaned response - filtered.append(cleaned_resp) + # If no match, return the original response + filtered.append(resp) + # Assuming we need the first response that matches or the original response filtered_resps.append(filtered[0]) return filtered_resps diff --git a/lmms_eval/tasks/mmbench/_default_template_mmbench_cn_yaml b/lmms_eval/tasks/mmbench/_default_template_mmbench_cn_yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/mmbench/_default_template_mmbench_en_yaml b/lmms_eval/tasks/mmbench/_default_template_mmbench_en_yaml old mode 100644 new mode 100755 diff --git a/lmms_eval/tasks/mme/mme.yaml b/lmms_eval/tasks/mme/mme.yaml index f7a5d229..3d665314 100755 --- a/lmms_eval/tasks/mme/mme.yaml +++ b/lmms_eval/tasks/mme/mme.yaml @@ -27,6 +27,9 @@ model_specific_prompt_kwargs: default: pre_prompt: "" post_prompt: "\nAnswer the question using a single word or phrase." + gpt4v: + pre_prompt: "" + post_prompt: "\nAnswer the question with Yes or No." qwen_vl: pre_prompt: "" post_prompt: " Answer:" diff --git a/lmms_eval/tasks/nocaps/utils.py b/lmms_eval/tasks/nocaps/utils.py index f645b1cc..9b1d4df6 100755 --- a/lmms_eval/tasks/nocaps/utils.py +++ b/lmms_eval/tasks/nocaps/utils.py @@ -42,7 +42,7 @@ def nocaps_process_result(doc, result): def nocaps_aggregation_result(results, metric, args=None): - scorers = [(Bleu(4), "Bleu_1"), (Bleu(4), "Bleu_2"), (Bleu(4), "Bleu_3"), (Bleu(4), "Bleu_4"), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")]#, (Spice(), "SPICE")] + scorers = [(Bleu(4), "Bleu_1"), (Bleu(4), "Bleu_2"), (Bleu(4), "Bleu_3"), (Bleu(4), "Bleu_4"), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] # , (Spice(), "SPICE")] scorers_dict = {s[1]: s for s in scorers} stored_results = [] diff --git a/lmms_eval/tasks/ocrbench/utils.py b/lmms_eval/tasks/ocrbench/utils.py index c8c8c650..fe1c1fcb 100644 --- a/lmms_eval/tasks/ocrbench/utils.py +++ b/lmms_eval/tasks/ocrbench/utils.py @@ -100,4 +100,4 @@ def ocrbench_aggregate_accuracy(results, args): print(f"Final Score(Total 1000): {Final_score}", file=f) logger.info(f"OCR Bench results saved to {file_name}") # return {"Final Score":Final_score,"Text Recognition":recognition_score,'Scene Text-centric VQA':OCRBench_score['Scene Text-centric VQA'],'Doc-oriented VQA':OCRBench_score['Doc-oriented VQA'],'Key Information Extraction':OCRBench_score['Key Information Extraction'],'Handwritten Mathematical Expression Recognition':OCRBench_score['Handwritten Mathematical Expression Recognition']} - return Final_score + return Final_score / 1000 # return the final score as accuracy diff --git a/lmms_eval/tasks/refcoco+/utils.py b/lmms_eval/tasks/refcoco+/utils.py index 4feb71cb..f1d43606 100755 --- a/lmms_eval/tasks/refcoco+/utils.py +++ b/lmms_eval/tasks/refcoco+/utils.py @@ -49,7 +49,7 @@ def refcoco_process_result(doc, result): def refcoco_aggregation_result(results, metric): - scorers = [(Bleu(4), "Bleu_1"), (Bleu(4), "Bleu_2"), (Bleu(4), "Bleu_3"), (Bleu(4), "Bleu_4"), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")]#, (Spice(), "SPICE")] + scorers = [(Bleu(4), "Bleu_1"), (Bleu(4), "Bleu_2"), (Bleu(4), "Bleu_3"), (Bleu(4), "Bleu_4"), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] # , (Spice(), "SPICE")] scorers_dict = {s[1]: s for s in scorers} stored_results = [] diff --git a/lmms_eval/tasks/refcoco/utils.py b/lmms_eval/tasks/refcoco/utils.py index 4feb71cb..f1d43606 100755 --- a/lmms_eval/tasks/refcoco/utils.py +++ b/lmms_eval/tasks/refcoco/utils.py @@ -49,7 +49,7 @@ def refcoco_process_result(doc, result): def refcoco_aggregation_result(results, metric): - scorers = [(Bleu(4), "Bleu_1"), (Bleu(4), "Bleu_2"), (Bleu(4), "Bleu_3"), (Bleu(4), "Bleu_4"), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")]#, (Spice(), "SPICE")] + scorers = [(Bleu(4), "Bleu_1"), (Bleu(4), "Bleu_2"), (Bleu(4), "Bleu_3"), (Bleu(4), "Bleu_4"), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] # , (Spice(), "SPICE")] scorers_dict = {s[1]: s for s in scorers} stored_results = [] diff --git a/lmms_eval/tasks/refcocog/utils.py b/lmms_eval/tasks/refcocog/utils.py index 4feb71cb..f1d43606 100755 --- a/lmms_eval/tasks/refcocog/utils.py +++ b/lmms_eval/tasks/refcocog/utils.py @@ -49,7 +49,7 @@ def refcoco_process_result(doc, result): def refcoco_aggregation_result(results, metric): - scorers = [(Bleu(4), "Bleu_1"), (Bleu(4), "Bleu_2"), (Bleu(4), "Bleu_3"), (Bleu(4), "Bleu_4"), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")]#, (Spice(), "SPICE")] + scorers = [(Bleu(4), "Bleu_1"), (Bleu(4), "Bleu_2"), (Bleu(4), "Bleu_3"), (Bleu(4), "Bleu_4"), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] # , (Spice(), "SPICE")] scorers_dict = {s[1]: s for s in scorers} stored_results = [] diff --git a/lmms_eval/tasks/synthdog/donut_evaluator.py b/lmms_eval/tasks/synthdog/donut_evaluator.py index e0e52e12..b24c4258 100644 --- a/lmms_eval/tasks/synthdog/donut_evaluator.py +++ b/lmms_eval/tasks/synthdog/donut_evaluator.py @@ -5,12 +5,16 @@ from typing import Any, Dict, List, Tuple, Union import torch -import zss from datasets import load_dataset from nltk import edit_distance from torch.utils.data import Dataset from transformers.modeling_utils import PreTrainedModel -from zss import Node + +try: + import zss + from zss import Node +except ImportError: + print("Please install zss library. You can install it by running 'pip install zss'") class JSONParseEvaluator: diff --git a/lmms_eval/tasks/textcaps/utils.py b/lmms_eval/tasks/textcaps/utils.py index c63feae5..12dc277f 100755 --- a/lmms_eval/tasks/textcaps/utils.py +++ b/lmms_eval/tasks/textcaps/utils.py @@ -38,7 +38,7 @@ def textcaps_process_result(doc, result): def textcaps_aggregation_result(results, metric, args=None): - scorers = [(Bleu(4), "Bleu_1"), (Bleu(4), "Bleu_2"), (Bleu(4), "Bleu_3"), (Bleu(4), "Bleu_4"), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")]#, (Spice(), "SPICE")] + scorers = [(Bleu(4), "Bleu_1"), (Bleu(4), "Bleu_2"), (Bleu(4), "Bleu_3"), (Bleu(4), "Bleu_4"), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] # , (Spice(), "SPICE")] scorers_dict = {s[1]: s for s in scorers} stored_results = [] diff --git a/lmms_eval/tasks/video_detail_description/utils.py b/lmms_eval/tasks/video_detail_description/utils.py index 1c5b0c70..9fb8c4b2 100755 --- a/lmms_eval/tasks/video_detail_description/utils.py +++ b/lmms_eval/tasks/video_detail_description/utils.py @@ -163,6 +163,10 @@ def get_eval_generic(question, answer, pred, max_tokens: int, retries: int = 5): except Exception as e: eval_logger.error(f"Unexpected error on attempt {attempt + 1}: {e}") + if "Sorry! We've encountered an issue with repetitive patterns in your prompt. Please try again with a different prompt." in json.loads(response.content)["error"]["message"]: + eval_logger.error(f"Repetitive patterns in prompt. Drop this data.") + return "", "" + # Handle other unexpected errors if attempt < retries - 1: time.sleep(NUM_SECONDS_TO_SLEEP) diff --git a/lmms_eval/tasks/videochatgpt/_default_template_yaml b/lmms_eval/tasks/videochatgpt/_default_template_yaml new file mode 100644 index 00000000..816297cc --- /dev/null +++ b/lmms_eval/tasks/videochatgpt/_default_template_yaml @@ -0,0 +1,13 @@ +dataset_path: lmms-lab/VideoChatGPT +dataset_kwargs: + token: True + video: True + cache_dir: videochatgpt +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "" + +metadata: + version: 0.0 + gpt_eval_model_name: gpt-3.5-turbo-0613 \ No newline at end of file diff --git a/lmms_eval/tasks/videochatgpt/utils.py b/lmms_eval/tasks/videochatgpt/utils.py new file mode 100755 index 00000000..b763b630 --- /dev/null +++ b/lmms_eval/tasks/videochatgpt/utils.py @@ -0,0 +1,551 @@ +from decord import VideoReader, cpu +import numpy as np +import os +import sys +import datetime +import lmms_eval.tasks._task_utils.file_utils as file_utils +import json +import logging +import yaml +from pathlib import Path + +import requests +import openai +from openai import OpenAI +import time +import ast + +eval_logger = logging.getLogger("lmms-eval") + +with open(Path(__file__).parent / "_default_template_yaml", "r") as f: + raw_data = f.readlines() + safe_data = [] + for i, line in enumerate(raw_data): + # remove function definition since yaml load cannot handle it + if "!function" not in line: + safe_data.append(line) + + config = yaml.safe_load("".join(safe_data)) + +NUM_SECONDS_TO_SLEEP = 5 + +GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"] + +API_TYPE = os.getenv("API_TYPE", "openai") + +if API_TYPE == "openai": + API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions") + API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY") + headers = { + "Authorization": f"Bearer {API_KEY}", + "Content-Type": "application/json", + } + +# Unzip all the zip files to HF HOME cache dir +HF_HOME = os.environ["HF_HOME"] +cache_dir = config["dataset_kwargs"]["cache_dir"] +cache_dir = os.path.join(HF_HOME, cache_dir) +cache_dir = os.path.join(cache_dir, "Test_Videos") + + +# Pass in video path here +# Can only work correctly with video llm +def videochatgpt_doc_to_visual(doc): + video_path = doc["video_name"] + ".mp4" + video_path = os.path.join(cache_dir, video_path) + if os.path.exists(video_path): + video_path = video_path + elif os.path.exists(video_path.replace("mp4", "MP4")): + video_path = video_path.replace("mp4", "MP4") + elif os.path.exists(video_path.replace("mp4", "mkv")): + video_path = video_path.replace("mp4", "mkv") + else: + sys.exit(f"video path:{video_path} does not exist, please check") + return [video_path] + + +# format the question +def videochatgpt_doc_to_text(doc, model_specific_prompt_kwargs=None): + if model_specific_prompt_kwargs is None: + model_specific_prompt_kwargs = {} + pre_prompt = "" + post_prompt = "" + if "pre_prompt" in model_specific_prompt_kwargs: + pre_prompt = model_specific_prompt_kwargs["pre_prompt"] + if "post_prompt" in model_specific_prompt_kwargs: + post_prompt = model_specific_prompt_kwargs["post_prompt"] + + question = doc["question"] + + return f"{pre_prompt}{question}{post_prompt}" + + +# format the question for consistency +def videochatgpt_doc_to_text_consistency(doc, model_specific_prompt_kwargs=None): + if model_specific_prompt_kwargs is None: + model_specific_prompt_kwargs = {} + pre_prompt = "" + post_prompt = "" + if "pre_prompt" in model_specific_prompt_kwargs: + pre_prompt = model_specific_prompt_kwargs["pre_prompt"] + if "post_prompt" in model_specific_prompt_kwargs: + post_prompt = model_specific_prompt_kwargs["post_prompt"] + + if "question_1" in doc: + question = doc["question_1"] + else: + question = doc["question_2"] + + return f"{pre_prompt}{question}{post_prompt}" + + +# format answer +def videochatgpt_doc_to_answer(doc): + return doc["answer"] + + +# Note: we process answer and gpt_eval seperately, in case gpt is not stable +# so we obtained a submission file for answer first +# and then feed the submission file to gpt for scoring + + +# Process result for evaluation in generic task +def videochatgpt_process_results_generic(doc, result): + pred = result[0] + + return {"submission": {"video_name": doc["video_name"], "Q": doc["question"], "A": doc["answer"], "pred": pred}} + + +# Process result for evaluation in temporal task +def videochatgpt_process_results_temporal(doc, result): + pred = result[0] + + return {"submission": {"video_name": doc["video_name"], "Q": doc["question"], "A": doc["answer"], "pred": pred}} + + +# Process result for generation in consistency task +def videochatgpt_process_results_consistency(doc, result): + pred = result[0] + + # if it is question_1, then assign prediction for the 1st question + # else assign prediction for the 2nd question + if doc["question_1"] != "None": + return {"submission": {"video_name": doc["video_name"], "Q1": doc["question_1"], "A": doc["answer"], "pred1": pred}} + else: + return {"submission": {"video_name": doc["video_name"], "Q2": doc["question_2"], "A": doc["answer"], "pred2": pred}} + + +def videochatgpt_aggregate_submissions(results, args, task): + now_date_time = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + submission_file_name = f"videochatgpt-{task}-{now_date_time}.json" + path = file_utils.generate_submission_file(submission_file_name, args) + + with open(path, "w") as f: + json.dump(results, f, indent=4) + + eval_logger.info(f"Submission file saved to {path}") + + return path + + +def videochatgpt_aggregate_submissions_consistency(results, args, task): + now_date_time = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + submission_file_name = f"videochatgpt-{task}-{now_date_time}.json" + path = file_utils.generate_submission_file(submission_file_name, args) + + combined_results = [] + + # Iterate over the results list in steps of 2 + for i in range(0, len(results), 2): + # Merge the current dict with the next one + first_dict = results[i] + second_dict = results[i + 1] if i + 1 < len(results) else {} + + # If 'video_name' is the same in both and is the key we use to match them + if first_dict.get("video_name") == second_dict.get("video_name"): + # Combine q2 and pred2 from the even dict into the odd dict + first_dict["Q2"] = second_dict.get("Q2") + first_dict["pred2"] = second_dict.get("pred2") + combined_results.append(first_dict) + + # Save the combined results to a file + with open(path, "w") as f: + json.dump(combined_results, f, indent=4) + + eval_logger.info(f"Submission file saved to {path}") + + return path + + +def get_eval_generic(question, answer, pred, task, max_tokens: int, retries: int = 5): + global headers + + if task == "correctness": + messages = [ + { + "role": "system", + "content": "You are an intelligent chatbot designed for evaluating the factual accuracy of generative outputs for video-based question-answer pairs. " + "Your task is to compare the predicted answer with the correct answer and determine if they are factually consistent. Here's how you can accomplish the task:" + "------" + "##INSTRUCTIONS: " + "- Focus on the factual consistency between the predicted answer and the correct answer. The predicted answer should not contain any misinterpretations or misinformation.\n" + "- The predicted answer must be factually accurate and align with the video content.\n" + "- Consider synonyms or paraphrases as valid matches.\n" + "- Evaluate the factual accuracy of the prediction compared to the answer.", + }, + { + "role": "user", + "content": "Please evaluate the following video-based question-answer pair:\n\n" + f"Question: {question}\n" + f"Correct Answer: {answer}\n" + f"Predicted Answer: {pred}\n\n" + "Provide your evaluation only as a factual accuracy score where the factual accuracy score is an integer value between 0 and 5, with 5 indicating the highest level of factual consistency. " + "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the factual accuracy score in INTEGER, not STRING." + "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. " + "For example, your response should look like this: {''score': 4.8}.", + }, + ] + elif task == "detailed_orientation": + messages = [ + { + "role": "system", + "content": "You are an intelligent chatbot designed for evaluating the detail orientation of generative outputs for video-based question-answer pairs. " + "Your task is to compare the predicted answer with the correct answer and determine its level of detail, considering both completeness and specificity. Here's how you can accomplish the task:" + "------" + "##INSTRUCTIONS: " + "- Check if the predicted answer covers all major points from the video. The response should not leave out any key aspects.\n" + "- Evaluate whether the predicted answer includes specific details rather than just generic points. It should provide comprehensive information that is tied to specific elements of the video.\n" + "- Consider synonyms or paraphrases as valid matches.\n" + "- Provide a single evaluation score that reflects the level of detail orientation of the prediction, considering both completeness and specificity.", + }, + { + "role": "user", + "content": "Please evaluate the following video-based question-answer pair:\n\n" + f"Question: {question}\n" + f"Correct Answer: {answer}\n" + f"Predicted Answer: {pred}\n\n" + "Provide your evaluation only as a detail orientation score where the detail orientation score is an integer value between 0 and 5, with 5 indicating the highest level of detail orientation. " + "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the detail orientation score in INTEGER, not STRING." + "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. " + "For example, your response should look like this: {''score': 4.8}.", + }, + ] + elif task == "context": + messages = [ + { + "role": "system", + "content": "You are an intelligent chatbot designed for evaluating the contextual understanding of generative outputs for video-based question-answer pairs. " + "Your task is to compare the predicted answer with the correct answer and determine if the generated response aligns with the overall context of the video content. Here's how you can accomplish the task:" + "------" + "##INSTRUCTIONS: " + "- Evaluate whether the predicted answer aligns with the overall context of the video content. It should not provide information that is out of context or misaligned.\n" + "- The predicted answer must capture the main themes and sentiments of the video.\n" + "- Consider synonyms or paraphrases as valid matches.\n" + "- Provide your evaluation of the contextual understanding of the prediction compared to the answer.", + }, + { + "role": "user", + "content": "Please evaluate the following video-based question-answer pair:\n\n" + f"Question: {question}\n" + f"Correct Answer: {answer}\n" + f"Predicted Answer: {pred}\n\n" + "Provide your evaluation only as a contextual understanding score where the contextual understanding score is an integer value between 0 and 5, with 5 indicating the highest level of contextual understanding. " + "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is contextual understanding score in INTEGER, not STRING." + "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. " + "For example, your response should look like this: {''score': 4.8}.", + }, + ] + elif task == "temporal": + messages = [ + { + "role": "system", + "content": "You are an intelligent chatbot designed for evaluating the temporal understanding of generative outputs for video-based question-answer pairs. " + "Your task is to compare the predicted answer with the correct answer and determine if they correctly reflect the temporal sequence of events in the video content. Here's how you can accomplish the task:" + "------" + "##INSTRUCTIONS: " + "- Focus on the temporal consistency between the predicted answer and the correct answer. The predicted answer should correctly reflect the sequence of events or details as they are presented in the video content.\n" + "- Consider synonyms or paraphrases as valid matches, but only if the temporal order is maintained.\n" + "- Evaluate the temporal accuracy of the prediction compared to the answer.", + }, + { + "role": "user", + "content": "Please evaluate the following video-based question-answer pair:\n\n" + f"Question: {question}\n" + f"Correct Answer: {answer}\n" + f"Predicted Answer: {pred}\n\n" + "Provide your evaluation only as a temporal accuracy score where the temporal accuracy score is an integer value between 0 and 5, with 5 indicating the highest level of temporal consistency. " + "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the temporal accuracy score in INTEGER, not STRING." + "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. " + "For example, your response should look like this: {''score': 4.8}.", + }, + ] + print(messages) + payload = { + "model": GPT_EVAL_MODEL_NAME, + "messages": messages, + "temperature": 0, + "max_tokens": max_tokens, + } + + for attempt in range(retries): + try: + response = requests.post(API_URL, headers=headers, json=payload, timeout=60) + response.raise_for_status() # Raises HTTPError for bad responses + try: + response_data = response.json() # Attempt to parse JSON + except requests.exceptions.JSONDecodeError: + eval_logger.error(f"JSON decode error on attempt {attempt + 1}. Response text: {response.text}") + continue # Skip to next retry + content = response_data["choices"][0]["message"]["content"].strip() + if content != "": + return content, response_data["model"] + # Handle HTTP errors separately + except requests.exceptions.HTTPError as e: + eval_logger.error(f"HTTP error on attempt {attempt + 1}: {e}") + # Handle other requests-related errors + except requests.exceptions.RequestException as e: + eval_logger.error(f"Request exception on attempt {attempt + 1}: {e}") + except Exception as e: + eval_logger.error(f"Unexpected error on attempt {attempt + 1}: {e}") + + # Handle other unexpected errors + if attempt < retries - 1: + time.sleep(NUM_SECONDS_TO_SLEEP) + else: # If this was the last attempt, log and return empty + eval_logger.error(f"All {retries} attempts failed. Last error message: {e}") + return "", "" + + return "", "" + + +def get_eval_consistency(question1, question2, answer, pred1, pred2, max_tokens: int, retries: int = 5): + global headers + + messages = [ + { + "role": "system", + "content": "You are an intelligent chatbot designed for evaluating the consistency of generative outputs for similar video-based question-answer pairs. " + "You will be given two very similar questions, a common answer common to both the questions and predicted answers for the two questions ." + "Your task is to compare the predicted answers for two very similar question, with a common correct answer and determine if they are consistent. Here's how you can accomplish the task:" + "------" + "##INSTRUCTIONS: " + "- Focus on the consistency between the two predicted answers and the correct answer. Both predicted answers should correspond to the correct answer and to each other, and should not contain any contradictions or significant differences in the conveyed information.\n" + "- Both predicted answers must be consistent with each other and the correct answer, in terms of the information they provide about the video content.\n" + "- Consider synonyms or paraphrases as valid matches, but only if they maintain the consistency in the conveyed information.\n" + "- Evaluate the consistency of the two predicted answers compared to the correct answer.", + }, + { + "role": "user", + "content": "Please evaluate the following video-based question-answer pair:\n\n" + f"Question 1: {question1}\n" + f"Question 2: {question2}\n" + f"Correct Answer: {answer}\n" + f"Predicted Answer to Question 1: {pred1}\n" + f"Predicted Answer to Question 2: {pred2}\n\n" + "Provide your evaluation only as a consistency score where the consistency score is an integer value between 0 and 5, with 5 indicating the highest level of consistency. " + "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the consistency score in INTEGER, not STRING." + "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. " + "For example, your response should look like this: {''score': 4.8}.", + }, + ] + print(messages) + payload = { + "model": GPT_EVAL_MODEL_NAME, + "messages": messages, + "temperature": 0, + "max_tokens": max_tokens, + } + + for attempt in range(retries): + try: + response = requests.post(API_URL, headers=headers, json=payload, timeout=60) + response.raise_for_status() # Raises HTTPError for bad responses + try: + response_data = response.json() # Attempt to parse JSON + except requests.exceptions.JSONDecodeError: + eval_logger.error(f"JSON decode error on attempt {attempt + 1}. Response text: {response.text}") + continue # Skip to next retry + content = response_data["choices"][0]["message"]["content"].strip() + if content != "": + return content, response_data["model"] + # Handle HTTP errors separately + except requests.exceptions.HTTPError as e: + eval_logger.error(f"HTTP error on attempt {attempt + 1}: {e}") + # Handle other requests-related errors + except requests.exceptions.RequestException as e: + eval_logger.error(f"Request exception on attempt {attempt + 1}: {e}") + except Exception as e: + eval_logger.error(f"Unexpected error on attempt {attempt + 1}: {e}") + + # Handle other unexpected errors + if attempt < retries - 1: + time.sleep(NUM_SECONDS_TO_SLEEP) + else: # If this was the last attempt, log and return empty + eval_logger.error(f"All {retries} attempts failed. Last error message: {e}") + return "", "" + + return "", "" + + +def parse_score(review): + try: + # Convert the string representation of a dictionary to an actual dictionary + review_dict = ast.literal_eval(review) + score = review_dict.get("score", 0) + return float(score) + except SyntaxError as e: + eval_logger.error(f"Syntax error parsing the review string: {e}. Review content: {review}") + return 0 + except ValueError as e: + eval_logger.error(f"Value error parsing the review string: {e}. Review content: {review}") + return 0 + except Exception as e: + eval_logger.error(f"Unexpected error parsing the review string: {e}. Review content: {review}") + return 0 + + +def videochatgpt_print_scores(eval_file_path, args): + # Load the predictions from the result file + with open(eval_file_path, "r") as file: + evaluated_list = json.load(file) + + score_file_name = "scores.json" + path = file_utils.generate_submission_file(score_file_name, args) + + # Compute average score + total_score = 0 + + # Iterate over the results to sum scores + for result_dict in evaluated_list: + total_score += result_dict["score"] + + # Calculate accuracy and average score + average_score = total_score / len(evaluated_list) if evaluated_list else 0 + + # Print the results + print(f"Average Score: {average_score}") + + # Write the processed data to the scores file + with open(path, "w") as f: + json.dump({"average_score": average_score}, f, indent=4) + + eval_logger.info(f"Score file saved to {path}") + + +def videochatgpt_gpt_eval(result_file_path, args, task): + """ + Process the result file containing predictions, score them using GPT, + and save the results with added scores and correctness fields to a new file. + + Args: + result_file_path: path to the JSON file with results to be evaluated + """ + + eval_file_name = "gpt_eval_result.json" + eval_file_path = file_utils.generate_submission_file(eval_file_name, args) + + # Load the predictions from the result file + with open(result_file_path, "r") as file: + result_list = json.load(file) + + evaluated_results = [] + + # Load the predictions from the result file + with open(result_file_path, "r") as file: + result_list = json.load(file) + + # Process each result to generate scores + # If task is consistency (2 questions with 2 answers) + if task == "consistency": + for data_dict in result_list: + try: + question1 = data_dict.get("Q1", "") + question2 = data_dict.get("Q2", "") + answer = data_dict.get("A", "") + pred1 = data_dict.get("pred1", "") + pred2 = data_dict.get("pred2", "") + + # Assume get_eval returns a review and the model name, and parse_score parses this review + review, model_name = get_eval_consistency(question1, question2, answer, pred1, pred2, 64) + score = parse_score(review) + except Exception as e: + eval_logger.error(f"Error for Video Name: {data_dict.get('video_name', 'Unknown')}: {e}") + review = "Failed to Get a Proper Review." + model_name = "Failed Request" + score = 0 + + # Update the dictionary with the new entries + updated_dict = { + "video_name": data_dict["video_name"], + "score": score, + "Q1": question1, + "Q2": question2, + "A": answer, + "pred1": pred1, + "pred2": pred2, + } + evaluated_results.append(updated_dict) + # If task is correctness, context, detail, temporal (1 question with 1 answer) + else: + # Process each result to generate scores + for data_dict in result_list: + try: + question = data_dict.get("Q", "") + answer = data_dict.get("A", "") + pred = data_dict.get("pred", "") + + # Assume get_eval returns a review and the model name, and parse_score parses this review + review, model_name = get_eval_generic(question, answer, pred, task, 64) + score = parse_score(review) + except Exception as e: + eval_logger.error(f"Error for Video Name: {data_dict.get('video_name', 'Unknown')}: {e}") + review = "Failed to Get a Proper Review." + model_name = "Failed Request" + score = 0 + + # Update the dictionary with the new entries + updated_dict = { + "video_name": data_dict["video_name"], + "score": score, + "Q": question, + "A": answer, + "pred": pred, + } + evaluated_results.append(updated_dict) + + # Save the evaluated results to a new JSON file + with open(eval_file_path, "w") as f: + json.dump(evaluated_results, f, indent=4) + + return eval_file_path + + +# Factory into different aggregate +def videochatgpt_aggregate_correctness(results, args): + result_file_path = videochatgpt_aggregate_submissions(results, args, "correctness") + eval_file_path = videochatgpt_gpt_eval(result_file_path, args, "correctness") + videochatgpt_print_scores(eval_file_path, args) + + +def videochatgpt_aggregate_detailed_orientation(results, args): + result_file_path = videochatgpt_aggregate_submissions(results, args, "detailed_orientation") + eval_file_path = videochatgpt_gpt_eval(result_file_path, args, "detailed_orientation") + videochatgpt_print_scores(eval_file_path, args) + + +def videochatgpt_aggregate_context(results, args): + result_file_path = videochatgpt_aggregate_submissions(results, args, "context") + eval_file_path = videochatgpt_gpt_eval(result_file_path, args, "context") + videochatgpt_print_scores(eval_file_path, args) + + +def videochatgpt_aggregate_temporal(results, args): + result_file_path = videochatgpt_aggregate_submissions(results, args, "temporal") + eval_file_path = videochatgpt_gpt_eval(result_file_path, args, "temporal") + videochatgpt_print_scores(eval_file_path, args) + + +def videochatgpt_aggregate_consistency(results, args): + result_file_path = videochatgpt_aggregate_submissions_consistency(results, args, "consistency") + eval_file_path = videochatgpt_gpt_eval(result_file_path, args, "consistency") + videochatgpt_print_scores(eval_file_path, args) diff --git a/lmms_eval/tasks/videochatgpt/videochatgpt_consistency.yaml b/lmms_eval/tasks/videochatgpt/videochatgpt_consistency.yaml new file mode 100755 index 00000000..84e6a8c0 --- /dev/null +++ b/lmms_eval/tasks/videochatgpt/videochatgpt_consistency.yaml @@ -0,0 +1,24 @@ +dataset_name: "Consistency" +task: "videochatgpt_consistency" +test_split: test +output_type: generate_until +doc_to_visual: !function utils.videochatgpt_doc_to_visual +doc_to_text: !function utils.videochatgpt_doc_to_text_consistency +doc_to_target: !function utils.videochatgpt_doc_to_answer +process_results: !function utils.videochatgpt_process_results_consistency +metric_list: + - metric: submission + aggregation: !function utils.videochatgpt_aggregate_consistency + higher_is_better: true +include: _default_template_yaml + +generation_kwargs: + until: + - "ASSISTANT:" + image_aspect_ratio: original + max_new_tokens: 1024 + temperature: 0 + top_p: 0 + num_beams: 1 + do_sample: false + \ No newline at end of file diff --git a/lmms_eval/tasks/videochatgpt/videochatgpt_context.yaml b/lmms_eval/tasks/videochatgpt/videochatgpt_context.yaml new file mode 100755 index 00000000..01d8a259 --- /dev/null +++ b/lmms_eval/tasks/videochatgpt/videochatgpt_context.yaml @@ -0,0 +1,13 @@ +dataset_name: "Generic" +task: "videochatgpt_context" +test_split: test +output_type: generate_until +doc_to_visual: !function utils.videochatgpt_doc_to_visual +doc_to_text: !function utils.videochatgpt_doc_to_text +doc_to_target: !function utils.videochatgpt_doc_to_answer +process_results: !function utils.videochatgpt_process_results_generic +metric_list: + - metric: submission + aggregation: !function utils.videochatgpt_aggregate_context + higher_is_better: true +include: _default_template_yaml diff --git a/lmms_eval/tasks/videochatgpt/videochatgpt_correctness.yaml b/lmms_eval/tasks/videochatgpt/videochatgpt_correctness.yaml new file mode 100755 index 00000000..11fedc27 --- /dev/null +++ b/lmms_eval/tasks/videochatgpt/videochatgpt_correctness.yaml @@ -0,0 +1,14 @@ +dataset_name: "Generic" +task: "videochatgpt_correctness" +test_split: test +output_type: generate_until +doc_to_visual: !function utils.videochatgpt_doc_to_visual +doc_to_text: !function utils.videochatgpt_doc_to_text +doc_to_target: !function utils.videochatgpt_doc_to_answer +process_results: !function utils.videochatgpt_process_results_generic +metric_list: + - metric: submission + aggregation: !function utils.videochatgpt_aggregate_correctness + higher_is_better: true +include: _default_template_yaml + diff --git a/lmms_eval/tasks/videochatgpt/videochatgpt_detailed_orientation.yaml b/lmms_eval/tasks/videochatgpt/videochatgpt_detailed_orientation.yaml new file mode 100755 index 00000000..b93c0ecd --- /dev/null +++ b/lmms_eval/tasks/videochatgpt/videochatgpt_detailed_orientation.yaml @@ -0,0 +1,13 @@ +dataset_name: "Generic" +task: "videochatgpt_detailed_orientation" +test_split: test +output_type: generate_until +doc_to_visual: !function utils.videochatgpt_doc_to_visual +doc_to_text: !function utils.videochatgpt_doc_to_text +doc_to_target: !function utils.videochatgpt_doc_to_answer +process_results: !function utils.videochatgpt_process_results_generic +metric_list: + - metric: submission + aggregation: !function utils.videochatgpt_aggregate_detailed_orientation + higher_is_better: true +include: _default_template_yaml diff --git a/lmms_eval/tasks/videochatgpt/videochatgpt_temporal.yaml b/lmms_eval/tasks/videochatgpt/videochatgpt_temporal.yaml new file mode 100755 index 00000000..1bf336ce --- /dev/null +++ b/lmms_eval/tasks/videochatgpt/videochatgpt_temporal.yaml @@ -0,0 +1,23 @@ +dataset_name: "Temporal" +task: "videochatgpt_temporal" +test_split: test +output_type: generate_until +doc_to_visual: !function utils.videochatgpt_doc_to_visual +doc_to_text: !function utils.videochatgpt_doc_to_text +doc_to_target: !function utils.videochatgpt_doc_to_answer +process_results: !function utils.videochatgpt_process_results_temporal +metric_list: + - metric: submission + aggregation: !function utils.videochatgpt_aggregate_temporal + higher_is_better: true +include: _default_template_yaml + +generation_kwargs: + until: + - "ASSISTANT:" + image_aspect_ratio: original + max_new_tokens: 1024 + temperature: 0 + top_p: 0 + num_beams: 1 + do_sample: false diff --git a/pyproject.toml b/pyproject.toml index b66d5e9f..398b35ff 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,9 +36,11 @@ dependencies = [ "sqlitedict", "torch>=2.1.0", # to enable sdpa mode for running 34B model on one 80GB GPU "openai>=1.0.0", + "google-generativeai", "pycocoevalcap", "tqdm-multiprocess", "transformers>=4.37.2", + "transformers-stream-generator", "zstandard", "pillow", "pyyaml", @@ -50,12 +52,12 @@ dependencies = [ "hf_transfer", "tenacity", "wandb>=0.16.0", - "transformers-stream-generator", "tiktoken", "pre-commit", "pydantic", "packaging", "decord", + "zss", ] [tool.setuptools.packages.find] diff --git a/tools/make_activitynetqa.ipynb b/tools/make_activitynetqa.ipynb new file mode 100755 index 00000000..efbef00e --- /dev/null +++ b/tools/make_activitynetqa.ipynb @@ -0,0 +1,120 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# This notebook will guide you to make correct format of Huggingface dataset, in proper parquet format and visualizable in Huggingface dataset hub.\n", + "# We will take the example of the dataset \"Otter-AI/MMVet\" and convert it to the proper format." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "\n", + "data_path = \"Otter-AI/MMVet\"\n", + "df = load_dataset(data_path, split=\"test\").to_pandas()\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import Dataset, Features, Value, Image\n", + "import pandas as pd\n", + "\n", + "# Define the features for the dataset\n", + "features = Features(\n", + " {\n", + " \"question_id\": Value(dtype=\"string\"),\n", + " \"image\": Image(),\n", + " \"question\": Value(dtype=\"string\"),\n", + " \"answer\": Value(dtype=\"string\"),\n", + " \"image_source\": Value(dtype=\"string\"),\n", + " \"capability\": Value(dtype=\"string\"),\n", + " # Add other fields as necessary\n", + " }\n", + ")\n", + "\n", + "df_items = {\n", + " \"question_id\": [],\n", + " \"image\": [],\n", + " \"question\": [],\n", + " \"answer\": [],\n", + " \"image_source\": [],\n", + " \"capability\": [],\n", + "}\n", + "\n", + "for idx, row in df.iterrows():\n", + " df_items[\"question_id\"].append(str(row[\"id\"]))\n", + " image = {\"bytes\": row[\"images\"][0][\"bytes\"], \"path\": \"\"}\n", + " df_items[\"image\"].append(image)\n", + " df_items[\"question\"].append(str(row[\"instruction\"]))\n", + " df_items[\"answer\"].append(str(row[\"answer\"]))\n", + " df_items[\"image_source\"].append(str(row[\"image_source\"]))\n", + " df_items[\"capability\"].append(\",\".join(list(row[\"capability\"])))\n", + " # Add other fields as necessary\n", + "\n", + "df_items = pd.DataFrame(df_items)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_items.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dataset = Dataset.from_pandas(df_items, features=features)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "hub_dataset_path = \"lmms-lab/MMVet\"\n", + "dataset.push_to_hub(repo_id=hub_dataset_path, split=\"test\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "lmms-eval", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}