Skip to content

Commit

Permalink
[WIP] adding mmbench dev evaluation (#75)
Browse files Browse the repository at this point in the history
* WIP

* Update GPT evaluation model name and sys prompt

* 🛠️ Scale accuracy to percentage

The accuracy value is now multiplied by 100 in the aggregation function to represent it as a percentage. Regarding the evaluation process, `math` module importation and refactoring reduce progress log verbosity by logging every 100 evaluations instead of 10. It prevents potential logging overflow. Handling of NaN values is added to ensure 'default_value' is set in case of missing data, avoiding errors in split, category, and l2-category assignments. Finally, reporting of categorical and l2-categorical accuracies is streamlined through a new `calculate_hit_rates` function, improving code readability and maintenance.

Issue refs: #1427, #1533

* Update GPT evaluation model name and API configuration

* Refactor MMBench_Evaluator class to handle missing columns

* Add print statements for detailed results in MMBench-CN(CC), MMBench-CN(Dev), and MMBench-EN(Dev) evaluations

* Refactor MMBench-CN and MMBench-EN evaluation functions

* 🔄 Refactor result processing and logging logic

- Simplified the result processing functions across different utility modules (`cc_utils.py`, `cn_utils.py`, `en_utils.py`) to unify the handling of multiple-choice options. Now, all options ("A" to "E") are dynamically added to the result data, and default to "nan" if not provided in the document.
- Removed redundant keys directly from the process results dict creation to avoid clutter and align with the new dynamic addition of options.
- In `mmbench_evals.py`, removed the unnecessary check for all splits being 'dev' and streamlined the evaluation loop by eliminating the progress bar (tqdm) for a cleaner log output.
- Commented-out code and verbose logging during evaluation, which may have interfered with performance, has been removed for a more efficient and less intrusive logging experience.

This cleanup reduces redundancy in the codebase and improves evaluation performance.

Refs #2045

---------

Co-authored-by: Bo Li <[email protected]>
(cherry picked from commit a19278c)
  • Loading branch information
Bo Li committed Apr 4, 2024
1 parent 70cc773 commit 22a4958
Show file tree
Hide file tree
Showing 10 changed files with 439 additions and 19 deletions.
40 changes: 37 additions & 3 deletions lmms_eval/tasks/mmbench/cc_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from lmms_eval.tasks.mmbench.mmbench_evals import MMBench_Evaluator
from lmms_eval.tasks._task_utils.file_utils import generate_submission_file

with open(Path(__file__).parent / "mmbench_cn.yaml", "r") as f:
with open(Path(__file__).parent / "mmbench.yaml", "r") as f:
raw_data = f.readlines()
safe_data = []
for i, line in enumerate(raw_data):
Expand All @@ -19,7 +19,18 @@

config = yaml.safe_load("".join(safe_data))

mmbench_evaluator = MMBench_Evaluator(sys_prompt=config["metadata"]["sys_prompt"])
GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"]
API_TYPE = os.getenv("API_TYPE", "openai")

if API_TYPE == "openai":
API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
elif API_TYPE == "azure":
API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")


mmbench_evaluator = MMBench_Evaluator(sys_prompt=config["metadata"]["sys_prompt"], API_KEY=API_KEY, API_URL=API_URL, model_version=GPT_EVAL_MODEL_NAME)


def mmbench_doc_to_visual(doc):
Expand Down Expand Up @@ -52,21 +63,44 @@ def mmbench_cn_cc_doc_to_text(doc, model_specific_prompt_kwargs=None):
def mmbench_cn_cc_process_results(doc, results):
model_response = results[0].strip()
data = {
"gpt_eval_score": {
"index": doc["index"],
"question": doc["question"],
"answer": doc["answer"],
"prediction": model_response,
"source": doc["source"],
"category": doc["category"],
},
"submission": {
"index": doc["index"],
"question": doc["question"],
"answer": doc["answer"],
"prediction": model_response,
"source": doc["source"],
"category": doc["category"],
}
},
}
option_candidate = ["A", "B", "C", "D", "E"]
for c in option_candidate:
data["submission"][c] = doc.get(c, "nan")
data["gpt_eval_score"][c] = doc.get(c, "nan")
return data


def mmbench_cn_cc_aggregate_dev_results_eval(results, args):
print(f"============= MMBench-CN(CC) Detailed Results =============")
overall_acc, category_acc, l2_category_acc = mmbench_evaluator.eval_result(results, eval_method="openai")
file = generate_submission_file("mmbench_cn_cc_results.json", args)
details_info = {
"overall_acc": overall_acc,
"category_acc": category_acc,
"l2_category_acc": l2_category_acc,
}
with open(file, "w") as f:
json.dump(details_info, f)
return overall_acc * 100


def mmbench_cn_cc_aggregate_results(results, args):
df = pd.DataFrame(results)
file = generate_submission_file("mmbench_cn_cc_results.xlsx", args)
Expand Down
36 changes: 33 additions & 3 deletions lmms_eval/tasks/mmbench/cn_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@

eval_logger = logging.getLogger("lmms-eval")
from lmms_eval.tasks.mmbench.mmbench_evals import MMBench_Evaluator
from lmms_eval.tasks._task_utils.file_utils import generate_submission_file

with open(Path(__file__).parent / "mmbench_cn.yaml", "r") as f:
with open(Path(__file__).parent / "mmbench.yaml", "r") as f:
raw_data = f.readlines()
safe_data = []
for i, line in enumerate(raw_data):
Expand All @@ -19,7 +20,18 @@

config = yaml.safe_load("".join(safe_data))

mmbench_evaluator = MMBench_Evaluator(sys_prompt=config["metadata"]["sys_prompt"])
GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"]
API_TYPE = os.getenv("API_TYPE", "openai")

if API_TYPE == "openai":
API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
elif API_TYPE == "azure":
API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")


mmbench_evaluator = MMBench_Evaluator(sys_prompt=config["metadata"]["sys_prompt"], API_KEY=API_KEY, API_URL=API_URL, model_version=GPT_EVAL_MODEL_NAME)


def mmbench_doc_to_visual(doc):
Expand Down Expand Up @@ -55,6 +67,17 @@ def mmbench_doc_to_text(doc, model_specific_prompt_kwargs=None):
def mmbench_process_results(doc, results):
model_response = results[0].strip()
data = {
"gpt_eval_score": {
"index": doc["index"],
"question": doc["question"],
"answer": doc["answer"],
"prediction": model_response,
"hint": doc["hint"],
"source": doc["source"],
"split": doc["split"],
"category": doc["category"],
"L2-category": doc["L2-category"],
},
"submission": {
"index": doc["index"],
"question": doc["question"],
Expand All @@ -65,14 +88,21 @@ def mmbench_process_results(doc, results):
"split": doc["split"],
"category": doc["category"],
"L2-category": doc["L2-category"],
}
},
}
option_candidate = ["A", "B", "C", "D", "E"]
for c in option_candidate:
data["submission"][c] = doc.get(c, "nan")
data["gpt_eval_score"][c] = doc.get(c, "nan")
return data


def mmbench_aggregate_dev_results_eval(results, args):
print(f"============= MMBench-CN(Dev) Detailed Results =============")
accuracy = mmbench_evaluator.eval_result(results, eval_method="openai")
return accuracy * 100


def mmbench_aggregate_dev_results(results, args):
df = pd.DataFrame(results)
excel_write_path = generate_submission_file("mmbench_cn_dev_results.xlsx", args)
Expand Down
45 changes: 41 additions & 4 deletions lmms_eval/tasks/mmbench/en_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from lmms_eval.tasks.mmbench.mmbench_evals import MMBench_Evaluator
from lmms_eval.tasks._task_utils.file_utils import generate_submission_file

with open(Path(__file__).parent / "mmbench_en.yaml", "r") as f:
with open(Path(__file__).parent / "mmbench.yaml", "r") as f:
raw_data = f.readlines()
safe_data = []
for i, line in enumerate(raw_data):
Expand All @@ -19,7 +19,18 @@

config = yaml.safe_load("".join(safe_data))

mmbench_evaluator = MMBench_Evaluator(sys_prompt=config["metadata"]["sys_prompt"])
GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"]
API_TYPE = os.getenv("API_TYPE", "openai")

if API_TYPE == "openai":
API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
elif API_TYPE == "azure":
API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")


mmbench_evaluator = MMBench_Evaluator(sys_prompt=config["metadata"]["sys_prompt"], API_KEY=API_KEY, API_URL=API_URL, model_version=GPT_EVAL_MODEL_NAME)


def mmbench_doc_to_visual(doc):
Expand Down Expand Up @@ -55,6 +66,17 @@ def mmbench_doc_to_text(doc, model_specific_prompt_kwargs=None):
def mmbench_process_results(doc, results):
model_response = results[0].strip()
data = {
"gpt_eval_score": {
"index": doc["index"],
"question": doc["question"],
"answer": doc["answer"],
"prediction": model_response,
"hint": doc["hint"],
"source": doc["source"],
"split": doc["split"],
"category": doc["category"],
"L2-category": doc["L2-category"],
},
"submission": {
"index": doc["index"],
"question": doc["question"],
Expand All @@ -65,15 +87,30 @@ def mmbench_process_results(doc, results):
"split": doc["split"],
"category": doc["category"],
"L2-category": doc["L2-category"],
}
},
}
option_candidate = ["A", "B", "C", "D", "E"]
for c in option_candidate:
data["submission"][c] = doc.get(c, "nan")
data["gpt_eval_score"][c] = doc.get(c, "nan")
return data


def mmbench_aggregate_dev_results(results, args):
def mmbench_aggregate_dev_results_eval(results, args):
print(f"============= MMBench-EN(Dev) Detailed Results =============")
overall_acc, category_acc, l2_category_acc = mmbench_evaluator.eval_result(results, eval_method="openai")
file = generate_submission_file("mmbench_en_dev_results.json", args)
details_info = {
"overall_acc": overall_acc,
"category_acc": category_acc,
"l2_category_acc": l2_category_acc,
}
with open(file, "w") as f:
json.dump(details_info, f)
return overall_acc * 100


def mmbench_aggregate_dev_results_submission(results, args):
df = pd.DataFrame(results)
excel_write_path = generate_submission_file("mmbench_en_dev_results.xlsx", args)
with pd.ExcelWriter(excel_write_path) as writer:
Expand Down
6 changes: 5 additions & 1 deletion lmms_eval/tasks/mmbench/mmbench.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,8 @@ task:
- mmbench_en_test
- mmbench_cn_dev
- mmbench_cn_test
- mmbench_cn_cc
- mmbench_cn_cc
metadata:
version: 0.0
sys_prompt: "There are several options:"
gpt_eval_model_name: "gpt-3.5-turbo-0613"
6 changes: 4 additions & 2 deletions lmms_eval/tasks/mmbench/mmbench_cc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,14 @@ generation_kwargs:
do_sample: false
process_results: !function cc_utils.mmbench_cn_cc_process_results
metric_list:
- metric: gpt_eval_score
aggregation: !function cc_utils.mmbench_cn_cc_aggregate_dev_results_eval
higher_is_better: true
- metric: submission
aggregation: !function cc_utils.mmbench_cn_cc_aggregate_results
metadata:
version: 0.0
gpt_eval_model_name: "gpt-3.5-turbo"
quick_extract: true
gpt_eval_model_name: "gpt-3.5-turbo-0613"

model_specific_prompt_kwargs:
default:
Expand Down
5 changes: 2 additions & 3 deletions lmms_eval/tasks/mmbench/mmbench_cn.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,5 @@ task:
- mmbench_cn_cc
metadata:
version: 0.0
gpt_eval_model_name: "gpt-3.5-turbo"
quick_extract: true
sys_prompt: "有如下几个选项:"
gpt_eval_model_name: "gpt-3.5-turbo-0613"
sys_prompt: "有如下几个选项:"
3 changes: 3 additions & 0 deletions lmms_eval/tasks/mmbench/mmbench_cn_dev.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
task: "mmbench_cn_dev"
test_split: "dev"
metric_list:
- metric: gpt_eval_score
aggregation: !function cn_utils.mmbench_aggregate_dev_results_eval
higher_is_better: true
- metric: submission
higher_is_better: true
aggregation: !function cn_utils.mmbench_aggregate_dev_results
Expand Down
1 change: 1 addition & 0 deletions lmms_eval/tasks/mmbench/mmbench_en.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ task:
metadata:
version: 0.0
sys_prompt: "There are several options:"
gpt_eval_model_name: "gpt-3.5-turbo-0613"
7 changes: 5 additions & 2 deletions lmms_eval/tasks/mmbench/mmbench_en_dev.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ task: "mmbench_en_dev"
test_split: dev
include: _default_template_mmbench_en_yaml
metric_list:
- metric: submission
aggregation: !function en_utils.mmbench_aggregate_dev_results
- metric: gpt_eval_score
aggregation: !function en_utils.mmbench_aggregate_dev_results_eval
higher_is_better: true
- metric: submission
aggregation: !function en_utils.mmbench_aggregate_dev_results_submission
higher_is_better: true
Loading

0 comments on commit 22a4958

Please sign in to comment.