Skip to content

Commit

Permalink
Merge pull request EvolvingLMMs-Lab#46 from EvolvingLMMs-Lab/dev/publ…
Browse files Browse the repository at this point in the history
…ic/mmbench

[WIP] adding mmbench dev evaluation (EvolvingLMMs-Lab#75)
  • Loading branch information
pufanyi authored Apr 7, 2024
2 parents 234b6cb + fce4a5f commit 61a33cd
Show file tree
Hide file tree
Showing 10 changed files with 447 additions and 19 deletions.
40 changes: 37 additions & 3 deletions lmms_eval/tasks/mmbench/cc_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from lmms_eval.tasks.mmbench.mmbench_evals import MMBench_Evaluator
from lmms_eval.tasks._task_utils.file_utils import generate_submission_file

with open(Path(__file__).parent / "mmbench_cn.yaml", "r") as f:
with open(Path(__file__).parent / "mmbench.yaml", "r") as f:
raw_data = f.readlines()
safe_data = []
for i, line in enumerate(raw_data):
Expand All @@ -19,7 +19,18 @@

config = yaml.safe_load("".join(safe_data))

mmbench_evaluator = MMBench_Evaluator(sys_prompt=config["metadata"]["sys_prompt"])
GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"]
API_TYPE = os.getenv("API_TYPE", "openai")

if API_TYPE == "openai":
API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
elif API_TYPE == "azure":
API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")


mmbench_evaluator = MMBench_Evaluator(sys_prompt=config["metadata"]["sys_prompt"], API_KEY=API_KEY, API_URL=API_URL, model_version=GPT_EVAL_MODEL_NAME)


def mmbench_doc_to_visual(doc):
Expand Down Expand Up @@ -52,21 +63,44 @@ def mmbench_cn_cc_doc_to_text(doc, model_specific_prompt_kwargs=None):
def mmbench_cn_cc_process_results(doc, results):
model_response = results[0].strip()
data = {
"gpt_eval_score": {
"index": doc["index"],
"question": doc["question"],
"answer": doc["answer"],
"prediction": model_response,
"source": doc["source"],
"category": doc["category"],
},
"submission": {
"index": doc["index"],
"question": doc["question"],
"answer": doc["answer"],
"prediction": model_response,
"source": doc["source"],
"category": doc["category"],
}
},
}
option_candidate = ["A", "B", "C", "D", "E"]
for c in option_candidate:
data["submission"][c] = doc.get(c, "nan")
data["gpt_eval_score"][c] = doc.get(c, "nan")
return data


def mmbench_cn_cc_aggregate_dev_results_eval(results, args):
print(f"============= MMBench-CN(CC) Detailed Results =============")
overall_acc, category_acc, l2_category_acc = mmbench_evaluator.eval_result(results, eval_method="openai")
file = generate_submission_file("mmbench_cn_cc_results.json", args)
details_info = {
"overall_acc": overall_acc,
"category_acc": category_acc,
"l2_category_acc": l2_category_acc,
}
with open(file, "w") as f:
json.dump(details_info, f)
return overall_acc * 100


def mmbench_cn_cc_aggregate_results(results, args):
df = pd.DataFrame(results)
file = generate_submission_file("mmbench_cn_cc_results.xlsx", args)
Expand Down
44 changes: 41 additions & 3 deletions lmms_eval/tasks/mmbench/cn_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@

eval_logger = logging.getLogger("lmms-eval")
from lmms_eval.tasks.mmbench.mmbench_evals import MMBench_Evaluator
from lmms_eval.tasks._task_utils.file_utils import generate_submission_file

with open(Path(__file__).parent / "mmbench_cn.yaml", "r") as f:
with open(Path(__file__).parent / "mmbench.yaml", "r") as f:
raw_data = f.readlines()
safe_data = []
for i, line in enumerate(raw_data):
Expand All @@ -19,7 +20,18 @@

config = yaml.safe_load("".join(safe_data))

mmbench_evaluator = MMBench_Evaluator(sys_prompt=config["metadata"]["sys_prompt"])
GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"]
API_TYPE = os.getenv("API_TYPE", "openai")

if API_TYPE == "openai":
API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
elif API_TYPE == "azure":
API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")


mmbench_evaluator = MMBench_Evaluator(sys_prompt=config["metadata"]["sys_prompt"], API_KEY=API_KEY, API_URL=API_URL, model_version=GPT_EVAL_MODEL_NAME)


def mmbench_doc_to_visual(doc):
Expand Down Expand Up @@ -55,6 +67,17 @@ def mmbench_doc_to_text(doc, model_specific_prompt_kwargs=None):
def mmbench_process_results(doc, results):
model_response = results[0].strip()
data = {
"gpt_eval_score": {
"index": doc["index"],
"question": doc["question"],
"answer": doc["answer"],
"prediction": model_response,
"hint": doc["hint"],
"source": doc["source"],
"split": doc["split"],
"category": doc["category"],
"L2-category": doc["L2-category"],
},
"submission": {
"index": doc["index"],
"question": doc["question"],
Expand All @@ -65,14 +88,29 @@ def mmbench_process_results(doc, results):
"split": doc["split"],
"category": doc["category"],
"L2-category": doc["L2-category"],
}
},
}
option_candidate = ["A", "B", "C", "D", "E"]
for c in option_candidate:
data["submission"][c] = doc.get(c, "nan")
data["gpt_eval_score"][c] = doc.get(c, "nan")
return data


def mmbench_aggregate_dev_results_eval(results, args):
print(f"============= MMBench-CN(Dev) Detailed Results =============")
overall_acc, category_acc, l2_category_acc = mmbench_evaluator.eval_result(results, eval_method="openai")
file = generate_submission_file("mmbench_cn_dev_results.json", args)
details_info = {
"overall_acc": overall_acc,
"category_acc": category_acc,
"l2_category_acc": l2_category_acc,
}
with open(file, "w") as f:
json.dump(details_info, f)
return overall_acc * 100


def mmbench_aggregate_dev_results(results, args):
df = pd.DataFrame(results)
excel_write_path = generate_submission_file("mmbench_cn_dev_results.xlsx", args)
Expand Down
45 changes: 41 additions & 4 deletions lmms_eval/tasks/mmbench/en_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from lmms_eval.tasks.mmbench.mmbench_evals import MMBench_Evaluator
from lmms_eval.tasks._task_utils.file_utils import generate_submission_file

with open(Path(__file__).parent / "mmbench_en.yaml", "r") as f:
with open(Path(__file__).parent / "mmbench.yaml", "r") as f:
raw_data = f.readlines()
safe_data = []
for i, line in enumerate(raw_data):
Expand All @@ -19,7 +19,18 @@

config = yaml.safe_load("".join(safe_data))

mmbench_evaluator = MMBench_Evaluator(sys_prompt=config["metadata"]["sys_prompt"])
GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"]
API_TYPE = os.getenv("API_TYPE", "openai")

if API_TYPE == "openai":
API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
elif API_TYPE == "azure":
API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")


mmbench_evaluator = MMBench_Evaluator(sys_prompt=config["metadata"]["sys_prompt"], API_KEY=API_KEY, API_URL=API_URL, model_version=GPT_EVAL_MODEL_NAME)


def mmbench_doc_to_visual(doc):
Expand Down Expand Up @@ -55,6 +66,17 @@ def mmbench_doc_to_text(doc, model_specific_prompt_kwargs=None):
def mmbench_process_results(doc, results):
model_response = results[0].strip()
data = {
"gpt_eval_score": {
"index": doc["index"],
"question": doc["question"],
"answer": doc["answer"],
"prediction": model_response,
"hint": doc["hint"],
"source": doc["source"],
"split": doc["split"],
"category": doc["category"],
"L2-category": doc["L2-category"],
},
"submission": {
"index": doc["index"],
"question": doc["question"],
Expand All @@ -65,15 +87,30 @@ def mmbench_process_results(doc, results):
"split": doc["split"],
"category": doc["category"],
"L2-category": doc["L2-category"],
}
},
}
option_candidate = ["A", "B", "C", "D", "E"]
for c in option_candidate:
data["submission"][c] = doc.get(c, "nan")
data["gpt_eval_score"][c] = doc.get(c, "nan")
return data


def mmbench_aggregate_dev_results(results, args):
def mmbench_aggregate_dev_results_eval(results, args):
print(f"============= MMBench-EN(Dev) Detailed Results =============")
overall_acc, category_acc, l2_category_acc = mmbench_evaluator.eval_result(results, eval_method="openai")
file = generate_submission_file("mmbench_en_dev_results.json", args)
details_info = {
"overall_acc": overall_acc,
"category_acc": category_acc,
"l2_category_acc": l2_category_acc,
}
with open(file, "w") as f:
json.dump(details_info, f)
return overall_acc * 100


def mmbench_aggregate_dev_results_submission(results, args):
df = pd.DataFrame(results)
excel_write_path = generate_submission_file("mmbench_en_dev_results.xlsx", args)
with pd.ExcelWriter(excel_write_path) as writer:
Expand Down
6 changes: 5 additions & 1 deletion lmms_eval/tasks/mmbench/mmbench.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,8 @@ task:
- mmbench_en_test
- mmbench_cn_dev
- mmbench_cn_test
- mmbench_cn_cc
- mmbench_cn_cc
metadata:
version: 0.0
sys_prompt: "There are several options:"
gpt_eval_model_name: "gpt-3.5-turbo-0613"
6 changes: 4 additions & 2 deletions lmms_eval/tasks/mmbench/mmbench_cc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,14 @@ generation_kwargs:
do_sample: false
process_results: !function cc_utils.mmbench_cn_cc_process_results
metric_list:
- metric: gpt_eval_score
aggregation: !function cc_utils.mmbench_cn_cc_aggregate_dev_results_eval
higher_is_better: true
- metric: submission
aggregation: !function cc_utils.mmbench_cn_cc_aggregate_results
metadata:
version: 0.0
gpt_eval_model_name: "gpt-3.5-turbo"
quick_extract: true
gpt_eval_model_name: "gpt-3.5-turbo-0613"

model_specific_prompt_kwargs:
default:
Expand Down
5 changes: 2 additions & 3 deletions lmms_eval/tasks/mmbench/mmbench_cn.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,5 @@ task:
- mmbench_cn_cc
metadata:
version: 0.0
gpt_eval_model_name: "gpt-3.5-turbo"
quick_extract: true
sys_prompt: "有如下几个选项:"
gpt_eval_model_name: "gpt-3.5-turbo-0613"
sys_prompt: "有如下几个选项:"
3 changes: 3 additions & 0 deletions lmms_eval/tasks/mmbench/mmbench_cn_dev.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
task: "mmbench_cn_dev"
test_split: "dev"
metric_list:
- metric: gpt_eval_score
aggregation: !function cn_utils.mmbench_aggregate_dev_results_eval
higher_is_better: true
- metric: submission
higher_is_better: true
aggregation: !function cn_utils.mmbench_aggregate_dev_results
Expand Down
1 change: 1 addition & 0 deletions lmms_eval/tasks/mmbench/mmbench_en.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ task:
metadata:
version: 0.0
sys_prompt: "There are several options:"
gpt_eval_model_name: "gpt-3.5-turbo-0613"
7 changes: 5 additions & 2 deletions lmms_eval/tasks/mmbench/mmbench_en_dev.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ task: "mmbench_en_dev"
test_split: dev
include: _default_template_mmbench_en_yaml
metric_list:
- metric: submission
aggregation: !function en_utils.mmbench_aggregate_dev_results
- metric: gpt_eval_score
aggregation: !function en_utils.mmbench_aggregate_dev_results_eval
higher_is_better: true
- metric: submission
aggregation: !function en_utils.mmbench_aggregate_dev_results_submission
higher_is_better: true
Loading

0 comments on commit 61a33cd

Please sign in to comment.