Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] adding mmbench dev evaluation (#75) #46

Merged
merged 1 commit into from
Apr 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 37 additions & 3 deletions lmms_eval/tasks/mmbench/cc_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from lmms_eval.tasks.mmbench.mmbench_evals import MMBench_Evaluator
from lmms_eval.tasks._task_utils.file_utils import generate_submission_file

with open(Path(__file__).parent / "mmbench_cn.yaml", "r") as f:
with open(Path(__file__).parent / "mmbench.yaml", "r") as f:
raw_data = f.readlines()
safe_data = []
for i, line in enumerate(raw_data):
Expand All @@ -19,7 +19,18 @@

config = yaml.safe_load("".join(safe_data))

mmbench_evaluator = MMBench_Evaluator(sys_prompt=config["metadata"]["sys_prompt"])
GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"]
API_TYPE = os.getenv("API_TYPE", "openai")

if API_TYPE == "openai":
API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
elif API_TYPE == "azure":
API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")


mmbench_evaluator = MMBench_Evaluator(sys_prompt=config["metadata"]["sys_prompt"], API_KEY=API_KEY, API_URL=API_URL, model_version=GPT_EVAL_MODEL_NAME)


def mmbench_doc_to_visual(doc):
Expand Down Expand Up @@ -52,21 +63,44 @@ def mmbench_cn_cc_doc_to_text(doc, model_specific_prompt_kwargs=None):
def mmbench_cn_cc_process_results(doc, results):
model_response = results[0].strip()
data = {
"gpt_eval_score": {
"index": doc["index"],
"question": doc["question"],
"answer": doc["answer"],
"prediction": model_response,
"source": doc["source"],
"category": doc["category"],
},
"submission": {
"index": doc["index"],
"question": doc["question"],
"answer": doc["answer"],
"prediction": model_response,
"source": doc["source"],
"category": doc["category"],
}
},
}
option_candidate = ["A", "B", "C", "D", "E"]
for c in option_candidate:
data["submission"][c] = doc.get(c, "nan")
data["gpt_eval_score"][c] = doc.get(c, "nan")
return data


def mmbench_cn_cc_aggregate_dev_results_eval(results, args):
print(f"============= MMBench-CN(CC) Detailed Results =============")
overall_acc, category_acc, l2_category_acc = mmbench_evaluator.eval_result(results, eval_method="openai")
file = generate_submission_file("mmbench_cn_cc_results.json", args)
details_info = {
"overall_acc": overall_acc,
"category_acc": category_acc,
"l2_category_acc": l2_category_acc,
}
with open(file, "w") as f:
json.dump(details_info, f)
return overall_acc * 100


def mmbench_cn_cc_aggregate_results(results, args):
df = pd.DataFrame(results)
file = generate_submission_file("mmbench_cn_cc_results.xlsx", args)
Expand Down
44 changes: 41 additions & 3 deletions lmms_eval/tasks/mmbench/cn_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@

eval_logger = logging.getLogger("lmms-eval")
from lmms_eval.tasks.mmbench.mmbench_evals import MMBench_Evaluator
from lmms_eval.tasks._task_utils.file_utils import generate_submission_file

with open(Path(__file__).parent / "mmbench_cn.yaml", "r") as f:
with open(Path(__file__).parent / "mmbench.yaml", "r") as f:
raw_data = f.readlines()
safe_data = []
for i, line in enumerate(raw_data):
Expand All @@ -19,7 +20,18 @@

config = yaml.safe_load("".join(safe_data))

mmbench_evaluator = MMBench_Evaluator(sys_prompt=config["metadata"]["sys_prompt"])
GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"]
API_TYPE = os.getenv("API_TYPE", "openai")

if API_TYPE == "openai":
API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
elif API_TYPE == "azure":
API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")


mmbench_evaluator = MMBench_Evaluator(sys_prompt=config["metadata"]["sys_prompt"], API_KEY=API_KEY, API_URL=API_URL, model_version=GPT_EVAL_MODEL_NAME)


def mmbench_doc_to_visual(doc):
Expand Down Expand Up @@ -55,6 +67,17 @@ def mmbench_doc_to_text(doc, model_specific_prompt_kwargs=None):
def mmbench_process_results(doc, results):
model_response = results[0].strip()
data = {
"gpt_eval_score": {
"index": doc["index"],
"question": doc["question"],
"answer": doc["answer"],
"prediction": model_response,
"hint": doc["hint"],
"source": doc["source"],
"split": doc["split"],
"category": doc["category"],
"L2-category": doc["L2-category"],
},
"submission": {
"index": doc["index"],
"question": doc["question"],
Expand All @@ -65,14 +88,29 @@ def mmbench_process_results(doc, results):
"split": doc["split"],
"category": doc["category"],
"L2-category": doc["L2-category"],
}
},
}
option_candidate = ["A", "B", "C", "D", "E"]
for c in option_candidate:
data["submission"][c] = doc.get(c, "nan")
data["gpt_eval_score"][c] = doc.get(c, "nan")
return data


def mmbench_aggregate_dev_results_eval(results, args):
print(f"============= MMBench-CN(Dev) Detailed Results =============")
overall_acc, category_acc, l2_category_acc = mmbench_evaluator.eval_result(results, eval_method="openai")
file = generate_submission_file("mmbench_cn_dev_results.json", args)
details_info = {
"overall_acc": overall_acc,
"category_acc": category_acc,
"l2_category_acc": l2_category_acc,
}
with open(file, "w") as f:
json.dump(details_info, f)
return overall_acc * 100


def mmbench_aggregate_dev_results(results, args):
df = pd.DataFrame(results)
excel_write_path = generate_submission_file("mmbench_cn_dev_results.xlsx", args)
Expand Down
45 changes: 41 additions & 4 deletions lmms_eval/tasks/mmbench/en_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from lmms_eval.tasks.mmbench.mmbench_evals import MMBench_Evaluator
from lmms_eval.tasks._task_utils.file_utils import generate_submission_file

with open(Path(__file__).parent / "mmbench_en.yaml", "r") as f:
with open(Path(__file__).parent / "mmbench.yaml", "r") as f:
raw_data = f.readlines()
safe_data = []
for i, line in enumerate(raw_data):
Expand All @@ -19,7 +19,18 @@

config = yaml.safe_load("".join(safe_data))

mmbench_evaluator = MMBench_Evaluator(sys_prompt=config["metadata"]["sys_prompt"])
GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"]
API_TYPE = os.getenv("API_TYPE", "openai")

if API_TYPE == "openai":
API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
elif API_TYPE == "azure":
API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")


mmbench_evaluator = MMBench_Evaluator(sys_prompt=config["metadata"]["sys_prompt"], API_KEY=API_KEY, API_URL=API_URL, model_version=GPT_EVAL_MODEL_NAME)


def mmbench_doc_to_visual(doc):
Expand Down Expand Up @@ -55,6 +66,17 @@ def mmbench_doc_to_text(doc, model_specific_prompt_kwargs=None):
def mmbench_process_results(doc, results):
model_response = results[0].strip()
data = {
"gpt_eval_score": {
"index": doc["index"],
"question": doc["question"],
"answer": doc["answer"],
"prediction": model_response,
"hint": doc["hint"],
"source": doc["source"],
"split": doc["split"],
"category": doc["category"],
"L2-category": doc["L2-category"],
},
"submission": {
"index": doc["index"],
"question": doc["question"],
Expand All @@ -65,15 +87,30 @@ def mmbench_process_results(doc, results):
"split": doc["split"],
"category": doc["category"],
"L2-category": doc["L2-category"],
}
},
}
option_candidate = ["A", "B", "C", "D", "E"]
for c in option_candidate:
data["submission"][c] = doc.get(c, "nan")
data["gpt_eval_score"][c] = doc.get(c, "nan")
return data


def mmbench_aggregate_dev_results(results, args):
def mmbench_aggregate_dev_results_eval(results, args):
print(f"============= MMBench-EN(Dev) Detailed Results =============")
overall_acc, category_acc, l2_category_acc = mmbench_evaluator.eval_result(results, eval_method="openai")
file = generate_submission_file("mmbench_en_dev_results.json", args)
details_info = {
"overall_acc": overall_acc,
"category_acc": category_acc,
"l2_category_acc": l2_category_acc,
}
with open(file, "w") as f:
json.dump(details_info, f)
return overall_acc * 100


def mmbench_aggregate_dev_results_submission(results, args):
df = pd.DataFrame(results)
excel_write_path = generate_submission_file("mmbench_en_dev_results.xlsx", args)
with pd.ExcelWriter(excel_write_path) as writer:
Expand Down
6 changes: 5 additions & 1 deletion lmms_eval/tasks/mmbench/mmbench.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,8 @@ task:
- mmbench_en_test
- mmbench_cn_dev
- mmbench_cn_test
- mmbench_cn_cc
- mmbench_cn_cc
metadata:
version: 0.0
sys_prompt: "There are several options:"
gpt_eval_model_name: "gpt-3.5-turbo-0613"
6 changes: 4 additions & 2 deletions lmms_eval/tasks/mmbench/mmbench_cc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,14 @@ generation_kwargs:
do_sample: false
process_results: !function cc_utils.mmbench_cn_cc_process_results
metric_list:
- metric: gpt_eval_score
aggregation: !function cc_utils.mmbench_cn_cc_aggregate_dev_results_eval
higher_is_better: true
- metric: submission
aggregation: !function cc_utils.mmbench_cn_cc_aggregate_results
metadata:
version: 0.0
gpt_eval_model_name: "gpt-3.5-turbo"
quick_extract: true
gpt_eval_model_name: "gpt-3.5-turbo-0613"

model_specific_prompt_kwargs:
default:
Expand Down
5 changes: 2 additions & 3 deletions lmms_eval/tasks/mmbench/mmbench_cn.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,5 @@ task:
- mmbench_cn_cc
metadata:
version: 0.0
gpt_eval_model_name: "gpt-3.5-turbo"
quick_extract: true
sys_prompt: "有如下几个选项:"
gpt_eval_model_name: "gpt-3.5-turbo-0613"
sys_prompt: "有如下几个选项:"
3 changes: 3 additions & 0 deletions lmms_eval/tasks/mmbench/mmbench_cn_dev.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
task: "mmbench_cn_dev"
test_split: "dev"
metric_list:
- metric: gpt_eval_score
aggregation: !function cn_utils.mmbench_aggregate_dev_results_eval
higher_is_better: true
- metric: submission
higher_is_better: true
aggregation: !function cn_utils.mmbench_aggregate_dev_results
Expand Down
1 change: 1 addition & 0 deletions lmms_eval/tasks/mmbench/mmbench_en.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ task:
metadata:
version: 0.0
sys_prompt: "There are several options:"
gpt_eval_model_name: "gpt-3.5-turbo-0613"
7 changes: 5 additions & 2 deletions lmms_eval/tasks/mmbench/mmbench_en_dev.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ task: "mmbench_en_dev"
test_split: dev
include: _default_template_mmbench_en_yaml
metric_list:
- metric: submission
aggregation: !function en_utils.mmbench_aggregate_dev_results
- metric: gpt_eval_score
aggregation: !function en_utils.mmbench_aggregate_dev_results_eval
higher_is_better: true
- metric: submission
aggregation: !function en_utils.mmbench_aggregate_dev_results_submission
higher_is_better: true
Loading
Loading