diff --git a/lmms_eval/tasks/mmvetv2/arial.ttf b/lmms_eval/tasks/mmvetv2/arial.ttf new file mode 100644 index 00000000..7ff88f22 Binary files /dev/null and b/lmms_eval/tasks/mmvetv2/arial.ttf differ diff --git a/lmms_eval/tasks/mmvetv2/utils.py b/lmms_eval/tasks/mmvetv2/utils.py index 5cc426c9..1c92c3fd 100644 --- a/lmms_eval/tasks/mmvetv2/utils.py +++ b/lmms_eval/tasks/mmvetv2/utils.py @@ -7,8 +7,8 @@ import requests import yaml from loguru import logger as eval_logger -from openai import AzureOpenAI, OpenAI from PIL import Image, ImageDraw, ImageFont +from openai import AzureOpenAI, OpenAI def add_order_label(image, label, font_size=40): @@ -18,7 +18,11 @@ def add_order_label(image, label, font_size=40): # Define font for the label # font_path = fm.findfont(fm.FontProperties(family=font_family)) font_path = os.path.join(__file__, os.pardir, "arial.ttf") - font = ImageFont.truetype(font_path, font_size) + try: + font = ImageFont.truetype(font_path, font_size) + except Exception as e: + eval_logger.error(f"Error: {e}") + font = ImageFont.load_default() # Calculate text size and position text_width = text_height = font_size @@ -187,11 +191,17 @@ def doc_to_text(doc, lmms_eval_specific_kwargs=None): API_TYPE = os.getenv("API_TYPE", "openai") if API_TYPE == "openai": - client = OpenAI() + API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions") + API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY") + client = OpenAI(api_key=API_KEY) + elif API_TYPE == "azure": - client = AzureOpenAI() - -GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"] + API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken") + API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY") + API_VERSION = os.getenv("AZURE_API_VERSION", "2023-07-01-preview") + client = AzureOpenAI(api_key=API_KEY, azure_endpoint=API_URL, api_version=API_VERSION) + +MODEL_VERSION = os.getenv("GPT_EVAL_MODEL_NAME", "gpt-4o") MM_VET_PROMPT = """Compare the ground truth and prediction from AI models, to give a correctness score for the prediction. in the ground truth means it is totally right only when all elements in the ground truth are present in the prediction, and means it is totally right when any one element in the ground truth is present in the prediction. The correctness score is 0.0 (totally wrong), 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, or 1.0 (totally right). Just complete the last space of the correctness score. gpt_query_prompt | Ground truth | Prediction | Correctness --- | --- | --- | --- @@ -207,7 +217,7 @@ def doc_to_text(doc, lmms_eval_specific_kwargs=None): def get_chat_response( prompt, - model=GPT_EVAL_MODEL_NAME, + model=MODEL_VERSION, temperature=0.0, max_tokens=128, patience=3,