From 3f960966c17c1724399510a652b523f0ea25d82c Mon Sep 17 00:00:00 2001
From: Li Bo <drluodian@gmail.com>
Date: Wed, 24 Jan 2024 10:00:33 +0800
Subject: [PATCH] [Datasets] Add four internal evaluation datasets (#13)

* Update generation_kwargs in pope.yaml

* Update pope_doc_to_text function

* Remove unused variable in mmvet_process_results function

* Remove unused imports in utils.py

* Refactor get_chat_response function to include retries for API requests

* Update gpt_eval_model_name in lmms_eval/tasks/dc100_en.yaml and add retry logic in get_chat_response function

* Update prompt variable in lmms_eval tasks

* Refactor output_name variable in cli_evaluate function

* Fix logging message in mmvet_process_results function

* Update sleep time in get_chat_response function

* Merge commit 'b5ad3edceaeb5c234d3244f99aa49a6b4b4572a1'

* Refactor get_eval function to include retries

* Add token parameter to load_dataset function in gqa_doc_to_visual

* Refactor llava_process_results and llava_aggregation functions
---
 lmms_eval/__main__.py                         |   2 +-
 lmms_eval/tasks/d170_cn/utils.py              | 138 ++++++++++++++++++
 lmms_eval/tasks/d170_en/utils.py              | 138 ++++++++++++++++++
 lmms_eval/tasks/dc100_en/dc100_en.yaml        |  27 ++++
 lmms_eval/tasks/dc100_en/utils.py             | 120 +++++++++++++++
 lmms_eval/tasks/dc200_cn/dc200_cn.yaml        |  27 ++++
 lmms_eval/tasks/dc200_cn/utils.py             | 120 +++++++++++++++
 lmms_eval/tasks/gqa/utils.py                  |   2 +-
 .../llava-in-the-wild/llava-in-the-wild.yaml  |  18 +--
 lmms_eval/tasks/llava-in-the-wild/utils.py    | 115 ++++++++++-----
 lmms_eval/tasks/mmvet/utils.py                |  28 +---
 11 files changed, 658 insertions(+), 77 deletions(-)
 create mode 100644 lmms_eval/tasks/d170_cn/utils.py
 create mode 100644 lmms_eval/tasks/d170_en/utils.py
 create mode 100644 lmms_eval/tasks/dc100_en/dc100_en.yaml
 create mode 100644 lmms_eval/tasks/dc100_en/utils.py
 create mode 100644 lmms_eval/tasks/dc200_cn/dc200_cn.yaml
 create mode 100644 lmms_eval/tasks/dc200_cn/utils.py

diff --git a/lmms_eval/__main__.py b/lmms_eval/__main__.py
index 44db0d58..1e68589f 100644
--- a/lmms_eval/__main__.py
+++ b/lmms_eval/__main__.py
@@ -197,7 +197,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
                     hash_input = f"{args.model_args}_{task_name}".encode("utf-8")
                     hash_output = hashlib.sha256(hash_input).hexdigest()[:10]  # Take the first 10 characters for brevity
                     datetime_str = utils.get_datetime_str()
-                    output_name = f"{args.model}_{args.tasks.replace(',', '_')}_{hash_output}_{datetime_str}_{args.log_samples_suffix}"
+                    output_name = f"{args.model}_{task_name}_{hash_output}_{datetime_str}_{args.log_samples_suffix}"
                     filename = path.joinpath(f"{output_name}.json")
                     # Structure the data with 'args' and 'logs' keys
                     data_to_dump = {"args": vars(args), "config": config, "logs": sorted(samples[task_name], key=lambda x: x["doc_id"])}  # Convert Namespace to dict
diff --git a/lmms_eval/tasks/d170_cn/utils.py b/lmms_eval/tasks/d170_cn/utils.py
new file mode 100644
index 00000000..ea368c96
--- /dev/null
+++ b/lmms_eval/tasks/d170_cn/utils.py
@@ -0,0 +1,138 @@
+import os
+import requests
+import time
+import logging
+import yaml
+from pathlib import Path
+import re
+
+eval_logger = logging.getLogger("lmms-eval")
+
+with open(Path(__file__).parent / "d170_cn.yaml", "r") as f:
+    raw_data = f.readlines()
+    safe_data = []
+    for i, line in enumerate(raw_data):
+        # remove function definition since yaml load cannot handle it
+        if "!function" not in line:
+            safe_data.append(line)
+
+    config = yaml.safe_load("".join(safe_data))
+
+API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
+API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
+GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"]
+
+# The EVALUATION_PROMPT_TEMPLATE_SIMPLE_V2 constant should be defined here
+EVALUATION_PROMPT_TEMPLATE_SIMPLE_V2 = """You are an expert in judging the quality of a model response compared with given ground truth. The model response is in English while the ground truth can be in English or Chinese, or both. You should only judge the relevance of the model response to the ground truth based on meanings, not the language.
+If the model response and ground truth are about grounding object coordinates, you may pay attention that the model responses are in format of [x_min, y_min, x_max, y_max]. You could judge the grounding quality by the IoU of the model response and the ground truth, or the distance between the center of the model response and the ground truth. If IoU is above 0.5 or the distance is below 0.3, you could give a score of 2. If IoU is below 0.2 or the distance is above 0.5, you could give a score of 0. If IoU is between 0.2 and 0.5 or the distance is between 0.2 and 0.5, you could give a score of 1.
+Your response should be an integer score in [0, 1, 2], where 0 means the model response is completely irrelevant to the ground truth, and 2 means the model response completely matches the ground truth. You would have specific score criteria in the ground truth. You also need to explain your score in English.
+Text: {prompt}
+Ground Truth: {ground_truth}
+You should response by following format:
+Score:
+Explanation:"""
+
+
+def get_chat_response(prompt, model=GPT_EVAL_MODEL_NAME, max_tokens=512, patience=3, sleep_time=15):
+    headers = {
+        "Authorization": f"Bearer {API_KEY}",
+        "Content-Type": "application/json",
+    }
+
+    messages = [
+        {"role": "user", "content": prompt},
+    ]
+
+    payload = {
+        "model": model,
+        "messages": messages,
+        "max_tokens": max_tokens,
+        "temperature": 0.0,
+    }
+
+    while patience > 0:
+        patience -= 1
+        try:
+            response = requests.post(
+                API_URL,
+                headers=headers,
+                json=payload,
+            )
+            response.raise_for_status()
+            response_data = response.json()
+
+            content = response_data["choices"][0]["message"]["content"].strip()
+            if content != "":
+                return content, response_data["model"]
+
+        except Exception as e:
+            eval_logger.info(f"Error in response: {response.json()['error']['message']}")
+            if "Rate limit" in str(e):
+                eval_logger.info("Sleeping due to rate limit...")
+                time.sleep(sleep_time)
+            eval_logger.info(f"Retrying...Patience left: {patience}")
+
+    return "", ""
+
+
+def doc_to_visual(doc):
+    if doc["image"] is None:
+        return []
+    return [doc["image"].convert("RGB")]
+
+
+def process_results(doc, results):
+    # get pred and ground truth here
+    pred = results[0]
+    question = doc["question"]
+    answer = doc["annotation"]
+    gpt_query_prompt = EVALUATION_PROMPT_TEMPLATE_SIMPLE_V2.format(prompt=pred, ground_truth=answer)
+    grade_sample_run_complete = False
+    while not grade_sample_run_complete:
+        try:
+            response, model_name = get_chat_response(gpt_query_prompt)
+            grade_sample_run_complete = True
+        except Exception as e:
+            eval_logger.info(f"Error in response: {e}")
+            eval_logger.info(f"Retrying...")
+
+    try:
+        score = int(re.findall(r"Score:\s*(\d)", response)[0])
+    except IndexError:
+        score = 0  # Assign score 0 if the score wasn't parsed correctly
+
+    return {
+        "gpt_eval_info": {
+            "question_id": doc["question_id"],
+            "prediction": pred,
+            "ground_truth": answer,
+            "eval_model": model_name,
+        },
+        "gpt_eval_avg_score": {
+            "score": score,
+        },
+        "gpt_eval_score2_rate": {
+            "score": score,
+        },
+    }
+
+
+def d170_cn_aggregate_info(results):
+    return 0
+
+
+def d170_cn_aggregate_avg_score(results):
+    total_score = 0
+    for result in results:
+        total_score += result["score"]
+    avg_score = total_score / len(results)
+    return avg_score
+
+
+def d170_cn_aggregate_score2_rate(results):
+    score2_count = 0
+    for result in results:
+        if result["score"] == 2:
+            score2_count += 1
+    score2_rate = score2_count / len(results)
+    return score2_rate
diff --git a/lmms_eval/tasks/d170_en/utils.py b/lmms_eval/tasks/d170_en/utils.py
new file mode 100644
index 00000000..49226ce2
--- /dev/null
+++ b/lmms_eval/tasks/d170_en/utils.py
@@ -0,0 +1,138 @@
+import os
+import requests
+import time
+import logging
+import yaml
+from pathlib import Path
+import re
+
+eval_logger = logging.getLogger("lmms-eval")
+
+with open(Path(__file__).parent / "d170_en.yaml", "r") as f:
+    raw_data = f.readlines()
+    safe_data = []
+    for i, line in enumerate(raw_data):
+        # remove function definition since yaml load cannot handle it
+        if "!function" not in line:
+            safe_data.append(line)
+
+    config = yaml.safe_load("".join(safe_data))
+
+API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
+API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
+GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"]
+
+# The EVALUATION_PROMPT_TEMPLATE_SIMPLE_V2 constant should be defined here
+EVALUATION_PROMPT_TEMPLATE_SIMPLE_V2 = """You are an expert in judging the quality of a model response compared with given ground truth. The model response is in English while the ground truth can be in English or Chinese, or both. You should only judge the relevance of the model response to the ground truth based on meanings, not the language.
+If the model response and ground truth are about grounding object coordinates, you may pay attention that the model responses are in format of [x_min, y_min, x_max, y_max]. You could judge the grounding quality by the IoU of the model response and the ground truth, or the distance between the center of the model response and the ground truth. If IoU is above 0.5 or the distance is below 0.3, you could give a score of 2. If IoU is below 0.2 or the distance is above 0.5, you could give a score of 0. If IoU is between 0.2 and 0.5 or the distance is between 0.2 and 0.5, you could give a score of 1.
+Your response should be an integer score in [0, 1, 2], where 0 means the model response is completely irrelevant to the ground truth, and 2 means the model response completely matches the ground truth. You would have specific score criteria in the ground truth. You also need to explain your score in English.
+Text: {prompt}
+Ground Truth: {ground_truth}
+You should response by following format:
+Score:
+Explanation:"""
+
+
+def get_chat_response(prompt, model=GPT_EVAL_MODEL_NAME, max_tokens=512, patience=3, sleep_time=15):
+    headers = {
+        "Authorization": f"Bearer {API_KEY}",
+        "Content-Type": "application/json",
+    }
+
+    messages = [
+        {"role": "user", "content": prompt},
+    ]
+
+    payload = {
+        "model": model,
+        "messages": messages,
+        "max_tokens": max_tokens,
+        "temperature": 0.0,
+    }
+
+    while patience > 0:
+        patience -= 1
+        try:
+            response = requests.post(
+                API_URL,
+                headers=headers,
+                json=payload,
+            )
+            response.raise_for_status()
+            response_data = response.json()
+
+            content = response_data["choices"][0]["message"]["content"].strip()
+            if content != "":
+                return content, response_data["model"]
+
+        except Exception as e:
+            eval_logger.info(f"Error in response: {response.json()['error']['message']}")
+            if "Rate limit" in str(e):
+                eval_logger.info("Sleeping due to rate limit...")
+                time.sleep(sleep_time)
+            eval_logger.info(f"Retrying...Patience left: {patience}")
+
+    return "", ""
+
+
+def doc_to_visual(doc):
+    if doc["image"] is None:
+        return []
+    return [doc["image"].convert("RGB")]
+
+
+def process_results(doc, results):
+    # get pred and ground truth here
+    pred = results[0]
+    question = doc["question"]
+    answer = doc["annotation"]
+    gpt_query_prompt = EVALUATION_PROMPT_TEMPLATE_SIMPLE_V2.format(prompt=pred, ground_truth=answer)
+    grade_sample_run_complete = False
+    while not grade_sample_run_complete:
+        try:
+            response, model_name = get_chat_response(gpt_query_prompt)
+            grade_sample_run_complete = True
+        except Exception as e:
+            eval_logger.info(f"Error in response: {e}")
+            eval_logger.info(f"Retrying...")
+
+    try:
+        score = int(re.findall(r"Score:\s*(\d)", response)[0])
+    except IndexError:
+        score = 0  # Assign score 0 if the score wasn't parsed correctly
+
+    return {
+        "gpt_eval_info": {
+            "question_id": doc["question_id"],
+            "prediction": pred,
+            "ground_truth": answer,
+            "eval_model": model_name,
+        },
+        "gpt_eval_avg_score": {
+            "score": score,
+        },
+        "gpt_eval_score2_rate": {
+            "score": score,
+        },
+    }
+
+
+def d170_en_aggregate_info(results):
+    return 0
+
+
+def d170_en_aggregate_avg_score(results):
+    total_score = 0
+    for result in results:
+        total_score += result["score"]
+    avg_score = total_score / len(results)
+    return avg_score
+
+
+def d170_en_aggregate_score2_rate(results):
+    score2_count = 0
+    for result in results:
+        if result["score"] == 2:
+            score2_count += 1
+    score2_rate = score2_count / len(results)
+    return score2_rate
diff --git a/lmms_eval/tasks/dc100_en/dc100_en.yaml b/lmms_eval/tasks/dc100_en/dc100_en.yaml
new file mode 100644
index 00000000..d55d9bed
--- /dev/null
+++ b/lmms_eval/tasks/dc100_en/dc100_en.yaml
@@ -0,0 +1,27 @@
+dataset_path: lmms-lab/DC100_EN
+dataset_kwargs:
+  token: True
+task: "dc100_en"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.doc_to_visual
+doc_to_text: "{{question}}" # Such that {{prompt}} will be replaced by doc["question"]
+doc_to_target: "answer"
+generation_kwargs:
+  until:
+    - "ASSISTANT:"
+  max_new_tokens: 1024
+  temperature: 0
+  top_p: 0
+  num_beams: 1
+  do_sample: false
+process_results: !function utils.process_results # apply gpt eval here
+metric_list:
+  - metric: gpt_eval_info
+    aggregation: !function utils.dc100_en_aggregate_info
+  - metric: gpt_eval_avg_score
+    aggregation: !function utils.dc100_en_aggregate_avg_score
+    higher_is_better: true
+metadata:
+  version: 0.0
+  gpt_eval_model_name: "gpt-4-vision-preview"
\ No newline at end of file
diff --git a/lmms_eval/tasks/dc100_en/utils.py b/lmms_eval/tasks/dc100_en/utils.py
new file mode 100644
index 00000000..318f4c21
--- /dev/null
+++ b/lmms_eval/tasks/dc100_en/utils.py
@@ -0,0 +1,120 @@
+import base64
+import requests
+import re
+import logging
+import os
+import yaml
+from pathlib import Path
+from io import BytesIO
+
+
+def doc_to_visual(doc):
+    if doc["image"] is None:
+        return []
+    return [doc["image"].convert("RGB")]
+
+
+eval_logger = logging.getLogger("lmms-eval")
+
+# Assuming the config is loaded similarly as in d170_en/utils.py
+with open(Path(__file__).parent / "dc100_en.yaml", "r") as f:
+    raw_data = f.readlines()
+    safe_data = []
+    for i, line in enumerate(raw_data):
+        if "!function" not in line:
+            safe_data.append(line)
+    config = yaml.safe_load("".join(safe_data))
+
+API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
+API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
+GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"]
+
+EVALUATION_PROMPT_TEMPLATE_SIMPLE_V1 = """Text Caption: {caption}
+From 0 to 100, how much do you rate for this Text Caption in terms of the correct and comprehensive description of the image?
+Do not dominant the rating by a single attribute such as recognition correctness, but a overall rating on the object/scene appearance, position, pose, action, shape, etc., and contents in the background. 
+Do not consider the appropriateness or sensitive descriptors, such as "middle-aged western man", judge based on if it has correct specifications of the object and scenes in image.
+Provide a few lines for explanation and the rate number at last after "Final Score:"."""
+
+
+def get_chat_response(base64_image, prompt, max_retries=3):
+    headers = {
+        "Authorization": f"Bearer {API_KEY}",
+        "Content-Type": "application/json",
+    }
+
+    payload = {
+        "model": GPT_EVAL_MODEL_NAME,
+        "messages": [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": prompt},
+                    {
+                        "type": "image_url",
+                        "image_url": f"data:image/jpeg;base64,{base64_image}",
+                    },
+                ],
+            }
+        ],
+        "max_tokens": 1024,
+        "temperature": 0.0,
+    }
+
+    for attempt in range(max_retries):
+        try:
+            response = requests.post(API_URL, headers=headers, json=payload)
+            response.raise_for_status()
+            response_data = response.json()
+            return response_data["choices"][0]["message"]["content"]
+        except requests.exceptions.RequestException as e:
+            eval_logger.warning(f"Request failed on attempt {attempt+1}: {e}")
+            if attempt == max_retries - 1:
+                raise
+
+
+def image_to_base64(pil_image):
+    buffered = BytesIO()
+    pil_image.save(buffered, format="PNG")
+    return base64.b64encode(buffered.getvalue()).decode("utf-8")
+
+
+def process_results(doc, results):
+    prediction = results[0]
+    question_id = doc["question_id"]
+    image_path = doc["image"]
+    base64_image = image_to_base64(image_path)
+    prompt = EVALUATION_PROMPT_TEMPLATE_SIMPLE_V1.format(caption=prediction)
+    try:
+        response = get_chat_response(base64_image, prompt)
+        score_value = re.search(r"Final Score: (\d+)", response)
+        score = int(score_value.group(1)) if score_value else 0
+    except Exception as e:
+        eval_logger.error(f"Error for Question ID: {question_id}: {e}")
+        response = ""
+        score = 0
+
+    return {
+        "gpt_eval_info": {
+            "question_id": question_id,
+            "question": doc["question"],
+            "model_caption": prediction,
+            "explanation": response,
+            "eval_model": GPT_EVAL_MODEL_NAME,
+            "score": score,
+        },
+        "gpt_eval_avg_score": {
+            "score": score,
+        },
+    }
+
+
+def dc100_en_aggregate_info(results):
+    return 0
+
+
+def dc100_en_aggregate_avg_score(results):
+    total_score = 0
+    for result in results:
+        total_score += result["score"]
+    avg_score = total_score / len(results)
+    return avg_score
diff --git a/lmms_eval/tasks/dc200_cn/dc200_cn.yaml b/lmms_eval/tasks/dc200_cn/dc200_cn.yaml
new file mode 100644
index 00000000..dcdf8320
--- /dev/null
+++ b/lmms_eval/tasks/dc200_cn/dc200_cn.yaml
@@ -0,0 +1,27 @@
+dataset_path: lmms-lab/DC200_CN
+dataset_kwargs:
+  token: True
+task: "dc200_cn"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.doc_to_visual
+doc_to_text: "{{question}}" # Such that {{prompt}} will be replaced by doc["question"]
+doc_to_target: "answer"
+generation_kwargs:
+  until:
+    - "ASSISTANT:"
+  max_new_tokens: 1024
+  temperature: 0
+  top_p: 0
+  num_beams: 1
+  do_sample: false
+process_results: !function utils.process_results # apply gpt eval here
+metric_list:
+  - metric: gpt_eval_info
+    aggregation: !function utils.dc200_cn_aggregate_info
+  - metric: gpt_eval_avg_score
+    aggregation: !function utils.dc200_cn_aggregate_avg_score
+    higher_is_better: true
+metadata:
+  version: 0.0
+  gpt_eval_model_name: "gpt-4-vision-preview"
\ No newline at end of file
diff --git a/lmms_eval/tasks/dc200_cn/utils.py b/lmms_eval/tasks/dc200_cn/utils.py
new file mode 100644
index 00000000..bd00f4b5
--- /dev/null
+++ b/lmms_eval/tasks/dc200_cn/utils.py
@@ -0,0 +1,120 @@
+import base64
+import requests
+import re
+import logging
+import os
+import yaml
+from pathlib import Path
+from io import BytesIO
+
+
+def doc_to_visual(doc):
+    if doc["image"] is None:
+        return []
+    return [doc["image"].convert("RGB")]
+
+
+eval_logger = logging.getLogger("lmms-eval")
+
+# Assuming the config is loaded similarly as in d170_en/utils.py
+with open(Path(__file__).parent / "dc200_cn.yaml", "r") as f:
+    raw_data = f.readlines()
+    safe_data = []
+    for i, line in enumerate(raw_data):
+        if "!function" not in line:
+            safe_data.append(line)
+    config = yaml.safe_load("".join(safe_data))
+
+API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
+API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
+GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"]
+
+EVALUATION_PROMPT_TEMPLATE_SIMPLE_V1 = """Text Caption: {caption}
+From 0 to 100, how much do you rate for this Text Caption in terms of the correct and comprehensive description of the image?
+Do not dominant the rating by a single attribute such as recognition correctness, but a overall rating on the object/scene appearance, position, pose, action, shape, etc., and contents in the background. 
+Do not consider the appropriateness or sensitive descriptors, such as "middle-aged western man", judge based on if it has correct specifications of the object and scenes in image.
+Provide a few lines for explanation and the rate number at last after "Final Score:"."""
+
+
+def get_chat_response(base64_image, prompt, max_retries=3):
+    headers = {
+        "Authorization": f"Bearer {API_KEY}",
+        "Content-Type": "application/json",
+    }
+
+    payload = {
+        "model": GPT_EVAL_MODEL_NAME,
+        "messages": [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": prompt},
+                    {
+                        "type": "image_url",
+                        "image_url": f"data:image/jpeg;base64,{base64_image}",
+                    },
+                ],
+            }
+        ],
+        "max_tokens": 1024,
+        "temperature": 0.0,
+    }
+
+    for attempt in range(max_retries):
+        try:
+            response = requests.post(API_URL, headers=headers, json=payload)
+            response.raise_for_status()
+            response_data = response.json()
+            return response_data["choices"][0]["message"]["content"]
+        except requests.exceptions.RequestException as e:
+            eval_logger.warning(f"Request failed on attempt {attempt+1}: {e}")
+            if attempt == max_retries - 1:
+                raise
+
+
+def image_to_base64(pil_image):
+    buffered = BytesIO()
+    pil_image.save(buffered, format="PNG")
+    return base64.b64encode(buffered.getvalue()).decode("utf-8")
+
+
+def process_results(doc, results):
+    prediction = results[0]
+    question_id = doc["question_id"]
+    image_path = doc["image"]
+    base64_image = image_to_base64(image_path)
+    prompt = EVALUATION_PROMPT_TEMPLATE_SIMPLE_V1.format(caption=prediction)
+    try:
+        response = get_chat_response(base64_image, prompt)
+        score_value = re.search(r"Final Score: (\d+)", response)
+        score = int(score_value.group(1)) if score_value else 0
+    except Exception as e:
+        eval_logger.error(f"After retrying, still error for Question ID: {question_id}: {e}")
+        score = 0
+        response = "Failed to get GPT4 eval response."
+
+    return {
+        "gpt_eval_info": {
+            "question_id": question_id,
+            "question": doc["question"],
+            "model_caption": prediction,
+            "explanation": response,
+            "eval_model": GPT_EVAL_MODEL_NAME,
+            "score": score,
+        },
+        "gpt_eval_avg_score": {
+            "score": score,
+        },
+    }
+
+
+def dc200_cn_aggregate_info(results):
+    return 0
+
+
+def dc200_cn_aggregate_avg_score(results):
+    total_score = 0
+    for result in results:
+        total_score += result["score"]
+    avg_score = total_score / len(results)
+    return avg_score
diff --git a/lmms_eval/tasks/gqa/utils.py b/lmms_eval/tasks/gqa/utils.py
index b83a13de..4413fb97 100644
--- a/lmms_eval/tasks/gqa/utils.py
+++ b/lmms_eval/tasks/gqa/utils.py
@@ -9,7 +9,7 @@ def gqa_doc_to_visual(doc):
     global GQA_RAW_IMAGE_DATASET
     global GQA_ID2IMAGE
     if GQA_RAW_IMAGE_DATASET is None:
-        GQA_RAW_IMAGE_DATASET = load_dataset("lmms-lab/GQA", "testdev_balanced_images", split="testdev")
+        GQA_RAW_IMAGE_DATASET = load_dataset("lmms-lab/GQA", "testdev_balanced_images", split="testdev", token=True)
         GQA_ID2IMAGE = {}
         for row in GQA_RAW_IMAGE_DATASET:
             GQA_ID2IMAGE[row["id"]] = row["image"].convert("RGB")
diff --git a/lmms_eval/tasks/llava-in-the-wild/llava-in-the-wild.yaml b/lmms_eval/tasks/llava-in-the-wild/llava-in-the-wild.yaml
index 5c504a47..aa7b4fe4 100644
--- a/lmms_eval/tasks/llava-in-the-wild/llava-in-the-wild.yaml
+++ b/lmms_eval/tasks/llava-in-the-wild/llava-in-the-wild.yaml
@@ -18,18 +18,18 @@ generation_kwargs:
   do_sample: false
 process_results: !function utils.llava_process_results
 metric_list:
-  - metric: gpt_eval_llava_conv
-    aggregation: !function utils.llava_aggregation
-    higher_is_better: true
-  - metric: gpt_eval_llava_detail
-    aggregation: !function utils.llava_aggregation
-    higher_is_better: true
-  - metric: gpt_eval_llava_complex
-    aggregation: !function utils.llava_aggregation
-    higher_is_better: true
   - metric: gpt_eval_llava_all
     aggregation: !function utils.llava_aggregation
     higher_is_better: true
+  # - metric: gpt_eval_llava_conv
+  #   aggregation: !function utils.llava_aggregation
+  #   higher_is_better: true
+  # - metric: gpt_eval_llava_detail
+  #   aggregation: !function utils.llava_aggregation
+  #   higher_is_better: true
+  # - metric: gpt_eval_llava_complex
+  #   aggregation: !function utils.llava_aggregation
+  #   higher_is_better: true
 metadata:
   version: 0.0
   gpt_eval_model_name: "gpt-4-0314"
\ No newline at end of file
diff --git a/lmms_eval/tasks/llava-in-the-wild/utils.py b/lmms_eval/tasks/llava-in-the-wild/utils.py
index 5970caaa..903270a2 100644
--- a/lmms_eval/tasks/llava-in-the-wild/utils.py
+++ b/lmms_eval/tasks/llava-in-the-wild/utils.py
@@ -29,17 +29,28 @@
 GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"]
 
 
-def get_eval(content: str, max_tokens: int):
+def get_eval(content: str, max_tokens: int, retries: int = 3):
     headers = {
         "Authorization": f"Bearer {API_KEY}",
         "Content-Type": "application/json",
     }
 
-    messages = [{"role": "system", "content": "You are a helpful and precise assistant for checking the quality of the answer."}, {"role": "user", "content": content}]
-
-    payload = {"model": GPT_EVAL_MODEL_NAME, "messages": messages, "temperature": 0.2, "max_tokens": max_tokens}
+    messages = [
+        {
+            "role": "system",
+            "content": "You are a helpful and precise assistant for checking the quality of the answer.",
+        },
+        {"role": "user", "content": content},
+    ]
+
+    payload = {
+        "model": GPT_EVAL_MODEL_NAME,
+        "messages": messages,
+        "temperature": 0.2,
+        "max_tokens": max_tokens,
+    }
 
-    while True:
+    for attempt in range(retries):
         try:
             response = requests.post(API_URL, headers=headers, json=payload)
             response.raise_for_status()
@@ -48,12 +59,15 @@ def get_eval(content: str, max_tokens: int):
             content = response_data["choices"][0]["message"]["content"].strip()
             if content != "":
                 return content, response_data["model"]
+            break  # If successful, break out of the loop
 
         except Exception as e:
-            eval_logger.info(f"Error in response : {response.json()['error']['message']}")
-            if "Rate limit" in str(e):
-                eval_logger.info("Sleeping due to rate limit...")
+            eval_logger.info(f"Attempt {attempt + 1} failed with error: {str(e)}")
+            if attempt < retries - 1:  # If we have retries left, sleep and then continue to next attempt
                 time.sleep(NUM_SECONDS_TO_SLEEP)
+            else:  # If this was the last attempt, log and return empty
+                eval_logger.error(f"All {retries} attempts failed. Last error message: {str(e)}")
+                return "", ""
     return "", ""
 
 
@@ -90,38 +104,59 @@ def llava_process_results(doc, result):
     Returns:
         a dictionary with key: metric name (in this case coco_bleu), value: metric value
     """
-    question = doc["question"]
-    ans1 = doc["gpt_answer"]
-    ans2 = result[0]
-    if isinstance(doc["caption"], list):
-        context = "\n".join(doc["caption"])
-    else:
-        context = doc["caption"]
-    category = "llava_bench_" + doc["category"]
-    rule = rule_dict[category]
-    prompt = rule["prompt"]
-    role = rule["role"]
-    content = f"[Context]\n{context}\n\n" f"[Question]\n{question}\n\n" f"[{role} 1]\n{ans1}\n\n[End of {role} 1]\n\n" f"[{role} 2]\n{ans2}\n\n[End of {role} 2]\n\n" f"[System]\n{prompt}\n\n"
-
-    review, model_name = get_eval(content, 1024)
-    scores = parse_score(review)
-    metric = f"gpt_eval_llava_{doc['category']}"
-    review_dict = {"question": question, "ans1": ans1, "ans2": ans2, "context": context, "category": category, "review": review, "scores": scores, "eval_model": model_name}
-
-    return {metric: review_dict, "gpt_eval_llava_all": review_dict}
+    try:
+        question = doc.get("question", "")
+        ans1 = doc.get("gpt_answer", "")
+        ans2 = result[0] if result else ""
+        captions = doc.get("caption", [])
+        context = "\n".join(captions) if isinstance(captions, list) else captions
+        category = "llava_bench_" + doc.get("category", "")
+        rule = rule_dict.get(category, {})
+        prompt = rule.get("prompt", "")
+        role = rule.get("role", "user")
+        content = f"[Context]\n{context}\n\n" f"[Question]\n{question}\n\n" f"[{role} 1]\n{ans1}\n\n[End of {role} 1]\n\n" f"[{role} 2]\n{ans2}\n\n[End of {role} 2]\n\n" f"[System]\n{prompt}\n\n"
+
+        review, model_name = get_eval(content, 1024)
+        scores = parse_score(review)
+    except Exception as e:
+        eval_logger.error(f"Error for Question ID: {doc.get('question_id', 'Unknown')}: {e}")
+        review = "Failed to Get a Proper Review."
+        model_name = "Failed Request"
+        scores = [-1, -1]
+
+    # metric = f"gpt_eval_llava_{doc.get('category', 'unknown')}"
+    review_dict = {
+        "question": question,
+        "ans1": ans1,
+        "ans2": ans2,
+        "context": context,
+        "category": category,
+        "review": review,
+        "scores": scores,
+        "eval_model": model_name,
+    }
+
+    return {"gpt_eval_llava_all": review_dict}
+    # return {metric: review_dict, "gpt_eval_llava_all": review_dict}
 
 
 def llava_aggregation(results):
-    scores = []
-    category = results[0]["category"]
-    for result in results:
-        scores.append(result["scores"])
-
-    stats = np.asarray(scores).mean(0).tolist()
-    stats = [round(x, 3) for x in stats]
-    eval_logger.info(f"Model/GPT4 Score for {category}: {stats[1] / stats[0] * 100:.1f}%")
-    eval_logger.info(f"GPT4 Score for {category}: {stats[0] * 10:.1f}%")
-    eval_logger.info(f"Model Score for {category}: {stats[1] * 10:.1f}%")
-    # TODO: For KC, Please make the logging information more clear. e.g. GPT4 Score: 0.8, Model Score: 0.7...
-    eval_logger.info("=========================")
-    return round(stats[1] / stats[0] * 100, 1)
+    return 0
+    try:
+        scores = []
+        category = results[0]["category"]
+        for result in results:
+            scores.append(result["scores"])
+
+        stats = np.asarray(scores).mean(0).tolist()
+        stats = [round(x, 3) for x in stats]
+        gpt4_score_percentage = stats[0] * 10
+        model_score_percentage = stats[1] * 10
+        eval_logger.info(f"Category: {category}")
+        eval_logger.info(f"GPT4 Score: {gpt4_score_percentage:.1f}%")
+        eval_logger.info(f"Model Score: {model_score_percentage:.1f}%")
+        eval_logger.info("=========================")
+        return round(stats[1] / stats[0] * 100, 1)
+    except Exception as e:
+        eval_logger.error(f"Error in llava_aggregation: {e}")
+        return None
diff --git a/lmms_eval/tasks/mmvet/utils.py b/lmms_eval/tasks/mmvet/utils.py
index 4e7a78b6..2972c91d 100644
--- a/lmms_eval/tasks/mmvet/utils.py
+++ b/lmms_eval/tasks/mmvet/utils.py
@@ -34,7 +34,7 @@
 """
 
 
-def get_chat_response(prompt, model=GPT_EVAL_MODEL_NAME, temperature=0.0, max_tokens=128, patience=3, sleep_time=15):
+def get_chat_response(prompt, model=GPT_EVAL_MODEL_NAME, temperature=0.0, max_tokens=128, patience=3, sleep_time=5):
     headers = {
         "Authorization": f"Bearer {API_KEY}",
         "Content-Type": "application/json",
@@ -84,7 +84,6 @@ def mmvet_doc_to_visual(doc):
 
 def mmvet_process_results(doc, results):
     # get pred and ground truth here
-    target = doc["answer"]
     pred = results[0]
     question = doc["question"]
     answer = doc["answer"]
@@ -114,7 +113,7 @@ def mmvet_process_results(doc, results):
                 if temperature >= 2:  # Assuming a max temperature threshold
                     score = 0.0
                     grade_sample_run_complete = True
-                    eval_logger.info(f"{doc['question_id']} failed to get a score.")
+                    eval_logger.info(f"Reach to max trials, {doc['question_id']} failed to get a score.")
         else:
             score = 0.0
             grade_sample_run_complete = True
@@ -133,29 +132,6 @@ def mmvet_process_results(doc, results):
     }
 
 
-# # all appearances of combination of cap needed for each question in the dataset
-# capability_list = [
-#     {"ocr", "math"},
-#     {"ocr", "spat", "math"},
-#     {"rec", "ocr", "spat", "math"},
-#     {"rec", "spat"},
-#     {"ocr", "spat"},
-#     {"rec", "ocr", "spat"},
-#     {"know", "ocr", "spat"},
-#     {"rec", "ocr"},
-#     {"rec", "spat", "know"},
-#     {"ocr"},
-#     {"rec"},
-#     {"rec", "know"},
-#     {"rec", "gen", "know"},
-#     {"rec", "ocr", "gen", "know"},
-#     {"rec", "ocr", "gen", "spat"},
-#     {"ocr", "gen", "spat"},
-# ]
-
-# # count of each capability in the dataset
-# capability_counter_list = [11, 14, 1, 12, 26, 7, 3, 4, 2, 12, 37, 9, 62, 8, 8, 2]
-
 cap_columns = pd.DataFrame(["rec", "ocr", "know", "gen", "spat", "math", "total"])
 cap_details_columns = pd.DataFrame(
     [