From 801829a25da6876ae3d41f171fbd463ca957ed77 Mon Sep 17 00:00:00 2001
From: kcz358 <92624596+kcz358@users.noreply.github.com>
Date: Sun, 11 Feb 2024 00:54:23 +0800
Subject: [PATCH] [Feat] Add qwen loglikelihood (#43)

* Add qwen loglikelihood

* Revise the pyproject dependency. Move tiktoken out from optional-dependencies

* Add ferret-bench

* Add seedbench 2, test on llava
---
 lmms_eval/models/model_utils/__init__.py      |   0
 .../model_utils/qwen/qwen_generate_utils.py   | 419 ++++++++++++++++++
 lmms_eval/models/qwen_vl.py                   |  71 +++
 lmms_eval/tasks/ferret/ferret.yaml            |  35 ++
 lmms_eval/tasks/ferret/rule.json              |   5 +
 lmms_eval/tasks/ferret/utils.py               | 202 +++++++++
 lmms_eval/tasks/seedbench_2/seedbench_2.yaml  |  46 ++
 lmms_eval/tasks/seedbench_2/utils.py          |  53 +++
 pyproject.toml                                |   4 +-
 9 files changed, 834 insertions(+), 1 deletion(-)
 create mode 100644 lmms_eval/models/model_utils/__init__.py
 create mode 100644 lmms_eval/models/model_utils/qwen/qwen_generate_utils.py
 create mode 100644 lmms_eval/tasks/ferret/ferret.yaml
 create mode 100644 lmms_eval/tasks/ferret/rule.json
 create mode 100644 lmms_eval/tasks/ferret/utils.py
 create mode 100644 lmms_eval/tasks/seedbench_2/seedbench_2.yaml
 create mode 100644 lmms_eval/tasks/seedbench_2/utils.py

diff --git a/lmms_eval/models/model_utils/__init__.py b/lmms_eval/models/model_utils/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/lmms_eval/models/model_utils/qwen/qwen_generate_utils.py b/lmms_eval/models/model_utils/qwen/qwen_generate_utils.py
new file mode 100644
index 00000000..f198c267
--- /dev/null
+++ b/lmms_eval/models/model_utils/qwen/qwen_generate_utils.py
@@ -0,0 +1,419 @@
+# Copyright (c) Alibaba Cloud.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Generation support."""
+
+from typing import Tuple, List, Union, Iterable
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from transformers import PreTrainedTokenizer
+from transformers import logging
+from transformers.generation import LogitsProcessor
+
+logger = logging.get_logger(__name__)
+
+# Types.
+HistoryType = List[Tuple[str, str]]
+TokensType = List[int]
+BatchTokensType = List[List[int]]
+
+
+def pad_batch(batch: BatchTokensType, pad_id: int, seq_length: int) -> BatchTokensType:
+    for tokens in batch:
+        context_length = len(tokens)
+        if context_length < seq_length:
+            tokens.extend([pad_id] * (seq_length - context_length))
+    return batch
+
+
+def get_ltor_masks_and_position_ids(
+    data,
+    eod_token,
+    reset_position_ids,
+    reset_attention_mask,
+    eod_mask_loss,
+):
+    """Build masks and position id for left to right model."""
+
+    # Extract batch size and sequence length.
+    micro_batch_size, seq_length = data.size()
+
+    # Attention mask (lower triangular).
+    if reset_attention_mask:
+        att_mask_batch = micro_batch_size
+    else:
+        att_mask_batch = 1
+    attention_mask = torch.tril(
+        torch.ones((att_mask_batch, seq_length, seq_length), device=data.device)
+    ).view(att_mask_batch, 1, seq_length, seq_length)
+
+    # Loss mask.
+    loss_mask = torch.ones(data.size(), dtype=torch.float, device=data.device)
+    if eod_mask_loss:
+        loss_mask[data == eod_token] = 0.0
+
+    # Position ids.
+    position_ids = torch.arange(seq_length, dtype=torch.long, device=data.device)
+    position_ids = position_ids.unsqueeze(0).expand_as(data)
+    # We need to clone as the ids will be modifed based on batch index.
+    if reset_position_ids:
+        position_ids = position_ids.clone()
+
+    if reset_position_ids or reset_attention_mask:
+        # Loop through the batches:
+        for b in range(micro_batch_size):
+
+            # Find indecies where EOD token is.
+            eod_index = position_ids[b, data[b] == eod_token]
+            # Detach indecies from positions if going to modify positions.
+            if reset_position_ids:
+                eod_index = eod_index.clone()
+
+            # Loop through EOD indecies:
+            prev_index = 0
+            for j in range(eod_index.size()[0]):
+                i = eod_index[j]
+                # Mask attention loss.
+                if reset_attention_mask:
+                    attention_mask[b, 0, (i + 1) :, : (i + 1)] = 0
+                # Reset positions.
+                if reset_position_ids:
+                    position_ids[b, (i + 1) :] -= i + 1 - prev_index
+                    prev_index = i + 1
+
+    # Convert attention mask to binary:
+    attention_mask = attention_mask < 0.5
+
+    return attention_mask, loss_mask, position_ids
+
+
+def get_batch(context_tokens: torch.LongTensor, eod_id: int):
+    """Generate batch from context tokens."""
+    # Move to GPU.
+    tokens = context_tokens.contiguous().to(context_tokens.device)
+    # Get the attention mask and postition ids.
+    attention_mask, _, position_ids = get_ltor_masks_and_position_ids(
+        tokens,
+        eod_id,
+        reset_position_ids=False,
+        reset_attention_mask=False,
+        eod_mask_loss=False,
+    )
+    return tokens, attention_mask, position_ids
+
+
+def get_stop_words_ids(chat_format, tokenizer):
+    if chat_format == "raw":
+        stop_words_ids = [tokenizer.encode("Human:"), [tokenizer.eod_id]]
+    elif chat_format == "chatml":
+        stop_words_ids = [[tokenizer.im_end_id], [tokenizer.im_start_id]]
+    else:
+        raise NotImplementedError(f"Unknown chat format {chat_format!r}")
+    return stop_words_ids
+
+
+def make_context(
+    tokenizer: PreTrainedTokenizer,
+    query: str,
+    history: List[Tuple[str, str]] = None,
+    system: str = "",
+    max_window_size: int = 6144,
+    chat_format: str = "chatml",
+):
+    if history is None:
+        history = []
+
+    if chat_format == "chatml":
+        im_start, im_end = "<|im_start|>", "<|im_end|>"
+        im_start_tokens = [tokenizer.im_start_id]
+        im_end_tokens = [tokenizer.im_end_id]
+        nl_tokens = tokenizer.encode("\n")
+
+        def _tokenize_str(role, content):
+            return f"{role}\n{content}", tokenizer.encode(
+                role, allowed_special=set(tokenizer.IMAGE_ST)
+            ) + nl_tokens + tokenizer.encode(content, allowed_special=set(tokenizer.IMAGE_ST))
+
+        system_text, system_tokens_part = _tokenize_str("system", system)
+        system_tokens = im_start_tokens + system_tokens_part + im_end_tokens
+
+        raw_text = ""
+        context_tokens = []
+
+        for turn_query, turn_response in reversed(history):
+            query_text, query_tokens_part = _tokenize_str("user", turn_query)
+            query_tokens = im_start_tokens + query_tokens_part + im_end_tokens
+            if turn_response is not None:
+                response_text, response_tokens_part = _tokenize_str(
+                    "assistant", turn_response
+                )
+                response_tokens = im_start_tokens + response_tokens_part + im_end_tokens
+
+                next_context_tokens = nl_tokens + query_tokens + nl_tokens + response_tokens
+                prev_chat = (
+                    f"\n{im_start}{query_text}{im_end}\n{im_start}{response_text}{im_end}"
+                )
+            else:
+                next_context_tokens = nl_tokens + query_tokens + nl_tokens
+                prev_chat = f"\n{im_start}{query_text}{im_end}\n"
+
+            current_context_size = (
+                len(system_tokens) + len(next_context_tokens) + len(context_tokens)
+            )
+            if current_context_size < max_window_size:
+                context_tokens = next_context_tokens + context_tokens
+                raw_text = prev_chat + raw_text
+            else:
+                break
+
+        context_tokens = system_tokens + context_tokens
+        raw_text = f"{im_start}{system_text}{im_end}" + raw_text
+        context_tokens += (
+            nl_tokens
+            + im_start_tokens
+            + _tokenize_str("user", query)[1]
+            + im_end_tokens
+            + nl_tokens
+            + im_start_tokens
+            + tokenizer.encode("assistant")
+            + nl_tokens
+        )
+        raw_text += f"\n{im_start}user\n{query}{im_end}\n{im_start}assistant\n"
+
+    elif chat_format == "raw":
+        raw_text = query
+        context_tokens = tokenizer.encode(raw_text)
+    else:
+        raise NotImplementedError(f"Unknown chat format {chat_format!r}")
+
+    return raw_text, context_tokens
+
+
+def _decode_default(
+    tokens: List[int],
+    *,
+    stop_words: List[str],
+    eod_words: List[str],
+    tokenizer: PreTrainedTokenizer,
+    raw_text_len: int,
+    verbose: bool = False,
+    return_end_reason: bool = False,
+    errors: str='replace',
+):
+    trim_decode_tokens = tokenizer.decode(tokens, errors=errors)[raw_text_len:]
+    if verbose:
+        print("\nRaw Generate: ", trim_decode_tokens)
+
+    end_reason = f"Gen length {len(tokens)}"
+    for stop_word in stop_words:
+        trim_decode_tokens = trim_decode_tokens.replace(stop_word, "").strip()
+    for eod_word in eod_words:
+        if eod_word in trim_decode_tokens:
+            end_reason = f"Gen {eod_word!r}"
+        trim_decode_tokens = trim_decode_tokens.split(eod_word)[0]
+    trim_decode_tokens = trim_decode_tokens.strip()
+    if verbose:
+        print("\nEnd Reason:", end_reason)
+        print("\nGenerate: ", trim_decode_tokens)
+
+    if return_end_reason:
+        return trim_decode_tokens, end_reason
+    else:
+        return trim_decode_tokens
+
+
+def _decode_chatml(
+    tokens: List[int],
+    *,
+    stop_words: List[str],
+    eod_token_ids: List[int],
+    tokenizer: PreTrainedTokenizer,
+    raw_text_len: int,
+    context_length: int,
+    verbose: bool = False,
+    return_end_reason: bool = False,
+    errors: str='replace'
+):
+    end_reason = f"Gen length {len(tokens)}"
+    eod_token_idx = context_length
+    for eod_token_idx in range(context_length, len(tokens)):
+        if tokens[eod_token_idx] in eod_token_ids:
+            end_reason = f"Gen {tokenizer.decode([tokens[eod_token_idx]])!r}"
+            break
+
+    trim_decode_tokens = tokenizer.decode(tokens[:eod_token_idx], errors=errors)[raw_text_len:]
+    if verbose:
+        print("\nRaw Generate w/o EOD:", tokenizer.decode(tokens, errors=errors)[raw_text_len:])
+        print("\nRaw Generate:", trim_decode_tokens)
+        print("\nEnd Reason:", end_reason)
+    for stop_word in stop_words:
+        trim_decode_tokens = trim_decode_tokens.replace(stop_word, "").strip()
+    trim_decode_tokens = trim_decode_tokens.strip()
+    if verbose:
+        print("\nGenerate:", trim_decode_tokens)
+
+    if return_end_reason:
+        return trim_decode_tokens, end_reason
+    else:
+        return trim_decode_tokens
+
+
+def decode_tokens(
+    tokens: Union[torch.LongTensor, TokensType],
+    tokenizer: PreTrainedTokenizer,
+    raw_text_len: int,
+    context_length: int,
+    chat_format: str,
+    verbose: bool = False,
+    return_end_reason: bool = False,
+    errors: str="replace",
+) -> str:
+    if torch.is_tensor(tokens):
+        tokens = tokens.cpu().numpy().tolist()
+
+    if chat_format == "chatml":
+        return _decode_chatml(
+            tokens,
+            stop_words=[],
+            eod_token_ids=[tokenizer.im_start_id, tokenizer.im_end_id],
+            tokenizer=tokenizer,
+            raw_text_len=raw_text_len,
+            context_length=context_length,
+            verbose=verbose,
+            return_end_reason=return_end_reason,
+            errors=errors,
+        )
+    elif chat_format == "raw":
+        return _decode_default(
+            tokens,
+            stop_words=["<|endoftext|>"],
+            eod_words=["<|endoftext|>"],
+            tokenizer=tokenizer,
+            raw_text_len=raw_text_len,
+            verbose=verbose,
+            return_end_reason=return_end_reason,
+            errors=errors,
+        )
+    else:
+        raise NotImplementedError(f"Unknown chat format {chat_format!r}")
+
+
+class StopWordsLogitsProcessor(LogitsProcessor):
+    """
+    :class:`transformers.LogitsProcessor` that enforces that when specified sequences appear, stop geration.
+    Args:
+        stop_words_ids (:obj:`List[List[int]]`):
+            List of list of token ids of stop ids. In order to get the tokens of the words
+            that should not appear in the generated text, use :obj:`tokenizer(bad_word,
+            add_prefix_space=True).input_ids`.
+        eos_token_id (:obj:`int`):
+            The id of the `end-of-sequence` token.
+    """
+
+    def __init__(self, stop_words_ids: Iterable[Iterable[int]], eos_token_id: int):
+
+        if not isinstance(stop_words_ids, List) or len(stop_words_ids) == 0:
+            raise ValueError(
+                f"`stop_words_ids` has to be a non-emtpy list, but is {stop_words_ids}."
+            )
+        if any(not isinstance(bad_word_ids, list) for bad_word_ids in stop_words_ids):
+            raise ValueError(
+                f"`stop_words_ids` has to be a list of lists, but is {stop_words_ids}."
+            )
+        if any(
+            any(
+                (not isinstance(token_id, (int, np.integer)) or token_id < 0)
+                for token_id in stop_word_ids
+            )
+            for stop_word_ids in stop_words_ids
+        ):
+            raise ValueError(
+                f"Each list in `stop_words_ids` has to be a list of positive integers, but is {stop_words_ids}."
+            )
+
+        self.stop_words_ids = list(
+            filter(
+                lambda bad_token_seq: bad_token_seq != [eos_token_id], stop_words_ids
+            )
+        )
+        self.eos_token_id = eos_token_id
+        for stop_token_seq in self.stop_words_ids:
+            assert (
+                len(stop_token_seq) > 0
+            ), "Stop words token sequences {} cannot have an empty list".format(
+                stop_words_ids
+            )
+
+    def __call__(
+        self, input_ids: torch.LongTensor, scores: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        stopped_samples = self._calc_stopped_samples(input_ids)
+        for i, should_stop in enumerate(stopped_samples):
+            if should_stop:
+                scores[i, self.eos_token_id] = float(2**15)
+        return scores
+
+    def _tokens_match(self, prev_tokens: torch.LongTensor, tokens: List[int]) -> bool:
+        if len(tokens) == 0:
+            # if bad word tokens is just one token always ban it
+            return True
+        elif len(tokens) > len(prev_tokens):
+            # if bad word tokens are longer then prev input_ids they can't be equal
+            return False
+        elif prev_tokens[-len(tokens) :].tolist() == tokens:
+            # if tokens match
+            return True
+        else:
+            return False
+
+    def _calc_stopped_samples(self, prev_input_ids: Iterable[int]) -> Iterable[int]:
+        stopped_samples = []
+        for prev_input_ids_slice in prev_input_ids:
+            match = False
+            for stop_token_seq in self.stop_words_ids:
+                if self._tokens_match(prev_input_ids_slice, stop_token_seq):
+                    # if tokens do not match continue
+                    match = True
+                    break
+            stopped_samples.append(match)
+
+        return stopped_samples
+
+
+def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float("Inf")):
+    """This function has been mostly taken from huggingface conversational
+    ai code at
+        https://medium.com/huggingface/how-to-build-a-state-of-the-art-
+             conversational-ai-with-transfer-learning-2d818ac26313"""
+
+    if top_k > 0:
+        # Remove all tokens with a probability less than the
+        # last token of the top-k
+        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+        logits[indices_to_remove] = filter_value
+
+    if top_p > 0.0:
+        # Cconvert to 1D
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
+        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+
+        # Remove tokens with cumulative probability above the threshold
+        sorted_indices_to_remove = cumulative_probs > top_p
+        # Shift the indices to the right to keep also the first token
+        # above the threshold
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+        for i in range(sorted_indices.size(0)):
+            indices_to_remove = sorted_indices[i][sorted_indices_to_remove[i]]
+            logits[i][indices_to_remove] = filter_value
+
+    return logits
+
+
+def switch(val1, val2, boolean):
+    boolean = boolean.type_as(val1)
+    return (1 - boolean) * val1 + boolean * val2
\ No newline at end of file
diff --git a/lmms_eval/models/qwen_vl.py b/lmms_eval/models/qwen_vl.py
index 7ad36300..01334c51 100644
--- a/lmms_eval/models/qwen_vl.py
+++ b/lmms_eval/models/qwen_vl.py
@@ -5,6 +5,7 @@
 from lmms_eval.api.instance import Instance
 from lmms_eval.api.model import lmms
 from lmms_eval.api.registry import register_model
+from lmms_eval.models.model_utils.qwen.qwen_generate_utils import make_context
 from accelerate import Accelerator, DistributedType
 from typing import List, Optional, Union, Tuple
 import uuid
@@ -115,6 +116,76 @@ def world_size(self):
         return self._world_size
 
     def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
+        res = []
+        pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding")
+
+        for contexts, doc_to_target, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]:
+            # encode, pad, and truncate contexts for this batch
+            if type(doc_to_target) == str:
+                continuation = doc_to_target
+            else:
+                continuation = doc_to_target(self.task_dict[task][split][doc_id])
+            visuals = [doc_to_visual(self.task_dict[task][split][doc_id])]
+            visuals = self.flatten(visuals)
+            query = []
+            visual_paths = []
+            for visual in visuals:
+                name = uuid.uuid4().hex.upper()[0:6]
+                visual.save(f"/tmp/{name}.png")
+                visual_paths.append(f"/tmp/{name}.png")
+                query.append({
+                'image' : f"/tmp/{name}.png"
+                })
+            
+            # Make a copy for query to save context (text that needs to be masked)
+            context_query = [ _ for _ in query]
+            context_query.append({'text' : contexts})
+            query.append({'text' : contexts + continuation})
+            
+            context_query = self.tokenizer.from_list_format(context_query)
+            query = self.tokenizer.from_list_format(query)
+            
+            raw_contxt_text, context_tokens = make_context(
+                    self.tokenizer,
+                    context_query,
+                    history=None,
+                    system='You are a helpful assistant',
+                    max_window_size=self.model.generation_config.max_window_size,
+                    chat_format=self.model.generation_config.chat_format
+                )
+            context_tokens = torch.tensor([context_tokens])
+            
+            raw_continuation_text, continuation_tokens = make_context(
+                    self.tokenizer,
+                    query,
+                    history=None,
+                    system='You are a helpful assistant',
+                    max_window_size=self.model.generation_config.max_window_size,
+                    chat_format=self.model.generation_config.chat_format
+                )
+            continuation_tokens = torch.tensor([continuation_tokens]).to(self.model.device)
+            attn_mask = torch.ones_like(continuation_tokens).to(self.model.device)
+            labels = continuation_tokens.clone().to(self.model.device)
+            labels[:, :context_tokens.shape[1]] = -100
+            with torch.inference_mode():
+                outputs = self.model(
+                    input_ids = continuation_tokens,
+                    labels=labels,
+                    attention_mask = attn_mask
+                )
+            loss = outputs.loss
+            logits = outputs["logits"]
+            greedy_tokens = logits.argmax(dim=-1)
+            cont_toks = continuation_tokens[:, context_tokens.shape[1] :]
+            greedy_tokens = greedy_tokens[:, context_tokens.shape[1] : continuation_tokens.shape[1]]  # [1, seq]
+            max_equal = (greedy_tokens == cont_toks).all()
+            res.append((float(loss.item()), bool(max_equal)))
+            pbar.update(1)
+            
+        pbar.close()
+        return res
+            
+
         assert False, "We have not implemented this function for Qwen VL yet"
 
     def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
diff --git a/lmms_eval/tasks/ferret/ferret.yaml b/lmms_eval/tasks/ferret/ferret.yaml
new file mode 100644
index 00000000..ad78c6b6
--- /dev/null
+++ b/lmms_eval/tasks/ferret/ferret.yaml
@@ -0,0 +1,35 @@
+dataset_path: lmms-lab/Ferret-Bench
+dataset_kwargs:
+  token: True
+task: "ferret"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.ferret_doc_to_visual
+doc_to_text: !function utils.ferret_doc_to_text
+doc_to_target: "gpt_answer"
+generation_kwargs:
+  until:
+    - "ASSISTANT:"
+  image_aspect_ratio: original
+  max_new_tokens: 1024
+  temperature: 0
+  top_p: 0
+  num_beams: 1
+  do_sample: false
+process_results: !function utils.ferret_process_results
+metric_list:
+  - metric: gpt_eval_ferret_all
+    aggregation: !function utils.ferret_all_aggregation
+    higher_is_better: true
+  - metric: gpt_eval_ferret_refer_desc
+    aggregation: !function utils.ferret_refer_desc_aggregation
+    higher_is_better: true
+  - metric: gpt_eval_ferret_refer_reason
+    aggregation: !function utils.ferret_refer_reason_aggregation
+    higher_is_better: true
+  - metric: gpt_eval_ferret_ground_conv
+    aggregation: !function utils.ferret_ground_conv_aggregation
+    higher_is_better: true
+metadata:
+  version: 0.0
+  gpt_eval_model_name: "gpt-4-0314"
\ No newline at end of file
diff --git a/lmms_eval/tasks/ferret/rule.json b/lmms_eval/tasks/ferret/rule.json
new file mode 100644
index 00000000..7294372c
--- /dev/null
+++ b/lmms_eval/tasks/ferret/rule.json
@@ -0,0 +1,5 @@
+{
+    "refer_desc":  {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question about specific region of an image. For your reference, the visual content in the image is represented with five descriptive sentences describing the same image. In addition, specific object locations within the image are given, along with detailed coordinates. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. Also, the relationships between pairs of objects are provided, in the format of object -> relationship -> subject, where the object/subject are indexed by object id from previous object lists as well as the object names. Also, several region description are given, each describing a box region of image, with detailed coordinates. \nPlease rate the spatial correspondence, helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."},
+    "refer_reason":  {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question about specific region of an image. For your reference, the visual content in the image is represented with five descriptive sentences describing the same image. In addition, specific object locations within the image are given, along with detailed coordinates. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. Also, the relationships between pairs of objects are provided, in the format of object -> relationship -> subject, where the object/subject are indexed by object id from previous object lists as well as the object names. Also, several region description are given, each describing a box region of image, with detailed coordinates. \nPlease rate the spatial correspondence, helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."},
+    "ground_conv":  {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question that requires model to predict the coordinates of relevant object. For your reference, the visual content in the image is represented with five descriptive sentences describing the same image. In addition, specific object locations within the image are given, along with detailed coordinates. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. Also, the relationships between pairs of objects are provided, in the format of object -> relationship -> subject, where the object/subject are indexed by object id from previous object lists as well as the object names. Also, several region description are given, each describing a box region of image, with detailed coordinates. \nPlease rate the predicted coordinates, helpfulness, relevance, accuracy, level of details of their responses. Specifically, pay your attention to the precision of the coordinates and whether it matches the object. Small deviation (<20% of ground-truth box width or height) of coordinates is allowed and shouldn't be punished. More than that, the degree of deviation should be reflected in scoring too.  Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."}
+}
\ No newline at end of file
diff --git a/lmms_eval/tasks/ferret/utils.py b/lmms_eval/tasks/ferret/utils.py
new file mode 100644
index 00000000..c04f2bb9
--- /dev/null
+++ b/lmms_eval/tasks/ferret/utils.py
@@ -0,0 +1,202 @@
+import json
+import logging
+import os
+import requests
+import numpy as np
+import openai
+from openai import OpenAI
+import time
+import yaml
+from pathlib import Path
+from copy import deepcopy
+
+eval_logger = logging.getLogger("lmms-eval")
+NUM_SECONDS_TO_SLEEP = 0.5
+
+FERRET_W_METRICS = ["gpt_eval_ferret_refer_desc", "gpt_eval_ferret_refer_reason", "gpt_eval_ferret_ground_conv"]
+
+rule_dict = json.load(open(os.path.join(os.path.dirname(os.path.abspath(__file__)), "rule.json"), "r"))
+
+with open(Path(__file__).parent / "ferret.yaml", "r") as f:
+    raw_data = f.readlines()
+    safe_data = []
+    for i, line in enumerate(raw_data):
+        # remove function definition since yaml load cannot handle it
+        if "!function" not in line:
+            safe_data.append(line)
+
+    config = yaml.safe_load("".join(safe_data))
+
+GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"]
+
+API_TYPE = os.getenv("API_TYPE", "openai")
+
+if API_TYPE == "openai":
+    API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
+    API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
+    headers = {
+        "Authorization": f"Bearer {API_KEY}",
+        "Content-Type": "application/json",
+    }
+elif API_TYPE == "azure":
+    API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
+    API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
+    headers = {
+        "api-key": API_KEY,
+        "Content-Type": "application/json",
+    }
+
+
+def get_eval(content: str, max_tokens: int, retries: int = 3):
+    global headers
+
+    messages = [
+        {
+            "role": "system",
+            "content": "You are a helpful and precise assistant for checking the quality of the answer.",
+        },
+        {"role": "user", "content": content},
+    ]
+
+    payload = {
+        "model": GPT_EVAL_MODEL_NAME,
+        "messages": messages,
+        "temperature": 0.2,
+        "max_tokens": max_tokens,
+    }
+
+    for attempt in range(retries):
+        try:
+            response = requests.post(API_URL, headers=headers, json=payload)
+            response.raise_for_status()
+            response_data = response.json()
+
+            content = response_data["choices"][0]["message"]["content"].strip()
+            if content != "":
+                return content, response_data["model"]
+            break  # If successful, break out of the loop
+
+        except Exception as e:
+            eval_logger.info(f"Attempt {attempt + 1} failed with error: {e}")
+            if attempt < retries - 1:  # If we have retries left, sleep and then continue to next attempt
+                time.sleep(NUM_SECONDS_TO_SLEEP)
+            else:  # If this was the last attempt, log and return empty
+                eval_logger.error(f"All {retries} attempts failed. Last error message: {e}")
+                return "", ""
+    return "", ""
+
+
+def parse_score(review):
+    try:
+        score_pair = review.split("\n")[0]
+        score_pair = score_pair.replace(",", " ")
+        sp = score_pair.split(" ")
+        if len(sp) == 2:
+            return [float(sp[0]), float(sp[1])]
+        else:
+            eval_logger.debug(f"Can not split: {review}. Returning [-1, -1]")
+            return [-1, -1]
+    except Exception as e:
+        eval_logger.debug(f"Error: {e}. Returning [-1, -1]")
+        return [-1, -1]
+
+
+def ferret_doc_to_visual(doc):
+    return [doc["image"].convert("RGB")]
+
+
+def ferret_doc_to_text(doc):
+    question = doc["question"]
+    return question
+
+
+def ferret_process_results(doc, result):
+    """
+    Args:
+        doc: a instance of the eval dataset
+        results: [pred]
+    Returns:
+        a dictionary with key: metric name (in this case coco_bleu), value: metric value
+    """
+    try:
+        question = doc.get("question", "")
+        ans1 = doc.get("gpt_answer", "")
+        ans2 = result[0] if result else ""
+        context = doc.get("context", [])
+        context = "\n".join(context) if isinstance(context, list) else context
+        category = doc.get("category", "")
+        rule = rule_dict.get(category, {})
+        prompt = rule.get("prompt", "")
+        role = rule.get("role", "user")
+        content = f"[Context]\n{context}\n\n" f"[Question]\n{question}\n\n" f"[{role} 1]\n{ans1}\n\n[End of {role} 1]\n\n" f"[{role} 2]\n{ans2}\n\n[End of {role} 2]\n\n" f"[System]\n{prompt}\n\n"
+        review, model_name = get_eval(content, 1024)
+        scores = parse_score(review)
+    except Exception as e:
+        eval_logger.error(f"Error for Question ID: {doc.get('question_id', 'Unknown')}: {e}")
+        review = "Failed to Get a Proper Review."
+        model_name = "Failed Request"
+        scores = [-1, -1]
+
+    metric = f"gpt_eval_ferret_{doc.get('category', 'all')}"
+    category_review_dict = {
+        "question": question,
+        "ans1": ans1,
+        "ans2": ans2,
+        "context": context,
+        "category": category,
+        "review": review,
+        "scores": scores,
+        "eval_model": model_name,
+    }
+
+    non_category_review_dict = deepcopy(category_review_dict)
+    non_category_review_dict["scores"] = [-999, -999]
+
+    data_dict = {}
+    for m in FERRET_W_METRICS:
+        if m == metric:
+            data_dict[m] = category_review_dict
+        else:
+            data_dict[m] = non_category_review_dict
+    data_dict["gpt_eval_ferret_all"] = category_review_dict
+
+    # return {"gpt_eval_ferret_all": review_dict}
+    return data_dict
+
+
+def ferret_refer_desc_aggregation(results):
+    return ferret_aggregation(results, "refer_desc")
+
+
+def ferret_refer_reason_aggregation(results):
+    return ferret_aggregation(results, "refer_reason")
+
+
+def ferret_ground_conv_aggregation(results):
+    return ferret_aggregation(results, "ground_conv")
+
+
+def ferret_all_aggregation(results):
+    return ferret_aggregation(results, "all")
+
+
+def ferret_aggregation(results, category):
+    try:
+        scores = []
+        for result in results:
+            if -999 in result["scores"]:
+                continue
+            scores.append(result["scores"])
+
+        stats = np.asarray(scores).mean(0).tolist()
+        stats = [round(x, 3) for x in stats]
+        # gpt4_score_percentage = stats[0] * 10
+        # model_score_percentage = stats[1] * 10
+        # eval_logger.info(f"Category: {category}")
+        # eval_logger.info(f"GPT4 Score: {gpt4_score_percentage:.1f}%")
+        # eval_logger.info(f"Model Score: {model_score_percentage:.1f}%")
+        # eval_logger.info("=========================")
+        return round(stats[1] / stats[0] * 100, 1)
+    except Exception as e:
+        eval_logger.info(f"Error in ferret_aggregation: {e}, and in category: {category}")
+        return None
diff --git a/lmms_eval/tasks/seedbench_2/seedbench_2.yaml b/lmms_eval/tasks/seedbench_2/seedbench_2.yaml
new file mode 100644
index 00000000..6113f284
--- /dev/null
+++ b/lmms_eval/tasks/seedbench_2/seedbench_2.yaml
@@ -0,0 +1,46 @@
+dataset_path: lmms-lab/SEED-Bench-2
+dataset_kwargs:
+  token: True
+task: "seedbench-2"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.seed_doc_to_visual
+doc_to_text: !function utils.seed_doc_to_text
+doc_to_target: "answer"
+generation_kwargs:
+  until:
+    - "ASSISTANT:"
+  max_new_tokens: 16
+  image_aspect_ratio: original
+# The return value of process_results will be used by metrics
+process_results: !function utils.seed_process_result
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+metric_list:
+  - metric: seed_Video
+    aggregation: !function utils.seed_aggregation_result
+    higher_is_better: true
+  - metric: seed_Multiple_Images
+    aggregation: !function utils.seed_aggregation_result
+    higher_is_better: true
+  - metric: seed_Image_&_Text_Generation
+    aggregation: !function utils.seed_aggregation_result
+    higher_is_better: true
+  - metric: seed_Single_Image
+    aggregation: !function utils.seed_aggregation_result
+    higher_is_better: true
+  - metric: seed_Image_Generation
+    aggregation: !function utils.seed_aggregation_result
+    higher_is_better: true
+  - metric: seed_Interleaved_Image
+    aggregation: !function utils.seed_aggregation_result
+    higher_is_better: true
+  - metric: seed_all
+    aggregation: !function utils.seed_aggregation_result
+    higher_is_better: true
+metadata:
+  - version: 0.0
+
+model_specific_prompt_kwargs:
+  llava :
+    img_token : <image>
+    post_prompt : "Answer with the option's letter from the given choices directly."
\ No newline at end of file
diff --git a/lmms_eval/tasks/seedbench_2/utils.py b/lmms_eval/tasks/seedbench_2/utils.py
new file mode 100644
index 00000000..0f78cfe2
--- /dev/null
+++ b/lmms_eval/tasks/seedbench_2/utils.py
@@ -0,0 +1,53 @@
+import json
+
+
+def seed_doc_to_visual(doc):
+    return [image.convert("RGB") for image in doc["image"]]
+
+def parse_choice_img(choice : str, img_token : str):
+    if "jpg" in choice or "png" in choice:
+        return img_token
+
+def seed_doc_to_text(doc, model_specific_kwargs = None):
+    question = doc["question"]
+    question.replace("<img>", model_specific_kwargs['img_token'])
+    question += "\n" + f"A. {parse_choice_img(doc['choice_a'], model_specific_kwargs['img_token'])}\n"
+    question += f"B. {parse_choice_img(doc['choice_a'], model_specific_kwargs['img_token'])}\n"
+    question += f"C. {parse_choice_img(doc['choice_a'], model_specific_kwargs['img_token'])}\n"
+    question += f"D. {parse_choice_img(doc['choice_a'], model_specific_kwargs['img_token'])}"
+    if (doc['data_type'] == "Image Generation"):
+        num_img_in_question = len(doc['image']) - 4
+        prepend_tokens = [model_specific_kwargs['img_token']] * num_img_in_question
+        question = " ".join(prepend_tokens) + "\n" + question
+    return f"{question}\n{model_specific_kwargs['post_prompt']}"
+
+
+def seed_process_result(doc, result):
+    pred = result[0].strip()
+    answer = doc["answer"]
+    data_type = doc["data_type"].split(" ")
+    data_type = "_".join(data_type)
+
+    return {f"seed_{data_type}": {"pred": pred, "answer": answer, "question_id": doc["question_id"]}, f"seed_all": {"pred": pred, "answer": answer, "question_id": doc["question_id"]}}
+
+
+def seed_aggregation_result(results):
+    total_count = 0
+    total_correct = 0
+    for result in results:
+        if result["pred"] == result["answer"]:
+            total_correct += 1
+        total_count += 1
+    return total_correct / total_count if total_count != 0 else 0
+
+
+def seed_aggregation_result_all(results):
+    score = seed_aggregation_result(results)
+    stored_results = []
+    for result in results:
+        stored_results.append({"question_id": result["question_id"], "prediction": result["pred"]})
+    with open("./seed_submission.json", "w") as f:
+        json.dump(stored_results, f, indent=4)
+    print("Storing files for seed_submission ...")
+
+    return score
diff --git a/pyproject.toml b/pyproject.toml
index 2c64a8bc..80da628f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -51,6 +51,8 @@ dependencies = [
     "hf_transfer",
     "tenacity",
     "wandb>=0.16.0",
+    "transformers-stream-generator",
+    "tiktoken"
 ]
 
 [tool.setuptools.packages.find]
@@ -77,7 +79,7 @@ math = ["sympy>=1.12", "antlr4-python3-runtime==4.11"]
 sentencepiece = ["sentencepiece>=0.1.98", "protobuf>=4.22.1"]
 gptq = ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"]
 anthropic = ["anthropic"]
-openai = ["openai==1.3.9", "tiktoken"]
+openai = ["openai==1.3.9"]
 vllm = ["vllm"]
 ifeval = ["langdetect", "immutabledict"]
 all = ["lmms_eval[dev]", "lmms_eval[testing]", "lmms_eval[linting]"]