Merge pull request #60 from bigscience-workshop/batch-gen

Support batching for API & Refactor model scripts
bigscience-workshop · May 18, 2022 · 4a78c75 · 4a78c75
2 parents 0c8b216 + 0d3bfad
commit 4a78c75
Show file tree

Hide file tree

Showing 14 changed files with 519 additions and 972 deletions.
diff --git a/README.md b/README.md
@@ -1,22 +1,23 @@
-# Language Model Evaluation Harness
+# Promptsource X Language Model Evaluation Harness
 
 ![](https://github.com/EleutherAI/lm-evaluation-harness/workflows/Build/badge.svg)
 [![codecov](https://codecov.io/gh/EleutherAI/lm-evaluation-harness/branch/master/graph/badge.svg?token=JSG3O2427J)](https://codecov.io/gh/EleutherAI/lm-evaluation-harness)
 
 ## Overview 
 
-This project provides a unified framework to test autoregressive language models (GPT-2, GPT-3, GPTNeo, etc) on a large number of different evaluation tasks.
+This project provides a unified framework to test language models (GPT-2, GPT-3, GPTNeo, etc) and seq2seq (T5, T0) models via prompt evaluation.
 
-Features:
+As of now, all the prompts are provided via `promptsource`; all datasets are in huggingface `datasets`. Both of these are not necessary for new tasks.
 
-- 200+ tasks implemented
-- Support for GPT-2, GPT-3, GPT-Neo, GPT-NeoX, and GPT-J, with flexible tokenization-agnostic interface
-- Task versioning to ensure reproducibility
+This fork is not (currently) backwards compatible with the original evaluation harness.
 
-## Install
+## Installation
 
 ```bash
-pip install lm-eval
+git clone https://github.com/bigscience-workshop/lm-evaluation-harness
+cd lm-evaluation-harness
+pip install   "promptsource @ git+https://github.com/bigscience-workshop/promptsource@eval-hackathon"
+pip install -e ".[dev]"
 ```
 
 ## Basic Usage
@@ -25,11 +26,18 @@ To evaluate a model, (e.g. GPT-2) on NLU tasks (e.g. LAMBADA, HellaSwag), you ca
 
 ```bash
 python main.py \
-	--model gpt2 \
-	--device cuda:0 \
-	--tasks lambada,hellaswag
+	--model hf-causal \
+  	--model_args pretrained=gpt2 \
+	--tasks mrpc,gsarti/flores_101_afr
 ```
-(This uses gpt2-117M by default as per HF defaults, use --model_args to specify other gpt2 sizes)
+
+Features:
+
+- Growing number of tasks integrated with `promptsource` (20+).
+- Support for hugging face causal language models, huggingface seq2seq models, and the openai completion api (gpt3), with flexible tokenization-agnostic interface
+- Task versioning to ensure reproducibility
+
+# Original Notes from Eval Harness
 
 Additional arguments can be provided to the model constructor using the `--model_args` flag. Most importantly, the `gpt2` model can be used to load an arbitrary HuggingFace model. For example, to run GPTNeo use the following:
 

diff --git a/lm_eval/base.py b/lm_eval/base.py
@@ -342,7 +342,6 @@ def _collate(x):
     def greedy_until(self, requests):
         # TODO: implement fully general `until` that handles untils that are
         #       multiple tokens or that span multiple tokens correctly
-
         # TODO: extract to TokenizedLM?
         res = []
 
@@ -351,8 +350,11 @@ def _collate(x):
             return len(toks), x[0]
 
         reord = utils.Reorderer(requests, _collate)
-
-        for context, request_args in tqdm(reord.get_reordered()):
+        for chunk in utils.chunks(
+            tqdm(reord.get_reordered(), disable=False), self.batch_size
+        ):
+            context = [c[0] for c in chunk]
+            request_args = chunk[0][1]
             stopping_criteria = request_args["stopping_criteria"]
             max_generation_length = request_args["max_generation_length"]
             num_fewshot = request_args["num_fewshot"]
@@ -366,38 +368,43 @@ def _collate(x):
             if stopping_criteria is None:
                 until = [self.eot_token]
             else:
-                until = [stopping_criteria]
+                until = [stopping_criteria, self.eot_token]
             primary_until = self.tok_encode(until[0])
 
             if len(primary_until) == 0:
                 primary_until = torch.tensor([self.eot_token_id])
 
-            context_enc = torch.tensor(
-                [self.tok_encode(context)[self.max_gen_toks - self.max_length :]]
-            ).to(self.device)
+            # Ensure that the context does encroach into the `space`
+            # for the generation.
+            tok_context = self.tok_encode_batch(context)
+            input_ids = tok_context["input_ids"][
+                :, self.max_gen_toks - self.max_length :
+            ].to(self.device)
+            attention_mask = tok_context["attention_mask"][
+                :, self.max_gen_toks - self.max_length :
+            ].to(self.device)
 
             if max_generation_length is None:
                 max_length = self.max_gen_toks
             else:
                 max_length = max_generation_length
 
             cont = self._model_generate(
-                context_enc,
+                input_ids,
+                attention_mask,
                 max_length,
                 torch.tensor(primary_until),
                 num_fewshot,
             )
 
-            s = self.tok_decode(cont.tolist())
-
-            for term in until:
-                s = s.split(term)[0]
-
-            # partial caching
-            self.cache_hook.add_partial("greedy_until", (context, until), s)
-
-            res.append(s)
+            sentences = self.tok_decode(cont.tolist())
 
+            for sentence in sentences:
+                for term in until:
+                    sentence = sentence.split(term)[0]
+                # partial caching
+                self.cache_hook.add_partial("greedy_until", (context, until), sentence)
+                res.append(sentence)
         return reord.get_original(res)
 
 

diff --git a/lm_eval/models/__init__.py b/lm_eval/models/__init__.py
@@ -1,20 +1,11 @@
-from . import gpt2
-from . import gptj
-from . import gpt3
-from . import mgpt
-from . import t5
-from . import t0
 from . import dummy
+from . import openai_completions
+from . import huggingface
 
 MODEL_REGISTRY = {
-    "hf": gpt2.HFLM,
-    "gpt2": gpt2.GPT2LM,
-    "gptj": gptj.GPTJLM,
-    "gpt3": gpt3.GPT3LM,
-    "mgpt": mgpt.MGPTLM,
-    "t5": t5.T5LM,
-    "mt5": t5.T5LM,
-    "t0": t0.T0LM,
+    "hf-causal": huggingface.AutoCausalLM,
+    "hf-seq2seq": huggingface.AutoSeq2SeqLM,
+    "openai": openai_completions.OpenAICompletionsLM,
     "dummy": dummy.DummyLM,
 }
 

diff --git a/lm_eval/models/gpt2.py b/lm_eval/models/gpt2.py