Improve: Extend benchmarks

unum-cloud · Apr 24, 2024 · 1f556b8 · 1f556b8
1 parent 6d5f1ce
commit 1f556b8
Show file tree

Hide file tree

Showing 3 changed files with 75 additions and 15 deletions.
diff --git a/BENCHMARKS.md b/BENCHMARKS.md
@@ -100,6 +100,8 @@ Results for VQAv2 evaluation.
 
 ## Speed
 
+### Embedding Models
+
 UForm comes pre-packaged with speed benchmarks for the models.
 
 ```bash
@@ -141,14 +143,6 @@ On Nvidia RTX 3090:
 | `sentence-transformers/all-MiniLM-L12-v2`        |      __Yes__ | 3'604 sequences/second |     x 2.24 |
 | `unum-cloud/uform3-image-text-multilingual-base` |      __Yes__ | 6'809 sequences/second | __x 4.22__ |
 
-On Nvidia RTX 3090, the following performance is expected on text token generation using `float16`, equivalent PyTorch settings, and greedy decoding.
-
-| Model                               | Size |               Speed |   Speedup |
-| :---------------------------------- | ---: | ------------------: | --------: |
-| `llava-hf/llava-1.5-7b-hf`          |   7B |  ~ 40 tokens/second |           |
-| `Salesforce/instructblip-vicuna-7b` |   7B |  ~ 40 tokens/second |           |
-| `unum-cloud/uform-gen`              | 1.5B | ~ 140 tokens/second | __x 3.5__ |
-
 Given the small size of the model it also work well on mobile devices.
 On Apple M2 Arm chips the energy efficiency of inference can exceed that of the RTX 3090 GPU and other Ampere-generation cards.
 
@@ -158,3 +152,26 @@ On Apple M2 Arm chips the energy efficiency of inference can exceed that of the
 | Apple M2 Pro unplugged |  ~ 19 tokens/second |      < 20W | 0.95 tokens/joule |
 | Apple M2 Max unplugged |  ~ 38 tokens/second |      < 36W | 1.06 tokens/joule |
 | Apple M2 Max plugged   |  ~ 56 tokens/second |      < 89W | 0.63 tokens/joule |
+
+### Generative Models
+
+```bash
+$ python python/scripts/bench_decoders.py --help
+usage: bench_decoders.py [-h] [--filter-out FILTER_OUT] [--batch-size BATCH_SIZE]
+
+options:
+  -h, --help            show this help message and exit
+  --filter-out FILTER_OUT
+                        Filter out models, backends, or devices with a Regular Expression.
+  --batch-size BATCH_SIZE
+                        Batch size for the benchmark. Batch size 1 measures latency. Large batch sizes may not fit on every GPU.
+```
+
+On Nvidia RTX 3090, the following performance is expected on text token generation using `float16`, equivalent PyTorch settings, and greedy decoding.
+
+| Model                               | Size |               Speed |   Speedup |
+| :---------------------------------- | ---: | ------------------: | --------: |
+| `llava-hf/llava-1.5-7b-hf`          |   7B |  ~ 40 tokens/second |           |
+| `Salesforce/instructblip-vicuna-7b` |   7B |  ~ 40 tokens/second |           |
+| `unum-cloud/uform-gen`              | 1.5B | ~ 140 tokens/second | __x 3.5__ |
+
diff --git a/python/scripts/bench_decoders.py b/python/scripts/bench_decoders.py
@@ -2,6 +2,7 @@
 from time import perf_counter
 from dataclasses import dataclass
 from typing import List
+import argparse
 
 import requests
 import torch
@@ -11,6 +12,8 @@
     InstructBlipForConditionalGeneration,
     InstructBlipProcessor,
     LlavaForConditionalGeneration,
+    AutoModel,
+    AutoProcessor,
 )
 
 from uform.torch_decoders import VLMForCausalLM, VLMProcessor
@@ -57,6 +60,7 @@ def caption(model, processor, prompt: str, image: Image.Image) -> str:
 
 
 def duration(callable):
+    """Profile the duration of a callable and return the duration and the result."""
     start = perf_counter()
     result = callable()
     stop = perf_counter()
@@ -86,7 +90,8 @@ def caption_image(image, model=model, processor=processor, prompt=prompt):
     print(f"Throughput: {total_length/total_duration:.2f} tokens/s")
 
 
-if __name__ == "__main__":
+def main(filter_out: str = None, batch_size: int = 10):
+
     image_urls = [
         "https://images.unsplash.com/photo-1697665666330-7acf230fa830?q=80&w=2787&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
         "https://images.unsplash.com/photo-1695653422543-7da6d6744364?q=80&w=2940&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDF8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
@@ -103,12 +108,30 @@ def caption_image(image, model=model, processor=processor, prompt=prompt):
         "a few food containers, with past, corn, olives, and sliced red & green peppers, with a man pouring sous on top of it",
     ]
 
+    print("UForm-Gen2")
+    bench_captions(
+        model=AutoModel.from_pretrained(
+            "unum-cloud/uform-gen2-dpo",
+            trust_remote_code=True,
+            torch_dtype=dtype,
+            low_cpu_mem_usage=low_cpu_mem_usage,
+            ignore_mismatched_sizes=True,
+        ).to(device),
+        processor=AutoProcessor.from_pretrained(
+            "unum-cloud/uform-gen2-dpo",
+            trust_remote_code=True,
+        ),
+        prompt="Describe the picture in great detail",
+        images=images,
+    )
+
     print("UForm-Gen")
     bench_captions(
         model=VLMForCausalLM.from_pretrained(
             "unum-cloud/uform-gen",
             torch_dtype=dtype,
             low_cpu_mem_usage=low_cpu_mem_usage,
+            ignore_mismatched_sizes=True,
         ).to(device),
         processor=VLMProcessor.from_pretrained(
             "unum-cloud/uform-gen",
@@ -144,3 +167,23 @@ def caption_image(image, model=model, processor=processor, prompt=prompt):
         prompt="Summarize the visual content of the image.",
         images=images,
     )
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--filter-out",
+        type=str,
+        default=None,
+        help="Filter out models, backends, or devices with a Regular Expression.",
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=10,
+        help="Batch size for the benchmark. Batch size 1 measures latency. Large batch sizes may not fit on every GPU.",
+    )
+    args = parser.parse_args()
+
+    main(filter_out=args.filter_out, batch_size=args.batch_size)
diff --git a/python/uform/chat.py b/python/uform/chat.py
@@ -13,10 +13,10 @@
 def parse_args():
     parser = ArgumentParser(description="Chat with UForm generative model")
 
-    parser.add_argument("--model", type=str, default="unum-cloud/uform-gen-chat")
-    parser.add_argument("--image", type=str, help="", required=True)
-    parser.add_argument("--device", type=str, required=True)
-    parser.add_argument("--fp16", action="store_true")
+    parser.add_argument("--model", type=str, default="unum-cloud/uform-gen-chat", help="Model name or path")
+    parser.add_argument("--image", type=str, required=True, help="Path to image or URL")
+    parser.add_argument("--device", type=str, required=True, help="Device to run on, like `cpu` or `cuda:0`")
+    parser.add_argument("--fp16", action="store_true", help="Use half-precision math for faster inference")
 
     return parser.parse_args()
 
@@ -95,16 +95,16 @@ def run_chat(opts, model, processor):
 def main():
     try:
         opts = parse_args()
-
+        processor = VLMProcessor.from_pretrained(opts.model)
         model = (
             VLMForCausalLM.from_pretrained(
                 opts.model,
                 torch_dtype=torch.bfloat16 if opts.fp16 else torch.float32,
+                ignore_mismatched_sizes=True,
             )
             .eval()
             .to(opts.device)
         )
-        processor = VLMProcessor.from_pretrained(opts.model)
 
         run_chat(opts, model, processor)