From affdcddbf97da24e7f5061f650e3e8878ac652ab Mon Sep 17 00:00:00 2001 From: Ankith Gunapal Date: Thu, 27 Jun 2024 13:16:00 -0700 Subject: [PATCH] BERT with torch.compile (#3201) * Added support for torch.compile with BERT * Added support for torch.compile with BERT * Added support for torch.compile with BERT * Added support for torch.compile with BERT * Added support for torch.compile with BERT * Added support for torch.compile with BERT * Update examples/Huggingface_Transformers/README.md Co-authored-by: Matthias Reso <13337103+mreso@users.noreply.github.com> * Updated based on review comments --------- Co-authored-by: Matthias Reso <13337103+mreso@users.noreply.github.com> --- .../Download_Transformer_models.py | 7 +- examples/Huggingface_Transformers/README.md | 167 ++++++++++-------- .../Transformer_handler_generalized.py | 49 +++-- .../model-config.yaml | 18 ++ .../setup_config.json | 13 -- ts_scripts/spellcheck_conf/wordlist.txt | 2 + 6 files changed, 147 insertions(+), 109 deletions(-) create mode 100644 examples/Huggingface_Transformers/model-config.yaml delete mode 100644 examples/Huggingface_Transformers/setup_config.json diff --git a/examples/Huggingface_Transformers/Download_Transformer_models.py b/examples/Huggingface_Transformers/Download_Transformer_models.py index 1ae3c6fd55..5b434dffc3 100644 --- a/examples/Huggingface_Transformers/Download_Transformer_models.py +++ b/examples/Huggingface_Transformers/Download_Transformer_models.py @@ -1,9 +1,9 @@ -import json import os import sys import torch import transformers +import yaml from transformers import ( AutoConfig, AutoModelForCausalLM, @@ -151,9 +151,10 @@ def transformers_model_dowloader( if len(sys.argv) > 1: filename = os.path.join(dirname, sys.argv[1]) else: - filename = os.path.join(dirname, "setup_config.json") + filename = os.path.join(dirname, "model-config.yaml") f = open(filename) - settings = json.load(f) + model_yaml_config = yaml.safe_load(f) + settings = model_yaml_config["handler"] mode = settings["mode"] model_name = settings["model_name"] num_labels = int(settings["num_labels"]) diff --git a/examples/Huggingface_Transformers/README.md b/examples/Huggingface_Transformers/README.md index 6b659ec737..c93c7eae95 100644 --- a/examples/Huggingface_Transformers/README.md +++ b/examples/Huggingface_Transformers/README.md @@ -18,22 +18,37 @@ To get started [install Torchserve](https://github.com/pytorch/serve) and then ### **Getting Started with the Demo** -If you're finetuning an existing model then you need to save your model and tokenizer with `save_pretrained()` which will create a `pytorch_model.bin`, `vocab.txt` and `config.json` file. Make sure to create them then run +If you're finetuning an existing model then you need to save your model and tokenizer with `save_pretrained()` which will create a `model.safetensors`, `vocab.txt` and `config.json` file. Make sure to create them then run ``` mkdir Transformer_model -mv pytorch_model.bin vocab.txt config.json Transformer_model/ +mv model.safetensors vocab.txt config.json Transformer_model/ ``` -If you'd like to download a pretrained model without fine tuning we've provided a simple helper script which will do the above for you. All you need to do is change [setup.config.json](https://github.com/pytorch/serve/blob/master/examples/Huggingface_Transformers/setup_config.json) to your liking and run +If you'd like to download a pretrained model without fine tuning we've provided a simple helper script which will do the above for you. All you need to do is change [model-config.yaml](https://github.com/pytorch/serve/blob/master/examples/Huggingface_Transformers/model-config.yaml) to your liking and run `python Download_Transformer_models.py` +In this example, we are using `torch.compile` by default + +This is enabled by the following config in `model-config.yaml` file + +``` +pt2: + compile: + enable: True + backend: inductor + mode: reduce-overhead +``` +When batch_size is 1, for BERT models, the operations are memory bound. Hence, we make use of `reduce-overhead` mode to make use of CUDAGraph and get better performance. + +To use PyTorch Eager or TorchScript, you can remove the above config. + For Torchscript support, check out [torchscript.md](torchscript.md) -#### Setting the setup_config.json +#### Setting the handler config in model-config.yaml -In the setup_config.json : +In `model-config.yaml` : *model_name* : bert-base-uncased , roberta-base or other available pre-trained models. @@ -55,7 +70,7 @@ In the setup_config.json : *batch_size* : Input batch size when tracing the model for `neuron` or `neuronx` as target hardware. -Once, `setup_config.json` has been set properly, the next step is to run +Once, `model-config.yaml` has been set properly, the next step is to run `python Download_Transformer_models.py` @@ -78,17 +93,17 @@ For examples of how to configure a model for a use case and what the input forma ## Sequence Classification -### Create model archive eager mode +### Create model archive for eager mode or torch.compile ``` -torch-model-archiver --model-name BERTSeqClassification --version 1.0 --serialized-file Transformer_model/pytorch_model.bin --handler ./Transformer_handler_generalized.py --extra-files "Transformer_model/config.json,./setup_config.json,./Seq_classification_artifacts/index_to_name.json" +torch-model-archiver --model-name BERTSeqClassification --version 1.0 --serialized-file Transformer_model/model.safetensors --handler ./Transformer_handler_generalized.py --config-file model-config.yaml --extra-files "Transformer_model/config.json,./Seq_classification_artifacts/index_to_name.json" ``` ### Create model archive Torchscript mode ``` -torch-model-archiver --model-name BERTSeqClassification --version 1.0 --serialized-file Transformer_model/traced_model.pt --handler ./Transformer_handler_generalized.py --extra-files "./setup_config.json,./Seq_classification_artifacts/index_to_name.json" +torch-model-archiver --model-name BERTSeqClassification --version 1.0 --serialized-file Transformer_model/traced_model.pt --handler ./Transformer_handler_generalized.py --config-file model-config.yaml --extra-files "./Seq_classification_artifacts/index_to_name.json" ``` @@ -99,7 +114,7 @@ To register the model on TorchServe using the above model archive file, we run t ``` mkdir model_store mv BERTSeqClassification.mar model_store/ -torchserve --start --model-store model_store --models my_tc=BERTSeqClassification.mar --ncs +torchserve --start --model-store model_store --models my_tc=BERTSeqClassification.mar --disable-token --ncs ``` @@ -113,20 +128,20 @@ To get an explanation: `curl -X POST http://127.0.0.1:8080/explanations/my_tc -T ## Token Classification -Change `setup_config.json` to +Change the `handler` section in `model-config.yaml` to ``` -{ - "model_name":"bert-base-uncased", - "mode":"token_classification", - "do_lower_case":true, - "num_labels":"9", - "save_mode":"pretrained", - "max_length":"150", - "captum_explanation":true, - "FasterTransformer":false, - "embedding_name": "bert" -} +handler: + model_name: bert-base-uncased + mode: token_classification + do_lower_case: true + num_labels: 9 + save_mode: pretrained + max_length: 150 + captum_explanation: true + embedding_name: bert + BetterTransformer: false + model_parallel: false ``` ``` @@ -134,14 +149,14 @@ rm -r Transformer_model python Download_Transformer_models.py ``` -### Create model archive eager mode +### Create model archive for eager mode or torch.compile ``` -torch-model-archiver --model-name BERTTokenClassification --version 1.0 --serialized-file Transformer_model/pytorch_model.bin --handler ./Transformer_handler_generalized.py --extra-files "Transformer_model/config.json,./setup_config.json,./Token_classification_artifacts/index_to_name.json" +torch-model-archiver --model-name BERTTokenClassification --version 1.0 --serialized-file Transformer_model/model.safetensors --handler ./Transformer_handler_generalized.py --config-file model-config.yaml --extra-files "Transformer_model/config.json,./Token_classification_artifacts/index_to_name.json" ``` ### Create model archive Torchscript mode ``` -torch-model-archiver --model-name BERTTokenClassification --version 1.0 --serialized-file Transformer_model/traced_model.pt --handler ./Transformer_handler_generalized.py --extra-files "./setup_config.json,./Token_classification_artifacts/index_to_name.json" +torch-model-archiver --model-name BERTTokenClassification --version 1.0 --serialized-file Transformer_model/traced_model.pt --handler ./Transformer_handler_generalized.py --config-file model-config.yaml --extra-files "./Token_classification_artifacts/index_to_name.json" ``` ### Register the model @@ -149,7 +164,7 @@ torch-model-archiver --model-name BERTTokenClassification --version 1.0 --serial ``` mkdir model_store mv BERTTokenClassification.mar model_store -torchserve --start --model-store model_store --models my_tc=BERTTokenClassification.mar --ncs +torchserve --start --model-store model_store --models my_tc=BERTTokenClassification.mar --disable-token --ncs ``` ### Run an inference @@ -158,19 +173,19 @@ To get an explanation: `curl -X POST http://127.0.0.1:8080/explanations/my_tc -T ## Question Answering -Change `setup_config.json` to +Change the `handler` section in `model-config.yaml` to ``` -{ - "model_name":"distilbert-base-cased-distilled-squad", - "mode":"question_answering", - "do_lower_case":true, - "num_labels":"0", - "save_mode":"pretrained", - "max_length":"128", - "captum_explanation":true, - "FasterTransformer":false, - "embedding_name": "distilbert" -} +handler: + model_name: distilbert-base-cased-distilled-squad + mode: question_answering + do_lower_case: true + num_labels: 0 + save_mode: pretrained + max_length: 150 + captum_explanation: true + embedding_name: distilbert + BetterTransformer: false + model_parallel: false ``` ``` @@ -178,14 +193,14 @@ rm -r Transformer_model python Download_Transformer_models.py ``` -### Create model archive eager mode +### Create model archive for eager mode or torch.compile ``` -torch-model-archiver --model-name BERTQA --version 1.0 --serialized-file Transformer_model/pytorch_model.bin --handler ./Transformer_handler_generalized.py --extra-files "Transformer_model/config.json,./setup_config.json" +torch-model-archiver --model-name BERTQA --version 1.0 --serialized-file Transformer_model/model.safetensors --handler ./Transformer_handler_generalized.py --config-file model-config.yaml --extra-files "Transformer_model/config.json" ``` ### Create model archive Torchscript mode ``` -torch-model-archiver --model-name BERTQA --version 1.0 --serialized-file Transformer_model/traced_model.pt --handler ./Transformer_handler_generalized.py --extra-files "./setup_config.json" +torch-model-archiver --model-name BERTQA --version 1.0 --serialized-file Transformer_model/traced_model.pt --handler ./Transformer_handler_generalized.py --config-file model-config.yaml ``` ### Register the model @@ -193,7 +208,7 @@ torch-model-archiver --model-name BERTQA --version 1.0 --serialized-file Transfo ``` mkdir model_store mv BERTQA.mar model_store -torchserve --start --model-store model_store --models my_tc=BERTQA.mar --ncs +torchserve --start --model-store model_store --models my_tc=BERTQA.mar --disable-token --ncs ``` ### Run an inference To run an inference: `curl -X POST http://127.0.0.1:8080/predictions/my_tc -T QA_artifacts/sample_text_captum_input.txt` @@ -201,20 +216,19 @@ To get an explanation: `curl -X POST http://127.0.0.1:8080/explanations/my_tc -T ## Text Generation -Change `setup_config.json` to - +Change the `handler` section in `model-config.yaml` to ``` -{ - "model_name":"gpt2", - "mode":"text_generation", - "do_lower_case":true, - "num_labels":"0", - "save_mode":"pretrained", - "max_length":"150", - "captum_explanation":true, - "FasterTransformer":false, - "embedding_name": "gpt2" -} +handler: + model_name: gpt2 + mode: text_generation + do_lower_case: true + num_labels: 0 + save_mode: pretrained + max_length: 150 + captum_explanation: true + embedding_name: gpt2 + BetterTransformer: false + model_parallel: false ``` ``` @@ -225,13 +239,13 @@ python Download_Transformer_models.py ### Create model archive eager mode ``` -torch-model-archiver --model-name Textgeneration --version 1.0 --serialized-file Transformer_model/pytorch_model.bin --handler ./Transformer_handler_generalized.py --extra-files "Transformer_model/config.json,./setup_config.json" +torch-model-archiver --model-name Textgeneration --version 1.0 --serialized-file Transformer_model/model.safetensors --handler ./Transformer_handler_generalized.py --config-file model-config.yaml --extra-files "Transformer_model/config.json" ``` ### Create model archive Torchscript mode ``` -torch-model-archiver --model-name Textgeneration --version 1.0 --serialized-file Transformer_model/traced_model.pt --handler ./Transformer_handler_generalized.py --extra-files "./setup_config.json" +torch-model-archiver --model-name Textgeneration --version 1.0 --serialized-file Transformer_model/traced_model.pt --handler ./Transformer_handler_generalized.py --config-file model-config.yaml ``` ### Register the model @@ -241,7 +255,7 @@ To register the model on TorchServe using the above model archive file, we run t ``` mkdir model_store mv Textgeneration.mar model_store/ -torchserve --start --model-store model_store --models my_tc=Textgeneration.mar --ncs +torchserve --start --model-store model_store --models my_tc=Textgeneration.mar --disable-token --ncs ``` ### Run an inference @@ -258,7 +272,7 @@ For batch inference the main difference is that you need set the batch size whil ``` mkdir model_store mv BERTSeqClassification.mar model_store/ - torchserve --start --model-store model_store --ncs + torchserve --start --model-store model_store --disable-token --ncs curl -X POST "localhost:8081/models?model_name=BERTSeqClassification&url=BERTSeqClassification.mar&batch_size=4&max_batch_delay=5000&initial_workers=3&synchronous=true" ``` @@ -316,36 +330,35 @@ When a json file is passed as a request format to the curl, Torchserve unwraps t ## Speed up inference with Better Transformer (Flash Attentions/ Xformer Memory Efficient kernels) -In the setup_config.json, specify `"BetterTransformer":true,`. +In the `model-config.yaml`, specify `"BetterTransformer":true,`. -[Better Transformer(Accelerated Transformer)](https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/) from PyTorch is integrated into [Huggingface Optimum](https://huggingface.co/docs/optimum/bettertransformer/overview) that bring major speedups for many of encoder models on different modalities (text, image, audio). It is a one liner API that we have also added in the `Transformer_handler_generalized.py` in this example as well. That as shown above you just need to set `"BetterTransformer":true,` in the setup_config.json. +[Better Transformer(Accelerated Transformer)](https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/) from PyTorch is integrated into [Huggingface Optimum](https://huggingface.co/docs/optimum/bettertransformer/overview) that bring major speedups for many of encoder models on different modalities (text, image, audio). It is a one liner API that we have also added in the `Transformer_handler_generalized.py` in this example as well. That as shown above you just need to set `"BetterTransformer":true,` in the `model-config.yaml`. Main speed ups in the Better Transformer comes from kernel fusion in the [TransformerEncoder] (https://pytorch.org/docs/stable/generated/torch.nn.TransformerEncoder.html) and making use of sparsity with [nested tensors](https://pytorch.org/tutorials/prototype/nestedtensor.html) when input sequences are padded to avoid unnecessary computation on padded tensors. We have seen up to 4.5x speed up with distill_bert when used higher batch sizes with padding. Please read more about it in this [blog post](https://medium.com/pytorch/bettertransformer-out-of-the-box-performance-for-huggingface-transformers-3fbe27d50ab2). You get some speedups even with Batch size = 1 and no padding however, major speed ups will show up when running inference with higher batch sizes (8.16,32) with padding. -The Accelerated Transformer integration with HuggingFace also added the support for decoder models, please read more about it [here](https://pytorch.org/blog/out-of-the-box-acceleration/). This adds the native support for Flash Attentions and Xformer Memory Efficient kernels in PyTorch and make it available on HuggingFace deocder models. This will brings significant speed up and memory savings with just one line of the code as before. +The Accelerated Transformer integration with HuggingFace also added the support for decoder models, please read more about it [here](https://pytorch.org/blog/out-of-the-box-acceleration/). This adds the native support for Flash Attentions and Xformer Memory Efficient kernels in PyTorch and make it available on HuggingFace decoder models. This will brings significant speed up and memory savings with just one line of the code as before. ## Model Parallelism [Parallelize] (https://huggingface.co/docs/transformers/model_doc/gpt2#transformers.GPT2Model.parallelize) is a an experimental feature that HuggingFace recently added to support large model inference for some very large models, GPT2 and T5. GPT2 model choices based on their size are gpt2-medium, gpt2-large, gpt2-xl. This feature only supports LMHeadModel that could be used for text generation, other application such as sequence, token classification and question answering are not supported. We have added parallelize support for GPT2 model in the custom handler in this example that will enable you to perform model parallel inference for GPT2 models used for text generation. The same logic in the handler can be extended to T5 and the applications it supports. Make sure that you register your model with one worker using this feature. To run this example, a machine with #gpus > 1 is required. The number of required gpus depends on the size of the model. This feature only supports single node, one machine with multi-gpus. -Change `setup_config.json` to - +Change the `handler` section in `model-config.yaml` to ``` -{ - "model_name":"gpt2", - "mode":"text_generation", - "do_lower_case":true, - "num_labels":"0", - "save_mode":"pretrained", - "max_length":"150", - "captum_explanation":true, - "embedding_name": "gpt2", - "FasterTransformer":false, - "model_parallel":true -} +handler: + model_name: gpt2 + mode: text_generation + do_lower_case: true + num_labels: 0 + save_mode: pretrained + max_length: 150 + captum_explanation: true + embedding_name: gpt2 + BetterTransformer: false + model_parallel: true ``` + ``` rm -r Transformer_model python Download_Transformer_models.py @@ -364,7 +377,7 @@ To register the model on TorchServe using the above model archive file, we run t ``` mkdir model_store mv Textgeneration.mar model_store/ -torchserve --start --model-store model_store +torchserve --start --model-store model_store --disable-token curl -X POST "localhost:8081/models?model_name=Textgeneration&url=Textgeneration.mar&batch_size=1&max_batch_delay=5000&initial_workers=1&synchronous=true" ``` diff --git a/examples/Huggingface_Transformers/Transformer_handler_generalized.py b/examples/Huggingface_Transformers/Transformer_handler_generalized.py index 5865bf1123..d8c3c11782 100644 --- a/examples/Huggingface_Transformers/Transformer_handler_generalized.py +++ b/examples/Huggingface_Transformers/Transformer_handler_generalized.py @@ -2,7 +2,6 @@ import json import logging import os -from abc import ABC import torch import transformers @@ -22,13 +21,14 @@ logger.info("Transformers version %s", transformers.__version__) -class TransformersSeqClassifierHandler(BaseHandler, ABC): +class TransformersSeqClassifierHandler(BaseHandler): """ Transformers handler class for sequence, token classification and question answering. """ def __init__(self): super(TransformersSeqClassifierHandler, self).__init__() + self.setup_config = None self.initialized = False def initialize(self, ctx): @@ -40,6 +40,11 @@ def initialize(self, ctx): pertaining to the model artifacts parameters. """ self.manifest = ctx.manifest + self.model_yaml_config = ( + ctx.model_yaml_config + if ctx is not None and hasattr(ctx, "model_yaml_config") + else {} + ) properties = ctx.system_properties model_dir = properties.get("model_dir") serialized_file = self.manifest["model"]["serializedFile"] @@ -50,20 +55,12 @@ def initialize(self, ctx): if torch.cuda.is_available() and properties.get("gpu_id") is not None else "cpu" ) - # read configs for the mode, model_name, etc. from setup_config.json - setup_config_path = os.path.join(model_dir, "setup_config.json") - if os.path.isfile(setup_config_path): - with open(setup_config_path) as setup_config_file: - self.setup_config = json.load(setup_config_file) - else: - logger.warning("Missing the setup_config.json file.") - # Loading the shared object of compiled Faster Transformer Library if Faster Transformer is set - if self.setup_config["FasterTransformer"]: - faster_transformer_complied_path = os.path.join( - model_dir, "libpyt_fastertransformer.so" - ) - torch.classes.load_library(faster_transformer_complied_path) + # read configs for the mode, model_name, etc. from the handler config + self.setup_config = self.model_yaml_config.get("handler", {}) + if not self.setup_config: + logger.warning("Missing the handler config") + # Loading the model and tokenizer from checkpoint and config files based on the user's choice of mode # further setup config can be added. if self.setup_config["save_mode"] == "torchscript": @@ -125,6 +122,21 @@ def initialize(self, ctx): ) self.model.eval() + + pt2_value = self.model_yaml_config.get("pt2", {}) + if "compile" in pt2_value: + compile_options = pt2_value["compile"] + if compile_options["enable"] == True: + del compile_options["enable"] + + compile_options_str = ", ".join( + [f"{k} {v}" for k, v in compile_options.items()] + ) + self.model = torch.compile( + self.model, + **compile_options, + ) + logger.info(f"Compiled model with {compile_options_str}") logger.info("Transformer model from path %s loaded successfully", model_dir) # Read the mapping file, index to object name @@ -216,6 +228,7 @@ def preprocess(self, requests): ) return (input_ids_batch, attention_mask_batch) + @torch.inference_mode def inference(self, input_batch): """Predict the class (or classes) of the received text using the serialized transformers checkpoint. @@ -316,7 +329,11 @@ def inference(self, input_batch): # https://github.com/huggingface/transformers/blob/v4.17.0/src/transformers/models/gpt2/modeling_gpt2.py#L970 input_ids_batch = input_ids_batch.to("cuda:0") outputs = self.model.generate( - input_ids_batch, max_length=50, do_sample=True, top_p=0.95, top_k=60 + input_ids_batch, + max_new_tokens=self.setup_config["max_length"], + do_sample=True, + top_p=0.95, + top_k=60, ) for i, x in enumerate(outputs): inferences.append( diff --git a/examples/Huggingface_Transformers/model-config.yaml b/examples/Huggingface_Transformers/model-config.yaml new file mode 100644 index 0000000000..4001ca55f0 --- /dev/null +++ b/examples/Huggingface_Transformers/model-config.yaml @@ -0,0 +1,18 @@ +minWorkers: 1 +maxWorkers: 1 +handler: + model_name: bert-base-uncased + mode: sequence_classification + do_lower_case: true + num_labels: 2 + save_mode: pretrained + max_length: 150 + captum_explanation: true + embedding_name: bert + BetterTransformer: false + model_parallel: false +pt2: + compile: + enable: True + backend: inductor + mode: reduce-overhead diff --git a/examples/Huggingface_Transformers/setup_config.json b/examples/Huggingface_Transformers/setup_config.json deleted file mode 100644 index 44053165df..0000000000 --- a/examples/Huggingface_Transformers/setup_config.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "model_name":"bert-base-uncased", - "mode":"sequence_classification", - "do_lower_case":true, - "num_labels":"2", - "save_mode":"pretrained", - "max_length":"150", - "captum_explanation":true, - "embedding_name": "bert", - "FasterTransformer":false, - "BetterTransformer":false, - "model_parallel":false -} diff --git a/ts_scripts/spellcheck_conf/wordlist.txt b/ts_scripts/spellcheck_conf/wordlist.txt index 43446f168f..77b46c9669 100644 --- a/ts_scripts/spellcheck_conf/wordlist.txt +++ b/ts_scripts/spellcheck_conf/wordlist.txt @@ -1250,6 +1250,8 @@ smoothquant woq TokenAuthorizationHandler TorchText +safetensors +CUDAGraph parallelLevel parallelType parallelization