feat: update v1.3 codes

triton-inference-server · Dec 2, 2022 · 36bf558 · 36bf558
1 parent 225b578
commit 36bf558
Show file tree

Hide file tree

Showing 64 changed files with 8,577 additions and 580 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -54,7 +54,6 @@ set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
 # Python.h needed by torch headers.
 find_package(Python3 REQUIRED COMPONENTS Development)
 
-find_package(FasterTransformer)
 find_package(CUDA 10.1 REQUIRED)
 if (BUILD_MULTI_GPU)
   message(STATUS "Enable BUILD_MULTI_GPU.")
@@ -111,7 +110,7 @@ else()
   FetchContent_Declare(
     repo-ft
     GIT_REPOSITORY https://github.com/NVIDIA/FasterTransformer.git
-    GIT_TAG main
+    GIT_TAG v5.2
     GIT_SHALLOW ON
   )
 endif()
@@ -159,6 +158,7 @@ target_include_directories(
   ${TRITON_PYTORCH_INCLUDE_PATHS}
   ${Python3_INCLUDE_DIRS}
   ${repo-ft_SOURCE_DIR}
+  ${repo-ft_SOURCE_DIR}/3rdparty/cutlass/include
   ${repo-core_SOURCE_DIR}/include
   )
 

diff --git a/README.md b/README.md
@@ -55,9 +55,10 @@ Note that this is a research and prototyping tool, not a formal product or maint
 
 | Models   | FP16 | BF16 | Tensor parallel | Pipeline parallel |
 | -------- | ---- | ---- | --------------- | ----------------- |
-| GPT      | Yes  | Yes  | Yes             | Yes               |
+| GPT/OPT  | Yes  | Yes  | Yes             | Yes               |
+| BLOOM    | Yes  | Yes  | Yes             | Yes               |
 | GPT-J    | Yes  | Yes  | Yes             | Yes               |
-| T5       | Yes  | Yes  | Yes             | Yes               |
+| T5/UL2   | Yes  | Yes  | Yes             | Yes               |
 | GPT-NeoX | Yes  | Yes  | Yes             | Yes               |
 | BERT     | Yes  | Yes  | Yes             | Yes               |
 
@@ -136,6 +137,7 @@ But also you can build it manually in interactive session (ex during fixing code
 
 ```bash
 docker run -it \
+    –shm-size=1g –ulimit memlock=-1 \
     -v ${WORKSPACE}:/workspace \
     --name ft_backend_builder \
     ${TRITON_DOCKER_IMAGE} bash
@@ -242,8 +244,16 @@ Specifically `tools/issue_request.py` is a simple script that sends a request co
 
 ## Changelog
 
+Oct 2022
+- Support IA3 in T5 and T5-Encoder
+
+Sep 2022
+- Support T5-Encoder only backend
+- Support T5 prompt tuning and p tuning
+- Support factual-nucleus sampling ([link](https://arxiv.org/pdf/2206.04624.pdf))
+
 Aug 2022
-- Release **FasterTransformer backend v1.2**
+- **Release the FasterTransformer backend 1.2**.
 - Support for interactive generation
 
 July 2022
@@ -260,6 +270,7 @@ May 2022
 - Support optional input. (triton version must be after 22.05)
 
 April 2022
+- **Release the FasterTransformer backend 1.1**.
 - Support bfloat16 inference in GPT model.
 - Support Nemo Megatron T5 and Megatron-LM T5 model.
 - Support optional input in fastertransformer backends. (Only supported after Triton 22.01)
@@ -278,4 +289,4 @@ Sep 2021
 
 Apr 2021
 - **Release the FasterTransformer backend 1.0**.
-  - Support Multi-GPU on GPT.
+  - Support Multi-GPU on GPT.
diff --git a/all_models/bert/fastertransformer/config.pbtxt b/all_models/bert/fastertransformer/config.pbtxt
@@ -32,7 +32,6 @@ max_batch_size: 1024
 input [
   {
     name: "input_hidden_state"
-    data_type: TYPE_FP16
     dims: [ -1, -1 ]
   },
   {
@@ -45,7 +44,6 @@ input [
 output [
   {
     name: "output_hidden_state"
-    data_type: TYPE_FP16
     dims: [ -1, -1 ]
   }
 ]
@@ -88,7 +86,7 @@ parameters {
 parameters {
   key: "model_checkpoint_path"
   value: {
-    string_value: "../all_models/bert/fastertransformer/1/2-gpu/"
+    string_value: "all_models/bert/fastertransformer/1/2-gpu/"
   }
 }
 parameters {

diff --git a/all_models/bloom/ensemble/1/.tmp b/all_models/bloom/ensemble/1/.tmp