update llama triton example (#1153)

* update llama triton example * update start_ids.csv * fix lint
InternLM · Feb 21, 2024 · 685070f · 685070f
1 parent 24beeb6
commit 685070f
Show file tree

Hide file tree

Showing 6 changed files with 26 additions and 5 deletions.
diff --git a/.gitignore b/.gitignore
@@ -74,6 +74,7 @@ work_dir*/
 *.log
 *.out
 *.csv
+!start_ids.csv
 *.pkl
 
 !CMakeLists.txt

diff --git a/examples/cpp/llama/README.md b/examples/cpp/llama/README.md
@@ -0,0 +1,7 @@
+How to generate start_ids.csv
+
+```bash
+# update `model_file` path and `encode_line` content according to the actual situation
+python3 tokenizer.py --model_file /workdir/llama2_13b_chat/tokenizer.model --encode_line 'LMDeploy is a toolkit for compressing, deploying, and serving LLMs.'
+# refer to tokenizer.py for more usage scenarios
+```
diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini
@@ -3,6 +3,7 @@ data_type=fp16
 enable_custom_all_reduce=0
 pipeline_para_size=1
 tensor_para_size=1
+; update model_dir path according to the actual situation
 model_dir=/workspace/models/triton_models/weights/
 
 

diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
@@ -255,7 +255,7 @@ int read_start_ids(size_t            batch_size,
                    std::string       file_name);
 
 std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>>
-prepareRequest(std::string ini_name, const int node_id, const int gpu_count, std::vector<void*>* pointer_record)
+prepareRequest(std::string ini_name, const int node_id, const int gpu_count, std::vector<void*>* pointer_record, const std::string& csv_name)
 {
     INIReader reader = INIReader(ini_name);
     if (reader.ParseError() < 0) {
@@ -279,7 +279,7 @@ prepareRequest(std::string ini_name, const int node_id, const int gpu_count, std
                    max_input_len,
                    end_id,
                    1,
-                   "../examples/cpp/llama/start_ids.csv");
+                   csv_name);
     // drop requests > request_batch_size
     if (v_start_lengths.size() > request_batch_size) {
         v_start_lengths.resize(request_batch_size);
@@ -363,6 +363,7 @@ int main(int argc, char* argv[])
     // Note: Only supports that all nodes have same gpu count
     const int   gpu_count  = ft::getDeviceCount();
     const int   world_size = node_num * gpu_count;
+    printf("Recommend to specify the first parameter on the command line as the path to llama_config.ini\n");
     std::string ini_name   = argc >= 2 ? std::string(argv[1]) : "../examples/cpp/llama/llama_config.ini";
 
     // step 1: Create model
@@ -372,7 +373,7 @@ int main(int argc, char* argv[])
     printf(
         "world_size=%d tensor_para_size=%d pipeline_para_size=%d\n", world_size, tensor_para_size, pipeline_para_size);
     FT_CHECK_WITH_INFO(world_size == (tensor_para_size * pipeline_para_size),
-                       "World Size != Tensor Parallel Size * Pipeline Parallel Size !");
+                       "World Size != Tensor Parallel Size * Pipeline Parallel Size ! Maybe you can use CUDA_VISIBLE_DEVICES.");
 
     std::cout << model->toString();
 
@@ -402,10 +403,12 @@ int main(int argc, char* argv[])
     }
 
     // step 4: prepare request
+    printf("Recommend to specify the second parameter on the command line as the path to start_ids.csv\n");
+    std::string csv_name = argc >= 3 ? std::string(argv[2]) : "../examples/cpp/llama/start_ids.csv";
     std::vector<void*> pointer_record;  // Used to prevent the pointers are
                                         // release after leaving functions
     std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>> request_list =
-        prepareRequest(ini_name, node_id, gpu_count, &pointer_record);
+        prepareRequest(ini_name, node_id, gpu_count, &pointer_record, csv_name);
     printf("[INFO] request is created \n");
 
     // step 5: Forward

diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv
@@ -0,0 +1 @@
+1,365,5773,1022,2376,338,263,5780,7354,363,27122,292,29892,7246,292,29892,322,16330,365,26369,29879,29889
diff --git a/examples/cpp/llama/tokenizer.py b/examples/cpp/llama/tokenizer.py
@@ -38,7 +38,8 @@ def decode(self, t: List[int]):
 
 def main(model_file: str = '/data/llama/model/tokenizer.model',
          encode_file: str = None,
-         decode_file: str = None):
+         decode_file: str = None,
+         encode_line: str = None):
     tokenizer = Tokenizer(model_file)
     if encode_file:
         with open(encode_file, 'r') as f:
@@ -59,6 +60,13 @@ def main(model_file: str = '/data/llama/model/tokenizer.model',
                 _token_ids = [int(token_id) for token_id in _token_ids]
                 ys = tokenizer.decode(_token_ids)
                 print(ys)
+    elif encode_line:
+        xs = tokenizer.encode(encode_line)
+        xs = ','.join(map(str, xs))
+        print(xs)
+        output_dir = osp.dirname(osp.abspath(__file__))
+        with open(osp.join(output_dir, 'start_ids.csv'), 'w') as f:
+            f.write(xs)
     else:
         first = True
         while True: