[ci] Update correctness testing

deepjavalibrary · Jul 23, 2024 · 277f5b0 · 277f5b0
1 parent f67f678
commit 277f5b0
Show file tree

Hide file tree

Showing 4 changed files with 73 additions and 54 deletions.
diff --git a/.github/workflows/correctness.yml b/.github/workflows/correctness.yml
@@ -95,7 +95,7 @@ jobs:
         env:
           TEST_DJL_VERSION: ${{ inputs.djl-version }}
         run: |
-          python -m pytest --capture=tee-sys -vv -k ${{ matrix.test.test }} tests.py
+          python -m pytest -k ${{ matrix.test.test }} tests.py
       - name: Cleanup
         working-directory: tests/integration
         run: |

diff --git a/tests/integration/llm/client.py b/tests/integration/llm/client.py
@@ -700,44 +700,59 @@ def get_model_name():
 }
 
 correctness_model_spec = {
-    "trtllm-codellama-13b": {
-        "batch_size": [32],
-        "seq_length": [256],
-        "tokenizer": "codellama/CodeLlama-13b-hf",
+    "trtllm-codestral-22b": {
+        "batch_size": [41],
+        "seq_length": [512],
+        "num_run": 4,
+        "tokenizer": "bullerwins/Codestral-22B-v0.1-hf",
         "dataset": "humaneval",
-        "score": 0.25
+        "score": 0.5,
+        "parameters": {
+            "return_full_text": True
+        }
     },
-    "lmi-dist-codellama-13b": {
-        "batch_size": [32],
-        "seq_length": [256],
-        "tokenizer": "codellama/CodeLlama-13b-hf",
+    "lmi-dist-codestral-22b": {
+        "batch_size": [41],
+        "seq_length": [512],
+        "num_run": 4,
+        "tokenizer": "bullerwins/Codestral-22B-v0.1-hf",
         "dataset": "humaneval",
-        "score": 0.25
+        "score": 0.5,
+        "parameters": {
+            "return_full_text": True
+        }
     },
-    "neuronx-codellama-13b": {
-        "batch_size": [32],
-        "seq_length": [256],
-        "tokenizer": "codellama/CodeLlama-13b-hf",
+    "neuronx-codestral-22b": {
+        "batch_size": [41],
+        "seq_length": [512],
+        "num_run": 4,
+        "tokenizer": "bullerwins/Codestral-22B-v0.1-hf",
         "dataset": "humaneval",
-        "score": 0.25
+        "score": 0.5,
+        "parameters": {
+            "return_full_text": True
+        }
     },
     "trtllm-llama3-1-8b": {
-        "batch_size": [32],
+        "batch_size": [66],
         "seq_length": [1],
+        "num_run": 4,
         "tokenizer": "TheBloke/Llama-2-7B-fp16",
         "dataset": "mmlu",
         "score": 0.6
     },
     "lmi-dist-llama3-1-8b": {
-        "batch_size": [32],
+        "batch_size": [66],
         "seq_length": [1],
+        "num_run": 213,
         "tokenizer": "TheBloke/Llama-2-7B-fp16",
         "dataset": "mmlu",
         "score": 0.6
     },
     "neuronx-llama3-1-8b": {
-        "batch_size": [32],
+        "batch_size": [66],
         "seq_length": [1],
+        "num_run": 213,
         "tokenizer": "TheBloke/Llama-2-7B-fp16",
         "dataset": "mmlu",
         "score": 0.6
@@ -801,13 +816,16 @@ def validate_correctness(type, tasks, expected):
         with ThreadPoolExecutor(max_workers=N_WORKERS) as executor:
             futures = []
             for i, out in enumerate(outputs):
+                logging.info(f"!!!out {out['generated_text']}")
                 task = tasks[inputs[i]]
+                logging.info(f"!!!task {task}")
                 completion = out.get('generated_text', '')
                 args = (task, completion, TIMEOUT)
                 future = executor.submit(check_correctness, *args)
                 futures.append(future)
             for future in as_completed(futures):
                 result = future.result()
+                logging.info(f"!!!result: {result}")
                 if result['passed']:
                     total_pass += 1
     elif type == "mmlu":
@@ -1044,12 +1062,14 @@ def t5_batch_generation(batch_size):
     return input_sentences[:batch_size]
 
 
-def load_dataset(dataset, key):
+def load_dataset(dataset):
     res = {}
     if dataset == "humaneval":
         url = "https://raw.githubusercontent.com/ymwangg/vllm-test/main/dataset/humaneval.jsonl"
+        key = "prompt"
     elif dataset == "mmlu":
         url = "https://djl-ai.s3.amazonaws.com/resources/benchmark/datasets/mmlu_djlserving.jsonl"
+        key = "inputs"
     else:
         raise ValueError(f"Unsupported dataset: {dataset}")
 
@@ -1377,17 +1397,14 @@ def test_transformers_neuronx_handler(model, model_spec):
 def test_correctness(model, model_spec):
     if model not in model_spec:
         raise ValueError(
-            f"{args.model} is not one of the supporting models {list(model_spec.keys())}"
+            f"{model} is not one of the supporting models {list(model_spec.keys())}"
         )
-    spec = model_spec[args.model]
-    score = int(spec.get("score", 0.4))
-    dataset = spec.get("dataset", "humaneval")
-    parameters = {}
-    if dataset == "humaneval":
-        data = load_dataset(dataset, "prompt")
-        parameters["return_full_text"] = True
-    elif dataset == "mmlu":
-        data = load_dataset(dataset, "inputs")
+    spec = model_spec[model]
+    score = float(spec.get("score", 0.3))
+    parameters = spec.get("parameters", {})
+    num_run = int(spec.get("num_run", 5))
+    dataset = spec.get("dataset", "mmlu")
+    data = load_dataset(dataset)
 
     for i, batch_size in enumerate(spec["batch_size"]):
         for seq_length in spec["seq_length"]:
@@ -1402,6 +1419,7 @@ def test_correctness(model, model_spec):
             awscurl_run(reqs,
                         spec.get("tokenizer", None),
                         batch_size,
+                        num_run=num_run,
                         dataset=True,
                         output=True)
             validate_correctness(dataset, data, score)

diff --git a/tests/integration/llm/prepare.py b/tests/integration/llm/prepare.py
@@ -898,30 +898,30 @@
 }
 
 correctness_model_list = {
-    "trtllm-codellama-13b": {
+    "trtllm-codestral-22b": {
         "engine": "Python",
         "option.task": "text-generation",
-        "option.model_id": "codellama/CodeLlama-13b-hf",
+        "option.model_id": "bullerwins/Codestral-22B-v0.1-hf",
         "option.rolling_batch": "trtllm",
         "option.tensor_parallel_degree": 4,
-        "option.max_rolling_batch_size": 32,
+        "option.max_rolling_batch_size": 41,
         "option.output_formatter": "json"
     },
-    "lmi-dist-codellama-13b": {
+    "lmi-dist-codestral-22b": {
         "engine": "MPI",
         "option.task": "text-generation",
-        "option.model_id": "codellama/CodeLlama-13b-hf",
+        "option.model_id": "bullerwins/Codestral-22B-v0.1-hf",
         "option.rolling_batch": "lmi-dist",
         "option.tensor_parallel_degree": 4,
-        "option.max_rolling_batch_size": 32,
+        "option.max_rolling_batch_size": 41,
         "option.output_formatter": "json"
     },
-    "neuronx-codellama-13b": {
-        "option.model_id": "codellama/CodeLlama-13b-hf",
+    "neuronx-codestral-22b": {
+        "option.model_id": "bullerwins/Codestral-22B-v0.1-hf",
         "option.tensor_parallel_degree": 4,
         "option.n_positions": 1024,
         "option.rolling_batch": "auto",
-        "option.max_rolling_batch_size": 32,
+        "option.max_rolling_batch_size": 41,
         "option.output_formatter": "json"
     },
     "trtllm-llama3-1-8b": {
@@ -930,7 +930,7 @@
         "option.model_id": "s3://djl-llm/llama-3.1-8b-hf/",
         "option.rolling_batch": "trtllm",
         "option.tensor_parallel_degree": 4,
-        "option.max_rolling_batch_size": 32,
+        "option.max_rolling_batch_size": 66,
         "option.output_formatter": "json"
     },
     "lmi-dist-llama3-1-8b": {
@@ -939,15 +939,16 @@
         "option.model_id": "s3://djl-llm/llama-3.1-8b-hf/",
         "option.rolling_batch": "lmi-dist",
         "option.tensor_parallel_degree": 4,
-        "option.max_rolling_batch_size": 32,
+        "option.max_rolling_batch_size": 66,
         "option.output_formatter": "json"
     },
     "neuronx-llama3-1-8b": {
         "option.model_id": "s3://djl-llm/llama-3.1-8b-hf/",
         "option.tensor_parallel_degree": 4,
         "option.n_positions": 1024,
         "option.rolling_batch": "auto",
-        "option.max_rolling_batch_size": 32
+        "option.max_rolling_batch_size": 66,
+        "option.output_formatter": "json"
     }
 }
 

diff --git a/tests/integration/tests.py b/tests/integration/tests.py
@@ -837,11 +837,11 @@ def test_llama_speculative_compiled(self):
 @pytest.mark.gpu_4
 class TestCorrectnessTrtLlm:
 
-    def test_codellama_13b(self):
-        with Runner('tensorrt-llm', 'codellama-13b') as r:
-            prepare.build_correctness_model("trtllm-codellama-13b")
+    def test_codestral_22b(self):
+        with Runner('tensorrt-llm', 'codestral-22b') as r:
+            prepare.build_correctness_model("trtllm-codestral-22b")
             r.launch("CUDA_VISIBLE_DEVICES=0,1,2,3")
-            client.run("correctness trtllm-codellama-13b".split())
+            client.run("correctness trtllm-codestral-22b".split())
 
     def test_llama3_1_8b(self):
         with Runner('tensorrt-llm', 'llama3-1-8b') as r:
@@ -855,11 +855,11 @@ def test_llama3_1_8b(self):
 @pytest.mark.gpu_4
 class TestCorrectnessLmiDist:
 
-    def test_codellama_13b(self):
-        with Runner('lmi', 'codellama-13b') as r:
-            prepare.build_correctness_model("lmi-dist-codellama-13b")
+    def test_codestral_22b(self):
+        with Runner('lmi', 'codestral-22b') as r:
+            prepare.build_correctness_model("lmi-dist-codestral-22b")
             r.launch()
-            client.run("correctness lmi-dist-codellama-13b".split())
+            client.run("correctness lmi-dist-codestral-22b".split())
 
     def test_llama3_1_8b(self):
         with Runner('lmi', 'llama3-1-8b') as r:
@@ -872,11 +872,11 @@ def test_llama3_1_8b(self):
 @pytest.mark.inf
 class TestCorrectnessNeuronx:
 
-    def test_codellama_13b(self):
-        with Runner('pytorch-inf2', 'codellama-13b') as r:
-            prepare.build_correctness_model("neuronx-codellama-13b")
+    def test_codestral_22b(self):
+        with Runner('pytorch-inf2', 'codestral-22b') as r:
+            prepare.build_correctness_model("neuronx-codestral-22b")
             r.launch()
-            client.run("correctness neuronx-codellama-13b".split())
+            client.run("correctness neuronx-codestral-22b".split())
 
     def test_llama3_1_8b(self):
         with Runner('pytorch-inf2', 'llama3-1-8b') as r: