diff --git a/.github/workflows/correctness.yml b/.github/workflows/correctness.yml
index c6d4828f8f..eb070f97d1 100644
--- a/.github/workflows/correctness.yml
+++ b/.github/workflows/correctness.yml
@@ -95,7 +95,7 @@ jobs:
         env:
           TEST_DJL_VERSION: ${{ inputs.djl-version }}
         run: |
-          python -m pytest --capture=tee-sys -vv -k ${{ matrix.test.test }} tests.py
+          python -m pytest -k ${{ matrix.test.test }} tests.py
       - name: Cleanup
         working-directory: tests/integration
         run: |
diff --git a/tests/integration/llm/client.py b/tests/integration/llm/client.py
index aa10d76434..b57cc1cd07 100644
--- a/tests/integration/llm/client.py
+++ b/tests/integration/llm/client.py
@@ -700,44 +700,59 @@ def get_model_name():
 }
 
 correctness_model_spec = {
-    "trtllm-codellama-13b": {
-        "batch_size": [32],
-        "seq_length": [256],
-        "tokenizer": "codellama/CodeLlama-13b-hf",
+    "trtllm-codestral-22b": {
+        "batch_size": [41],
+        "seq_length": [512],
+        "num_run": 4,
+        "tokenizer": "bullerwins/Codestral-22B-v0.1-hf",
         "dataset": "humaneval",
-        "score": 0.25
+        "score": 0.5,
+        "parameters": {
+            "return_full_text": True
+        }
     },
-    "lmi-dist-codellama-13b": {
-        "batch_size": [32],
-        "seq_length": [256],
-        "tokenizer": "codellama/CodeLlama-13b-hf",
+    "lmi-dist-codestral-22b": {
+        "batch_size": [41],
+        "seq_length": [512],
+        "num_run": 4,
+        "tokenizer": "bullerwins/Codestral-22B-v0.1-hf",
         "dataset": "humaneval",
-        "score": 0.25
+        "score": 0.5,
+        "parameters": {
+            "return_full_text": True
+        }
     },
-    "neuronx-codellama-13b": {
-        "batch_size": [32],
-        "seq_length": [256],
-        "tokenizer": "codellama/CodeLlama-13b-hf",
+    "neuronx-codestral-22b": {
+        "batch_size": [41],
+        "seq_length": [512],
+        "num_run": 4,
+        "tokenizer": "bullerwins/Codestral-22B-v0.1-hf",
         "dataset": "humaneval",
-        "score": 0.25
+        "score": 0.5,
+        "parameters": {
+            "return_full_text": True
+        }
     },
     "trtllm-llama3-1-8b": {
-        "batch_size": [32],
+        "batch_size": [66],
         "seq_length": [1],
+        "num_run": 4,
         "tokenizer": "TheBloke/Llama-2-7B-fp16",
         "dataset": "mmlu",
         "score": 0.6
     },
     "lmi-dist-llama3-1-8b": {
-        "batch_size": [32],
+        "batch_size": [66],
         "seq_length": [1],
+        "num_run": 213,
         "tokenizer": "TheBloke/Llama-2-7B-fp16",
         "dataset": "mmlu",
         "score": 0.6
     },
     "neuronx-llama3-1-8b": {
-        "batch_size": [32],
+        "batch_size": [66],
         "seq_length": [1],
+        "num_run": 213,
         "tokenizer": "TheBloke/Llama-2-7B-fp16",
         "dataset": "mmlu",
         "score": 0.6
@@ -801,13 +816,16 @@ def validate_correctness(type, tasks, expected):
         with ThreadPoolExecutor(max_workers=N_WORKERS) as executor:
             futures = []
             for i, out in enumerate(outputs):
+                logging.info(f"!!!out {out['generated_text']}")
                 task = tasks[inputs[i]]
+                logging.info(f"!!!task {task}")
                 completion = out.get('generated_text', '')
                 args = (task, completion, TIMEOUT)
                 future = executor.submit(check_correctness, *args)
                 futures.append(future)
             for future in as_completed(futures):
                 result = future.result()
+                logging.info(f"!!!result: {result}")
                 if result['passed']:
                     total_pass += 1
     elif type == "mmlu":
@@ -1044,12 +1062,14 @@ def t5_batch_generation(batch_size):
     return input_sentences[:batch_size]
 
 
-def load_dataset(dataset, key):
+def load_dataset(dataset):
     res = {}
     if dataset == "humaneval":
         url = "https://raw.githubusercontent.com/ymwangg/vllm-test/main/dataset/humaneval.jsonl"
+        key = "prompt"
     elif dataset == "mmlu":
         url = "https://djl-ai.s3.amazonaws.com/resources/benchmark/datasets/mmlu_djlserving.jsonl"
+        key = "inputs"
     else:
         raise ValueError(f"Unsupported dataset: {dataset}")
 
@@ -1377,17 +1397,14 @@ def test_transformers_neuronx_handler(model, model_spec):
 def test_correctness(model, model_spec):
     if model not in model_spec:
         raise ValueError(
-            f"{args.model} is not one of the supporting models {list(model_spec.keys())}"
+            f"{model} is not one of the supporting models {list(model_spec.keys())}"
         )
-    spec = model_spec[args.model]
-    score = int(spec.get("score", 0.4))
-    dataset = spec.get("dataset", "humaneval")
-    parameters = {}
-    if dataset == "humaneval":
-        data = load_dataset(dataset, "prompt")
-        parameters["return_full_text"] = True
-    elif dataset == "mmlu":
-        data = load_dataset(dataset, "inputs")
+    spec = model_spec[model]
+    score = float(spec.get("score", 0.3))
+    parameters = spec.get("parameters", {})
+    num_run = int(spec.get("num_run", 5))
+    dataset = spec.get("dataset", "mmlu")
+    data = load_dataset(dataset)
 
     for i, batch_size in enumerate(spec["batch_size"]):
         for seq_length in spec["seq_length"]:
@@ -1402,6 +1419,7 @@ def test_correctness(model, model_spec):
             awscurl_run(reqs,
                         spec.get("tokenizer", None),
                         batch_size,
+                        num_run=num_run,
                         dataset=True,
                         output=True)
             validate_correctness(dataset, data, score)
diff --git a/tests/integration/llm/prepare.py b/tests/integration/llm/prepare.py
index 055d30cf0e..cdc6c80b17 100644
--- a/tests/integration/llm/prepare.py
+++ b/tests/integration/llm/prepare.py
@@ -898,30 +898,30 @@
 }
 
 correctness_model_list = {
-    "trtllm-codellama-13b": {
+    "trtllm-codestral-22b": {
         "engine": "Python",
         "option.task": "text-generation",
-        "option.model_id": "codellama/CodeLlama-13b-hf",
+        "option.model_id": "bullerwins/Codestral-22B-v0.1-hf",
         "option.rolling_batch": "trtllm",
         "option.tensor_parallel_degree": 4,
-        "option.max_rolling_batch_size": 32,
+        "option.max_rolling_batch_size": 41,
         "option.output_formatter": "json"
     },
-    "lmi-dist-codellama-13b": {
+    "lmi-dist-codestral-22b": {
         "engine": "MPI",
         "option.task": "text-generation",
-        "option.model_id": "codellama/CodeLlama-13b-hf",
+        "option.model_id": "bullerwins/Codestral-22B-v0.1-hf",
         "option.rolling_batch": "lmi-dist",
         "option.tensor_parallel_degree": 4,
-        "option.max_rolling_batch_size": 32,
+        "option.max_rolling_batch_size": 41,
         "option.output_formatter": "json"
     },
-    "neuronx-codellama-13b": {
-        "option.model_id": "codellama/CodeLlama-13b-hf",
+    "neuronx-codestral-22b": {
+        "option.model_id": "bullerwins/Codestral-22B-v0.1-hf",
         "option.tensor_parallel_degree": 4,
         "option.n_positions": 1024,
         "option.rolling_batch": "auto",
-        "option.max_rolling_batch_size": 32,
+        "option.max_rolling_batch_size": 41,
         "option.output_formatter": "json"
     },
     "trtllm-llama3-1-8b": {
@@ -930,7 +930,7 @@
         "option.model_id": "s3://djl-llm/llama-3.1-8b-hf/",
         "option.rolling_batch": "trtllm",
         "option.tensor_parallel_degree": 4,
-        "option.max_rolling_batch_size": 32,
+        "option.max_rolling_batch_size": 66,
         "option.output_formatter": "json"
     },
     "lmi-dist-llama3-1-8b": {
@@ -939,7 +939,7 @@
         "option.model_id": "s3://djl-llm/llama-3.1-8b-hf/",
         "option.rolling_batch": "lmi-dist",
         "option.tensor_parallel_degree": 4,
-        "option.max_rolling_batch_size": 32,
+        "option.max_rolling_batch_size": 66,
         "option.output_formatter": "json"
     },
     "neuronx-llama3-1-8b": {
@@ -947,7 +947,8 @@
         "option.tensor_parallel_degree": 4,
         "option.n_positions": 1024,
         "option.rolling_batch": "auto",
-        "option.max_rolling_batch_size": 32
+        "option.max_rolling_batch_size": 66,
+        "option.output_formatter": "json"
     }
 }
 
diff --git a/tests/integration/tests.py b/tests/integration/tests.py
index 0bb0086c91..f1131158fe 100644
--- a/tests/integration/tests.py
+++ b/tests/integration/tests.py
@@ -837,11 +837,11 @@ def test_llama_speculative_compiled(self):
 @pytest.mark.gpu_4
 class TestCorrectnessTrtLlm:
 
-    def test_codellama_13b(self):
-        with Runner('tensorrt-llm', 'codellama-13b') as r:
-            prepare.build_correctness_model("trtllm-codellama-13b")
+    def test_codestral_22b(self):
+        with Runner('tensorrt-llm', 'codestral-22b') as r:
+            prepare.build_correctness_model("trtllm-codestral-22b")
             r.launch("CUDA_VISIBLE_DEVICES=0,1,2,3")
-            client.run("correctness trtllm-codellama-13b".split())
+            client.run("correctness trtllm-codestral-22b".split())
 
     def test_llama3_1_8b(self):
         with Runner('tensorrt-llm', 'llama3-1-8b') as r:
@@ -855,11 +855,11 @@ def test_llama3_1_8b(self):
 @pytest.mark.gpu_4
 class TestCorrectnessLmiDist:
 
-    def test_codellama_13b(self):
-        with Runner('lmi', 'codellama-13b') as r:
-            prepare.build_correctness_model("lmi-dist-codellama-13b")
+    def test_codestral_22b(self):
+        with Runner('lmi', 'codestral-22b') as r:
+            prepare.build_correctness_model("lmi-dist-codestral-22b")
             r.launch()
-            client.run("correctness lmi-dist-codellama-13b".split())
+            client.run("correctness lmi-dist-codestral-22b".split())
 
     def test_llama3_1_8b(self):
         with Runner('lmi', 'llama3-1-8b') as r:
@@ -872,11 +872,11 @@ def test_llama3_1_8b(self):
 @pytest.mark.inf
 class TestCorrectnessNeuronx:
 
-    def test_codellama_13b(self):
-        with Runner('pytorch-inf2', 'codellama-13b') as r:
-            prepare.build_correctness_model("neuronx-codellama-13b")
+    def test_codestral_22b(self):
+        with Runner('pytorch-inf2', 'codestral-22b') as r:
+            prepare.build_correctness_model("neuronx-codestral-22b")
             r.launch()
-            client.run("correctness neuronx-codellama-13b".split())
+            client.run("correctness neuronx-codestral-22b".split())
 
     def test_llama3_1_8b(self):
         with Runner('pytorch-inf2', 'llama3-1-8b') as r: