diff --git a/.github/workflows/correctness.yml b/.github/workflows/correctness.yml index c6d4828f8f..eb070f97d1 100644 --- a/.github/workflows/correctness.yml +++ b/.github/workflows/correctness.yml @@ -95,7 +95,7 @@ jobs: env: TEST_DJL_VERSION: ${{ inputs.djl-version }} run: | - python -m pytest --capture=tee-sys -vv -k ${{ matrix.test.test }} tests.py + python -m pytest -k ${{ matrix.test.test }} tests.py - name: Cleanup working-directory: tests/integration run: | diff --git a/tests/integration/llm/client.py b/tests/integration/llm/client.py index aa10d76434..b57cc1cd07 100644 --- a/tests/integration/llm/client.py +++ b/tests/integration/llm/client.py @@ -700,44 +700,59 @@ def get_model_name(): } correctness_model_spec = { - "trtllm-codellama-13b": { - "batch_size": [32], - "seq_length": [256], - "tokenizer": "codellama/CodeLlama-13b-hf", + "trtllm-codestral-22b": { + "batch_size": [41], + "seq_length": [512], + "num_run": 4, + "tokenizer": "bullerwins/Codestral-22B-v0.1-hf", "dataset": "humaneval", - "score": 0.25 + "score": 0.5, + "parameters": { + "return_full_text": True + } }, - "lmi-dist-codellama-13b": { - "batch_size": [32], - "seq_length": [256], - "tokenizer": "codellama/CodeLlama-13b-hf", + "lmi-dist-codestral-22b": { + "batch_size": [41], + "seq_length": [512], + "num_run": 4, + "tokenizer": "bullerwins/Codestral-22B-v0.1-hf", "dataset": "humaneval", - "score": 0.25 + "score": 0.5, + "parameters": { + "return_full_text": True + } }, - "neuronx-codellama-13b": { - "batch_size": [32], - "seq_length": [256], - "tokenizer": "codellama/CodeLlama-13b-hf", + "neuronx-codestral-22b": { + "batch_size": [41], + "seq_length": [512], + "num_run": 4, + "tokenizer": "bullerwins/Codestral-22B-v0.1-hf", "dataset": "humaneval", - "score": 0.25 + "score": 0.5, + "parameters": { + "return_full_text": True + } }, "trtllm-llama3-1-8b": { - "batch_size": [32], + "batch_size": [66], "seq_length": [1], + "num_run": 4, "tokenizer": "TheBloke/Llama-2-7B-fp16", "dataset": "mmlu", "score": 0.6 }, "lmi-dist-llama3-1-8b": { - "batch_size": [32], + "batch_size": [66], "seq_length": [1], + "num_run": 213, "tokenizer": "TheBloke/Llama-2-7B-fp16", "dataset": "mmlu", "score": 0.6 }, "neuronx-llama3-1-8b": { - "batch_size": [32], + "batch_size": [66], "seq_length": [1], + "num_run": 213, "tokenizer": "TheBloke/Llama-2-7B-fp16", "dataset": "mmlu", "score": 0.6 @@ -801,13 +816,16 @@ def validate_correctness(type, tasks, expected): with ThreadPoolExecutor(max_workers=N_WORKERS) as executor: futures = [] for i, out in enumerate(outputs): + logging.info(f"!!!out {out['generated_text']}") task = tasks[inputs[i]] + logging.info(f"!!!task {task}") completion = out.get('generated_text', '') args = (task, completion, TIMEOUT) future = executor.submit(check_correctness, *args) futures.append(future) for future in as_completed(futures): result = future.result() + logging.info(f"!!!result: {result}") if result['passed']: total_pass += 1 elif type == "mmlu": @@ -1044,12 +1062,14 @@ def t5_batch_generation(batch_size): return input_sentences[:batch_size] -def load_dataset(dataset, key): +def load_dataset(dataset): res = {} if dataset == "humaneval": url = "https://raw.githubusercontent.com/ymwangg/vllm-test/main/dataset/humaneval.jsonl" + key = "prompt" elif dataset == "mmlu": url = "https://djl-ai.s3.amazonaws.com/resources/benchmark/datasets/mmlu_djlserving.jsonl" + key = "inputs" else: raise ValueError(f"Unsupported dataset: {dataset}") @@ -1377,17 +1397,14 @@ def test_transformers_neuronx_handler(model, model_spec): def test_correctness(model, model_spec): if model not in model_spec: raise ValueError( - f"{args.model} is not one of the supporting models {list(model_spec.keys())}" + f"{model} is not one of the supporting models {list(model_spec.keys())}" ) - spec = model_spec[args.model] - score = int(spec.get("score", 0.4)) - dataset = spec.get("dataset", "humaneval") - parameters = {} - if dataset == "humaneval": - data = load_dataset(dataset, "prompt") - parameters["return_full_text"] = True - elif dataset == "mmlu": - data = load_dataset(dataset, "inputs") + spec = model_spec[model] + score = float(spec.get("score", 0.3)) + parameters = spec.get("parameters", {}) + num_run = int(spec.get("num_run", 5)) + dataset = spec.get("dataset", "mmlu") + data = load_dataset(dataset) for i, batch_size in enumerate(spec["batch_size"]): for seq_length in spec["seq_length"]: @@ -1402,6 +1419,7 @@ def test_correctness(model, model_spec): awscurl_run(reqs, spec.get("tokenizer", None), batch_size, + num_run=num_run, dataset=True, output=True) validate_correctness(dataset, data, score) diff --git a/tests/integration/llm/prepare.py b/tests/integration/llm/prepare.py index 055d30cf0e..cdc6c80b17 100644 --- a/tests/integration/llm/prepare.py +++ b/tests/integration/llm/prepare.py @@ -898,30 +898,30 @@ } correctness_model_list = { - "trtllm-codellama-13b": { + "trtllm-codestral-22b": { "engine": "Python", "option.task": "text-generation", - "option.model_id": "codellama/CodeLlama-13b-hf", + "option.model_id": "bullerwins/Codestral-22B-v0.1-hf", "option.rolling_batch": "trtllm", "option.tensor_parallel_degree": 4, - "option.max_rolling_batch_size": 32, + "option.max_rolling_batch_size": 41, "option.output_formatter": "json" }, - "lmi-dist-codellama-13b": { + "lmi-dist-codestral-22b": { "engine": "MPI", "option.task": "text-generation", - "option.model_id": "codellama/CodeLlama-13b-hf", + "option.model_id": "bullerwins/Codestral-22B-v0.1-hf", "option.rolling_batch": "lmi-dist", "option.tensor_parallel_degree": 4, - "option.max_rolling_batch_size": 32, + "option.max_rolling_batch_size": 41, "option.output_formatter": "json" }, - "neuronx-codellama-13b": { - "option.model_id": "codellama/CodeLlama-13b-hf", + "neuronx-codestral-22b": { + "option.model_id": "bullerwins/Codestral-22B-v0.1-hf", "option.tensor_parallel_degree": 4, "option.n_positions": 1024, "option.rolling_batch": "auto", - "option.max_rolling_batch_size": 32, + "option.max_rolling_batch_size": 41, "option.output_formatter": "json" }, "trtllm-llama3-1-8b": { @@ -930,7 +930,7 @@ "option.model_id": "s3://djl-llm/llama-3.1-8b-hf/", "option.rolling_batch": "trtllm", "option.tensor_parallel_degree": 4, - "option.max_rolling_batch_size": 32, + "option.max_rolling_batch_size": 66, "option.output_formatter": "json" }, "lmi-dist-llama3-1-8b": { @@ -939,7 +939,7 @@ "option.model_id": "s3://djl-llm/llama-3.1-8b-hf/", "option.rolling_batch": "lmi-dist", "option.tensor_parallel_degree": 4, - "option.max_rolling_batch_size": 32, + "option.max_rolling_batch_size": 66, "option.output_formatter": "json" }, "neuronx-llama3-1-8b": { @@ -947,7 +947,8 @@ "option.tensor_parallel_degree": 4, "option.n_positions": 1024, "option.rolling_batch": "auto", - "option.max_rolling_batch_size": 32 + "option.max_rolling_batch_size": 66, + "option.output_formatter": "json" } } diff --git a/tests/integration/tests.py b/tests/integration/tests.py index 0bb0086c91..f1131158fe 100644 --- a/tests/integration/tests.py +++ b/tests/integration/tests.py @@ -837,11 +837,11 @@ def test_llama_speculative_compiled(self): @pytest.mark.gpu_4 class TestCorrectnessTrtLlm: - def test_codellama_13b(self): - with Runner('tensorrt-llm', 'codellama-13b') as r: - prepare.build_correctness_model("trtllm-codellama-13b") + def test_codestral_22b(self): + with Runner('tensorrt-llm', 'codestral-22b') as r: + prepare.build_correctness_model("trtllm-codestral-22b") r.launch("CUDA_VISIBLE_DEVICES=0,1,2,3") - client.run("correctness trtllm-codellama-13b".split()) + client.run("correctness trtllm-codestral-22b".split()) def test_llama3_1_8b(self): with Runner('tensorrt-llm', 'llama3-1-8b') as r: @@ -855,11 +855,11 @@ def test_llama3_1_8b(self): @pytest.mark.gpu_4 class TestCorrectnessLmiDist: - def test_codellama_13b(self): - with Runner('lmi', 'codellama-13b') as r: - prepare.build_correctness_model("lmi-dist-codellama-13b") + def test_codestral_22b(self): + with Runner('lmi', 'codestral-22b') as r: + prepare.build_correctness_model("lmi-dist-codestral-22b") r.launch() - client.run("correctness lmi-dist-codellama-13b".split()) + client.run("correctness lmi-dist-codestral-22b".split()) def test_llama3_1_8b(self): with Runner('lmi', 'llama3-1-8b') as r: @@ -872,11 +872,11 @@ def test_llama3_1_8b(self): @pytest.mark.inf class TestCorrectnessNeuronx: - def test_codellama_13b(self): - with Runner('pytorch-inf2', 'codellama-13b') as r: - prepare.build_correctness_model("neuronx-codellama-13b") + def test_codestral_22b(self): + with Runner('pytorch-inf2', 'codestral-22b') as r: + prepare.build_correctness_model("neuronx-codestral-22b") r.launch() - client.run("correctness neuronx-codellama-13b".split()) + client.run("correctness neuronx-codestral-22b".split()) def test_llama3_1_8b(self): with Runner('pytorch-inf2', 'llama3-1-8b') as r: