Skip to content

Commit

Permalink
[ci] Update correctness testing
Browse files Browse the repository at this point in the history
  • Loading branch information
xyang16 committed Jul 23, 2024
1 parent f67f678 commit 277f5b0
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 54 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/correctness.yml
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ jobs:
env:
TEST_DJL_VERSION: ${{ inputs.djl-version }}
run: |
python -m pytest --capture=tee-sys -vv -k ${{ matrix.test.test }} tests.py
python -m pytest -k ${{ matrix.test.test }} tests.py
- name: Cleanup
working-directory: tests/integration
run: |
Expand Down
76 changes: 47 additions & 29 deletions tests/integration/llm/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -700,44 +700,59 @@ def get_model_name():
}

correctness_model_spec = {
"trtllm-codellama-13b": {
"batch_size": [32],
"seq_length": [256],
"tokenizer": "codellama/CodeLlama-13b-hf",
"trtllm-codestral-22b": {
"batch_size": [41],
"seq_length": [512],
"num_run": 4,
"tokenizer": "bullerwins/Codestral-22B-v0.1-hf",
"dataset": "humaneval",
"score": 0.25
"score": 0.5,
"parameters": {
"return_full_text": True
}
},
"lmi-dist-codellama-13b": {
"batch_size": [32],
"seq_length": [256],
"tokenizer": "codellama/CodeLlama-13b-hf",
"lmi-dist-codestral-22b": {
"batch_size": [41],
"seq_length": [512],
"num_run": 4,
"tokenizer": "bullerwins/Codestral-22B-v0.1-hf",
"dataset": "humaneval",
"score": 0.25
"score": 0.5,
"parameters": {
"return_full_text": True
}
},
"neuronx-codellama-13b": {
"batch_size": [32],
"seq_length": [256],
"tokenizer": "codellama/CodeLlama-13b-hf",
"neuronx-codestral-22b": {
"batch_size": [41],
"seq_length": [512],
"num_run": 4,
"tokenizer": "bullerwins/Codestral-22B-v0.1-hf",
"dataset": "humaneval",
"score": 0.25
"score": 0.5,
"parameters": {
"return_full_text": True
}
},
"trtllm-llama3-1-8b": {
"batch_size": [32],
"batch_size": [66],
"seq_length": [1],
"num_run": 4,
"tokenizer": "TheBloke/Llama-2-7B-fp16",
"dataset": "mmlu",
"score": 0.6
},
"lmi-dist-llama3-1-8b": {
"batch_size": [32],
"batch_size": [66],
"seq_length": [1],
"num_run": 213,
"tokenizer": "TheBloke/Llama-2-7B-fp16",
"dataset": "mmlu",
"score": 0.6
},
"neuronx-llama3-1-8b": {
"batch_size": [32],
"batch_size": [66],
"seq_length": [1],
"num_run": 213,
"tokenizer": "TheBloke/Llama-2-7B-fp16",
"dataset": "mmlu",
"score": 0.6
Expand Down Expand Up @@ -801,13 +816,16 @@ def validate_correctness(type, tasks, expected):
with ThreadPoolExecutor(max_workers=N_WORKERS) as executor:
futures = []
for i, out in enumerate(outputs):
logging.info(f"!!!out {out['generated_text']}")
task = tasks[inputs[i]]
logging.info(f"!!!task {task}")
completion = out.get('generated_text', '')
args = (task, completion, TIMEOUT)
future = executor.submit(check_correctness, *args)
futures.append(future)
for future in as_completed(futures):
result = future.result()
logging.info(f"!!!result: {result}")
if result['passed']:
total_pass += 1
elif type == "mmlu":
Expand Down Expand Up @@ -1044,12 +1062,14 @@ def t5_batch_generation(batch_size):
return input_sentences[:batch_size]


def load_dataset(dataset, key):
def load_dataset(dataset):
res = {}
if dataset == "humaneval":
url = "https://raw.githubusercontent.com/ymwangg/vllm-test/main/dataset/humaneval.jsonl"
key = "prompt"
elif dataset == "mmlu":
url = "https://djl-ai.s3.amazonaws.com/resources/benchmark/datasets/mmlu_djlserving.jsonl"
key = "inputs"
else:
raise ValueError(f"Unsupported dataset: {dataset}")

Expand Down Expand Up @@ -1377,17 +1397,14 @@ def test_transformers_neuronx_handler(model, model_spec):
def test_correctness(model, model_spec):
if model not in model_spec:
raise ValueError(
f"{args.model} is not one of the supporting models {list(model_spec.keys())}"
f"{model} is not one of the supporting models {list(model_spec.keys())}"
)
spec = model_spec[args.model]
score = int(spec.get("score", 0.4))
dataset = spec.get("dataset", "humaneval")
parameters = {}
if dataset == "humaneval":
data = load_dataset(dataset, "prompt")
parameters["return_full_text"] = True
elif dataset == "mmlu":
data = load_dataset(dataset, "inputs")
spec = model_spec[model]
score = float(spec.get("score", 0.3))
parameters = spec.get("parameters", {})
num_run = int(spec.get("num_run", 5))
dataset = spec.get("dataset", "mmlu")
data = load_dataset(dataset)

for i, batch_size in enumerate(spec["batch_size"]):
for seq_length in spec["seq_length"]:
Expand All @@ -1402,6 +1419,7 @@ def test_correctness(model, model_spec):
awscurl_run(reqs,
spec.get("tokenizer", None),
batch_size,
num_run=num_run,
dataset=True,
output=True)
validate_correctness(dataset, data, score)
Expand Down
25 changes: 13 additions & 12 deletions tests/integration/llm/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -898,30 +898,30 @@
}

correctness_model_list = {
"trtllm-codellama-13b": {
"trtllm-codestral-22b": {
"engine": "Python",
"option.task": "text-generation",
"option.model_id": "codellama/CodeLlama-13b-hf",
"option.model_id": "bullerwins/Codestral-22B-v0.1-hf",
"option.rolling_batch": "trtllm",
"option.tensor_parallel_degree": 4,
"option.max_rolling_batch_size": 32,
"option.max_rolling_batch_size": 41,
"option.output_formatter": "json"
},
"lmi-dist-codellama-13b": {
"lmi-dist-codestral-22b": {
"engine": "MPI",
"option.task": "text-generation",
"option.model_id": "codellama/CodeLlama-13b-hf",
"option.model_id": "bullerwins/Codestral-22B-v0.1-hf",
"option.rolling_batch": "lmi-dist",
"option.tensor_parallel_degree": 4,
"option.max_rolling_batch_size": 32,
"option.max_rolling_batch_size": 41,
"option.output_formatter": "json"
},
"neuronx-codellama-13b": {
"option.model_id": "codellama/CodeLlama-13b-hf",
"neuronx-codestral-22b": {
"option.model_id": "bullerwins/Codestral-22B-v0.1-hf",
"option.tensor_parallel_degree": 4,
"option.n_positions": 1024,
"option.rolling_batch": "auto",
"option.max_rolling_batch_size": 32,
"option.max_rolling_batch_size": 41,
"option.output_formatter": "json"
},
"trtllm-llama3-1-8b": {
Expand All @@ -930,7 +930,7 @@
"option.model_id": "s3://djl-llm/llama-3.1-8b-hf/",
"option.rolling_batch": "trtllm",
"option.tensor_parallel_degree": 4,
"option.max_rolling_batch_size": 32,
"option.max_rolling_batch_size": 66,
"option.output_formatter": "json"
},
"lmi-dist-llama3-1-8b": {
Expand All @@ -939,15 +939,16 @@
"option.model_id": "s3://djl-llm/llama-3.1-8b-hf/",
"option.rolling_batch": "lmi-dist",
"option.tensor_parallel_degree": 4,
"option.max_rolling_batch_size": 32,
"option.max_rolling_batch_size": 66,
"option.output_formatter": "json"
},
"neuronx-llama3-1-8b": {
"option.model_id": "s3://djl-llm/llama-3.1-8b-hf/",
"option.tensor_parallel_degree": 4,
"option.n_positions": 1024,
"option.rolling_batch": "auto",
"option.max_rolling_batch_size": 32
"option.max_rolling_batch_size": 66,
"option.output_formatter": "json"
}
}

Expand Down
24 changes: 12 additions & 12 deletions tests/integration/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -837,11 +837,11 @@ def test_llama_speculative_compiled(self):
@pytest.mark.gpu_4
class TestCorrectnessTrtLlm:

def test_codellama_13b(self):
with Runner('tensorrt-llm', 'codellama-13b') as r:
prepare.build_correctness_model("trtllm-codellama-13b")
def test_codestral_22b(self):
with Runner('tensorrt-llm', 'codestral-22b') as r:
prepare.build_correctness_model("trtllm-codestral-22b")
r.launch("CUDA_VISIBLE_DEVICES=0,1,2,3")
client.run("correctness trtllm-codellama-13b".split())
client.run("correctness trtllm-codestral-22b".split())

def test_llama3_1_8b(self):
with Runner('tensorrt-llm', 'llama3-1-8b') as r:
Expand All @@ -855,11 +855,11 @@ def test_llama3_1_8b(self):
@pytest.mark.gpu_4
class TestCorrectnessLmiDist:

def test_codellama_13b(self):
with Runner('lmi', 'codellama-13b') as r:
prepare.build_correctness_model("lmi-dist-codellama-13b")
def test_codestral_22b(self):
with Runner('lmi', 'codestral-22b') as r:
prepare.build_correctness_model("lmi-dist-codestral-22b")
r.launch()
client.run("correctness lmi-dist-codellama-13b".split())
client.run("correctness lmi-dist-codestral-22b".split())

def test_llama3_1_8b(self):
with Runner('lmi', 'llama3-1-8b') as r:
Expand All @@ -872,11 +872,11 @@ def test_llama3_1_8b(self):
@pytest.mark.inf
class TestCorrectnessNeuronx:

def test_codellama_13b(self):
with Runner('pytorch-inf2', 'codellama-13b') as r:
prepare.build_correctness_model("neuronx-codellama-13b")
def test_codestral_22b(self):
with Runner('pytorch-inf2', 'codestral-22b') as r:
prepare.build_correctness_model("neuronx-codestral-22b")
r.launch()
client.run("correctness neuronx-codellama-13b".split())
client.run("correctness neuronx-codestral-22b".split())

def test_llama3_1_8b(self):
with Runner('pytorch-inf2', 'llama3-1-8b') as r:
Expand Down

0 comments on commit 277f5b0

Please sign in to comment.