diff --git a/.github/workflows/ci-llama.yaml b/.github/workflows/ci-llama.yaml index 999c3fbcc..e9098316f 100644 --- a/.github/workflows/ci-llama.yaml +++ b/.github/workflows/ci-llama.yaml @@ -76,7 +76,13 @@ jobs: "numpy<2.0" - name: Run llama test - run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --longrun --iree-hip-target=gfx942 + run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --longrun --iree-hip-target=gfx942 --html=out/index.html + + - name: Deploy to GitHub Pages + uses: peaceiris/actions-gh-pages@v3 + with: + github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }} + publish_dir: ./out - name: Upload llama executable files uses: actions/upload-artifact@v4 diff --git a/sharktank/conftest.py b/sharktank/conftest.py index afbc93e46..e46466959 100644 --- a/sharktank/conftest.py +++ b/sharktank/conftest.py @@ -256,3 +256,24 @@ def get_iree_flags(request: FixtureRequest): model_path["iree_hal_target_backends"] = set_fixture_from_cli_option( request, "--iree-hal-target-backends", "iree_hal_target_backends" ) + + +# The following three functions allow us to add a "XFail Reason" column to the html reports for each test +def pytest_html_results_table_header(cells): + cells.insert(2, "XFail Reason") + + +def pytest_html_results_table_row(report, cells): + if hasattr(report, "wasxfail"): + cells.insert(2, f"{report.wasxfail}") + else: + cells.insert(2, f"") + + +@pytest.hookimpl(hookwrapper=True) +def pytest_runtest_makereport(item, call): + outcome = yield + report = outcome.get_result() + + if report.when == "call" and hasattr(item, "wasxfail"): + report.wasxfail = item.wasxfail diff --git a/sharktank/requirements-tests.txt b/sharktank/requirements-tests.txt index 4be48fdde..d5b4b0c0e 100644 --- a/sharktank/requirements-tests.txt +++ b/sharktank/requirements-tests.txt @@ -1,3 +1,4 @@ datasets==3.0.0 parameterized pytest==8.0.0 +pytest-html diff --git a/sharktank/sharktank/utils/export_artifacts.py b/sharktank/sharktank/utils/export_artifacts.py index 84c206d7a..dac0ee21e 100644 --- a/sharktank/sharktank/utils/export_artifacts.py +++ b/sharktank/sharktank/utils/export_artifacts.py @@ -24,6 +24,63 @@ ) +class ExportMlirException(Exception): + """SHARK-Platform export MLIR exception that preserves the command line and error output.""" + + def __init__(self, process: subprocess.CompletedProcess, cwd: str): + try: + errs = process.stderr.decode("utf-8") + except: + errs = str(process.stderr) + super().__init__( + f"Error invoking export_paged_llama_v1.py\n" + f"Error code: {process.returncode}\n" + f"Stderr diagnostics:\n{errs}\n\n" + f"Invoked with:\n" + f" cd {cwd} && {process.args}\n\n" + ) + + +class IreeCompileException(Exception): + """Compiler exception that preserves the command line and error output.""" + + def __init__(self, process: subprocess.CompletedProcess, cwd: str): + try: + errs = process.stderr.decode("utf-8") + except: + errs = str(process.stderr) + super().__init__( + f"Error invoking iree-compile\n" + f"Error code: {process.returncode}\n" + f"Stderr diagnostics:\n{errs}\n\n" + f"Invoked with:\n" + f" cd {cwd} && {process.args}\n\n" + ) + + +class IreeBenchmarkException(Exception): + """Runtime exception that preserves the command line and error output.""" + + def __init__(self, process: subprocess.CompletedProcess, cwd: str): + # iree-run-module sends output to both stdout and stderr + try: + errs = process.stderr.decode("utf-8") + except: + errs = str(process.stderr) + try: + outs = process.stdout.decode("utf-8") + except: + outs = str(process.stdout) + super().__init__( + f"Error invoking iree-benchmark-module\n" + f"Error code: {process.returncode}\n" + f"Stderr diagnostics:\n{errs}\n" + f"Stdout diagnostics:\n{outs}\n" + f"Run with:\n" + f" cd {cwd} && {process.args}\n\n" + ) + + class ExportArtifacts: def __init__( self, @@ -127,37 +184,27 @@ def export_to_mlir( proc = subprocess.run(cmd, shell=True, capture_output=True, cwd=cwd) if proc.returncode != 0: - logger.error( - f"Error exporting mlir with export_paged_llm_v1.py\n" - f"{proc.stdout+proc.stderr}" - ) + raise ExportMlirException(proc, cwd) else: logger.info(f"Exported to mlir successfully:\n" f"{proc.stdout}") return proc.returncode @timeit - def compile_to_vmfb( - self, - *, - mlir_path, - vmfb_path, - hal_dump_path, - ): - # TODO: Control flag to enable multiple backends + def compile_to_vmfb(self, *, mlir_path, vmfb_path, hal_dump_path, cwd): compile_flags = ["--iree-hip-target=" + self.iree_hip_target] + compile_flags += ["--iree-hal-target-backends=rocm"] compile_flags += [f"--iree-hal-dump-executable-files-to={hal_dump_path}/files"] - try: - ireec.compile_file( - input_file=mlir_path, - target_backends=[self.iree_hal_target_backends], - extra_args=compile_flags, - output_file=vmfb_path, - ) - except Exception as error: - logger.error(f"Error running iree-compile:\n" f"{error}") - else: - logger.info(f"Compiled to vmfb successfully:\n" f"{vmfb_path}") + cmd = self.get_compile_cmd( + output_mlir_path=mlir_path, + output_vmfb_path=vmfb_path, + args=compile_flags, + ) + logging.getLogger().info(f"Launching compile command:\n" f"cd {cwd} && {cmd}") + proc = subprocess.run(cmd, shell=True, capture_output=True, cwd=cwd) + return_code = proc.returncode + if return_code != 0: + raise IreeCompileException(proc, cwd) def iree_benchmark_vmfb( self, @@ -193,13 +240,22 @@ def iree_benchmark_vmfb( proc = subprocess.run(cmd, shell=True, stdout=sys.stdout, cwd=cwd) return_code = proc.returncode if return_code != 0: - raise RuntimeError(f"Error running benchmark {cmd} in cwd {cwd}") + raise IreeBenchmarkException(proc, cwd) def create_file(self, *, suffix, prefix): file_path = Path(prefix).with_suffix(suffix) f = open(file_path, "w") return file_path + def get_compile_cmd( + self, *, output_mlir_path: str, output_vmfb_path: str, args: [str] + ): + compile_args = ["iree-compile", output_mlir_path] + compile_args += args + compile_args += ["-o", output_vmfb_path] + cmd = subprocess.list2cmdline(compile_args) + return cmd + def get_artifacts(self): self.dir_path = self.sharktank_dir + "/" + "tmp_perplexity_ci_artifacts/" diff --git a/sharktank/tests/models/llama/benchmark_amdgpu_test.py b/sharktank/tests/models/llama/benchmark_amdgpu_test.py index c99bbc7e1..d9757f8ab 100644 --- a/sharktank/tests/models/llama/benchmark_amdgpu_test.py +++ b/sharktank/tests/models/llama/benchmark_amdgpu_test.py @@ -13,7 +13,12 @@ import subprocess from pathlib import Path from typing import List -from sharktank.utils.export_artifacts import ExportArtifacts +from sharktank.utils.export_artifacts import ( + ExportArtifacts, + ExportMlirException, + IreeBenchmarkException, + IreeCompileException, +) longrun = pytest.mark.skipif("not config.getoption('longrun')") is_mi300x = pytest.mark.skipif("config.getoption('iree_hip_target') != 'gfx942'") @@ -136,6 +141,7 @@ def testBenchmark8B_f16_Decomposed(self): mlir_path=str(output_mlir), vmfb_path=output_vmfb, hal_dump_path=output_file_name, + cwd=self.repo_root, ) # benchmark prefill self.llama8b_f16_artifacts.iree_benchmark_vmfb( @@ -156,7 +162,7 @@ def testBenchmark8B_f16_Decomposed(self): @longrun @is_mi300x - @pytest.mark.xfail(reason="torch_sdpa not yet plumbed through", strict=True) + @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException) def testBenchmark8B_f16_Non_Decomposed(self): output_file_name = self.dir_path_8b / "f16_torch" output_mlir = self.llama8b_f16_artifacts.create_file( @@ -187,6 +193,7 @@ def testBenchmark8B_f16_Non_Decomposed(self): mlir_path=str(output_mlir), vmfb_path=output_vmfb, hal_dump_path=output_file_name, + cwd=self.repo_root, ) # benchmark prefill self.llama8b_f16_artifacts.iree_benchmark_vmfb( @@ -207,7 +214,9 @@ def testBenchmark8B_f16_Non_Decomposed(self): @longrun @is_mi300x - @pytest.mark.xfail(reason="8B fp8 irpa path not stored yet", strict=True) + @pytest.mark.xfail( + reason="Test not yet implemented", strict=True, raises=AttributeError + ) def testBenchmark8B_fp8_Decomposed(self): output_file_name = self.dir_path_8b / "fp8_decomposed" output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name) @@ -249,7 +258,9 @@ def testBenchmark8B_fp8_Decomposed(self): @longrun @is_mi300x - @pytest.mark.xfail(reason="torch_sdpa not yet plumbed through", strict=True) + @pytest.mark.xfail( + reason="Test not yet implemented", strict=True, raises=AttributeError + ) def testBenchmark8B_fp8_Non_Decomposed(self): output_file_name = self.dir_path_8b / "fp8_torch_sdpa" output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name) @@ -346,7 +357,9 @@ def setUp(self): @longrun @is_mi300x - @pytest.mark.xfail(reason="70b f16 irpa path not stored yet", strict=True) + @pytest.mark.xfail( + reason="Test not yet implemented", strict=True, raises=AttributeError + ) def testBenchmark70B_f16_Decomposed(self): output_file_name = self.dir_path_70b / "f16_decomposed" output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name) @@ -388,7 +401,9 @@ def testBenchmark70B_f16_Decomposed(self): @longrun @is_mi300x - @pytest.mark.xfail(reason="torch_sdpa not yet plumbed through", strict=True) + @pytest.mark.xfail( + reason="Test not yet implemented", strict=True, raises=AttributeError + ) def testBenchmark70B_f16_Non_Decomposed(self): output_file_name = self.dir_path_70b / "f16_torch_sdpa" output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name) @@ -430,7 +445,9 @@ def testBenchmark70B_f16_Non_Decomposed(self): @longrun @is_mi300x - @pytest.mark.xfail(reason="70B fp8 irpa path not stored yet", strict=True) + @pytest.mark.xfail( + reason="Test not yet implemented", strict=True, raises=AttributeError + ) def testBenchmark70B_fp8_Decomposed(self): output_file_name = self.dir_path_70b / "fp8_decomposed" output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name) @@ -472,7 +489,9 @@ def testBenchmark70B_fp8_Decomposed(self): @longrun @is_mi300x - @pytest.mark.xfail(reason="torch_sdpa not yet plumbed through", strict=True) + @pytest.mark.xfail( + reason="Test not yet implemented", strict=True, raises=AttributeError + ) def testBenchmark70B_fp8_Non_Decomposed(self): output_file_name = self.dir_path_70b / "fp8_torch_sdpa" output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name) @@ -569,7 +588,9 @@ def setUp(self): @longrun @is_mi300x - @pytest.mark.xfail(reason="405B f16 irpa path not stored yet", strict=True) + @pytest.mark.xfail( + reason="Test not yet implemented", strict=True, raises=AttributeError + ) def testBenchmark405B_f16_Decomposed(self): output_file_name = self.dir_path_405b / "f16_decomposed" output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name) @@ -611,7 +632,9 @@ def testBenchmark405B_f16_Decomposed(self): @longrun @is_mi300x - @pytest.mark.xfail(reason="torch_sdpa not yet plumbed through", strict=True) + @pytest.mark.xfail( + reason="Test not yet implemented", strict=True, raises=AttributeError + ) def testBenchmark405B_f16_Non_Decomposed(self): output_file_name = self.dir_path_405b / "f16_torch_sdpa" output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name) @@ -653,7 +676,9 @@ def testBenchmark405B_f16_Non_Decomposed(self): @longrun @is_mi300x - @pytest.mark.xfail(reason="405B fp8 irpa path not stored yet", strict=True) + @pytest.mark.xfail( + reason="Test not yet implemented", strict=True, raises=AttributeError + ) def testBenchmark405B_fp8_Decomposed(self): output_file_name = self.dir_path_405b / "fp8_decomposed" output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name) @@ -695,7 +720,9 @@ def testBenchmark405B_fp8_Decomposed(self): @longrun @is_mi300x - @pytest.mark.xfail(reason="torch_sdpa not yet plumbed through", strict=True) + @pytest.mark.xfail( + reason="Test not yet implemented", strict=True, raises=AttributeError + ) def testBenchmark405B_fp8_Non_Decomposed(self): output_file_name = self.dir_path_405b / "fp8_torch_sdpa" output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name)