diff --git a/.github/workflows/ci-llama.yaml b/.github/workflows/ci-llama.yaml
index 999c3fbcc..e9098316f 100644
--- a/.github/workflows/ci-llama.yaml
+++ b/.github/workflows/ci-llama.yaml
@@ -76,7 +76,13 @@ jobs:
"numpy<2.0"
- name: Run llama test
- run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --longrun --iree-hip-target=gfx942
+ run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --longrun --iree-hip-target=gfx942 --html=out/index.html
+
+ - name: Deploy to GitHub Pages
+ uses: peaceiris/actions-gh-pages@v3
+ with:
+ github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }}
+ publish_dir: ./out
- name: Upload llama executable files
uses: actions/upload-artifact@v4
diff --git a/sharktank/conftest.py b/sharktank/conftest.py
index afbc93e46..e46466959 100644
--- a/sharktank/conftest.py
+++ b/sharktank/conftest.py
@@ -256,3 +256,24 @@ def get_iree_flags(request: FixtureRequest):
model_path["iree_hal_target_backends"] = set_fixture_from_cli_option(
request, "--iree-hal-target-backends", "iree_hal_target_backends"
)
+
+
+# The following three functions allow us to add a "XFail Reason" column to the html reports for each test
+def pytest_html_results_table_header(cells):
+ cells.insert(2, "
XFail Reason | ")
+
+
+def pytest_html_results_table_row(report, cells):
+ if hasattr(report, "wasxfail"):
+ cells.insert(2, f"{report.wasxfail} | ")
+ else:
+ cells.insert(2, f" | ")
+
+
+@pytest.hookimpl(hookwrapper=True)
+def pytest_runtest_makereport(item, call):
+ outcome = yield
+ report = outcome.get_result()
+
+ if report.when == "call" and hasattr(item, "wasxfail"):
+ report.wasxfail = item.wasxfail
diff --git a/sharktank/requirements-tests.txt b/sharktank/requirements-tests.txt
index 4be48fdde..d5b4b0c0e 100644
--- a/sharktank/requirements-tests.txt
+++ b/sharktank/requirements-tests.txt
@@ -1,3 +1,4 @@
datasets==3.0.0
parameterized
pytest==8.0.0
+pytest-html
diff --git a/sharktank/sharktank/utils/export_artifacts.py b/sharktank/sharktank/utils/export_artifacts.py
index 84c206d7a..dac0ee21e 100644
--- a/sharktank/sharktank/utils/export_artifacts.py
+++ b/sharktank/sharktank/utils/export_artifacts.py
@@ -24,6 +24,63 @@
)
+class ExportMlirException(Exception):
+ """SHARK-Platform export MLIR exception that preserves the command line and error output."""
+
+ def __init__(self, process: subprocess.CompletedProcess, cwd: str):
+ try:
+ errs = process.stderr.decode("utf-8")
+ except:
+ errs = str(process.stderr)
+ super().__init__(
+ f"Error invoking export_paged_llama_v1.py\n"
+ f"Error code: {process.returncode}\n"
+ f"Stderr diagnostics:\n{errs}\n\n"
+ f"Invoked with:\n"
+ f" cd {cwd} && {process.args}\n\n"
+ )
+
+
+class IreeCompileException(Exception):
+ """Compiler exception that preserves the command line and error output."""
+
+ def __init__(self, process: subprocess.CompletedProcess, cwd: str):
+ try:
+ errs = process.stderr.decode("utf-8")
+ except:
+ errs = str(process.stderr)
+ super().__init__(
+ f"Error invoking iree-compile\n"
+ f"Error code: {process.returncode}\n"
+ f"Stderr diagnostics:\n{errs}\n\n"
+ f"Invoked with:\n"
+ f" cd {cwd} && {process.args}\n\n"
+ )
+
+
+class IreeBenchmarkException(Exception):
+ """Runtime exception that preserves the command line and error output."""
+
+ def __init__(self, process: subprocess.CompletedProcess, cwd: str):
+ # iree-run-module sends output to both stdout and stderr
+ try:
+ errs = process.stderr.decode("utf-8")
+ except:
+ errs = str(process.stderr)
+ try:
+ outs = process.stdout.decode("utf-8")
+ except:
+ outs = str(process.stdout)
+ super().__init__(
+ f"Error invoking iree-benchmark-module\n"
+ f"Error code: {process.returncode}\n"
+ f"Stderr diagnostics:\n{errs}\n"
+ f"Stdout diagnostics:\n{outs}\n"
+ f"Run with:\n"
+ f" cd {cwd} && {process.args}\n\n"
+ )
+
+
class ExportArtifacts:
def __init__(
self,
@@ -127,37 +184,27 @@ def export_to_mlir(
proc = subprocess.run(cmd, shell=True, capture_output=True, cwd=cwd)
if proc.returncode != 0:
- logger.error(
- f"Error exporting mlir with export_paged_llm_v1.py\n"
- f"{proc.stdout+proc.stderr}"
- )
+ raise ExportMlirException(proc, cwd)
else:
logger.info(f"Exported to mlir successfully:\n" f"{proc.stdout}")
return proc.returncode
@timeit
- def compile_to_vmfb(
- self,
- *,
- mlir_path,
- vmfb_path,
- hal_dump_path,
- ):
- # TODO: Control flag to enable multiple backends
+ def compile_to_vmfb(self, *, mlir_path, vmfb_path, hal_dump_path, cwd):
compile_flags = ["--iree-hip-target=" + self.iree_hip_target]
+ compile_flags += ["--iree-hal-target-backends=rocm"]
compile_flags += [f"--iree-hal-dump-executable-files-to={hal_dump_path}/files"]
- try:
- ireec.compile_file(
- input_file=mlir_path,
- target_backends=[self.iree_hal_target_backends],
- extra_args=compile_flags,
- output_file=vmfb_path,
- )
- except Exception as error:
- logger.error(f"Error running iree-compile:\n" f"{error}")
- else:
- logger.info(f"Compiled to vmfb successfully:\n" f"{vmfb_path}")
+ cmd = self.get_compile_cmd(
+ output_mlir_path=mlir_path,
+ output_vmfb_path=vmfb_path,
+ args=compile_flags,
+ )
+ logging.getLogger().info(f"Launching compile command:\n" f"cd {cwd} && {cmd}")
+ proc = subprocess.run(cmd, shell=True, capture_output=True, cwd=cwd)
+ return_code = proc.returncode
+ if return_code != 0:
+ raise IreeCompileException(proc, cwd)
def iree_benchmark_vmfb(
self,
@@ -193,13 +240,22 @@ def iree_benchmark_vmfb(
proc = subprocess.run(cmd, shell=True, stdout=sys.stdout, cwd=cwd)
return_code = proc.returncode
if return_code != 0:
- raise RuntimeError(f"Error running benchmark {cmd} in cwd {cwd}")
+ raise IreeBenchmarkException(proc, cwd)
def create_file(self, *, suffix, prefix):
file_path = Path(prefix).with_suffix(suffix)
f = open(file_path, "w")
return file_path
+ def get_compile_cmd(
+ self, *, output_mlir_path: str, output_vmfb_path: str, args: [str]
+ ):
+ compile_args = ["iree-compile", output_mlir_path]
+ compile_args += args
+ compile_args += ["-o", output_vmfb_path]
+ cmd = subprocess.list2cmdline(compile_args)
+ return cmd
+
def get_artifacts(self):
self.dir_path = self.sharktank_dir + "/" + "tmp_perplexity_ci_artifacts/"
diff --git a/sharktank/tests/models/llama/benchmark_amdgpu_test.py b/sharktank/tests/models/llama/benchmark_amdgpu_test.py
index c99bbc7e1..d9757f8ab 100644
--- a/sharktank/tests/models/llama/benchmark_amdgpu_test.py
+++ b/sharktank/tests/models/llama/benchmark_amdgpu_test.py
@@ -13,7 +13,12 @@
import subprocess
from pathlib import Path
from typing import List
-from sharktank.utils.export_artifacts import ExportArtifacts
+from sharktank.utils.export_artifacts import (
+ ExportArtifacts,
+ ExportMlirException,
+ IreeBenchmarkException,
+ IreeCompileException,
+)
longrun = pytest.mark.skipif("not config.getoption('longrun')")
is_mi300x = pytest.mark.skipif("config.getoption('iree_hip_target') != 'gfx942'")
@@ -136,6 +141,7 @@ def testBenchmark8B_f16_Decomposed(self):
mlir_path=str(output_mlir),
vmfb_path=output_vmfb,
hal_dump_path=output_file_name,
+ cwd=self.repo_root,
)
# benchmark prefill
self.llama8b_f16_artifacts.iree_benchmark_vmfb(
@@ -156,7 +162,7 @@ def testBenchmark8B_f16_Decomposed(self):
@longrun
@is_mi300x
- @pytest.mark.xfail(reason="torch_sdpa not yet plumbed through", strict=True)
+ @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException)
def testBenchmark8B_f16_Non_Decomposed(self):
output_file_name = self.dir_path_8b / "f16_torch"
output_mlir = self.llama8b_f16_artifacts.create_file(
@@ -187,6 +193,7 @@ def testBenchmark8B_f16_Non_Decomposed(self):
mlir_path=str(output_mlir),
vmfb_path=output_vmfb,
hal_dump_path=output_file_name,
+ cwd=self.repo_root,
)
# benchmark prefill
self.llama8b_f16_artifacts.iree_benchmark_vmfb(
@@ -207,7 +214,9 @@ def testBenchmark8B_f16_Non_Decomposed(self):
@longrun
@is_mi300x
- @pytest.mark.xfail(reason="8B fp8 irpa path not stored yet", strict=True)
+ @pytest.mark.xfail(
+ reason="Test not yet implemented", strict=True, raises=AttributeError
+ )
def testBenchmark8B_fp8_Decomposed(self):
output_file_name = self.dir_path_8b / "fp8_decomposed"
output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name)
@@ -249,7 +258,9 @@ def testBenchmark8B_fp8_Decomposed(self):
@longrun
@is_mi300x
- @pytest.mark.xfail(reason="torch_sdpa not yet plumbed through", strict=True)
+ @pytest.mark.xfail(
+ reason="Test not yet implemented", strict=True, raises=AttributeError
+ )
def testBenchmark8B_fp8_Non_Decomposed(self):
output_file_name = self.dir_path_8b / "fp8_torch_sdpa"
output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name)
@@ -346,7 +357,9 @@ def setUp(self):
@longrun
@is_mi300x
- @pytest.mark.xfail(reason="70b f16 irpa path not stored yet", strict=True)
+ @pytest.mark.xfail(
+ reason="Test not yet implemented", strict=True, raises=AttributeError
+ )
def testBenchmark70B_f16_Decomposed(self):
output_file_name = self.dir_path_70b / "f16_decomposed"
output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name)
@@ -388,7 +401,9 @@ def testBenchmark70B_f16_Decomposed(self):
@longrun
@is_mi300x
- @pytest.mark.xfail(reason="torch_sdpa not yet plumbed through", strict=True)
+ @pytest.mark.xfail(
+ reason="Test not yet implemented", strict=True, raises=AttributeError
+ )
def testBenchmark70B_f16_Non_Decomposed(self):
output_file_name = self.dir_path_70b / "f16_torch_sdpa"
output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name)
@@ -430,7 +445,9 @@ def testBenchmark70B_f16_Non_Decomposed(self):
@longrun
@is_mi300x
- @pytest.mark.xfail(reason="70B fp8 irpa path not stored yet", strict=True)
+ @pytest.mark.xfail(
+ reason="Test not yet implemented", strict=True, raises=AttributeError
+ )
def testBenchmark70B_fp8_Decomposed(self):
output_file_name = self.dir_path_70b / "fp8_decomposed"
output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name)
@@ -472,7 +489,9 @@ def testBenchmark70B_fp8_Decomposed(self):
@longrun
@is_mi300x
- @pytest.mark.xfail(reason="torch_sdpa not yet plumbed through", strict=True)
+ @pytest.mark.xfail(
+ reason="Test not yet implemented", strict=True, raises=AttributeError
+ )
def testBenchmark70B_fp8_Non_Decomposed(self):
output_file_name = self.dir_path_70b / "fp8_torch_sdpa"
output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name)
@@ -569,7 +588,9 @@ def setUp(self):
@longrun
@is_mi300x
- @pytest.mark.xfail(reason="405B f16 irpa path not stored yet", strict=True)
+ @pytest.mark.xfail(
+ reason="Test not yet implemented", strict=True, raises=AttributeError
+ )
def testBenchmark405B_f16_Decomposed(self):
output_file_name = self.dir_path_405b / "f16_decomposed"
output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name)
@@ -611,7 +632,9 @@ def testBenchmark405B_f16_Decomposed(self):
@longrun
@is_mi300x
- @pytest.mark.xfail(reason="torch_sdpa not yet plumbed through", strict=True)
+ @pytest.mark.xfail(
+ reason="Test not yet implemented", strict=True, raises=AttributeError
+ )
def testBenchmark405B_f16_Non_Decomposed(self):
output_file_name = self.dir_path_405b / "f16_torch_sdpa"
output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name)
@@ -653,7 +676,9 @@ def testBenchmark405B_f16_Non_Decomposed(self):
@longrun
@is_mi300x
- @pytest.mark.xfail(reason="405B fp8 irpa path not stored yet", strict=True)
+ @pytest.mark.xfail(
+ reason="Test not yet implemented", strict=True, raises=AttributeError
+ )
def testBenchmark405B_fp8_Decomposed(self):
output_file_name = self.dir_path_405b / "fp8_decomposed"
output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name)
@@ -695,7 +720,9 @@ def testBenchmark405B_fp8_Decomposed(self):
@longrun
@is_mi300x
- @pytest.mark.xfail(reason="torch_sdpa not yet plumbed through", strict=True)
+ @pytest.mark.xfail(
+ reason="Test not yet implemented", strict=True, raises=AttributeError
+ )
def testBenchmark405B_fp8_Non_Decomposed(self):
output_file_name = self.dir_path_405b / "fp8_torch_sdpa"
output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name)