diff --git a/.github/workflows/ci-llama.yaml b/.github/workflows/ci-llama.yaml
index 999c3fbcc..e9098316f 100644
--- a/.github/workflows/ci-llama.yaml
+++ b/.github/workflows/ci-llama.yaml
@@ -76,7 +76,13 @@ jobs:
             "numpy<2.0"
 
       - name: Run llama test
-        run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --longrun --iree-hip-target=gfx942
+        run: pytest sharktank/tests/models/llama/benchmark_amdgpu_test.py -v -s --longrun --iree-hip-target=gfx942 --html=out/index.html
+
+      - name: Deploy to GitHub Pages
+        uses: peaceiris/actions-gh-pages@v3
+        with:
+          github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }}
+          publish_dir: ./out
 
       - name: Upload llama executable files
         uses: actions/upload-artifact@v4
diff --git a/sharktank/conftest.py b/sharktank/conftest.py
index afbc93e46..e46466959 100644
--- a/sharktank/conftest.py
+++ b/sharktank/conftest.py
@@ -256,3 +256,24 @@ def get_iree_flags(request: FixtureRequest):
     model_path["iree_hal_target_backends"] = set_fixture_from_cli_option(
         request, "--iree-hal-target-backends", "iree_hal_target_backends"
     )
+
+
+# The following three functions allow us to add a "XFail Reason" column to the html reports for each test
+def pytest_html_results_table_header(cells):
+    cells.insert(2, "<th>XFail Reason</th>")
+
+
+def pytest_html_results_table_row(report, cells):
+    if hasattr(report, "wasxfail"):
+        cells.insert(2, f"<td>{report.wasxfail}</td>")
+    else:
+        cells.insert(2, f"<td></td>")
+
+
+@pytest.hookimpl(hookwrapper=True)
+def pytest_runtest_makereport(item, call):
+    outcome = yield
+    report = outcome.get_result()
+
+    if report.when == "call" and hasattr(item, "wasxfail"):
+        report.wasxfail = item.wasxfail
diff --git a/sharktank/requirements-tests.txt b/sharktank/requirements-tests.txt
index 4be48fdde..d5b4b0c0e 100644
--- a/sharktank/requirements-tests.txt
+++ b/sharktank/requirements-tests.txt
@@ -1,3 +1,4 @@
 datasets==3.0.0
 parameterized
 pytest==8.0.0
+pytest-html
diff --git a/sharktank/sharktank/utils/export_artifacts.py b/sharktank/sharktank/utils/export_artifacts.py
index 84c206d7a..dac0ee21e 100644
--- a/sharktank/sharktank/utils/export_artifacts.py
+++ b/sharktank/sharktank/utils/export_artifacts.py
@@ -24,6 +24,63 @@
 )
 
 
+class ExportMlirException(Exception):
+    """SHARK-Platform export MLIR exception that preserves the command line and error output."""
+
+    def __init__(self, process: subprocess.CompletedProcess, cwd: str):
+        try:
+            errs = process.stderr.decode("utf-8")
+        except:
+            errs = str(process.stderr)
+        super().__init__(
+            f"Error invoking export_paged_llama_v1.py\n"
+            f"Error code: {process.returncode}\n"
+            f"Stderr diagnostics:\n{errs}\n\n"
+            f"Invoked with:\n"
+            f"  cd {cwd} && {process.args}\n\n"
+        )
+
+
+class IreeCompileException(Exception):
+    """Compiler exception that preserves the command line and error output."""
+
+    def __init__(self, process: subprocess.CompletedProcess, cwd: str):
+        try:
+            errs = process.stderr.decode("utf-8")
+        except:
+            errs = str(process.stderr)
+        super().__init__(
+            f"Error invoking iree-compile\n"
+            f"Error code: {process.returncode}\n"
+            f"Stderr diagnostics:\n{errs}\n\n"
+            f"Invoked with:\n"
+            f"  cd {cwd} && {process.args}\n\n"
+        )
+
+
+class IreeBenchmarkException(Exception):
+    """Runtime exception that preserves the command line and error output."""
+
+    def __init__(self, process: subprocess.CompletedProcess, cwd: str):
+        # iree-run-module sends output to both stdout and stderr
+        try:
+            errs = process.stderr.decode("utf-8")
+        except:
+            errs = str(process.stderr)
+        try:
+            outs = process.stdout.decode("utf-8")
+        except:
+            outs = str(process.stdout)
+        super().__init__(
+            f"Error invoking iree-benchmark-module\n"
+            f"Error code: {process.returncode}\n"
+            f"Stderr diagnostics:\n{errs}\n"
+            f"Stdout diagnostics:\n{outs}\n"
+            f"Run with:\n"
+            f"  cd {cwd} && {process.args}\n\n"
+        )
+
+
 class ExportArtifacts:
     def __init__(
         self,
@@ -127,37 +184,27 @@ def export_to_mlir(
 
         proc = subprocess.run(cmd, shell=True, capture_output=True, cwd=cwd)
         if proc.returncode != 0:
-            logger.error(
-                f"Error exporting mlir with export_paged_llm_v1.py\n"
-                f"{proc.stdout+proc.stderr}"
-            )
+            raise ExportMlirException(proc, cwd)
         else:
             logger.info(f"Exported to mlir successfully:\n" f"{proc.stdout}")
 
         return proc.returncode
 
     @timeit
-    def compile_to_vmfb(
-        self,
-        *,
-        mlir_path,
-        vmfb_path,
-        hal_dump_path,
-    ):
-        # TODO: Control flag to enable multiple backends
+    def compile_to_vmfb(self, *, mlir_path, vmfb_path, hal_dump_path, cwd):
         compile_flags = ["--iree-hip-target=" + self.iree_hip_target]
+        compile_flags += ["--iree-hal-target-backends=rocm"]
         compile_flags += [f"--iree-hal-dump-executable-files-to={hal_dump_path}/files"]
-        try:
-            ireec.compile_file(
-                input_file=mlir_path,
-                target_backends=[self.iree_hal_target_backends],
-                extra_args=compile_flags,
-                output_file=vmfb_path,
-            )
-        except Exception as error:
-            logger.error(f"Error running iree-compile:\n" f"{error}")
-        else:
-            logger.info(f"Compiled to vmfb successfully:\n" f"{vmfb_path}")
+        cmd = self.get_compile_cmd(
+            output_mlir_path=mlir_path,
+            output_vmfb_path=vmfb_path,
+            args=compile_flags,
+        )
+        logging.getLogger().info(f"Launching compile command:\n" f"cd {cwd} && {cmd}")
+        proc = subprocess.run(cmd, shell=True, capture_output=True, cwd=cwd)
+        return_code = proc.returncode
+        if return_code != 0:
+            raise IreeCompileException(proc, cwd)
 
     def iree_benchmark_vmfb(
         self,
@@ -193,13 +240,22 @@ def iree_benchmark_vmfb(
         proc = subprocess.run(cmd, shell=True, stdout=sys.stdout, cwd=cwd)
         return_code = proc.returncode
         if return_code != 0:
-            raise RuntimeError(f"Error running benchmark {cmd} in cwd {cwd}")
+            raise IreeBenchmarkException(proc, cwd)
 
     def create_file(self, *, suffix, prefix):
         file_path = Path(prefix).with_suffix(suffix)
         f = open(file_path, "w")
         return file_path
 
+    def get_compile_cmd(
+        self, *, output_mlir_path: str, output_vmfb_path: str, args: [str]
+    ):
+        compile_args = ["iree-compile", output_mlir_path]
+        compile_args += args
+        compile_args += ["-o", output_vmfb_path]
+        cmd = subprocess.list2cmdline(compile_args)
+        return cmd
+
     def get_artifacts(self):
 
         self.dir_path = self.sharktank_dir + "/" + "tmp_perplexity_ci_artifacts/"
diff --git a/sharktank/tests/models/llama/benchmark_amdgpu_test.py b/sharktank/tests/models/llama/benchmark_amdgpu_test.py
index c99bbc7e1..d9757f8ab 100644
--- a/sharktank/tests/models/llama/benchmark_amdgpu_test.py
+++ b/sharktank/tests/models/llama/benchmark_amdgpu_test.py
@@ -13,7 +13,12 @@
 import subprocess
 from pathlib import Path
 from typing import List
-from sharktank.utils.export_artifacts import ExportArtifacts
+from sharktank.utils.export_artifacts import (
+    ExportArtifacts,
+    ExportMlirException,
+    IreeBenchmarkException,
+    IreeCompileException,
+)
 
 longrun = pytest.mark.skipif("not config.getoption('longrun')")
 is_mi300x = pytest.mark.skipif("config.getoption('iree_hip_target') != 'gfx942'")
@@ -136,6 +141,7 @@ def testBenchmark8B_f16_Decomposed(self):
             mlir_path=str(output_mlir),
             vmfb_path=output_vmfb,
             hal_dump_path=output_file_name,
+            cwd=self.repo_root,
         )
         # benchmark prefill
         self.llama8b_f16_artifacts.iree_benchmark_vmfb(
@@ -156,7 +162,7 @@ def testBenchmark8B_f16_Decomposed(self):
 
     @longrun
     @is_mi300x
-    @pytest.mark.xfail(reason="torch_sdpa not yet plumbed through", strict=True)
+    @pytest.mark.xfail(reason="Compile Error", strict=True, raises=IreeCompileException)
     def testBenchmark8B_f16_Non_Decomposed(self):
         output_file_name = self.dir_path_8b / "f16_torch"
         output_mlir = self.llama8b_f16_artifacts.create_file(
@@ -187,6 +193,7 @@ def testBenchmark8B_f16_Non_Decomposed(self):
             mlir_path=str(output_mlir),
             vmfb_path=output_vmfb,
             hal_dump_path=output_file_name,
+            cwd=self.repo_root,
         )
         # benchmark prefill
         self.llama8b_f16_artifacts.iree_benchmark_vmfb(
@@ -207,7 +214,9 @@ def testBenchmark8B_f16_Non_Decomposed(self):
 
     @longrun
     @is_mi300x
-    @pytest.mark.xfail(reason="8B fp8 irpa path not stored yet", strict=True)
+    @pytest.mark.xfail(
+        reason="Test not yet implemented", strict=True, raises=AttributeError
+    )
     def testBenchmark8B_fp8_Decomposed(self):
         output_file_name = self.dir_path_8b / "fp8_decomposed"
         output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name)
@@ -249,7 +258,9 @@ def testBenchmark8B_fp8_Decomposed(self):
 
     @longrun
     @is_mi300x
-    @pytest.mark.xfail(reason="torch_sdpa not yet plumbed through", strict=True)
+    @pytest.mark.xfail(
+        reason="Test not yet implemented", strict=True, raises=AttributeError
+    )
     def testBenchmark8B_fp8_Non_Decomposed(self):
         output_file_name = self.dir_path_8b / "fp8_torch_sdpa"
         output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name)
@@ -346,7 +357,9 @@ def setUp(self):
 
     @longrun
     @is_mi300x
-    @pytest.mark.xfail(reason="70b f16 irpa path not stored yet", strict=True)
+    @pytest.mark.xfail(
+        reason="Test not yet implemented", strict=True, raises=AttributeError
+    )
     def testBenchmark70B_f16_Decomposed(self):
         output_file_name = self.dir_path_70b / "f16_decomposed"
         output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name)
@@ -388,7 +401,9 @@ def testBenchmark70B_f16_Decomposed(self):
 
     @longrun
     @is_mi300x
-    @pytest.mark.xfail(reason="torch_sdpa not yet plumbed through", strict=True)
+    @pytest.mark.xfail(
+        reason="Test not yet implemented", strict=True, raises=AttributeError
+    )
     def testBenchmark70B_f16_Non_Decomposed(self):
         output_file_name = self.dir_path_70b / "f16_torch_sdpa"
         output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name)
@@ -430,7 +445,9 @@ def testBenchmark70B_f16_Non_Decomposed(self):
 
     @longrun
     @is_mi300x
-    @pytest.mark.xfail(reason="70B fp8 irpa path not stored yet", strict=True)
+    @pytest.mark.xfail(
+        reason="Test not yet implemented", strict=True, raises=AttributeError
+    )
     def testBenchmark70B_fp8_Decomposed(self):
         output_file_name = self.dir_path_70b / "fp8_decomposed"
         output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name)
@@ -472,7 +489,9 @@ def testBenchmark70B_fp8_Decomposed(self):
 
     @longrun
     @is_mi300x
-    @pytest.mark.xfail(reason="torch_sdpa not yet plumbed through", strict=True)
+    @pytest.mark.xfail(
+        reason="Test not yet implemented", strict=True, raises=AttributeError
+    )
     def testBenchmark70B_fp8_Non_Decomposed(self):
         output_file_name = self.dir_path_70b / "fp8_torch_sdpa"
         output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name)
@@ -569,7 +588,9 @@ def setUp(self):
 
     @longrun
     @is_mi300x
-    @pytest.mark.xfail(reason="405B f16 irpa path not stored yet", strict=True)
+    @pytest.mark.xfail(
+        reason="Test not yet implemented", strict=True, raises=AttributeError
+    )
     def testBenchmark405B_f16_Decomposed(self):
         output_file_name = self.dir_path_405b / "f16_decomposed"
         output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name)
@@ -611,7 +632,9 @@ def testBenchmark405B_f16_Decomposed(self):
 
     @longrun
     @is_mi300x
-    @pytest.mark.xfail(reason="torch_sdpa not yet plumbed through", strict=True)
+    @pytest.mark.xfail(
+        reason="Test not yet implemented", strict=True, raises=AttributeError
+    )
     def testBenchmark405B_f16_Non_Decomposed(self):
         output_file_name = self.dir_path_405b / "f16_torch_sdpa"
         output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name)
@@ -653,7 +676,9 @@ def testBenchmark405B_f16_Non_Decomposed(self):
 
     @longrun
     @is_mi300x
-    @pytest.mark.xfail(reason="405B fp8 irpa path not stored yet", strict=True)
+    @pytest.mark.xfail(
+        reason="Test not yet implemented", strict=True, raises=AttributeError
+    )
     def testBenchmark405B_fp8_Decomposed(self):
         output_file_name = self.dir_path_405b / "fp8_decomposed"
         output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name)
@@ -695,7 +720,9 @@ def testBenchmark405B_fp8_Decomposed(self):
 
     @longrun
     @is_mi300x
-    @pytest.mark.xfail(reason="torch_sdpa not yet plumbed through", strict=True)
+    @pytest.mark.xfail(
+        reason="Test not yet implemented", strict=True, raises=AttributeError
+    )
     def testBenchmark405B_fp8_Non_Decomposed(self):
         output_file_name = self.dir_path_405b / "fp8_torch_sdpa"
         output_mlir = self.create_file(suffix=".mlir", prefix=output_file_name)