diff --git a/.gitignore b/.gitignore
deleted file mode 100644
index 82f9275..0000000
--- a/.gitignore
+++ /dev/null
@@ -1,162 +0,0 @@
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-
-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-cover/
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-.pybuilder/
-target/
-
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# IPython
-profile_default/
-ipython_config.py
-
-# pyenv
-#   For a library or package, you might want to ignore these files since the code is
-#   intended to run in multiple environments; otherwise, check them in:
-# .python-version
-
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-
-# poetry
-#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
-#   This is especially recommended for binary packages to ensure reproducibility, and is more
-#   commonly ignored for libraries.
-#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
-#poetry.lock
-
-# pdm
-#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
-#pdm.lock
-#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
-#   in version control.
-#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
-.pdm.toml
-.pdm-python
-.pdm-build/
-
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
-__pypackages__/
-
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-
-# SageMath parsed files
-*.sage.py
-
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
-# Spyder project settings
-.spyderproject
-.spyproject
-
-# Rope project settings
-.ropeproject
-
-# mkdocs documentation
-/site
-
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-
-# Pyre type checker
-.pyre/
-
-# pytype static type analyzer
-.pytype/
-
-# Cython debug symbols
-cython_debug/
-
-# PyCharm
-#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
-#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
-#  and can be added to the global gitignore or merged into this file.  For a more nuclear
-#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
diff --git a/Deepspeed-MII/README.md b/Deepspeed-MII/README.md
deleted file mode 100644
index 6a1d524..0000000
--- a/Deepspeed-MII/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-# Deepspeed-MII
-Deepspeed-MII
diff --git a/InferenceGraphPlotter/README.md b/InferenceGraphPlotter/README.md
deleted file mode 100644
index 182ea8e..0000000
--- a/InferenceGraphPlotter/README.md
+++ /dev/null
@@ -1,8 +0,0 @@
-# InferenceGraphPlotter
-
-## How to run?
-1. Clone the repo and cd into the repo
-2. Spin up a simple webserver to serve the files. One way is by using python.
-    - for python 2: python -m SimpleHTTPServer
-    - for python 3: python -m http.server
-3. Open a webbrowser and go to http://localhost:8000
\ No newline at end of file
diff --git a/README.md b/README.md
index db4a354..182ea8e 100644
--- a/README.md
+++ b/README.md
@@ -1,29 +1,8 @@
-# LLaMA-Inference-Bench
- 
-LLM-Inference-Bench: Inference Benchmarking of Large Language Models on AI Accelerators
-
-## Metrix of Evaluated Frameworks and Hardwares :
-
-| Framework/ Hardware | NVIDIA A100 | NVIDIA H100 | NVIDIA GH200 | AMD MI250 | Intel PVC | Habana Gaudi2 | Sambanova SN40L |
-|:-----------------------:|:---------------:|:---------------:|:------------:|:---------:|:---------:|:-------------:|:---------------:|
-|         [vLLM](./vLLM/README.md)        |     [Link]()    |     [Link]()    |      Yes     |    [Link]()   |    [Link]()   |       No      |       N/A       |
-|      [llama.cpp](./llama.cpp/README.md)      |     [Link]()    |     [Link]()    |      Yes     |    [Link]()   |    [Link]()   |      N/A      |       N/A       |
-|     [TensorRT-LLM](./TensorRT-LLM/README.md)    |     [Link]()    |     [Link]()    |     [Link]()     |    N/A    |    N/A    |      N/A      |       N/A       |
-|      [DeepSpeed-MII](./Deepspeed-MII/README.md)      |      No     |      No     |      No      |     No    |     No    |      [Link]()     |       N/A       |
-
-## Key Insights 
-
-
- Cite this work:
- ```
- @INPROCEEDINGS{####,
-  author={Krishna Teja Chitty-Venkata and Siddhisanket Raskar and Bharat Kale and Farah Ferdaus and Aditya Tanikanti and Ken Raffenetti and Valerie Taylor and Murali Emani and Venkatram Vishwanath},
-  booktitle={2024 IEEE/ACM International Workshop on Performance Modeling, Benchmarking and Simulation of High Performance Computer Systems (PMBS)}, 
-  title={LLM-Inference-Bench: Inference Benchmarking of Large Language Models on AI Accelerators}, 
-  year={2024},
-  volume={},
-  number={},
-  pages={},
-  keywords={Large Language Models, AI Accelerators, Performance Evaluation, Benchmarking },
-  doi={}}
- ```
+# InferenceGraphPlotter
+
+## How to run?
+1. Clone the repo and cd into the repo
+2. Spin up a simple webserver to serve the files. One way is by using python.
+    - for python 2: python -m SimpleHTTPServer
+    - for python 3: python -m http.server
+3. Open a webbrowser and go to http://localhost:8000
\ No newline at end of file
diff --git a/TensorRT-LLM/A100/README.MD b/TensorRT-LLM/A100/README.MD
deleted file mode 100644
index e69de29..0000000
diff --git a/TensorRT-LLM/GH200/README.MD b/TensorRT-LLM/GH200/README.MD
deleted file mode 100644
index e69de29..0000000
diff --git a/TensorRT-LLM/H100/README.MD b/TensorRT-LLM/H100/README.MD
deleted file mode 100644
index 1c2d0c6..0000000
--- a/TensorRT-LLM/H100/README.MD
+++ /dev/null
@@ -1,56 +0,0 @@
-# TRT-LLM on H100
-
-1. Setup Virtual Environment 
-
-    ```bash
-    module use /soft/modulefiles/
-    module load conda
-    module load openmpi/4.1.1-nvhpc
-
-    conda create -n TensorRT_LLM python=3.10
-    conda activate TensorRT_LLM
-    conda install -c conda-forge mpi4py openmpi
-
-    ```
-
-2. Install Dependancies
-    ```bash
-    git clone https://github.com/NVIDIA/TensorRT-LLM.git
-
-    cd TensorRT-LLM
-    cd examples/llama/
-
-    MPICC=$(which mpicc) MPICXX=$(which mpicxx) pip install -r requirements.txt
-    ```
-
-3. Running single Benchmark
-   ```bash
-
-    export dir_1=<HF weights path>
-    export dir_2=<trt weights path>
-    export dir_3=<trt engines path>
-
-    python convert_checkpoint.py --tp_size=1 --model_dir=$dir_1 --output_dir=$dir_2 --dtype=float16
-
-    trtllm-build --checkpoint_dir=$dir_2 --output_dir=$dir_3 --gemm_plugin=float16 --max_batch_size=1 --max_input_len=128 --max_output_len=128
-
-    python3 ../run.py --model_name="mistral_7b" --tokenizer_dir=$dir_1 --engine_dir=$dir_3 --max_output_len=128 --max_input_length=$input_output_length --run_profiling --batch_size=1 
-   ```
-
-4. Replaces or Copy files `run_power.py`, `run_precision_bench.py`, `utils.py` and `run.py` from this directory to clones trt-llm directory. 
-
-5. Run benchmarks. 
-Use `p-llama2-7b.sh` to run power benchmakrs. 
-Use `q-llama2-7b.sh` to run precision benchmarks. 
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/TensorRT-LLM/H100/p-llama2-7b.sh b/TensorRT-LLM/H100/p-llama2-7b.sh
deleted file mode 100755
index 2b846ca..0000000
--- a/TensorRT-LLM/H100/p-llama2-7b.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-export HF_TOKEN="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"
-export HF_HOME="/vast/users/sraskar/mi250/hf/hub"
-export HF_DATASETS_CACHE="/vast/users/sraskar/mi250/hf/hub"
-
-pip install pynvml==11.5.0
-pip install pydantic-core==2.18.1
-pip install psutil
-pip install py3nvml
-
-cd /vast/users/sraskar/h100/llm_research/tensorRT/new/TensorRT-LLM/examples/llama/
-
-model_name="meta-llama/Llama-2-7b-hf"
-dir_1="/vast/users/sraskar/mi250/hf/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9"
-dir_2="/vast/users/sraskar/h100/llm_research/tensorRT/model_weights/TensorRT_weights/trt_weights/Llama2-7b"
-dir_3="/vast/users/sraskar/h100/llm_research/tensorRT/model_weights/TensorRT_weights/trt_binaries/Llama2-7b"
-
-
-
-
-for tensor_parallel in 1; do
-    for precision in "float16"; do
-        rm -rf $dir_2/*
-        rm -rf $dir_3/*
-        python convert_checkpoint.py --workers=64 --tp_size=$tensor_parallel --model_dir=$dir_1 --output_dir=$dir_2 --dtype=$precision
-        for batch_size in 1 16 32 64; do
-            for input_output_length in 1024; do
-                trtllm-build --workers=64 --tp_size=$tensor_parallel --checkpoint_dir=$dir_2 --output_dir=$dir_3 --gemm_plugin=$precision --gpt_attention_plugin=$precision --max_batch_size=$batch_size --max_input_len=$input_output_length
-                mpirun -np $tensor_parallel python3 ../run_power.py --model_name=$model_name --tp_size=$tensor_parallel --tokenizer_dir=$dir_1 --engine_dir=$dir_3 --max_output_len=$input_output_length --max_input_length=$input_output_length --run_profiling --batch_size=$batch_size 
-            done
-        done
-    done
-done
\ No newline at end of file
diff --git a/TensorRT-LLM/H100/q-llama2-7b.sh b/TensorRT-LLM/H100/q-llama2-7b.sh
deleted file mode 100755
index 60249ce..0000000
--- a/TensorRT-LLM/H100/q-llama2-7b.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-export HF_TOKEN="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"
-export HF_HOME="/vast/users/sraskar/mi250/hf/hub"
-export HF_DATASETS_CACHE="/vast/users/sraskar/mi250/hf/hub"
-
-pip install pynvml==11.5.0
-pip install pydantic-core==2.18.1
-# pip install psutil
-pip install psutil==5.9.8
-
-pip install pydantic==2.7.0
-pip install regex==2024.5.15
-
-cd /vast/users/sraskar/h100/llm_research/tensorRT/new/TensorRT-LLM/examples/llama/
-
-model_name="meta-llama/Llama-2-7b-hf"
-dir_1="/vast/users/sraskar/mi250/hf/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9"
-# dir_2="/vast/users/sraskar/h100/llm_research/tensorRT/model_weights/TensorRT_weights/trt_weights/Llama2-7b"
-# dir_3="/vast/users/sraskar/h100/llm_research/tensorRT/model_weights/TensorRT_weights/trt_binaries/Llama2-7b"
-dir_2="."
-dir_3="."
-
-
-for tensor_parallel in 1; do
-    for precision in "full_prec" "int8_sq" "int4_awq"; do
-        for kv_cache_precision in "int8" "fp8"; do
-            # rm -rf $dir_2/*
-            # rm -rf $dir_3/*
-            python ../quantization/quantize.py --model_dir $dir_1 --dtype float16 --qformat $precision --kv_cache_dtype $kv_cache_precision --output_dir $dir_2 --calib_size 10 --tp_size $tensor_parallel --batch_size=1
-            for batch_size in 1 16 32 64; do
-                for input_output_length in 1024; do
-                    trtllm-build --workers=48 --tp_size=$tensor_parallel --checkpoint_dir=$dir_2 --output_dir=$dir_3 --max_batch_size=$batch_size --max_input_len=$input_output_length
-                    mpirun -np $tensor_parallel python3 ../run_precision.py --qformat $precision --kv_cache_dtype $kv_cache_precision --model_name=$model_name --tp_size=$tensor_parallel --tokenizer_dir=$dir_1 --engine_dir=$dir_3 --max_output_len=$input_output_length --max_input_length=$input_output_length --run_profiling --batch_size=$batch_size
-                done
-            done
-        done
-    done
-done
\ No newline at end of file
diff --git a/TensorRT-LLM/H100/run.py b/TensorRT-LLM/H100/run.py
deleted file mode 100644
index f1994d3..0000000
--- a/TensorRT-LLM/H100/run.py
+++ /dev/null
@@ -1,550 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import ast
-import csv
-import os
-from pathlib import Path
-
-import numpy as np
-import torch
-from utils import (DEFAULT_HF_MODEL_DIRS, DEFAULT_PROMPT_TEMPLATES,
-                   add_common_args, load_tokenizer, read_decoder_start_token_id,
-                   read_model_name, supports_inflight_batching,
-                   throttle_generator)
-
-import tensorrt_llm
-import tensorrt_llm.profiler
-from tensorrt_llm.logger import logger
-from tensorrt_llm.runtime import PYTHON_BINDINGS, ModelRunner
-
-if PYTHON_BINDINGS:
-    from tensorrt_llm.runtime import ModelRunnerCpp
-
-
-def parse_arguments(args=None):
-    # see `add_common_args` for extended list of arguments
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument('--pp_size', type=int, default = 1)
-    parser.add_argument('--tp_size', type=int, default = 1)
-    parser.add_argument('--moe_ep_size', type=int, default = 1)
-    parser.add_argument('--moe_tp_size', type=int, default = 1)
-    parser.add_argument('--model_name', type=str, required=True)
-    parser.add_argument('--batch_size', type=int, default=1, help='Batch Size')
-    parser.add_argument('--precision', type=str, default="float16", help="precision")
-    parser.add_argument('--int8_kv_cache', default=False, action='store_true', help="Int8 KV Cache.")
-
-    parser.add_argument('--qformat', type=str, default="float16", help="precision")
-    parser.add_argument('--kv_cache_dtype', type=str, default="float16", help="precision")
-
-    parser.add_argument('--max_input_length', type=int, default=923)
-    parser.add_argument('--max_output_len', type=int, required=True)
-    parser.add_argument(
-        '--input_text',
-        type=str,
-        nargs='+',
-        default=["Born in north-east France, Soyer trained as a"])
-    parser.add_argument(
-        '--input_file',
-        type=str,
-        help=
-        'CSV or Numpy file containing tokenized input. Alternative to text input.',
-        default=None)
-    parser.add_argument('--output_csv',
-                        type=str,
-                        help='CSV file where the tokenized output is stored.',
-                        default=None)
-    parser.add_argument('--output_npy',
-                        type=str,
-                        help='Numpy file where the tokenized output is stored.',
-                        default=None)
-    parser.add_argument(
-        '--output_logits_npy',
-        type=str,
-        help=
-        'Numpy file where the generation logits are stored. Use only when num_beams==1',
-        default=None)
-    parser.add_argument('--output_log_probs_npy',
-                        type=str,
-                        help='Numpy file where the log_probs are stored',
-                        default=None)
-    parser.add_argument('--output_cum_log_probs_npy',
-                        type=str,
-                        help='Numpy file where the cum_log_probs are stored',
-                        default=None)
-    parser.add_argument(
-        '--run_profiling',
-        default=False,
-        action='store_true',
-        help="Run several 10 iterations to profile the inference latencies.")
-    parser = add_common_args(parser)
-
-    return parser.parse_args(args=args)
-
-
-def parse_input(tokenizer,
-                input_text=None,
-                prompt_template=None,
-                input_file=None,
-                add_special_tokens=True,
-                max_input_length=923,
-                pad_id=None,
-                num_prepend_vtokens=[],
-                model_name=None,
-                model_version=None):
-    if pad_id is None:
-        pad_id = tokenizer.pad_token_id
-
-    batch_input_ids = []
-    if input_file is None:
-        for curr_text in input_text:
-            if prompt_template is not None:
-                curr_text = prompt_template.format(input_text=curr_text)
-            input_ids = tokenizer.encode(curr_text,
-                                         add_special_tokens=add_special_tokens,
-                                         truncation=True,
-                                         max_length=max_input_length)
-            batch_input_ids.append(input_ids)
-    else:
-        if input_file.endswith('.csv'):
-            with open(input_file, 'r') as csv_file:
-                csv_reader = csv.reader(csv_file, delimiter=',')
-                for line in csv_reader:
-                    input_ids = np.array(line, dtype='int32')
-                    batch_input_ids.append(input_ids[-max_input_length:])
-        elif input_file.endswith('.npy'):
-            inputs = np.load(input_file)
-            for row in inputs:
-                input_ids = row[row != pad_id]
-                batch_input_ids.append(input_ids[-max_input_length:])
-        elif input_file.endswith('.txt'):
-            with open(input_file, 'r', encoding='utf-8',
-                      errors='replace') as txt_file:
-                input_text = txt_file.readlines()
-                batch_input_ids = tokenizer(
-                    input_text,
-                    add_special_tokens=add_special_tokens,
-                    truncation=True,
-                    max_length=max_input_length)["input_ids"]
-        else:
-            print('Input file format not supported.')
-            raise SystemExit
-
-    if num_prepend_vtokens:
-        assert len(num_prepend_vtokens) == len(batch_input_ids)
-        base_vocab_size = tokenizer.vocab_size - len(
-            tokenizer.special_tokens_map.get('additional_special_tokens', []))
-        for i, length in enumerate(num_prepend_vtokens):
-            batch_input_ids[i] = list(
-                range(base_vocab_size,
-                      base_vocab_size + length)) + batch_input_ids[i]
-
-    if input_file is None and 'GLM' in model_name and model_version == 'glm':
-        for ids in batch_input_ids:
-            ids.append(tokenizer.sop_token_id)
-
-    batch_input_ids = [
-        torch.tensor(x, dtype=torch.int32) for x in batch_input_ids
-    ]
-    return batch_input_ids
-
-
-def print_output(tokenizer,
-                 output_ids,
-                 input_lengths,
-                 sequence_lengths,
-                 output_csv=None,
-                 output_npy=None,
-                 context_logits=None,
-                 generation_logits=None,
-                 cum_log_probs=None,
-                 log_probs=None,
-                 output_logits_npy=None,
-                 output_cum_log_probs_npy=None,
-                 output_log_probs_npy=None):
-    batch_size, num_beams, _ = output_ids.size()
-    if output_csv is None and output_npy is None:
-        for batch_idx in range(batch_size):
-            inputs = output_ids[batch_idx][0][:input_lengths[batch_idx]].tolist(
-            )
-            input_text = tokenizer.decode(inputs)
-            print(f'Input [Text {batch_idx}]: \"{input_text}\"')
-            for beam in range(num_beams):
-                output_begin = input_lengths[batch_idx]
-                output_end = sequence_lengths[batch_idx][beam]
-                outputs = output_ids[batch_idx][beam][
-                    output_begin:output_end].tolist()
-                output_text = tokenizer.decode(outputs)
-                print(
-                    f'Output [Text {batch_idx} Beam {beam}]: \"{output_text}\"')
-
-    output_ids = output_ids.reshape((-1, output_ids.size(2)))
-    if output_csv is not None:
-        output_file = Path(output_csv)
-        output_file.parent.mkdir(exist_ok=True, parents=True)
-        outputs = output_ids.tolist()
-        with open(output_file, 'w') as csv_file:
-            writer = csv.writer(csv_file, delimiter=',')
-            writer.writerows(outputs)
-
-    if output_npy is not None:
-        output_file = Path(output_npy)
-        output_file.parent.mkdir(exist_ok=True, parents=True)
-        outputs = np.array(output_ids.cpu().contiguous(), dtype='int32')
-        np.save(output_file, outputs)
-
-    # Save context logits
-    if context_logits is not None and output_logits_npy is not None:
-        context_logits = torch.cat(context_logits, axis=0)
-        vocab_size_padded = context_logits.shape[-1]
-        context_logits = context_logits.reshape([1, -1, vocab_size_padded])
-
-        output_context_logits_npy = output_logits_npy.split(
-            '.npy')[0] + "_context"
-        output_context_logits_file = Path(output_context_logits_npy)
-        context_outputs = np.array(
-            context_logits.squeeze(0).cpu().contiguous(),
-            dtype='float32')  # [promptLengthSum, vocabSize]
-        np.save(output_context_logits_file, context_outputs)
-
-    # Save generation logits
-    if generation_logits is not None and output_logits_npy is not None and num_beams == 1:
-        output_generation_logits_npy = output_logits_npy.split(
-            '.npy')[0] + "_generation"
-        output_generation_logits_file = Path(output_generation_logits_npy)
-        generation_outputs = np.array(generation_logits.cpu().contiguous(),
-                                      dtype='float32')
-        np.save(output_generation_logits_file, generation_outputs)
-
-    # Save cum log probs
-    if cum_log_probs is not None and output_cum_log_probs_npy is not None:
-        cum_log_probs_file = Path(output_cum_log_probs_npy)
-        cum_log_probs_outputs = np.array(cum_log_probs.cpu().contiguous(),
-                                         dtype='float32')
-        np.save(cum_log_probs_file, cum_log_probs_outputs)
-
-    # Save cum log probs
-    if log_probs is not None and output_log_probs_npy is not None:
-        log_probs_file = Path(output_log_probs_npy)
-        log_probs_outputs = np.array(log_probs.cpu().contiguous(),
-                                     dtype='float32')
-        np.save(log_probs_file, log_probs_outputs)
-
-
-def main(args):
-    runtime_rank = tensorrt_llm.mpi_rank()
-    logger.set_level(args.log_level)
-
-    # different handling if encoder-decoder models
-    is_enc_dec = {
-        name
-        for name in os.listdir(args.engine_dir)
-        if os.path.isdir(os.path.join(args.engine_dir, name))
-    } == {'encoder', 'decoder'}
-    if is_enc_dec:
-        logger.warning(
-            "This path is an encoder-decoder model. Using different handling.")
-        assert not args.use_py_session, "Encoder-decoder models don't have a unified python runtime, please use its own examples/enc_dec/run.py instead."
-
-    model_name, model_version = read_model_name(
-        args.engine_dir) if not is_enc_dec else ("", "")
-    if args.tokenizer_dir is None and model_name in DEFAULT_HF_MODEL_DIRS:
-        logger.warning(
-            "tokenizer_dir is not specified. Try to infer from model_name, but this may be incorrect."
-        )
-        args.tokenizer_dir = DEFAULT_HF_MODEL_DIRS[model_name]
-
-    tokenizer, pad_id, end_id = load_tokenizer(
-        tokenizer_dir=args.tokenizer_dir,
-        vocab_file=args.vocab_file,
-        model_name=model_name,
-        model_version=model_version,
-        tokenizer_type=args.tokenizer_type,
-    )
-
-    if args.end_id:
-        end_id = args.end_id
-
-    stop_words_list = None
-    if args.stop_words:
-        stop_words_list = tensorrt_llm.runtime.decode_words_list(
-            args.stop_words, tokenizer)
-
-    bad_words_list = None
-    if args.bad_words:
-        bad_words_list = tensorrt_llm.runtime.decode_words_list(
-            args.bad_words, tokenizer)
-
-    prompt_template = None
-    if args.use_prompt_template and model_name in DEFAULT_PROMPT_TEMPLATES:
-        prompt_template = DEFAULT_PROMPT_TEMPLATES[model_name]
-    batch_input_ids = parse_input(tokenizer=tokenizer,
-                                  input_text=args.input_text,
-                                  prompt_template=prompt_template,
-                                  input_file=args.input_file,
-                                  add_special_tokens=args.add_special_tokens,
-                                  max_input_length=args.max_input_length,
-                                  pad_id=pad_id,
-                                  num_prepend_vtokens=args.num_prepend_vtokens,
-                                  model_name=model_name,
-                                  model_version=model_version)
-
-    if is_enc_dec:
-        encoder_input_ids = batch_input_ids
-        decoder_start_token_id = read_decoder_start_token_id(
-            os.path.join(args.engine_dir, "decoder"))
-        decoder_input_ids = [
-            torch.tensor([decoder_start_token_id], dtype=torch.int32)
-            for _ in batch_input_ids
-        ]
-
-    input_lengths = [x.size(0) for x in decoder_input_ids
-                     ] if is_enc_dec else [x.size(0) for x in batch_input_ids]
-    encoder_input_lengths = [x.size(0)
-                             for x in encoder_input_ids] if is_enc_dec else None
-
-    if not args.use_py_session and not supports_inflight_batching(
-            os.path.join(args.engine_dir, "decoder") if is_enc_dec else args.
-            engine_dir):
-        logger.warning(
-            "The given engine does not support in-flight batching, fallback to python session"
-        )
-        args.use_py_session = True
-
-    if not PYTHON_BINDINGS and not args.use_py_session:
-        logger.warning(
-            "Python bindings of C++ session is unavailable, fallback to Python session."
-        )
-        args.use_py_session = True
-    if args.debug_mode and not args.use_py_session:
-        logger.warning(
-            "Debug mode is not supported in C++ session for now, fallback to Python session."
-        )
-        args.use_py_session = True
-    if args.return_all_generated_tokens and args.use_py_session:
-        raise ValueError(
-            "Returning all the generated tokens at each step is not supported in the Python session, use C++ session instead."
-        )
-    if (not args.return_all_generated_tokens) and args.streaming and (
-            args.num_beams > 1):
-        logger.warning(
-            "Setting return_all_generated_tokens to True since streaming AND beam search are done simultaneously. "
-            "Returning the full beams at each streaming step is needed because beam search + streaming can change previous outputs. "
-            "WARNING: using this option may increase network usage significantly (quadratically w.r.t output length)."
-        )
-        args.return_all_generated_tokens = True
-    runner_cls = ModelRunner if args.use_py_session else ModelRunnerCpp
-    runner_kwargs = dict(
-        engine_dir=args.engine_dir,
-        lora_dir=args.lora_dir,
-        rank=runtime_rank,
-        debug_mode=args.debug_mode,
-        lora_ckpt_source=args.lora_ckpt_source,
-        gpu_weights_percent=args.gpu_weights_percent,
-    )
-    if not args.use_py_session:
-        runner_kwargs.update(is_enc_dec=is_enc_dec)
-    if args.medusa_choices is not None:
-        args.medusa_choices = ast.literal_eval(args.medusa_choices)
-        assert args.temperature == 1.0, "Medusa should use temperature == 1.0"
-        assert args.num_beams == 1, "Medusa should use num_beams == 1"
-        runner_kwargs.update(medusa_choices=args.medusa_choices)
-    if not args.use_py_session:
-        runner_kwargs.update(
-            max_batch_size=len(batch_input_ids),
-            max_input_len=max(
-                encoder_input_lengths if is_enc_dec else input_lengths),
-            max_output_len=args.max_output_len,
-            max_beam_width=args.num_beams,
-            max_attention_window_size=args.max_attention_window_size,
-            sink_token_length=args.sink_token_length,
-            max_tokens_in_paged_kv_cache=args.max_tokens_in_paged_kv_cache,
-            kv_cache_enable_block_reuse=args.kv_cache_enable_block_reuse,
-            kv_cache_free_gpu_memory_fraction=args.
-            kv_cache_free_gpu_memory_fraction,
-            enable_chunked_context=args.enable_chunked_context,
-            multi_block_mode=args.multi_block_mode)
-    runner = runner_cls.from_dir(**runner_kwargs)
-
-    with torch.no_grad():
-        outputs = runner.generate(
-            batch_input_ids=decoder_input_ids
-            if is_enc_dec else batch_input_ids,
-            encoder_input_ids=encoder_input_ids if is_enc_dec else None,
-            max_new_tokens=args.max_output_len,
-            max_attention_window_size=args.max_attention_window_size,
-            sink_token_length=args.sink_token_length,
-            end_id=end_id,
-            pad_id=pad_id,
-            temperature=args.temperature,
-            top_k=args.top_k,
-            top_p=args.top_p,
-            num_beams=args.num_beams,
-            length_penalty=args.length_penalty,
-            early_stopping=args.early_stopping,
-            repetition_penalty=args.repetition_penalty,
-            presence_penalty=args.presence_penalty,
-            frequency_penalty=args.frequency_penalty,
-            stop_words_list=stop_words_list,
-            bad_words_list=bad_words_list,
-            output_cum_log_probs=(args.output_cum_log_probs_npy != None),
-            output_log_probs=(args.output_log_probs_npy != None),
-            random_seed=args.random_seed,
-            lora_uids=args.lora_task_uids,
-            prompt_table=args.prompt_table_path,
-            prompt_tasks=args.prompt_tasks,
-            streaming=args.streaming,
-            output_sequence_lengths=True,
-            no_repeat_ngram_size=args.no_repeat_ngram_size,
-            return_dict=True,
-            medusa_choices=args.medusa_choices,
-            return_all_generated_tokens=args.return_all_generated_tokens)
-        torch.cuda.synchronize()
-
-    if args.streaming:
-        for curr_outputs in throttle_generator(outputs,
-                                               args.streaming_interval):
-            if runtime_rank == 0:
-                output_ids = curr_outputs['output_ids']
-                sequence_lengths = curr_outputs['sequence_lengths']
-                cum_log_probs = None
-                log_probs = None
-                if args.output_cum_log_probs_npy != None:
-                    cum_log_probs = outputs['cum_log_probs']
-                if args.output_log_probs_npy != None:
-                    log_probs = outputs['log_probs']
-                print_output(
-                    tokenizer,
-                    output_ids,
-                    input_lengths,
-                    sequence_lengths,
-                    output_csv=args.output_csv,
-                    output_npy=args.output_npy,
-                    cum_log_probs=cum_log_probs,
-                    log_probs=log_probs,
-                    output_cum_log_probs_npy=args.output_cum_log_probs_npy,
-                    output_log_probs_npy=args.output_log_probs_npy)
-    else:
-        if runtime_rank == 0:
-            output_ids = outputs['output_ids']
-            sequence_lengths = outputs['sequence_lengths']
-            context_logits = None
-            generation_logits = None
-            cum_log_probs = None
-            log_probs = None
-            if runner.gather_context_logits:
-                context_logits = outputs['context_logits']
-            if runner.gather_generation_logits:
-                generation_logits = outputs['generation_logits']
-            if args.output_cum_log_probs_npy != None:
-                cum_log_probs = outputs['cum_log_probs']
-            if args.output_log_probs_npy != None:
-                log_probs = outputs['log_probs']
-            print_output(tokenizer,
-                         output_ids,
-                         input_lengths,
-                         sequence_lengths,
-                         output_csv=args.output_csv,
-                         output_npy=args.output_npy,
-                         context_logits=context_logits,
-                         generation_logits=generation_logits,
-                         output_logits_npy=args.output_logits_npy,
-                         cum_log_probs=cum_log_probs,
-                         log_probs=log_probs,
-                         output_cum_log_probs_npy=args.output_cum_log_probs_npy,
-                         output_log_probs_npy=args.output_log_probs_npy)
-
-    if args.run_profiling:
-        ite = 1
-        # warmup
-        for _ in range(ite):
-            with torch.no_grad():
-                outputs = runner.generate(
-                    batch_input_ids,
-                    max_new_tokens=args.max_output_len,
-                    max_attention_window_size=args.max_attention_window_size,
-                    end_id=end_id,
-                    pad_id=pad_id,
-                    temperature=args.temperature,
-                    top_k=args.top_k,
-                    top_p=args.top_p,
-                    num_beams=args.num_beams,
-                    length_penalty=args.length_penalty,
-                    early_stopping=args.early_stopping,
-                    repetition_penalty=args.repetition_penalty,
-                    presence_penalty=args.presence_penalty,
-                    frequency_penalty=args.frequency_penalty,
-                    stop_words_list=stop_words_list,
-                    bad_words_list=bad_words_list,
-                    output_cum_log_probs=(args.output_cum_log_probs_npy !=
-                                          None),
-                    output_log_probs=(args.output_log_probs_npy != None),
-                    random_seed=args.random_seed,
-                    lora_uids=args.lora_task_uids,
-                    prompt_table=args.prompt_table_path,
-                    prompt_tasks=args.prompt_tasks,
-                    streaming=args.streaming,
-                    output_sequence_lengths=True,
-                    return_dict=True,
-                    return_all_generated_tokens=args.return_all_generated_tokens
-                )
-                torch.cuda.synchronize()
-
-        tensorrt_llm.profiler.start("tmp")
-        ite=1
-        for _ in range(ite):
-            with torch.no_grad():
-                outputs = runner.generate(
-                    batch_input_ids,
-                    max_new_tokens=args.max_output_len,
-                    max_attention_window_size=args.max_attention_window_size,
-                    end_id=end_id,
-                    pad_id=pad_id,
-                    temperature=args.temperature,
-                    top_k=args.top_k,
-                    top_p=args.top_p,
-                    num_beams=args.num_beams,
-                    length_penalty=args.length_penalty,
-                    early_stopping=args.early_stopping,
-                    repetition_penalty=args.repetition_penalty,
-                    presence_penalty=args.presence_penalty,
-                    frequency_penalty=args.frequency_penalty,
-                    stop_words_list=stop_words_list,
-                    bad_words_list=bad_words_list,
-                    output_cum_log_probs=(args.output_cum_log_probs_npy !=
-                                          None),
-                    output_log_probs=(args.output_log_probs_npy != None),
-                    random_seed=args.random_seed,
-                    lora_uids=args.lora_task_uids,
-                    prompt_table=args.prompt_table_path,
-                    prompt_tasks=args.prompt_tasks,
-                    streaming=args.streaming,
-                    output_sequence_lengths=True,
-                    return_dict=True,
-                    return_all_generated_tokens=args.return_all_generated_tokens
-                )
-                torch.cuda.synchronize()
-        tensorrt_llm.profiler.stop("tmp")
-
-        print(
-            f"batch_size: {len(batch_input_ids)}, avg latency of {ite} iterations: : {tensorrt_llm.profiler.elapsed_time_in_sec('tmp') / ite} sec"
-        )
-
-
-if __name__ == '__main__':
-    args = parse_arguments()
-    main(args)
diff --git a/TensorRT-LLM/H100/run_power.py b/TensorRT-LLM/H100/run_power.py
deleted file mode 100644
index 9b3de62..0000000
--- a/TensorRT-LLM/H100/run_power.py
+++ /dev/null
@@ -1,619 +0,0 @@
-
-# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import ast
-import csv
-import os
-from pathlib import Path
-
-# from huggingface_hub import login
-# login("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")
-
-from power_utils import gpuPowerProbe
-power_profile = gpuPowerProbe(interval=0.10)
-
-
-import numpy as np
-import torch
-from utils import (DEFAULT_HF_MODEL_DIRS, DEFAULT_PROMPT_TEMPLATES,
-                   add_common_args, load_tokenizer, read_model_name,
-                   throttle_generator)
-
-import tensorrt_llm
-import tensorrt_llm.profiler
-from tensorrt_llm.logger import logger
-from tensorrt_llm.runtime import PYTHON_BINDINGS, ModelRunner
-
-if PYTHON_BINDINGS:
-    from tensorrt_llm.runtime import ModelRunnerCpp
-
-
-def parse_arguments(args=None):
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument('--pp_size', type=int, default = 1)
-    parser.add_argument('--tp_size', type=int, default = 1)
-    parser.add_argument('--moe_ep_size', type=int, default = 1)
-    parser.add_argument('--moe_tp_size', type=int, default = 1)
-    parser.add_argument('--model_name', type=str, required=True)
-    parser.add_argument('--batch_size', type=int, default=1, help='Batch Size')
-    parser.add_argument('--precision', type=str, default="float16", help="precision")
-    parser.add_argument('--int8_kv_cache', default=False, action='store_true', help="Int8 KV Cache.")
-
-    parser.add_argument('--max_input_length', type=int, default=923)
-    parser.add_argument('--max_output_len', type=int, required=True)
-    parser.add_argument(
-        '--input_text',
-        type=str,
-        nargs='+',
-        default=["Born in north-east France, Soyer trained as a"])
-    parser.add_argument(
-        '--input_file',
-        type=str,
-        help=
-        'CSV or Numpy file containing tokenized input. Alternative to text input.',
-        default=None)
-    parser.add_argument('--output_csv',
-                        type=str,
-                        help='CSV file where the tokenized output is stored.',
-                        default=None)
-    parser.add_argument('--output_npy',
-                        type=str,
-                        help='Numpy file where the tokenized output is stored.',
-                        default=None)
-    parser.add_argument(
-        '--output_logits_npy',
-        type=str,
-        help=
-        'Numpy file where the generation logits are stored. Use only when num_beams==1',
-        default=None)
-    parser.add_argument('--output_log_probs_npy',
-                        type=str,
-                        help='Numpy file where the log_probs are stored',
-                        default=None)
-    parser.add_argument('--output_cum_log_probs_npy',
-                        type=str,
-                        help='Numpy file where the cum_log_probs are stored',
-                        default=None)
-    parser.add_argument(
-        '--run_profiling',
-        default=False,
-        action='store_true',
-        help="Run several 10 iterations to profile the inference latencies.")
-    parser = add_common_args(parser)
-
-    return parser.parse_args(args=args)
-
-
-def parse_input(tokenizer,
-                input_text=None,
-                prompt_template=None,
-                input_file=None,
-                add_special_tokens=True,
-                max_input_length=923,
-                pad_id=None,
-                num_prepend_vtokens=[],
-                model_name=None,
-                model_version=None):
-    if pad_id is None:
-        pad_id = tokenizer.pad_token_id
-
-    batch_input_ids = []
-    if input_file is None:
-        for curr_text in input_text:
-            if prompt_template is not None:
-                curr_text = prompt_template.format(input_text=curr_text)
-            input_ids = tokenizer.encode(curr_text,
-                                         add_special_tokens=add_special_tokens,
-                                         truncation=True,
-                                         max_length=max_input_length)
-            batch_input_ids.append(input_ids)
-    else:
-        if input_file.endswith('.csv'):
-            with open(input_file, 'r') as csv_file:
-                csv_reader = csv.reader(csv_file, delimiter=',')
-                for line in csv_reader:
-                    input_ids = np.array(line, dtype='int32')
-                    batch_input_ids.append(input_ids[-max_input_length:])
-        elif input_file.endswith('.npy'):
-            inputs = np.load(input_file)
-            for row in inputs:
-                input_ids = row[row != pad_id]
-                batch_input_ids.append(input_ids[-max_input_length:])
-        elif input_file.endswith('.txt'):
-            with open(input_file, 'r', encoding='utf-8',
-                      errors='replace') as txt_file:
-                input_text = txt_file.readlines()
-                batch_input_ids = tokenizer(
-                    input_text,
-                    add_special_tokens=add_special_tokens,
-                    truncation=True,
-                    max_length=max_input_length)["input_ids"]
-        else:
-            print('Input file format not supported.')
-            raise SystemExit
-
-    if num_prepend_vtokens:
-        assert len(num_prepend_vtokens) == len(batch_input_ids)
-        base_vocab_size = tokenizer.vocab_size - len(
-            tokenizer.special_tokens_map.get('additional_special_tokens', []))
-        for i, length in enumerate(num_prepend_vtokens):
-            batch_input_ids[i] = list(
-                range(base_vocab_size,
-                      base_vocab_size + length)) + batch_input_ids[i]
-
-    if model_name == 'ChatGLMForCausalLM' and model_version == 'glm':
-        for ids in batch_input_ids:
-            ids.append(tokenizer.sop_token_id)
-
-    batch_input_ids = [
-        torch.tensor(x, dtype=torch.int32) for x in batch_input_ids
-    ]
-    return batch_input_ids
-
-
-def print_output(tokenizer,
-                 output_ids,
-                 input_lengths,
-                 sequence_lengths,
-                 output_csv=None,
-                 output_npy=None,
-                 context_logits=None,
-                 generation_logits=None,
-                 cum_log_probs=None,
-                 log_probs=None,
-                 output_logits_npy=None,
-                 output_cum_log_probs_npy=None,
-                 output_log_probs_npy=None):
-    batch_size, num_beams, _ = output_ids.size()
-    if output_csv is None and output_npy is None:
-        for batch_idx in range(batch_size):
-            inputs = output_ids[batch_idx][0][:input_lengths[batch_idx]].tolist(
-            )
-            input_text = tokenizer.decode(inputs)
-            print(f'Input [Text {batch_idx}]: \"{input_text}\"')
-            for beam in range(num_beams):
-                output_begin = input_lengths[batch_idx]
-                output_end = sequence_lengths[batch_idx][beam]
-                outputs = output_ids[batch_idx][beam][
-                    output_begin:output_end].tolist()
-                output_text = tokenizer.decode(outputs)
-                print(
-                    f'Output [Text {batch_idx} Beam {beam}]: \"{output_text}\"')
-
-    output_ids = output_ids.reshape((-1, output_ids.size(2)))
-    if output_csv is not None:
-        output_file = Path(output_csv)
-        output_file.parent.mkdir(exist_ok=True, parents=True)
-        outputs = output_ids.tolist()
-        with open(output_file, 'w') as csv_file:
-            writer = csv.writer(csv_file, delimiter=',')
-            writer.writerows(outputs)
-
-    if output_npy is not None:
-        output_file = Path(output_npy)
-        output_file.parent.mkdir(exist_ok=True, parents=True)
-        outputs = np.array(output_ids.cpu().contiguous(), dtype='int32')
-        np.save(output_file, outputs)
-
-    # Save context logits
-    if context_logits is not None and output_logits_npy is not None:
-        context_logits = torch.cat(context_logits, axis=0)
-        vocab_size_padded = context_logits.shape[-1]
-        context_logits = context_logits.reshape([1, -1, vocab_size_padded])
-
-        output_context_logits_npy = output_logits_npy.split(
-            '.npy')[0] + "_context"
-        output_context_logits_file = Path(output_context_logits_npy)
-        context_outputs = np.array(
-            context_logits.squeeze(0).cpu().contiguous(),
-            dtype='float32')  # [promptLengthSum, vocabSize]
-        np.save(output_context_logits_file, context_outputs)
-
-    # Save generation logits
-    if generation_logits is not None and output_logits_npy is not None and num_beams == 1:
-        output_generation_logits_npy = output_logits_npy.split(
-            '.npy')[0] + "_generation"
-        output_generation_logits_file = Path(output_generation_logits_npy)
-        generation_outputs = np.array(generation_logits.cpu().contiguous(),
-                                      dtype='float32')
-        np.save(output_generation_logits_file, generation_outputs)
-
-    # Save cum log probs
-    if cum_log_probs is not None and output_cum_log_probs_npy is not None:
-        cum_log_probs_file = Path(output_cum_log_probs_npy)
-        cum_log_probs_outputs = np.array(cum_log_probs.cpu().contiguous(),
-                                         dtype='float32')
-        np.save(cum_log_probs_file, cum_log_probs_outputs)
-
-    # Save cum log probs
-    if log_probs is not None and output_log_probs_npy is not None:
-        log_probs_file = Path(output_log_probs_npy)
-        log_probs_outputs = np.array(log_probs.cpu().contiguous(),
-                                     dtype='float32')
-        np.save(log_probs_file, log_probs_outputs)
-
-
-def main(args):
-    runtime_rank = tensorrt_llm.mpi_rank()
-    logger.set_level(args.log_level)
-
-    # different handling if encoder-decoder models
-    import os
-    is_enc_dec = {
-        name
-        for name in os.listdir(args.engine_dir)
-        if os.path.isdir(os.path.join(args.engine_dir, name))
-    } == {'encoder', 'decoder'}
-    if is_enc_dec:
-        logger.warning(
-            "This path is an encoder-decoder model. Using different handling.")
-        assert not args.use_py_session, "Encoder-decoder models don't have a unified python runtime, please use its own examples/enc_dec/run.py instead."
-
-    model_name, model_version = read_model_name(
-        args.engine_dir) if not is_enc_dec else ("", "")
-    if args.tokenizer_dir is None:
-        logger.warning(
-            "tokenizer_dir is not specified. Try to infer from model_name, but this may be incorrect."
-        )
-        args.tokenizer_dir = DEFAULT_HF_MODEL_DIRS[model_name]
-
-    tokenizer, pad_id, end_id = load_tokenizer(args,
-        tokenizer_dir=args.tokenizer_dir,
-        vocab_file=args.vocab_file,
-        model_name=model_name,
-        model_version=model_version,
-        tokenizer_type=args.tokenizer_type,
-    )
-
-    stop_words_list = None
-    if args.stop_words:
-        stop_words_list = tensorrt_llm.runtime.decode_words_list(
-            args.stop_words, tokenizer)
-
-    bad_words_list = None
-    if args.bad_words:
-        bad_words_list = tensorrt_llm.runtime.decode_words_list(
-            args.bad_words, tokenizer)
-
-    prompt_template = None
-    if args.use_prompt_template and model_name in DEFAULT_PROMPT_TEMPLATES:
-        prompt_template = DEFAULT_PROMPT_TEMPLATES[model_name]
-    batch_input_ids = parse_input(tokenizer=tokenizer,
-                                  input_text=args.input_text,
-                                  prompt_template=prompt_template,
-                                  input_file=args.input_file,
-                                  add_special_tokens=args.add_special_tokens,
-                                  max_input_length=args.max_input_length,
-                                  pad_id=pad_id,
-                                  num_prepend_vtokens=args.num_prepend_vtokens,
-                                  model_name=model_name,
-                                  model_version=model_version)
-
-    if is_enc_dec:
-        encoder_input_ids = batch_input_ids
-        decoder_input_ids = [
-            torch.tensor([pad_id], dtype=torch.int32) for _ in batch_input_ids
-        ]  # by default decoder_start_token_id for T5
-
-    input_lengths = [x.size(0) for x in decoder_input_ids
-                     ] if is_enc_dec else [x.size(0) for x in batch_input_ids]
-    encoder_input_lengths = [x.size(0)
-                             for x in encoder_input_ids] if is_enc_dec else None
-
-    if not PYTHON_BINDINGS and not args.use_py_session:
-        logger.warning(
-            "Python bindings of C++ session is unavailable, fallback to Python session."
-        )
-        args.use_py_session = True
-    if args.debug_mode and not args.use_py_session:
-        logger.warning(
-            "Debug mode is not supported in C++ session for now, fallback to Python session."
-        )
-        args.use_py_session = True
-    runner_cls = ModelRunner if args.use_py_session else ModelRunnerCpp
-    runner_kwargs = dict(
-        engine_dir=args.engine_dir,
-        lora_dir=args.lora_dir,
-        rank=runtime_rank,
-        debug_mode=args.debug_mode,
-        lora_ckpt_source=args.lora_ckpt_source,
-        gpu_weights_percent=args.gpu_weights_percent,
-    )
-    if not args.use_py_session:
-        runner_kwargs.update(is_enc_dec=is_enc_dec)
-    if args.medusa_choices is not None:
-        args.medusa_choices = ast.literal_eval(args.medusa_choices)
-        assert args.temperature == 1.0, "Medusa should use temperature == 1.0"
-        assert args.num_beams == 1, "Medusa should use num_beams == 1"
-        runner_kwargs.update(medusa_choices=args.medusa_choices)
-    if not args.use_py_session:
-        runner_kwargs.update(
-            max_batch_size=len(batch_input_ids),
-            max_input_len=max(
-                encoder_input_lengths if is_enc_dec else input_lengths),
-            max_output_len=args.max_output_len,
-            max_beam_width=args.num_beams,
-            max_attention_window_size=args.max_attention_window_size,
-            sink_token_length=args.sink_token_length,
-            max_tokens_in_paged_kv_cache=args.max_tokens_in_paged_kv_cache,
-            kv_cache_enable_block_reuse=args.kv_cache_enable_block_reuse,
-            kv_cache_free_gpu_memory_fraction=args.
-            kv_cache_free_gpu_memory_fraction,
-            enable_chunked_context=args.enable_chunked_context,
-        )
-    runner = runner_cls.from_dir(**runner_kwargs)
-
-    with torch.no_grad():
-        outputs = runner.generate(
-            batch_input_ids=decoder_input_ids
-            if is_enc_dec else batch_input_ids,
-            encoder_input_ids=encoder_input_ids if is_enc_dec else None,
-            max_new_tokens=args.max_output_len,
-            max_attention_window_size=args.max_attention_window_size,
-            sink_token_length=args.sink_token_length,
-            end_id=end_id,
-            pad_id=pad_id,
-            temperature=args.temperature,
-            top_k=args.top_k,
-            top_p=args.top_p,
-            num_beams=args.num_beams,
-            length_penalty=args.length_penalty,
-            early_stopping=args.early_stopping,
-            repetition_penalty=args.repetition_penalty,
-            presence_penalty=args.presence_penalty,
-            frequency_penalty=args.frequency_penalty,
-            stop_words_list=stop_words_list,
-            bad_words_list=bad_words_list,
-            output_cum_log_probs=(args.output_cum_log_probs_npy != None),
-            output_log_probs=(args.output_log_probs_npy != None),
-            random_seed=args.random_seed,
-            lora_uids=args.lora_task_uids,
-            prompt_table=args.prompt_table_path,
-            prompt_tasks=args.prompt_tasks,
-            streaming=args.streaming,
-            output_sequence_lengths=True,
-            no_repeat_ngram_size=args.no_repeat_ngram_size,
-            return_dict=True,
-            medusa_choices=args.medusa_choices)
-        torch.cuda.synchronize()
-
-    if args.streaming:
-        for curr_outputs in throttle_generator(outputs,
-                                               args.streaming_interval):
-            if runtime_rank == 0:
-                output_ids = curr_outputs['output_ids']
-                sequence_lengths = curr_outputs['sequence_lengths']
-                cum_log_probs = None
-                log_probs = None
-                if args.output_cum_log_probs_npy != None:
-                    cum_log_probs = outputs['cum_log_probs']
-                if args.output_log_probs_npy != None:
-                    log_probs = outputs['log_probs']
-                print_output(
-                    tokenizer,
-                    output_ids,
-                    input_lengths,
-                    sequence_lengths,
-                    output_csv=args.output_csv,
-                    output_npy=args.output_npy,
-                    cum_log_probs=cum_log_probs,
-                    log_probs=log_probs,
-                    output_cum_log_probs_npy=args.output_cum_log_probs_npy,
-                    output_log_probs_npy=args.output_log_probs_npy)
-    else:
-        if runtime_rank == 0:
-            output_ids = outputs['output_ids']
-            sequence_lengths = outputs['sequence_lengths']
-            context_logits = None
-            generation_logits = None
-            cum_log_probs = None
-            log_probs = None
-            if runner.gather_context_logits:
-                context_logits = outputs['context_logits']
-            if runner.gather_generation_logits:
-                generation_logits = outputs['generation_logits']
-            if args.output_cum_log_probs_npy != None:
-                cum_log_probs = outputs['cum_log_probs']
-            if args.output_log_probs_npy != None:
-                log_probs = outputs['log_probs']
-            print_output(tokenizer,
-                         output_ids,
-                         input_lengths,
-                         sequence_lengths,
-                         output_csv=args.output_csv,
-                         output_npy=args.output_npy,
-                         context_logits=context_logits,
-                         generation_logits=generation_logits,
-                         output_logits_npy=args.output_logits_npy,
-                         cum_log_probs=cum_log_probs,
-                         log_probs=log_probs,
-                         output_cum_log_probs_npy=args.output_cum_log_probs_npy,
-                         output_log_probs_npy=args.output_log_probs_npy)
-
-    if args.run_profiling:
-        ite = 1
-        # warmup
-        for _ in range(ite):
-            with torch.no_grad():
-                outputs = runner.generate(
-                    batch_input_ids,
-                    max_new_tokens=args.max_output_len,
-                    max_attention_window_size=args.max_attention_window_size,
-                    end_id=end_id,
-                    pad_id=pad_id,
-                    temperature=args.temperature,
-                    top_k=args.top_k,
-                    top_p=args.top_p,
-                    num_beams=args.num_beams,
-                    length_penalty=args.length_penalty,
-                    early_stopping=args.early_stopping,
-                    repetition_penalty=args.repetition_penalty,
-                    presence_penalty=args.presence_penalty,
-                    frequency_penalty=args.frequency_penalty,
-                    stop_words_list=stop_words_list,
-                    bad_words_list=bad_words_list,
-                    output_cum_log_probs=(args.output_cum_log_probs_npy !=
-                                          None),
-                    output_log_probs=(args.output_log_probs_npy != None),
-                    random_seed=args.random_seed,
-                    lora_uids=args.lora_task_uids,
-                    prompt_table=args.prompt_table_path,
-                    prompt_tasks=args.prompt_tasks,
-                    streaming=args.streaming,
-                    output_sequence_lengths=True,
-                    return_dict=True)
-                torch.cuda.synchronize()
-
-        # tensorrt_llm.profiler.start("tmp")
-        for _ in range(ite):
-            with torch.no_grad():
-                power_profile.start()
-                outputs = runner.generate(
-                    batch_input_ids,
-                    max_new_tokens=args.max_output_len,
-                    max_attention_window_size=args.max_attention_window_size,
-                    end_id=end_id,
-                    pad_id=pad_id,
-                    temperature=args.temperature,
-                    top_k=args.top_k,
-                    top_p=args.top_p,
-                    num_beams=args.num_beams,
-                    length_penalty=args.length_penalty,
-                    early_stopping=args.early_stopping,
-                    repetition_penalty=args.repetition_penalty,
-                    presence_penalty=args.presence_penalty,
-                    frequency_penalty=args.frequency_penalty,
-                    stop_words_list=stop_words_list,
-                    bad_words_list=bad_words_list,
-                    output_cum_log_probs=(args.output_cum_log_probs_npy !=
-                                          None),
-                    output_log_probs=(args.output_log_probs_npy != None),
-                    random_seed=args.random_seed,
-                    lora_uids=args.lora_task_uids,
-                    prompt_table=args.prompt_table_path,
-                    prompt_tasks=args.prompt_tasks,
-                    streaming=args.streaming,
-                    output_sequence_lengths=True,
-                    return_dict=True)
-                training_powers, training_powers_time = power_profile.stop()
-                power_profile.destroy()
-                torch.cuda.synchronize()
-                
-                list_1 = ["Hardware",
-                        "Num of Hardware",
-                        "Framework",
-                        "Model",
-                        "Input Output Length",
-                        "Batch Size",
-                        "training_powers",
-                        "training_powers_time"
-                        ]
-                    
-                list_2 = ["Nvidia A100 GPU",
-                        args.tp_size,
-                        "TensorRT-LLM",
-                        args.model_name,
-                        args.max_input_length,
-                        args.batch_size,
-                        list(training_powers),
-                        list(training_powers_time)
-                        ] 
-
-                assert len(list_1) == len(list_2)
-
-                import csv, os
-
-                def split_string(model_name):
-                    if "/" in model_name:
-                        return model_name.split("/")[-1]
-                    else:
-                        return model_name
-                # csv_file = "power_results.csv"
-                csv_file = "power_" + str(split_string(args.model_name)) + ".csv"
-                file_exists = os.path.exists(csv_file)
-
-                if runtime_rank == 0:
-                    with open(csv_file, 'a', newline = '') as csvfile:
-                        writer = csv.writer(csvfile)
-                        
-                        if not file_exists:
-                            writer.writerow(list_1)
-                        
-                        writer.writerow(list_2) 
-                        
-                    csvfile.close()
-
-        # tensorrt_llm.profiler.stop("tmp")
-
-        # print(
-        #     f"batch_size: {len(batch_input_ids)}, avg latency of {ite} iterations: : {tensorrt_llm.profiler.elapsed_time_in_sec('tmp') / ite} sec"
-        # )
-
-        # file_path = "/home/krishnat95/llama-bench/Inference/TensorRT-LLM/examples/batch_input.csv"
-        
-        # if runtime_rank == 0:
-        #     with open(file_path, 'a', newline='') as file:
-        #         writer = csv.writer(file)
-        #         latency = tensorrt_llm.profiler.elapsed_time_in_sec('tmp') / ite
-        #         throughput = (args.batch_size*(args.max_input_length + args.max_output_len))/latency
-        #         data = [[args.model_name, str(args.tp_size), str(args.pp_size), str(args.moe_ep_size), str(args.moe_tp_size), str(args.precision), str(args.max_input_length), str(args.max_output_len), str(args.batch_size), str(args.int8_kv_cache), str(latency), str(throughput)]]
-        #         writer.writerows(data)
-
-
-
-
-import random
-import string
-
-
-def generate_random_word(length):
-    letters = string.ascii_letters
-    return ''.join(random.choice(letters) for i in range(length))
-
-def generate_input(args):
-    random_words = ["France" for _ in range(args.max_input_length)]
-
-    input_id = ""
-
-    for word in random_words:
-        input_id = input_id + word + " "
-
-    input_id = input_id[:-1]
-    
-    input_list = []
-
-    for batch_size in range(args.batch_size):
-        input_list.append(input_id)
-        
-    return input_list
-
-
-if __name__ == '__main__':
-    args = parse_arguments()
-    args.input_text = generate_input(args)
-    main(args)
-
-
-
-
-
-power_profile.start()
-
-training_powers, training_powers_time = power_profile.stop()
-power_profile.destroy()
-
diff --git a/TensorRT-LLM/H100/run_precision_bench.py b/TensorRT-LLM/H100/run_precision_bench.py
deleted file mode 100644
index c8bc0f4..0000000
--- a/TensorRT-LLM/H100/run_precision_bench.py
+++ /dev/null
@@ -1,563 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import ast
-import csv
-import os
-from pathlib import Path
-
-import numpy as np
-import torch
-from utils import (DEFAULT_HF_MODEL_DIRS, DEFAULT_PROMPT_TEMPLATES,
-                   add_common_args, load_tokenizer, read_model_name,
-                   throttle_generator)
-
-import tensorrt_llm
-import tensorrt_llm.profiler
-from tensorrt_llm.logger import logger
-from tensorrt_llm.runtime import PYTHON_BINDINGS, ModelRunner
-
-if PYTHON_BINDINGS:
-    from tensorrt_llm.runtime import ModelRunnerCpp
-
-
-def parse_arguments(args=None):
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument('--pp_size', type=int, default = 1)
-    parser.add_argument('--tp_size', type=int, default = 1)
-    parser.add_argument('--moe_ep_size', type=int, default = 1)
-    parser.add_argument('--moe_tp_size', type=int, default = 1)
-    parser.add_argument('--model_name', type=str, required=True)
-    parser.add_argument('--batch_size', type=int, default=1, help='Batch Size')
-    parser.add_argument('--precision', type=str, default="float16", help="precision")
-    parser.add_argument('--int8_kv_cache', default=False, action='store_true', help="Int8 KV Cache.")
-
-    parser.add_argument('--qformat', type=str, default="float16", help="precision")
-    parser.add_argument('--kv_cache_dtype', type=str, default="float16", help="precision")
-
-    parser.add_argument('--max_input_length', type=int, default=923)
-    parser.add_argument('--max_output_len', type=int, required=True)
-    parser.add_argument(
-        '--input_text',
-        type=str,
-        nargs='+',
-        default=["Born in north-east France, Soyer trained as a"])
-    parser.add_argument(
-        '--input_file',
-        type=str,
-        help=
-        'CSV or Numpy file containing tokenized input. Alternative to text input.',
-        default=None)
-    parser.add_argument('--output_csv',
-                        type=str,
-                        help='CSV file where the tokenized output is stored.',
-                        default=None)
-    parser.add_argument('--output_npy',
-                        type=str,
-                        help='Numpy file where the tokenized output is stored.',
-                        default=None)
-    parser.add_argument(
-        '--output_logits_npy',
-        type=str,
-        help=
-        'Numpy file where the generation logits are stored. Use only when num_beams==1',
-        default=None)
-    parser.add_argument('--output_log_probs_npy',
-                        type=str,
-                        help='Numpy file where the log_probs are stored',
-                        default=None)
-    parser.add_argument('--output_cum_log_probs_npy',
-                        type=str,
-                        help='Numpy file where the cum_log_probs are stored',
-                        default=None)
-    parser.add_argument(
-        '--run_profiling',
-        default=False,
-        action='store_true',
-        help="Run several 10 iterations to profile the inference latencies.")
-    parser = add_common_args(parser)
-
-    return parser.parse_args(args=args)
-
-
-def parse_input(tokenizer,
-                input_text=None,
-                prompt_template=None,
-                input_file=None,
-                add_special_tokens=True,
-                max_input_length=923,
-                pad_id=None,
-                num_prepend_vtokens=[],
-                model_name=None,
-                model_version=None):
-    if pad_id is None:
-        pad_id = tokenizer.pad_token_id
-
-    batch_input_ids = []
-    if input_file is None:
-        for curr_text in input_text:
-            if prompt_template is not None:
-                curr_text = prompt_template.format(input_text=curr_text)
-            input_ids = tokenizer.encode(curr_text,
-                                         add_special_tokens=add_special_tokens,
-                                         truncation=True,
-                                         max_length=max_input_length)
-            batch_input_ids.append(input_ids)
-    else:
-        if input_file.endswith('.csv'):
-            with open(input_file, 'r') as csv_file:
-                csv_reader = csv.reader(csv_file, delimiter=',')
-                for line in csv_reader:
-                    input_ids = np.array(line, dtype='int32')
-                    batch_input_ids.append(input_ids[-max_input_length:])
-        elif input_file.endswith('.npy'):
-            inputs = np.load(input_file)
-            for row in inputs:
-                input_ids = row[row != pad_id]
-                batch_input_ids.append(input_ids[-max_input_length:])
-        elif input_file.endswith('.txt'):
-            with open(input_file, 'r', encoding='utf-8',
-                      errors='replace') as txt_file:
-                input_text = txt_file.readlines()
-                batch_input_ids = tokenizer(
-                    input_text,
-                    add_special_tokens=add_special_tokens,
-                    truncation=True,
-                    max_length=max_input_length)["input_ids"]
-        else:
-            print('Input file format not supported.')
-            raise SystemExit
-
-    if num_prepend_vtokens:
-        assert len(num_prepend_vtokens) == len(batch_input_ids)
-        base_vocab_size = tokenizer.vocab_size - len(
-            tokenizer.special_tokens_map.get('additional_special_tokens', []))
-        for i, length in enumerate(num_prepend_vtokens):
-            batch_input_ids[i] = list(
-                range(base_vocab_size,
-                      base_vocab_size + length)) + batch_input_ids[i]
-
-    if model_name == 'ChatGLMForCausalLM' and model_version == 'glm':
-        for ids in batch_input_ids:
-            ids.append(tokenizer.sop_token_id)
-
-    batch_input_ids = [
-        torch.tensor(x, dtype=torch.int32) for x in batch_input_ids
-    ]
-    return batch_input_ids
-
-
-def print_output(tokenizer,
-                 output_ids,
-                 input_lengths,
-                 sequence_lengths,
-                 output_csv=None,
-                 output_npy=None,
-                 context_logits=None,
-                 generation_logits=None,
-                 cum_log_probs=None,
-                 log_probs=None,
-                 output_logits_npy=None,
-                 output_cum_log_probs_npy=None,
-                 output_log_probs_npy=None):
-    batch_size, num_beams, _ = output_ids.size()
-    if output_csv is None and output_npy is None:
-        for batch_idx in range(batch_size):
-            inputs = output_ids[batch_idx][0][:input_lengths[batch_idx]].tolist(
-            )
-            input_text = tokenizer.decode(inputs)
-            print(f'Input [Text {batch_idx}]: \"{input_text}\"')
-            for beam in range(num_beams):
-                output_begin = input_lengths[batch_idx]
-                output_end = sequence_lengths[batch_idx][beam]
-                outputs = output_ids[batch_idx][beam][
-                    output_begin:output_end].tolist()
-                output_text = tokenizer.decode(outputs)
-                print(
-                    f'Output [Text {batch_idx} Beam {beam}]: \"{output_text}\"')
-
-    output_ids = output_ids.reshape((-1, output_ids.size(2)))
-    if output_csv is not None:
-        output_file = Path(output_csv)
-        output_file.parent.mkdir(exist_ok=True, parents=True)
-        outputs = output_ids.tolist()
-        with open(output_file, 'w') as csv_file:
-            writer = csv.writer(csv_file, delimiter=',')
-            writer.writerows(outputs)
-
-    if output_npy is not None:
-        output_file = Path(output_npy)
-        output_file.parent.mkdir(exist_ok=True, parents=True)
-        outputs = np.array(output_ids.cpu().contiguous(), dtype='int32')
-        np.save(output_file, outputs)
-
-    # Save context logits
-    if context_logits is not None and output_logits_npy is not None:
-        context_logits = torch.cat(context_logits, axis=0)
-        vocab_size_padded = context_logits.shape[-1]
-        context_logits = context_logits.reshape([1, -1, vocab_size_padded])
-
-        output_context_logits_npy = output_logits_npy.split(
-            '.npy')[0] + "_context"
-        output_context_logits_file = Path(output_context_logits_npy)
-        context_outputs = np.array(
-            context_logits.squeeze(0).cpu().contiguous(),
-            dtype='float32')  # [promptLengthSum, vocabSize]
-        np.save(output_context_logits_file, context_outputs)
-
-    # Save generation logits
-    if generation_logits is not None and output_logits_npy is not None and num_beams == 1:
-        output_generation_logits_npy = output_logits_npy.split(
-            '.npy')[0] + "_generation"
-        output_generation_logits_file = Path(output_generation_logits_npy)
-        generation_outputs = np.array(generation_logits.cpu().contiguous(),
-                                      dtype='float32')
-        np.save(output_generation_logits_file, generation_outputs)
-
-    # Save cum log probs
-    if cum_log_probs is not None and output_cum_log_probs_npy is not None:
-        cum_log_probs_file = Path(output_cum_log_probs_npy)
-        cum_log_probs_outputs = np.array(cum_log_probs.cpu().contiguous(),
-                                         dtype='float32')
-        np.save(cum_log_probs_file, cum_log_probs_outputs)
-
-    # Save cum log probs
-    if log_probs is not None and output_log_probs_npy is not None:
-        log_probs_file = Path(output_log_probs_npy)
-        log_probs_outputs = np.array(log_probs.cpu().contiguous(),
-                                     dtype='float32')
-        np.save(log_probs_file, log_probs_outputs)
-
-
-def main(args):
-    runtime_rank = tensorrt_llm.mpi_rank()
-    logger.set_level(args.log_level)
-
-    # different handling if encoder-decoder models
-    is_enc_dec = {
-        name
-        for name in os.listdir(args.engine_dir)
-        if os.path.isdir(os.path.join(args.engine_dir, name))
-    } == {'encoder', 'decoder'}
-    if is_enc_dec:
-        logger.warning(
-            "This path is an encoder-decoder model. Using different handling.")
-        assert not args.use_py_session, "Encoder-decoder models don't have a unified python runtime, please use its own examples/enc_dec/run.py instead."
-
-    model_name, model_version = read_model_name(
-        args.engine_dir) if not is_enc_dec else ("", "")
-    if args.tokenizer_dir is None:
-        logger.warning(
-            "tokenizer_dir is not specified. Try to infer from model_name, but this may be incorrect."
-        )
-        args.tokenizer_dir = DEFAULT_HF_MODEL_DIRS[model_name]
-
-    tokenizer, pad_id, end_id = load_tokenizer(args,
-        tokenizer_dir=args.tokenizer_dir,
-        vocab_file=args.vocab_file,
-        model_name=model_name,
-        model_version=model_version,
-        tokenizer_type=args.tokenizer_type,
-    )
-
-    stop_words_list = None
-    if args.stop_words:
-        stop_words_list = tensorrt_llm.runtime.decode_words_list(
-            args.stop_words, tokenizer)
-
-    bad_words_list = None
-    if args.bad_words:
-        bad_words_list = tensorrt_llm.runtime.decode_words_list(
-            args.bad_words, tokenizer)
-
-    prompt_template = None
-    if args.use_prompt_template and model_name in DEFAULT_PROMPT_TEMPLATES:
-        prompt_template = DEFAULT_PROMPT_TEMPLATES[model_name]
-    batch_input_ids = parse_input(tokenizer=tokenizer,
-                                  input_text=args.input_text,
-                                  prompt_template=prompt_template,
-                                  input_file=args.input_file,
-                                  add_special_tokens=args.add_special_tokens,
-                                  max_input_length=args.max_input_length,
-                                  pad_id=pad_id,
-                                  num_prepend_vtokens=args.num_prepend_vtokens,
-                                  model_name=model_name,
-                                  model_version=model_version)
-
-    if is_enc_dec:
-        encoder_input_ids = batch_input_ids
-        decoder_input_ids = [
-            torch.tensor([pad_id], dtype=torch.int32) for _ in batch_input_ids
-        ]  # by default decoder_start_token_id for T5
-
-    input_lengths = [x.size(0) for x in decoder_input_ids
-                     ] if is_enc_dec else [x.size(0) for x in batch_input_ids]
-    encoder_input_lengths = [x.size(0)
-                             for x in encoder_input_ids] if is_enc_dec else None
-
-    if not PYTHON_BINDINGS and not args.use_py_session:
-        logger.warning(
-            "Python bindings of C++ session is unavailable, fallback to Python session."
-        )
-        args.use_py_session = True
-    if args.debug_mode and not args.use_py_session:
-        logger.warning(
-            "Debug mode is not supported in C++ session for now, fallback to Python session."
-        )
-        args.use_py_session = True
-    runner_cls = ModelRunner if args.use_py_session else ModelRunnerCpp
-    runner_kwargs = dict(
-        engine_dir=args.engine_dir,
-        lora_dir=args.lora_dir,
-        rank=runtime_rank,
-        debug_mode=args.debug_mode,
-        lora_ckpt_source=args.lora_ckpt_source,
-        gpu_weights_percent=args.gpu_weights_percent,
-    )
-    if not args.use_py_session:
-        runner_kwargs.update(is_enc_dec=is_enc_dec)
-    if args.medusa_choices is not None:
-        args.medusa_choices = ast.literal_eval(args.medusa_choices)
-        assert args.temperature == 1.0, "Medusa should use temperature == 1.0"
-        assert args.num_beams == 1, "Medusa should use num_beams == 1"
-        runner_kwargs.update(medusa_choices=args.medusa_choices)
-    if not args.use_py_session:
-        runner_kwargs.update(
-            max_batch_size=len(batch_input_ids),
-            max_input_len=max(
-                encoder_input_lengths if is_enc_dec else input_lengths),
-            max_output_len=args.max_output_len,
-            max_beam_width=args.num_beams,
-            max_attention_window_size=args.max_attention_window_size,
-            sink_token_length=args.sink_token_length,
-            max_tokens_in_paged_kv_cache=args.max_tokens_in_paged_kv_cache,
-            kv_cache_enable_block_reuse=args.kv_cache_enable_block_reuse,
-            kv_cache_free_gpu_memory_fraction=args.
-            kv_cache_free_gpu_memory_fraction,
-            enable_chunked_context=args.enable_chunked_context,
-        )
-    runner = runner_cls.from_dir(**runner_kwargs)
-
-    with torch.no_grad():
-        outputs = runner.generate(
-            batch_input_ids=decoder_input_ids
-            if is_enc_dec else batch_input_ids,
-            encoder_input_ids=encoder_input_ids if is_enc_dec else None,
-            max_new_tokens=args.max_output_len,
-            max_attention_window_size=args.max_attention_window_size,
-            sink_token_length=args.sink_token_length,
-            end_id=end_id,
-            pad_id=pad_id,
-            temperature=args.temperature,
-            top_k=args.top_k,
-            top_p=args.top_p,
-            num_beams=args.num_beams,
-            length_penalty=args.length_penalty,
-            early_stopping=args.early_stopping,
-            repetition_penalty=args.repetition_penalty,
-            presence_penalty=args.presence_penalty,
-            frequency_penalty=args.frequency_penalty,
-            stop_words_list=stop_words_list,
-            bad_words_list=bad_words_list,
-            output_cum_log_probs=(args.output_cum_log_probs_npy != None),
-            output_log_probs=(args.output_log_probs_npy != None),
-            random_seed=args.random_seed,
-            lora_uids=args.lora_task_uids,
-            prompt_table=args.prompt_table_path,
-            prompt_tasks=args.prompt_tasks,
-            streaming=args.streaming,
-            output_sequence_lengths=True,
-            no_repeat_ngram_size=args.no_repeat_ngram_size,
-            return_dict=True,
-            medusa_choices=args.medusa_choices)
-        torch.cuda.synchronize()
-
-    if args.streaming:
-        for curr_outputs in throttle_generator(outputs,
-                                               args.streaming_interval):
-            if runtime_rank == 0:
-                output_ids = curr_outputs['output_ids']
-                sequence_lengths = curr_outputs['sequence_lengths']
-                cum_log_probs = None
-                log_probs = None
-                if args.output_cum_log_probs_npy != None:
-                    cum_log_probs = outputs['cum_log_probs']
-                if args.output_log_probs_npy != None:
-                    log_probs = outputs['log_probs']
-                print_output(
-                    tokenizer,
-                    output_ids,
-                    input_lengths,
-                    sequence_lengths,
-                    output_csv=args.output_csv,
-                    output_npy=args.output_npy,
-                    cum_log_probs=cum_log_probs,
-                    log_probs=log_probs,
-                    output_cum_log_probs_npy=args.output_cum_log_probs_npy,
-                    output_log_probs_npy=args.output_log_probs_npy)
-    else:
-        if runtime_rank == 0:
-            output_ids = outputs['output_ids']
-            sequence_lengths = outputs['sequence_lengths']
-            context_logits = None
-            generation_logits = None
-            cum_log_probs = None
-            log_probs = None
-            if runner.gather_context_logits:
-                context_logits = outputs['context_logits']
-            if runner.gather_generation_logits:
-                generation_logits = outputs['generation_logits']
-            if args.output_cum_log_probs_npy != None:
-                cum_log_probs = outputs['cum_log_probs']
-            if args.output_log_probs_npy != None:
-                log_probs = outputs['log_probs']
-            print_output(tokenizer,
-                         output_ids,
-                         input_lengths,
-                         sequence_lengths,
-                         output_csv=args.output_csv,
-                         output_npy=args.output_npy,
-                         context_logits=context_logits,
-                         generation_logits=generation_logits,
-                         output_logits_npy=args.output_logits_npy,
-                         cum_log_probs=cum_log_probs,
-                         log_probs=log_probs,
-                         output_cum_log_probs_npy=args.output_cum_log_probs_npy,
-                         output_log_probs_npy=args.output_log_probs_npy)
-
-    if args.run_profiling:
-        ite = 10
-        # warmup
-        for _ in range(ite):
-            with torch.no_grad():
-                outputs = runner.generate(
-                    batch_input_ids,
-                    max_new_tokens=args.max_output_len,
-                    max_attention_window_size=args.max_attention_window_size,
-                    end_id=end_id,
-                    pad_id=pad_id,
-                    temperature=args.temperature,
-                    top_k=args.top_k,
-                    top_p=args.top_p,
-                    num_beams=args.num_beams,
-                    length_penalty=args.length_penalty,
-                    early_stopping=args.early_stopping,
-                    repetition_penalty=args.repetition_penalty,
-                    presence_penalty=args.presence_penalty,
-                    frequency_penalty=args.frequency_penalty,
-                    stop_words_list=stop_words_list,
-                    bad_words_list=bad_words_list,
-                    output_cum_log_probs=(args.output_cum_log_probs_npy !=
-                                          None),
-                    output_log_probs=(args.output_log_probs_npy != None),
-                    random_seed=args.random_seed,
-                    lora_uids=args.lora_task_uids,
-                    prompt_table=args.prompt_table_path,
-                    prompt_tasks=args.prompt_tasks,
-                    streaming=args.streaming,
-                    output_sequence_lengths=True,
-                    return_dict=True)
-                torch.cuda.synchronize()
-
-        tensorrt_llm.profiler.start("tmp")
-        ite = 1
-        for _ in range(ite):
-            with torch.no_grad():
-                outputs = runner.generate(
-                    batch_input_ids,
-                    max_new_tokens=args.max_output_len,
-                    max_attention_window_size=args.max_attention_window_size,
-                    end_id=end_id,
-                    pad_id=pad_id,
-                    temperature=args.temperature,
-                    top_k=args.top_k,
-                    top_p=args.top_p,
-                    num_beams=args.num_beams,
-                    length_penalty=args.length_penalty,
-                    early_stopping=args.early_stopping,
-                    repetition_penalty=args.repetition_penalty,
-                    presence_penalty=args.presence_penalty,
-                    frequency_penalty=args.frequency_penalty,
-                    stop_words_list=stop_words_list,
-                    bad_words_list=bad_words_list,
-                    output_cum_log_probs=(args.output_cum_log_probs_npy !=
-                                          None),
-                    output_log_probs=(args.output_log_probs_npy != None),
-                    random_seed=args.random_seed,
-                    lora_uids=args.lora_task_uids,
-                    prompt_table=args.prompt_table_path,
-                    prompt_tasks=args.prompt_tasks,
-                    streaming=args.streaming,
-                    output_sequence_lengths=True,
-                    return_dict=True)
-                torch.cuda.synchronize()
-        tensorrt_llm.profiler.stop("tmp")
-
-        print(
-            f"batch_size: {len(batch_input_ids)}, avg latency of {ite} iterations: : {tensorrt_llm.profiler.elapsed_time_in_sec('tmp') / ite} sec"
-        )
-
-        def split_string(model_name):
-            if "/" in model_name:
-                return model_name.split("/")[-1]
-            else:
-                return model_name
-        
-        file_path = "precision_results_" + str(split_string(args.model_name)) + ".csv"
-
-        if runtime_rank == 0:
-            
-            latency = tensorrt_llm.profiler.elapsed_time_in_sec('tmp') / ite
-            throughput = (args.batch_size*(args.max_input_length + args.max_output_len))/latency
-            Weight_KV_dtype = f"Weight = {args.qformat}, KV Cache = {args.kv_cache_dtype}"
-            data = [["Nvidia A100 GPU",str(args.tp_size),"TensorRT-LLM",args.model_name,str(args.max_input_length),str(args.batch_size),Weight_KV_dtype,str(latency),str(throughput)]]
-
-            with open(file_path, 'a', newline='') as file:
-                writer = csv.writer(file)
-                writer.writerows(data)
-
-
-import random
-import string
-
-
-def generate_random_word(length):
-    letters = string.ascii_letters
-    return ''.join(random.choice(letters) for i in range(length))
-
-def generate_input(args):
-    random_words = ["France" for _ in range(args.max_input_length)]
-
-    input_id = ""
-
-    for word in random_words:
-        input_id = input_id + word + " "
-
-    input_id = input_id[:-1]
-    
-    input_list = []
-
-    for batch_size in range(args.batch_size):
-        input_list.append(input_id)
-        
-    return input_list
-
-
-if __name__ == '__main__':
-    args = parse_arguments()
-    args.input_text = generate_input(args)
-    main(args)
diff --git a/TensorRT-LLM/H100/utils.py b/TensorRT-LLM/H100/utils.py
deleted file mode 100644
index bf0057f..0000000
--- a/TensorRT-LLM/H100/utils.py
+++ /dev/null
@@ -1,373 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import json
-from pathlib import Path
-from typing import Optional
-
-from transformers import AutoTokenizer, LlamaTokenizer, T5Tokenizer
-
-from tensorrt_llm.bindings import GptJsonConfig
-from tensorrt_llm.builder import get_engine_version
-
-DEFAULT_HF_MODEL_DIRS = {
-    'BaichuanForCausalLM': 'baichuan-inc/Baichuan-13B-Chat',
-    'BloomForCausalLM': 'bigscience/bloom-560m',
-    'GLMModel': 'THUDM/glm-10b',
-    'ChatGLMModel': 'THUDM/chatglm3-6b',
-    'ChatGLMForCausalLM': 'THUDM/chatglm3-6b',
-    'FalconForCausalLM': 'tiiuae/falcon-rw-1b',
-    'GPTForCausalLM': 'gpt2-medium',
-    'GPTJForCausalLM': 'EleutherAI/gpt-j-6b',
-    'GPTNeoXForCausalLM': 'EleutherAI/gpt-neox-20b',
-    'InternLMForCausalLM': 'internlm/internlm-chat-7b',
-    'InternLM2ForCausalLM': 'internlm/internlm2-chat-7b',
-    'LlamaForCausalLM': 'meta-llama/Llama-2-7b-hf',
-    'MPTForCausalLM': 'mosaicml/mpt-7b',
-    'PhiForCausalLM': 'microsoft/phi-2',
-    'OPTForCausalLM': 'facebook/opt-350m',
-    'QWenLMHeadModel': 'Qwen/Qwen-7B',
-    'QWenForCausalLM': 'Qwen/Qwen-7B',
-    'Qwen2ForCausalLM': 'Qwen/Qwen1.5-7B',
-    'Qwen2MoeForCausalLM': 'Qwen/Qwen1.5-MoE-A2.7B',
-    'RecurrentGemmaForCausalLM': 'google/recurrentgemma-2b',
-}
-
-INTERNLM_META_INSTRUCTION = """You are an AI assistant whose name is InternLM (书生·浦语).
-- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.
-- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文.
-"""
-
-QWEN_PROMPT_TEMPLATE = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant\n"
-
-DEFAULT_PROMPT_TEMPLATES = {
-    'InternLMForCausalLM': "<|User|>:{input_text}<eoh>\n<|Bot|>:",
-    'InternLM2ForCausalLM': "<|im_start|>system\n" + INTERNLM_META_INSTRUCTION +
-    "<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant\n",
-    'QWenLMHeadModel': QWEN_PROMPT_TEMPLATE,
-    'QWenForCausalLM': QWEN_PROMPT_TEMPLATE,
-    'Qwen2ForCausalLM': QWEN_PROMPT_TEMPLATE,
-    'Qwen2MoeForCausalLM': QWEN_PROMPT_TEMPLATE,
-}
-
-
-def supports_inflight_batching(engine_dir):
-    config_path = Path(engine_dir) / "config.json"
-    json_config = GptJsonConfig.parse_file(config_path)
-    model_config = json_config.model_config
-    return model_config.supports_inflight_batching
-
-
-def read_decoder_start_token_id(engine_dir):
-    with open(Path(engine_dir) / "config.json", 'r') as f:
-        config = json.load(f)
-    return config['pretrained_config']['decoder_start_token_id']
-
-
-def read_model_name(engine_dir: str):
-    engine_version = get_engine_version(engine_dir)
-
-    with open(Path(engine_dir) / "config.json", 'r') as f:
-        config = json.load(f)
-
-    if engine_version is None:
-        return config['builder_config']['name'], None
-
-    model_arch = config['pretrained_config']['architecture']
-    model_version = None
-    if 'GLM' in model_arch:
-        model_version = config['pretrained_config']['chatglm_version']
-    if 'qwen' in model_arch.lower():
-        model_version = config['pretrained_config']['qwen_type']
-    return model_arch, model_version
-
-
-def throttle_generator(generator, stream_interval):
-    for i, out in enumerate(generator):
-        if not i % stream_interval:
-            yield out
-
-    if i % stream_interval:
-        yield out
-
-
-def load_tokenizer(args, tokenizer_dir: Optional[str] = None,
-                   vocab_file: Optional[str] = None,
-                   model_name: str = 'GPTForCausalLM',
-                   model_version: Optional[str] = None,
-                   tokenizer_type: Optional[str] = None):
-    if vocab_file is None:
-        use_fast = True
-        if tokenizer_type is not None and tokenizer_type == "llama":
-            use_fast = False
-        # Should set both padding_side and truncation_side to be 'left'
-        if "Llama-3-8B" in args.model_name:
-            tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B",
-                                                      cache_dir = "/vast/users/sraskar/mi250/hf/hub/models--meta-llama--Meta-Llama-3-8B/snapshots/62bd457b6fe961a42a631306577e622c83876cb6",
-                                                      legacy=False,
-                                                      padding_side='left',
-                                                      truncation_side='left',
-                                                      trust_remote_code=True,
-                                                      use_fast=False
-                                                      )
-
-        else:
-            tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
-                                                    legacy=False,
-                                                    padding_side='left',
-                                                    truncation_side='left',
-                                                    trust_remote_code=True,
-                                                    tokenizer_type=tokenizer_type,
-                                                    use_fast=use_fast)
-    elif model_name == 'GemmaForCausalLM' or model_name == 'RecurrentGemmaForCausalLM':
-        from transformers import GemmaTokenizer
-
-        # Initialize tokenizer from vocab file.
-        tokenizer = GemmaTokenizer(vocab_file=vocab_file,
-                                   padding_side='left',
-                                   truncation_side='left',
-                                   legacy=False)
-    elif model_name == 'Grok1ModelForCausalLM':
-        tokenizer = LlamaTokenizer(vocab_file=vocab_file,
-                                   padding_side='left',
-                                   truncation_side='left',
-                                   legacy=False,
-                                   use_fast=False)
-    else:
-        # For gpt-next, directly load from tokenizer.model
-        tokenizer = T5Tokenizer(vocab_file=vocab_file,
-                                padding_side='left',
-                                truncation_side='left',
-                                legacy=False)
-    if 'qwen' in model_name.lower() and model_version == 'qwen':
-        with open(Path(tokenizer_dir) / "generation_config.json") as f:
-            gen_config = json.load(f)
-        pad_id = gen_config['pad_token_id']
-        end_id = gen_config['eos_token_id']
-    elif 'GLM' in model_name and model_version == 'glm':
-        pad_id = tokenizer.pad_token_id
-        end_id = tokenizer.eop_token_id
-    else:
-        if tokenizer.pad_token_id is None:
-            tokenizer.pad_token_id = tokenizer.eos_token_id
-        pad_id = tokenizer.pad_token_id
-        end_id = tokenizer.eos_token_id
-
-    return tokenizer, pad_id, end_id
-
-
-def add_common_args(parser):
-    # sampling arguments
-    parser.add_argument('--num_beams',
-                        type=int,
-                        help="Use beam search if num_beams > 1",
-                        default=1)
-    parser.add_argument('--temperature', type=float, default=1.0)
-    parser.add_argument('--top_k', type=int, default=1)
-    parser.add_argument('--top_p', type=float, default=0.0)
-    parser.add_argument('--length_penalty', type=float, default=1.0)
-    parser.add_argument('--repetition_penalty', type=float, default=1.0)
-    parser.add_argument('--presence_penalty', type=float, default=0.0)
-    parser.add_argument('--frequency_penalty', type=float, default=0.0)
-    parser.add_argument('--beam_search_diversity_rate', type=float, default=0.0)
-    parser.add_argument('--random_seed', type=int, default=0)
-    parser.add_argument('--early_stopping',
-                        type=int,
-                        help='Use early stopping if num_beams > 1'
-                        '1 for early-stopping, 0 for non-early-stopping'
-                        'other values for stopping by length',
-                        default=1)
-    parser.add_argument(
-        '--end_id',
-        default=None,
-        type=int,
-        help="Override tokenizer end_id to stop on given end_id token.")
-    parser.add_argument(
-        '--stop_words',
-        default=None,
-        type=str,
-        nargs="+",
-        action='append',
-        help=
-        'Set stop words for a batch. Successive invocations of --stop_words set stop words for other batches.'
-        '    E.g.: --stop_words " London" " chef" --stop_words "eventually became" "was not"',
-    )
-    parser.add_argument(
-        '--bad_words',
-        default=None,
-        type=str,
-        nargs="+",
-        action='append',
-        help=
-        'Set bad words for a batch. Successive invocations of --bad_words set bad words for other batches.'
-        '    E.g.: --bad_words " London" " chef" --bad_words "eventually became" "was not"',
-    )
-    parser.add_argument('--no_repeat_ngram_size', type=int, default=None)
-
-    # common runtime arguments
-    parser.add_argument('--sink_token_length',
-                        type=int,
-                        default=None,
-                        help='The sink token length.')
-    parser.add_argument(
-        '--max_attention_window_size',
-        type=int,
-        default=None,
-        help=
-        'The attention window size that controls the sliding window attention / cyclic kv cache behavior'
-    )
-    parser.add_argument(
-        '--multi_block_mode',
-        action='store_true',
-        help=
-        "Distribute the work across multiple CUDA thread-blocks on the GPU for masked MHA kernel."
-    )
-    parser.add_argument('--log_level', type=str, default='info')
-    parser.add_argument(
-        '--no_prompt_template',
-        dest='use_prompt_template',
-        default=True,
-        action='store_false',
-        help=
-        "Whether or not to use default prompt template to wrap the input text.")
-    parser.add_argument('--use_py_session',
-                        default=False,
-                        action='store_true',
-                        help="Whether or not to use Python runtime session")
-    parser.add_argument('--debug_mode',
-                        default=False,
-                        action='store_true',
-                        help="Whether or not to turn on the debug mode")
-    parser.add_argument('--streaming', default=False, action='store_true')
-    parser.add_argument('--streaming_interval',
-                        type=int,
-                        help="How often to return tokens when streaming.",
-                        default=5)
-    parser.add_argument(
-        '--prompt_table_path',
-        type=str,
-        help="Path to .npy file, exported by nemo_prompt_convert.py")
-    parser.add_argument(
-        '--prompt_tasks',
-        help="Comma-separated list of tasks for prompt tuning, e.g., 0,3,1,0")
-    parser.add_argument('--lora_dir',
-                        type=str,
-                        default=None,
-                        nargs="+",
-                        help="The directory of LoRA weights")
-    parser.add_argument('--lora_ckpt_source',
-                        type=str,
-                        default="hf",
-                        choices=["hf", "nemo"],
-                        help="The source of lora checkpoint.")
-    parser.add_argument(
-        '--lora_task_uids',
-        type=str,
-        default=None,
-        nargs="+",
-        help="The list of LoRA task uids; use -1 to disable the LoRA module")
-    parser.add_argument(
-        '--num_prepend_vtokens',
-        nargs="+",
-        type=int,
-        help="Number of (default) virtual tokens to prepend to each sentence."
-        " For example, '--num_prepend_vtokens=10' will prepend the tokens"
-        " [vocab_size, vocab_size + 1, ..., vocab_size + 9] to the sentence.")
-    parser.add_argument(
-        '--medusa_choices',
-        type=str,
-        default=None,
-        help="Medusa choice to use, if not none, will use Medusa decoding."
-        "   E.g.: [[0, 0, 0, 0], [0, 1, 0], [1, 0], [1, 1]] for 9 medusa tokens."
-    )
-
-    # model arguments
-    parser.add_argument('--engine_dir', type=str, default='engine_outputs')
-    parser.add_argument(
-        '--tokenizer_type',
-        help=
-        'Specify that argument when providing a .model file as the tokenizer_dir. '
-        'It allows AutoTokenizer to instantiate the correct tokenizer type.')
-    parser.add_argument('--vocab_file',
-                        help="Used for sentencepiece tokenizers")
-    parser.add_argument('--no_add_special_tokens',
-                        dest='add_special_tokens',
-                        default=True,
-                        action='store_false',
-                        help="Whether or not to add special tokens")
-    parser.add_argument('--hf_model_dir', '--model_dir', type=str, default=None)
-    parser.add_argument(
-        '--tokenizer_dir',
-        default=None,
-        help='tokenizer path; defaults to hf_model_dir if left unspecified')
-
-    # memory argument
-    parser.add_argument(
-        '--gpu_weights_percent',
-        default=1,
-        type=float,
-        help=
-        'Specify the percentage of weights that reside on GPU instead of CPU and streaming load during runtime.',
-    )
-    parser.add_argument(
-        '--max_tokens_in_paged_kv_cache',
-        default=None,
-        type=int,
-        help=
-        'Specify the maximum number of tokens in a kv cache page (only available with cpp session).',
-    )
-    parser.add_argument(
-        '--kv_cache_enable_block_reuse',
-        action='store_true',
-        help=
-        'Enables block reuse in kv cache (only available with cpp session).',
-    )
-    parser.add_argument(
-        '--kv_cache_free_gpu_memory_fraction',
-        default=0.9,
-        type=float,
-        help='Specify the free gpu memory fraction.',
-    )
-    parser.add_argument(
-        '--enable_chunked_context',
-        action='store_true',
-        help='Enables chunked context (only available with cpp session).',
-    )
-
-    # hf model argument (if use hf model)
-    parser.add_argument(
-        '--hf_data_type',
-        '--data_type',
-        type=str,
-        choices=['fp32', 'fp16', 'bf16', 'float32', 'float16', 'bfloat16'],
-        default='fp16',
-        help="The data type for hf model.")
-    parser.add_argument(
-        '--hf_device_map_auto',
-        action='store_true',
-        help="Use device map 'auto' to load a pretrained HF model. This may "
-        "help to test a large model that cannot fit into a singlue GPU.")
-
-    parser.add_argument(
-        "--return_all_generated_tokens",
-        default=False,
-        action="store_true",
-        help="This option changes the token output only for streaming. "
-        "If not specified, return only generated tokens at each step. "
-        "If specified, return the full beams/outputs at each step. "
-        "It is automatically enabled for num_beams>1 (only available with cpp session). "
-        "WARNING: using this option may increase network usage significantly (quadratically w.r.t output length)."
-    )
-
-    return parser
diff --git a/TensorRT-LLM/README.md b/TensorRT-LLM/README.md
deleted file mode 100644
index 7ed0f40..0000000
--- a/TensorRT-LLM/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-# TensorRT-LLM
-TensorRT-LLM
diff --git a/InferenceGraphPlotter/data/.DS_Store b/data/.DS_Store
similarity index 100%
rename from InferenceGraphPlotter/data/.DS_Store
rename to data/.DS_Store
diff --git a/InferenceGraphPlotter/data/Best_Hardware_and_Framework/All_results.csv b/data/Best_Hardware_and_Framework/All_results.csv
similarity index 100%
rename from InferenceGraphPlotter/data/Best_Hardware_and_Framework/All_results.csv
rename to data/Best_Hardware_and_Framework/All_results.csv
diff --git a/InferenceGraphPlotter/data/Best_Hardware_and_Framework/config.json b/data/Best_Hardware_and_Framework/config.json
similarity index 100%
rename from InferenceGraphPlotter/data/Best_Hardware_and_Framework/config.json
rename to data/Best_Hardware_and_Framework/config.json
diff --git a/InferenceGraphPlotter/data/KV_Cache/All_results.csv b/data/KV_Cache/All_results.csv
similarity index 100%
rename from InferenceGraphPlotter/data/KV_Cache/All_results.csv
rename to data/KV_Cache/All_results.csv
diff --git a/InferenceGraphPlotter/data/KV_Cache/config.json b/data/KV_Cache/config.json
similarity index 100%
rename from InferenceGraphPlotter/data/KV_Cache/config.json
rename to data/KV_Cache/config.json
diff --git a/InferenceGraphPlotter/data/KV_Cache_Block_Size/All_results.csv b/data/KV_Cache_Block_Size/All_results.csv
similarity index 100%
rename from InferenceGraphPlotter/data/KV_Cache_Block_Size/All_results.csv
rename to data/KV_Cache_Block_Size/All_results.csv
diff --git a/InferenceGraphPlotter/data/KV_Cache_Block_Size/config.json b/data/KV_Cache_Block_Size/config.json
similarity index 100%
rename from InferenceGraphPlotter/data/KV_Cache_Block_Size/config.json
rename to data/KV_Cache_Block_Size/config.json
diff --git a/InferenceGraphPlotter/data/KV_Cache_Dtype/All_results.csv b/data/KV_Cache_Dtype/All_results.csv
similarity index 100%
rename from InferenceGraphPlotter/data/KV_Cache_Dtype/All_results.csv
rename to data/KV_Cache_Dtype/All_results.csv
diff --git a/InferenceGraphPlotter/data/KV_Cache_Dtype/config.json b/data/KV_Cache_Dtype/config.json
similarity index 100%
rename from InferenceGraphPlotter/data/KV_Cache_Dtype/config.json
rename to data/KV_Cache_Dtype/config.json
diff --git a/InferenceGraphPlotter/data/Parallelism/All_results.csv b/data/Parallelism/All_results.csv
similarity index 100%
rename from InferenceGraphPlotter/data/Parallelism/All_results.csv
rename to data/Parallelism/All_results.csv
diff --git a/InferenceGraphPlotter/data/Parallelism/config.json b/data/Parallelism/config.json
similarity index 100%
rename from InferenceGraphPlotter/data/Parallelism/config.json
rename to data/Parallelism/config.json
diff --git a/InferenceGraphPlotter/data/Perplexity/All_results.csv b/data/Perplexity/All_results.csv
similarity index 100%
rename from InferenceGraphPlotter/data/Perplexity/All_results.csv
rename to data/Perplexity/All_results.csv
diff --git a/InferenceGraphPlotter/data/Perplexity/config.json b/data/Perplexity/config.json
similarity index 100%
rename from InferenceGraphPlotter/data/Perplexity/config.json
rename to data/Perplexity/config.json
diff --git a/InferenceGraphPlotter/data/Power/All_results.csv b/data/Power/All_results.csv
similarity index 100%
rename from InferenceGraphPlotter/data/Power/All_results.csv
rename to data/Power/All_results.csv
diff --git a/InferenceGraphPlotter/data/Power/config.json b/data/Power/config.json
similarity index 100%
rename from InferenceGraphPlotter/data/Power/config.json
rename to data/Power/config.json
diff --git a/InferenceGraphPlotter/data/Speculative_Decoding/All_results.csv b/data/Speculative_Decoding/All_results.csv
similarity index 100%
rename from InferenceGraphPlotter/data/Speculative_Decoding/All_results.csv
rename to data/Speculative_Decoding/All_results.csv
diff --git a/InferenceGraphPlotter/data/Speculative_Decoding/config.json b/data/Speculative_Decoding/config.json
similarity index 100%
rename from InferenceGraphPlotter/data/Speculative_Decoding/config.json
rename to data/Speculative_Decoding/config.json
diff --git a/InferenceGraphPlotter/data/Throughput/All_results.csv b/data/Throughput/All_results.csv
similarity index 100%
rename from InferenceGraphPlotter/data/Throughput/All_results.csv
rename to data/Throughput/All_results.csv
diff --git a/InferenceGraphPlotter/data/Throughput/config.json b/data/Throughput/config.json
similarity index 100%
rename from InferenceGraphPlotter/data/Throughput/config.json
rename to data/Throughput/config.json
diff --git a/InferenceGraphPlotter/data/alias.json b/data/alias.json
similarity index 100%
rename from InferenceGraphPlotter/data/alias.json
rename to data/alias.json
diff --git a/InferenceGraphPlotter/data/color_coding.json b/data/color_coding.json
similarity index 100%
rename from InferenceGraphPlotter/data/color_coding.json
rename to data/color_coding.json
diff --git a/InferenceGraphPlotter/data/graphs_list.txt b/data/graphs_list.txt
similarity index 100%
rename from InferenceGraphPlotter/data/graphs_list.txt
rename to data/graphs_list.txt
diff --git a/InferenceGraphPlotter/index.html b/index.html
similarity index 100%
rename from InferenceGraphPlotter/index.html
rename to index.html
diff --git a/llama.cpp/A100/README.MD b/llama.cpp/A100/README.MD
deleted file mode 100644
index e69de29..0000000
diff --git a/llama.cpp/GH200/README.MD b/llama.cpp/GH200/README.MD
deleted file mode 100644
index e69de29..0000000
diff --git a/llama.cpp/H100/README.MD b/llama.cpp/H100/README.MD
deleted file mode 100644
index e69de29..0000000
diff --git a/llama.cpp/MI250/README.MD b/llama.cpp/MI250/README.MD
deleted file mode 100644
index e69de29..0000000
diff --git a/llama.cpp/README.md b/llama.cpp/README.md
deleted file mode 100644
index 9487602..0000000
--- a/llama.cpp/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-# llama.cpp
-llama.cpp
diff --git a/InferenceGraphPlotter/src/css/style.css b/src/css/style.css
similarity index 100%
rename from InferenceGraphPlotter/src/css/style.css
rename to src/css/style.css
diff --git a/InferenceGraphPlotter/src/js/display.js b/src/js/display.js
similarity index 100%
rename from InferenceGraphPlotter/src/js/display.js
rename to src/js/display.js
diff --git a/InferenceGraphPlotter/src/js/download_SVG.js b/src/js/download_SVG.js
similarity index 100%
rename from InferenceGraphPlotter/src/js/download_SVG.js
rename to src/js/download_SVG.js
diff --git a/vLLM/A100/README.MD b/vLLM/A100/README.MD
deleted file mode 100644
index e69de29..0000000
diff --git a/vLLM/GH200/README.MD b/vLLM/GH200/README.MD
deleted file mode 100644
index 3269baa..0000000
--- a/vLLM/GH200/README.MD
+++ /dev/null
@@ -1,15 +0,0 @@
-# Setup vLLM on GH200
-
-1. Build a container 
-```bash
-$ source build-container.sh
-```
-
-2. Run container.  
-```bash
-sourece run-container.sh
-```
-This will run the container and execute the `run-models.sh` script. 
-  
-3. Run models 
-First `run-models.sh` will install the dependacnies by calling `wheels/setup_wheel.sh` script. It will run benchmakrs following it. 
\ No newline at end of file
diff --git a/vLLM/GH200/benchmark_latency.py b/vLLM/GH200/benchmark_latency.py
deleted file mode 100644
index b5048f7..0000000
--- a/vLLM/GH200/benchmark_latency.py
+++ /dev/null
@@ -1,334 +0,0 @@
-"""Benchmark the latency of processing a single batch of requests."""
-import argparse
-import json
-import time
-from pathlib import Path
-from typing import List, Optional
-
-import numpy as np
-import torch
-from tqdm import tqdm
-
-from vllm import LLM, SamplingParams
-from vllm.engine.arg_utils import EngineArgs
-from vllm.inputs import PromptInputs
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
-from vllm.utils import FlexibleArgumentParser
-
-import os
-import csv
-
-def main(args: argparse.Namespace):
-    print(args)
-
-    # NOTE(woosuk): If the request cannot be processed in a single batch,
-    # the engine will automatically process the request in multiple batches.
-    llm = LLM(
-        model=args.model,
-        speculative_model=args.speculative_model,
-        num_speculative_tokens=args.num_speculative_tokens,
-        speculative_draft_tensor_parallel_size=\
-            args.speculative_draft_tensor_parallel_size,
-        tokenizer=args.tokenizer,
-        quantization=args.quantization,
-        tensor_parallel_size=args.tensor_parallel_size,
-        trust_remote_code=args.trust_remote_code,
-        dtype=args.dtype,
-        max_model_len=args.max_model_len,
-        enforce_eager=args.enforce_eager,
-        kv_cache_dtype=args.kv_cache_dtype,
-        quantization_param_path=args.quantization_param_path,
-        device=args.device,
-        ray_workers_use_nsight=args.ray_workers_use_nsight,
-        use_v2_block_manager=args.use_v2_block_manager,
-        enable_chunked_prefill=args.enable_chunked_prefill,
-        download_dir=args.download_dir,
-        block_size=args.block_size,
-        gpu_memory_utilization=args.gpu_memory_utilization,
-        load_format=args.load_format,
-        distributed_executor_backend=args.distributed_executor_backend,
-        otlp_traces_endpoint=args.otlp_traces_endpoint,
-        enable_prefix_caching=args.enable_prefix_caching,
-    )
-
-    sampling_params = SamplingParams(
-        n=args.n,
-        temperature=0.0 if args.use_beam_search else 1.0,
-        top_p=1.0,
-        use_beam_search=args.use_beam_search,
-        ignore_eos=True,
-        max_tokens=args.output_len,
-    )
-    print(sampling_params)
-    dummy_prompt_token_ids = np.random.randint(10000,
-                                               size=(args.batch_size,
-                                                     args.input_len))
-    dummy_inputs: List[PromptStrictInputs] = [{
-        "prompt_token_ids": batch
-    } for batch in dummy_prompt_token_ids.tolist()]
-
-    def run_to_completion(profile_dir: Optional[str] = None):
-        if profile_dir:
-            with torch.profiler.profile(
-                    activities=[
-                        torch.profiler.ProfilerActivity.CPU,
-                        torch.profiler.ProfilerActivity.CUDA,
-                    ],
-                    on_trace_ready=torch.profiler.tensorboard_trace_handler(
-                        str(profile_dir))) as p:
-                llm.generate(dummy_inputs,
-                             sampling_params=sampling_params,
-                             use_tqdm=False)
-            print(p.key_averages())
-        else:
-            start_time = time.perf_counter()
-            llm.generate(dummy_inputs,
-                         sampling_params=sampling_params,
-                         use_tqdm=False)
-            end_time = time.perf_counter()
-            latency = end_time - start_time
-            return latency
-
-    print("Warming up...")
-    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
-        run_to_completion(profile_dir=None)
-
-    if args.profile:
-        profile_dir = args.profile_result_dir
-        if not profile_dir:
-            profile_dir = Path(
-                "."
-            ) / "vllm_benchmark_result" / f"latency_result_{time.time()}"
-        print(f"Profiling (results will be saved to '{profile_dir}')...")
-        run_to_completion(profile_dir=profile_dir)
-        return
-
-    # Benchmark.
-    latencies = []
-    for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
-        latencies.append(run_to_completion(profile_dir=None))
-    latencies = np.array(latencies)
-    percentages = [10, 25, 50, 75, 90, 99]
-    percentiles = np.percentile(latencies, percentages)
-    print(f'Avg latency: {np.mean(latencies)} seconds')
-    for percentage, percentile in zip(percentages, percentiles):
-        print(f'{percentage}% percentile latency: {percentile} seconds')
-    # output to csv file
-    avg_latency = np.mean(latencies)
-    print(f'Avg latency: {avg_latency} seconds')
-
-    total_num_tokens = args.batch_size*(args.input_len + args.output_len)
-    print("Total Number of Tokens = ", total_num_tokens)
-
-    throughput = total_num_tokens/avg_latency
-    print("Throughput = ", throughput)
-
-    list_1 = ["Model Name",
-              "throughput",
-              "latency",
-              "batch size",
-              "tensor_parallel",
-              "input length",
-              "output length"
-              ]
-        
-    list_2 = [args.model,
-              throughput,
-              avg_latency,
-              args.batch_size,
-              args.tensor_parallel_size,
-              args.input_len,
-              args.output_len
-              ] 
-
-    assert len(list_1) == len(list_2)
-
-    def split_string(model_name):
-        if "/" in model_name:
-            return model_name.split("/")[-1]
-        else:
-            return model_name
-    
-    csv_file = "results_" + str(split_string(args.model)) + ".csv"
-    file_exists = os.path.exists(csv_file)
-
-    with open(csv_file, 'a', newline = '') as csvfile:
-        writer = csv.writer(csvfile)
-        
-        if not file_exists:
-            writer.writerow(list_1)
-        
-        writer.writerow(list_2) 
-        
-    csvfile.close()
-    # Output JSON results if specified
-    if args.output_json:
-        results = {
-            "avg_latency": np.mean(latencies),
-            "latencies": latencies.tolist(),
-            "percentiles": dict(zip(percentages, percentiles.tolist())),
-        }
-        with open(args.output_json, "w") as f:
-            json.dump(results, f, indent=4)
-
-
-if __name__ == '__main__':
-    parser = FlexibleArgumentParser(
-        description='Benchmark the latency of processing a single batch of '
-        'requests till completion.')
-    parser.add_argument('--model', type=str, default='facebook/opt-125m')
-    parser.add_argument('--speculative-model', type=str, default=None)
-    parser.add_argument('--num-speculative-tokens', type=int, default=None)
-    parser.add_argument('--speculative-draft-tensor-parallel-size',
-                        '-spec-draft-tp',
-                        type=int,
-                        default=None)
-    parser.add_argument('--tokenizer', type=str, default=None)
-    parser.add_argument('--quantization',
-                        '-q',
-                        choices=[*QUANTIZATION_METHODS, None],
-                        default=None)
-    parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
-    parser.add_argument('--input-len', type=int, default=32)
-    parser.add_argument('--output-len', type=int, default=128)
-    parser.add_argument('--batch-size', type=int, default=8)
-    parser.add_argument('--n',
-                        type=int,
-                        default=1,
-                        help='Number of generated sequences per prompt.')
-    parser.add_argument('--use-beam-search', action='store_true')
-    parser.add_argument('--num-iters-warmup',
-                        type=int,
-                        default=10,
-                        help='Number of iterations to run for warmup.')
-    parser.add_argument('--num-iters',
-                        type=int,
-                        default=30,
-                        help='Number of iterations to run.')
-    parser.add_argument('--trust-remote-code',
-                        action='store_true',
-                        help='trust remote code from huggingface')
-    parser.add_argument(
-        '--max-model-len',
-        type=int,
-        default=None,
-        help='Maximum length of a sequence (including prompt and output). '
-        'If None, will be derived from the model.')
-    parser.add_argument(
-        '--dtype',
-        type=str,
-        default='auto',
-        choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
-        help='data type for model weights and activations. '
-        'The "auto" option will use FP16 precision '
-        'for FP32 and FP16 models, and BF16 precision '
-        'for BF16 models.')
-    parser.add_argument('--enforce-eager',
-                        action='store_true',
-                        help='enforce eager mode and disable CUDA graph')
-    parser.add_argument(
-        '--kv-cache-dtype',
-        type=str,
-        choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
-        default="auto",
-        help='Data type for kv cache storage. If "auto", will use model '
-        'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
-        'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
-    parser.add_argument(
-        '--quantization-param-path',
-        type=str,
-        default=None,
-        help='Path to the JSON file containing the KV cache scaling factors. '
-        'This should generally be supplied, when KV cache dtype is FP8. '
-        'Otherwise, KV cache scaling factors default to 1.0, which may cause '
-        'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
-        'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
-        'instead supported for common inference criteria.')
-    parser.add_argument(
-        '--profile',
-        action='store_true',
-        help='profile the generation process of a single batch')
-    parser.add_argument(
-        '--profile-result-dir',
-        type=str,
-        default=None,
-        help=('path to save the pytorch profiler output. Can be visualized '
-              'with ui.perfetto.dev or Tensorboard.'))
-    parser.add_argument(
-        "--device",
-        type=str,
-        default="auto",
-        choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
-        help='device type for vLLM execution, supporting CUDA, OpenVINO and '
-        'CPU.')
-    parser.add_argument('--block-size',
-                        type=int,
-                        default=16,
-                        help='block size of key/value cache')
-    parser.add_argument(
-        '--enable-chunked-prefill',
-        action='store_true',
-        help='If True, the prefill requests can be chunked based on the '
-        'max_num_batched_tokens')
-    parser.add_argument("--enable-prefix-caching",
-                        action='store_true',
-                        help="Enable automatic prefix caching")
-    parser.add_argument('--use-v2-block-manager', action='store_true')
-    parser.add_argument(
-        "--ray-workers-use-nsight",
-        action='store_true',
-        help="If specified, use nsight to profile ray workers",
-    )
-    parser.add_argument('--download-dir',
-                        type=str,
-                        default=None,
-                        help='directory to download and load the weights, '
-                        'default to the default cache dir of huggingface')
-    parser.add_argument(
-        '--output-json',
-        type=str,
-        default=None,
-        help='Path to save the latency results in JSON format.')
-    parser.add_argument('--gpu-memory-utilization',
-                        type=float,
-                        default=0.9,
-                        help='the fraction of GPU memory to be used for '
-                        'the model executor, which can range from 0 to 1.'
-                        'If unspecified, will use the default value of 0.9.')
-    parser.add_argument(
-        '--load-format',
-        type=str,
-        default=EngineArgs.load_format,
-        choices=[
-            'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
-            'bitsandbytes'
-        ],
-        help='The format of the model weights to load.\n\n'
-        '* "auto" will try to load the weights in the safetensors format '
-        'and fall back to the pytorch bin format if safetensors format '
-        'is not available.\n'
-        '* "pt" will load the weights in the pytorch bin format.\n'
-        '* "safetensors" will load the weights in the safetensors format.\n'
-        '* "npcache" will load the weights in pytorch format and store '
-        'a numpy cache to speed up the loading.\n'
-        '* "dummy" will initialize the weights with random values, '
-        'which is mainly for profiling.\n'
-        '* "tensorizer" will load the weights using tensorizer from '
-        'CoreWeave. See the Tensorize vLLM Model script in the Examples'
-        'section for more information.\n'
-        '* "bitsandbytes" will load the weights using bitsandbytes '
-        'quantization.\n')
-    parser.add_argument(
-        '--distributed-executor-backend',
-        choices=['ray', 'mp'],
-        default=None,
-        help='Backend to use for distributed serving. When more than 1 GPU '
-        'is used, will be automatically set to "ray" if installed '
-        'or "mp" (multiprocessing) otherwise.')
-    parser.add_argument(
-        '--otlp-traces-endpoint',
-        type=str,
-        default=None,
-        help='Target URL to which OpenTelemetry traces will be sent.')
-    args = parser.parse_args()
-    main(args)
diff --git a/vLLM/GH200/benchmark_throughput_power_gh200.py b/vLLM/GH200/benchmark_throughput_power_gh200.py
deleted file mode 100644
index d3833fa..0000000
--- a/vLLM/GH200/benchmark_throughput_power_gh200.py
+++ /dev/null
@@ -1,331 +0,0 @@
-"""Benchmark the latency of processing a single batch of requests."""
-import argparse
-import json
-import time
-from pathlib import Path
-from typing import List, Optional
-import numpy as np
-import torch
-from tqdm import tqdm
-
-from vllm import LLM, SamplingParams
-from vllm.engine.arg_utils import EngineArgs
-from vllm.inputs import PromptInputs
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
-from vllm.utils import FlexibleArgumentParser
-import csv
-import os
-
-# from huggingface_hub import login
-# login("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")
-
-perplexity_dict = {"huggyllama/llama-7b":3.1271538213640806,
-                    "huggyllama/llama-13b":2.9614621865686885,
-                    "mistralai/Mixtral-8x7B-v0.1":2.7458531353012336,
-                    "meta-llama/Llama-2-13b-hf":2.811120439876313,
-                    "huggyllama/llama-30b":2.744248104044345,
-                    "facebook/opt-13b":3.870206998984964,
-                    "Nexusflow/NexusRaven-V2-13B":3.3361824356186327,
-                    "mistralai/Mixtral-8x22B-v0.1":2.5427975971657135,
-                    "meta-llama/Llama-2-7b-hf":2.9624337637748193,
-                    "mistralai/Mistral-7B-v0.3":3.0581070650881257,
-                    "Qwen/Qwen1.5-7B":4.249519567986975,
-                    "google/gemma-1.1-7b-it":13.892232459056668,
-                    "meta-llama/Meta-Llama-3-8B":3.718509102406137,
-                    "facebook/opt-6.7b":4.137426439363523,
-                    "Qwen/Qwen2-7B":4.019159671931102,
-                    "tiiuae/falcon-7b":3.534306161370759,
-                    "bigscience/bloom-7b1":5.208961745879341,
-                    "EleutherAI/gpt-j-6b":3.4668491651555446,
-                    "huggyllama/llama-65b":2.6334970265470727,
-                    "meta-llama/Llama-2-70b-hf":2.491589054514988,
-                    "meta-llama/Meta-Llama-3-70B":2.9904107267016,
-                    "google/gemma-7b":4.149094819615527,
-                    "BAAI/Aquila-7B":4.671358785970369,
-                    "Deci/DeciLM-7B":3.4662699434809126
-                    }
-
-def split_string(model_name):
-    if "/" in model_name:
-        return model_name.split("/")[-1]
-    else:
-        return model_name
-
-def dump_results(list_1, list_2, model_name, csv_file_name):
-    
-    assert len(list_1) == len(list_2)
-    
-    csv_file = csv_file_name + str(split_string(model_name)) + ".csv"
-    file_exists = os.path.exists(csv_file)
-
-    with open(csv_file, 'a', newline = '') as csvfile:
-        writer = csv.writer(csvfile)
-        
-        if not file_exists:
-            writer.writerow(list_1)
-        
-        writer.writerow(list_2) 
-        
-    csvfile.close()
-
-
-
-def main(args: argparse.Namespace):
-    print(args)
-
-    llm = LLM(
-        model=args.model,
-        speculative_model=args.speculative_model,
-        num_speculative_tokens=args.num_speculative_tokens,
-        speculative_draft_tensor_parallel_size=args.speculative_draft_tensor_parallel_size,
-        tokenizer=args.tokenizer,
-        quantization=args.quantization,
-        tensor_parallel_size=args.tensor_parallel_size,
-        trust_remote_code=args.trust_remote_code,
-        dtype=args.dtype,
-        max_model_len=args.max_model_len,
-        enforce_eager=args.enforce_eager,
-        kv_cache_dtype=args.kv_cache_dtype,
-        quantization_param_path=args.quantization_param_path,
-        device=args.device,
-        ray_workers_use_nsight=args.ray_workers_use_nsight,
-        use_v2_block_manager=args.use_v2_block_manager,
-        enable_chunked_prefill=args.enable_chunked_prefill,
-        block_size=args.block_size,
-        gpu_memory_utilization=args.gpu_memory_utilization,
-        load_format=args.load_format,
-        distributed_executor_backend=args.distributed_executor_backend,
-        otlp_traces_endpoint=args.otlp_traces_endpoint,
-        enable_prefix_caching=args.enable_prefix_caching,
-    )
-
-    #warm up
-    sampling_params = SamplingParams(
-        n=args.n,
-        temperature=0.0 if args.use_beam_search else 1.0,
-        top_p=1.0,
-        use_beam_search=args.use_beam_search,
-        ignore_eos=True,
-        max_tokens=256,
-    )
-    dummy_prompt_token_ids = np.random.randint(10000, size=(16, 256))
-    dummy_inputs: List[PromptInputs] = [{"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()]
-    
-    print("Warming up...")
-    for _ in tqdm(range(3), desc="Warmup iterations"):
-        llm.generate(dummy_inputs, sampling_params=sampling_params, use_tqdm=False)
-
-    batch_size_list   = [1,16,32,64]
-    input_output_list = [128,256,512,1024,2048]
-
-    for bs in batch_size_list:
-        for input_output in input_output_list:
-
-            args.batch_size = bs 
-            args.output_len = input_output
-            args.input_len = input_output
-
-            sampling_params = SamplingParams(
-                n=args.n,
-                temperature=0.0 if args.use_beam_search else 1.0,
-                top_p=1.0,
-                use_beam_search=args.use_beam_search,
-                ignore_eos=True,
-                max_tokens=args.output_len,
-            )
-            dummy_prompt_token_ids = np.random.randint(10000, size=(args.batch_size, args.input_len))
-            dummy_inputs: List[PromptInputs] = [{"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()]
-            
-            start_time = time.perf_counter()
-            llm.generate(dummy_inputs, sampling_params=sampling_params, use_tqdm=False)
-            end_time = time.perf_counter()
-            latency = end_time - start_time
-            total_tokens = args.batch_size*(args.input_len + args.input_len)
-            throughput =  total_tokens/latency
-
-            from power_utils import gpuPowerProbe
-            power_profile = gpuPowerProbe(interval=0.10)
-
-            power_profile.start()
-            llm.generate(dummy_inputs, sampling_params=sampling_params, use_tqdm=False)
-            training_powers, training_powers_time = power_profile.stop()
-            power_profile.destroy()
-
-            avg_power = np.mean(training_powers)
-            throughput_per_watt_avg_power = float(throughput/avg_power)
-
-            perplexity = perplexity_dict[args.model]
-
-            list_1 = ["Hardware","Num of Hardware","Framework","Model","Input Output Length","Batch Size","Latency","Throughput","area","avg_power","sum_power","Throughput_per_watt_area","Throughput_per_watt_avg","Throughput_per_watt_sum"]
-            list_2 = ["Nvidia GH200 GPU", args.tensor_parallel_size, "vLLM", args.model, args.input_len, args.batch_size, latency, throughput, None, avg_power, None, None, throughput_per_watt_avg_power,None] 
-            dump_results(list_1, list_2, args.model, csv_file_name = "7b-power_results_")
-
-            list_1 = ["Hardware","Num of Hardware","Framework","Model","Input Output Length","Batch Size","Latency","Throughput"]
-            list_2 = ["Nvidia GH200 GPU", args.tensor_parallel_size, "vLLM", args.model, args.input_len, args.batch_size, latency, throughput] 
-            dump_results(list_1, list_2, args.model, csv_file_name = "7b-throughput_results_")
-
-            list_1 = ["Hardware","Num of Hardware","Framework","Model","Input Output Length","Batch Size","Latency","Throughput","avg_power","Throughput_per_watt_avg", "Perplexity"]
-            list_2 = ["Nvidia GH200 GPU", args.tensor_parallel_size, "vLLM", args.model, args.input_len, args.batch_size, latency, throughput, avg_power, throughput_per_watt_avg_power,perplexity] 
-            dump_results(list_1, list_2, args.model, csv_file_name = "7b-perplexity_results_")
-
-
-if __name__ == '__main__':
-    parser = FlexibleArgumentParser(
-        description='Benchmark the latency of processing a single batch of '
-        'requests till completion.')
-    parser.add_argument('--model', type=str, default='facebook/opt-125m')
-    parser.add_argument('--speculative-model', type=str, default=None)
-    parser.add_argument('--num-speculative-tokens', type=int, default=None)
-    parser.add_argument('--speculative-draft-tensor-parallel-size',
-                        '-spec-draft-tp',
-                        type=int,
-                        default=None)
-    parser.add_argument('--tokenizer', type=str, default=None)
-    parser.add_argument('--quantization',
-                        '-q',
-                        choices=[*QUANTIZATION_METHODS, None],
-                        default=None)
-    parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
-    parser.add_argument('--input-len', type=int, default=32)
-    parser.add_argument('--output-len', type=int, default=128)
-    parser.add_argument('--batch-size', type=int, default=8)
-    parser.add_argument('--n',
-                        type=int,
-                        default=1,
-                        help='Number of generated sequences per prompt.')
-    parser.add_argument('--use-beam-search', action='store_true')
-    parser.add_argument('--num-iters-warmup',
-                        type=int,
-                        default=10,
-                        help='Number of iterations to run for warmup.')
-    parser.add_argument('--num-iters',
-                        type=int,
-                        default=30,
-                        help='Number of iterations to run.')
-    parser.add_argument('--trust-remote-code',
-                        action='store_true',
-                        help='trust remote code from huggingface')
-    parser.add_argument(
-        '--max-model-len',
-        type=int,
-        default=None,
-        help='Maximum length of a sequence (including prompt and output). '
-        'If None, will be derived from the model.')
-    parser.add_argument(
-        '--dtype',
-        type=str,
-        default='auto',
-        choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
-        help='data type for model weights and activations. '
-        'The "auto" option will use FP16 precision '
-        'for FP32 and FP16 models, and BF16 precision '
-        'for BF16 models.')
-    parser.add_argument('--enforce-eager',
-                        action='store_true',
-                        help='enforce eager mode and disable CUDA graph')
-    parser.add_argument(
-        '--kv-cache-dtype',
-        type=str,
-        choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
-        default="auto",
-        help='Data type for kv cache storage. If "auto", will use model '
-        'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
-        'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
-    parser.add_argument(
-        '--quantization-param-path',
-        type=str,
-        default=None,
-        help='Path to the JSON file containing the KV cache scaling factors. '
-        'This should generally be supplied, when KV cache dtype is FP8. '
-        'Otherwise, KV cache scaling factors default to 1.0, which may cause '
-        'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
-        'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
-        'instead supported for common inference criteria.')
-    parser.add_argument(
-        '--profile',
-        action='store_true',
-        help='profile the generation process of a single batch')
-    parser.add_argument(
-        '--profile-result-dir',
-        type=str,
-        default=None,
-        help=('path to save the pytorch profiler output. Can be visualized '
-              'with ui.perfetto.dev or Tensorboard.'))
-    parser.add_argument(
-        "--device",
-        type=str,
-        default="auto",
-        choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
-        help='device type for vLLM execution, supporting CUDA, OpenVINO and '
-        'CPU.')
-    parser.add_argument('--block-size',
-                        type=int,
-                        default=16,
-                        help='block size of key/value cache')
-    parser.add_argument(
-        '--enable-chunked-prefill',
-        action='store_true',
-        help='If True, the prefill requests can be chunked based on the '
-        'max_num_batched_tokens')
-    parser.add_argument("--enable-prefix-caching",
-                        action='store_true',
-                        help="Enable automatic prefix caching")
-    parser.add_argument('--use-v2-block-manager', action='store_true')
-    parser.add_argument(
-        "--ray-workers-use-nsight",
-        action='store_true',
-        help="If specified, use nsight to profile ray workers",
-    )
-    parser.add_argument('--download-dir',
-                        type=str,
-                        default=None,
-                        help='directory to download and load the weights, '
-                        'default to the default cache dir of huggingface')
-    parser.add_argument(
-        '--output-json',
-        type=str,
-        default=None,
-        help='Path to save the latency results in JSON format.')
-    parser.add_argument('--gpu-memory-utilization',
-                        type=float,
-                        default=0.9,
-                        help='the fraction of GPU memory to be used for '
-                        'the model executor, which can range from 0 to 1.'
-                        'If unspecified, will use the default value of 0.9.')
-    parser.add_argument(
-        '--load-format',
-        type=str,
-        default=EngineArgs.load_format,
-        choices=[
-            'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
-            'bitsandbytes'
-        ],
-        help='The format of the model weights to load.\n\n'
-        '* "auto" will try to load the weights in the safetensors format '
-        'and fall back to the pytorch bin format if safetensors format '
-        'is not available.\n'
-        '* "pt" will load the weights in the pytorch bin format.\n'
-        '* "safetensors" will load the weights in the safetensors format.\n'
-        '* "npcache" will load the weights in pytorch format and store '
-        'a numpy cache to speed up the loading.\n'
-        '* "dummy" will initialize the weights with random values, '
-        'which is mainly for profiling.\n'
-        '* "tensorizer" will load the weights using tensorizer from '
-        'CoreWeave. See the Tensorize vLLM Model script in the Examples'
-        'section for more information.\n'
-        '* "bitsandbytes" will load the weights using bitsandbytes '
-        'quantization.\n')
-    parser.add_argument(
-        '--distributed-executor-backend',
-        choices=['ray', 'mp'],
-        default=None,
-        help='Backend to use for distributed serving. When more than 1 GPU '
-        'is used, will be automatically set to "ray" if installed '
-        'or "mp" (multiprocessing) otherwise.')
-    parser.add_argument(
-        '--otlp-traces-endpoint',
-        type=str,
-        default=None,
-        help='Target URL to which OpenTelemetry traces will be sent.')
-    args = parser.parse_args()
-    main(args)
\ No newline at end of file
diff --git a/vLLM/GH200/build-container.sh b/vLLM/GH200/build-container.sh
deleted file mode 100644
index 7deaf3c..0000000
--- a/vLLM/GH200/build-container.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-
-# Obtain your credentials by following instructions at
-# https://docs.nvidia.com/ngc/gpu-cloud/ngc-private-registry-user-guide/index.html
-export SINGULARITY_DOCKER_USERNAME="$oauthtoken"
-export SINGULARITY_DOCKER_PASSWORD=YOUR_PASSWORD
-export APPTAINER_DOCKER_USERNAME="$oauthtoken"
-export APPTAINER_DOCKER_PASSWORD=YOUR_PASSWORD
-
-apptainer build vllm-gh200.sif vllm-gh200.def
diff --git a/vLLM/GH200/power_utils.py b/vLLM/GH200/power_utils.py
deleted file mode 100644
index 5d7906c..0000000
--- a/vLLM/GH200/power_utils.py
+++ /dev/null
@@ -1,68 +0,0 @@
-import multiprocessing
-import os
-import time
-from py3nvml.py3nvml import nvmlDeviceGetPowerUsage,  \
-    nvmlDeviceGetCount,  \
-    nvmlDeviceGetHandleByIndex, \
-    nvmlInit, \
-    nvmlShutdown
-
-class gpuPowerProbe(object):
-    def __init__(self, interval, gpu_id=-1):
-        self.interval = multiprocessing.Value('d', interval)
-        self.len = int(7200/interval)
-        self.powers = multiprocessing.Array('d', self.len)
-        self.times = multiprocessing.Array('d', self.len)
-        self.gpu_id = multiprocessing.Value('i', gpu_id)  
-        self.process = None
-        self.prevTime = multiprocessing.Value('d',time.time())
-        self.halt = multiprocessing.Value('i',1)  
-        self.count = multiprocessing.Value('i',0)
-        self.isrunning = multiprocessing.Value('i',0)
-        self.alive = multiprocessing.Value('i',0)  
-        self.init()
-
-    def _getGpuPower(self, powers, times, gpu_id, count, halt, alive, isrunning, prevTime, interval):
-        nvmlInit()
-        while (alive.value):
-            while (not halt.value):
-                isrunning.value = 1
-                if gpu_id.value > -1:
-                    power = nvmlDeviceGetPowerUsage(nvmlDeviceGetHandleByIndex(gpu_id.value))
-                else:
-                    power = 0
-                    num_gpus = nvmlDeviceGetCount()
-                    for i in range(num_gpus):
-                        power += nvmlDeviceGetPowerUsage(nvmlDeviceGetHandleByIndex(i))
-                    
-                new_time = time.time()
-                while (new_time-prevTime.value < interval.value):
-                    new_time = time.time()
-                powers[count.value] = power
-                times[count.value] = new_time-prevTime.value
-                count.value += 1
-                prevTime.value = new_time
-                isrunning.value = 0
-        nvmlShutdown()
-        
-    def init(self):
-        self.halt.value = 1
-        self.alive.value = 1
-        self.process = multiprocessing.Process(target = self._getGpuPower, args = (self.powers, self.times, self.gpu_id,
-                self.count, self.halt, self.alive, self.isrunning, self.prevTime, self.interval))
-        self.process.start()
-
-    def start(self):  
-        self.count.value = 0
-        self.prevTime.value = time.time()
-        self.halt.value = 0
-
-    def stop(self):
-        self.halt.value = 1
-        while (self.isrunning.value):
-            pass
-        return self.powers[:self.count.value], self.times[:self.count.value]
-    
-    def destroy(self):
-        self.alive.value = 0
-        self.process.join()
\ No newline at end of file
diff --git a/vLLM/GH200/run-container.sh b/vLLM/GH200/run-container.sh
deleted file mode 100644
index 8d79095..0000000
--- a/vLLM/GH200/run-container.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash -l 
-#COBALT -t 6:00:00 -n 1 -q gpu_gh200  --jobname v_7b-models
-
-module use /soft/modulefiles/
-module load conda/2024.03.04 
-source /soft/datascience/miniconda3/bin/activate
-
-apptainer exec --nv --no-mount /gpfs/jlse-fs0 \
-    --bind /vast/users/sraskar/gh200/llm_research/:/vast/users/sraskar/gh200/llm_research/ \
-    --bind /vast/users/sraskar/model_weights/GGUF_weights/:/vast/users/sraskar/model_weights/GGUF_weights \
-    --bind /vast/users/sraskar/h100/tensorRT/trt_weights:/vast/users/sraskar/h100/tensorRT/trt_weights \
-    --bind /vast/users/sraskar/mi250/hf:/vast/users/sraskar/mi250/hf vllm-gh200.sif \
-    /vast/users/sraskar/gh200/llm_research/vllm/benchmarks/run-models.sh
-
-
diff --git a/vLLM/GH200/run-models.sh b/vLLM/GH200/run-models.sh
deleted file mode 100644
index c0df6be..0000000
--- a/vLLM/GH200/run-models.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-export HF_TOKEN="your_hugging_face_token"
-export HF_HOME="/hf"
-export HF_DATASETS_CACHE="/hf"
-
-
-source /vast/users/sraskar/gh200/llm_research/container-build/vllm/wheels/setup_wheels.sh
-cd /vast/users/sraskar/gh200/llm_research/vllm/benchmarks/
-
-for model_name in "meta-llama/Meta-Llama-3-8B";do
-    for tensor_parallel in 1; do
-        for batch_size in 1 16 32 64; do
-            for input_output_length in 128 256 512 1024 2048; do
-                python3 benchmark_throughput_power_gh200.py --device cuda --model=$model_name --tensor-parallel-size=$tensor_parallel --input-len=$input_output_length --output-len=$input_output_length --batch-size=$batch_size --dtype="float16" --trust-remote-code
-            done
-        done
-    done
-done
-
diff --git a/vLLM/GH200/vllm-gh200.def b/vLLM/GH200/vllm-gh200.def
deleted file mode 100644
index 89f79c4..0000000
--- a/vLLM/GH200/vllm-gh200.def
+++ /dev/null
@@ -1,49 +0,0 @@
-Bootstrap: docker
-From: nvcr.io/nvidia/pytorch:24.06-py3
-
-
-%post
-
-NOW=`date`
-echo "export NOW=\"${NOW}\"" >> $SINGULARITY_ENVIRONMENT
-
-mkdir /extra
-mkdir /xdisk
-
-chown root:root /usr/lib
-apt update -y && apt install -y build-essential curl openssh-server openssh-client pdsh
-
-pip install --upgrade pip wheel
-
-pip install \
-    accelerate \
-    deepspeed \
-    openai \
-    peft \
-    pyarrow==14.0.2 \
-    sentencepiece \
-    tiktoken \
-    transformers \
-    trl
-
-pip install stanford-stk --no-deps
-
-pip install \
-    aioprometheus \
-    fastapi \
-    fschat[model_worker,webui] \
-    lm-format-enforcer==0.10.3 \
-    outlines \
-    prometheus-fastapi-instrumentator \
-    protobuf==3.20.3 \
-    ray==2.9.2 \
-    typer==0.9.4 \
-    uvicorn
-
-
-
-
-
-%label 
-This container uses nvidia ubuntu 22.04 as base and installs requirements to run vLLM on GH200. 
-Maintaier: Sid Raskar(sraskar@anl.gov)
\ No newline at end of file
diff --git a/vLLM/Gaudi2/README.MD b/vLLM/Gaudi2/README.MD
deleted file mode 100644
index e69de29..0000000
diff --git a/vLLM/H100/README.MD b/vLLM/H100/README.MD
deleted file mode 100644
index e69de29..0000000
diff --git a/vLLM/MI250/README.MD b/vLLM/MI250/README.MD
deleted file mode 100644
index e69de29..0000000
diff --git a/vLLM/README.md b/vLLM/README.md
deleted file mode 100644
index 4cf1b20..0000000
--- a/vLLM/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# vLLM
-
-vLLM