diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index c1a0d175d0..9d50420e9f 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -351,3 +351,93 @@ jobs: PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" echo "::endgroup::" done + + test-huggingface-transformers: + name: test-huggingface-transformers + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + secrets: inherit + strategy: + matrix: + hf_model_repo: [google/gemma-2b] + fail-fast: false + with: + secrets-env: EXECUTORCH_HF_TOKEN + runner: linux.12xlarge + docker-image: executorch-ubuntu-22.04-clang12 + submodules: 'true' + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + timeout: 90 + script: | + echo "::group::Set up ExecuTorch" + # The generic Linux job chooses to use base env, not the one setup by the image + CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") + conda activate "${CONDA_ENV}" + PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh cmake + + echo "Installing libexecutorch.a, libextension_module.so, libportable_ops_lib.a" + rm -rf cmake-out + cmake \ + -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ + -DEXECUTORCH_BUILD_XNNPACK=ON \ + -DPYTHON_EXECUTABLE=python \ + -Bcmake-out . + cmake --build cmake-out -j9 --target install --config Release + + echo "Build llama runner" + dir="examples/models/llama2" + cmake \ + -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ + -DEXECUTORCH_BUILD_XNNPACK=ON \ + -DPYTHON_EXECUTABLE=python \ + -Bcmake-out/${dir} \ + ${dir} + cmake --build cmake-out/${dir} -j9 --config Release + echo "::endgroup::" + + echo "::group::Set up HuggingFace Dependencies" + pip install -U "huggingface_hub[cli]" + huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN + pip install accelerate sentencepiece + # TODO(guangyang): Switch to use released transformers library after all required patches are included + pip install "git+https://github.com/huggingface/transformers.git@6cc4dfe3f1e8d421c6d6351388e06e9b123cbfe1" + pip list + echo "::endgroup::" + + echo "::group::Export to ExecuTorch" + TOKENIZER_FILE=tokenizer.model + TOKENIZER_BIN_FILE=tokenizer.bin + ET_MODEL_NAME=et_model + # Fetch the file using a Python one-liner + DOWNLOADED_TOKENIZER_FILE_PATH=$(python -c " + from huggingface_hub import hf_hub_download + # Download the file from the Hugging Face Hub + downloaded_path = hf_hub_download( + repo_id='${{ matrix.hf_model_repo }}', + filename='${TOKENIZER_FILE}' + ) + print(downloaded_path) + ") + if [ -f "$DOWNLOADED_TOKENIZER_FILE_PATH" ]; then + echo "${TOKENIZER_FILE} downloaded successfully at: $DOWNLOADED_TOKENIZER_FILE_PATH" + python -m extension.llm.tokenizer.tokenizer -t $DOWNLOADED_TOKENIZER_FILE_PATH -o ./${TOKENIZER_BIN_FILE} + ls ./tokenizer.bin + else + echo "Failed to download ${TOKENIZER_FILE} from ${{ matrix.hf_model_repo }}." + exit 1 + fi + + python -m extension.export_util.export_hf_model -hfm=${{ matrix.hf_model_repo }} -o ${ET_MODEL_NAME} + + cmake-out/examples/models/llama2/llama_main --model_path=${ET_MODEL_NAME}.pte --tokenizer_path=${TOKENIZER_BIN_FILE} --prompt="My name is" + echo "::endgroup::" diff --git a/extension/export_util/export_hf_model.py b/extension/export_util/export_hf_model.py new file mode 100644 index 0000000000..12ed202988 --- /dev/null +++ b/extension/export_util/export_hf_model.py @@ -0,0 +1,110 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import os + +import torch +import torch.export._trace +from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner +from executorch.exir import EdgeCompileConfig, ExecutorchBackendConfig, to_edge +from torch.nn.attention import SDPBackend +from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers.generation.configuration_utils import GenerationConfig +from transformers.integrations.executorch import convert_and_export_with_cache +from transformers.modeling_utils import PreTrainedModel + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument( + "-hfm", + "--hf_model_repo", + required=True, + default=None, + help="a valid huggingface model repo name", + ) + parser.add_argument( + "-o", + "--output_name", + required=False, + default=None, + help="output name of the exported model", + ) + + args = parser.parse_args() + + # Configs to HF model + device = "cpu" + dtype = torch.float32 + batch_size = 1 + max_length = 123 + cache_implementation = "static" + attn_implementation = "sdpa" + + # Load and configure a HF model + model = AutoModelForCausalLM.from_pretrained( + args.hf_model_repo, + attn_implementation=attn_implementation, + device_map=device, + torch_dtype=dtype, + generation_config=GenerationConfig( + use_cache=True, + cache_implementation=cache_implementation, + max_length=max_length, + cache_config={ + "batch_size": batch_size, + "max_cache_len": max_length, + }, + ), + ) + print(f"{model.config}") + print(f"{model.generation_config}") + + tokenizer = AutoTokenizer.from_pretrained(args.hf_model_repo) + input_ids = tokenizer([""], return_tensors="pt").to(device)["input_ids"] + cache_position = torch.tensor([0], dtype=torch.long) + + def _get_constant_methods(model: PreTrainedModel): + return { + "get_dtype": 5 if model.config.torch_dtype == torch.float16 else 6, + "get_bos_id": model.config.bos_token_id, + "get_eos_id": model.config.eos_token_id, + "get_head_dim": model.config.hidden_size / model.config.num_attention_heads, + "get_max_batch_size": model.generation_config.cache_config.batch_size, + "get_max_seq_len": model.generation_config.cache_config.max_cache_len, + "get_n_bos": 1, + "get_n_eos": 1, + "get_n_kv_heads": model.config.num_key_value_heads, + "get_n_layers": model.config.num_hidden_layers, + "get_vocab_size": model.config.vocab_size, + "use_kv_cache": model.generation_config.use_cache, + } + + with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad(): + + exported_prog = convert_and_export_with_cache(model, input_ids, cache_position) + prog = ( + to_edge( + exported_prog, + compile_config=EdgeCompileConfig( + _check_ir_validity=False, + _skip_dim_order=True, + ), + constant_methods=_get_constant_methods(model), + ) + .to_backend(XnnpackPartitioner()) + .to_executorch(ExecutorchBackendConfig(extract_delegate_segments=True)) + ) + out_name = args.output_name if args.output_name else model.config.model_type + filename = os.path.join("./", f"{out_name}.pte") + with open(filename, "wb") as f: + prog.write_to_file(f) + print(f"Saved exported program to {filename}") + + +if __name__ == "__main__": + main()